117 lines
5.3 KiB
Diff
117 lines
5.3 KiB
Diff
|
From 52d71ad0e5c27808e7d8eea8a0920298837e408c Mon Sep 17 00:00:00 2001
|
||
|
From: Xavi Hernandez <xhernandez@redhat.com>
|
||
|
Date: Wed, 17 Jul 2019 14:50:22 +0200
|
||
|
Subject: [PATCH 271/276] cluster/ec: fix EIO error for concurrent writes on
|
||
|
sparse files
|
||
|
|
||
|
EC doesn't allow concurrent writes on overlapping areas, they are
|
||
|
serialized. However non-overlapping writes are serviced in parallel.
|
||
|
When a write is not aligned, EC first needs to read the entire chunk
|
||
|
from disk, apply the modified fragment and write it again.
|
||
|
|
||
|
The problem appears on sparse files because a write to an offset
|
||
|
implicitly creates data on offsets below it (so, in some way, they
|
||
|
are overlapping). For example, if a file is empty and we read 10 bytes
|
||
|
from offset 10, read() will return 0 bytes. Now, if we write one byte
|
||
|
at offset 1M and retry the same read, the system call will return 10
|
||
|
bytes (all containing 0's).
|
||
|
|
||
|
So if we have two writes, the first one at offset 10 and the second one
|
||
|
at offset 1M, EC will send both in parallel because they do not overlap.
|
||
|
However, the first one will try to read missing data from the first chunk
|
||
|
(i.e. offsets 0 to 9) to recombine the entire chunk and do the final write.
|
||
|
This read will happen in parallel with the write to 1M. What could happen
|
||
|
is that half of the bricks process the write before the read, and the
|
||
|
half do the read before the write. Some bricks will return 10 bytes of
|
||
|
data while the otherw will return 0 bytes (because the file on the brick
|
||
|
has not been expanded yet).
|
||
|
|
||
|
When EC tries to recombine the answers from the bricks, it can't, because
|
||
|
it needs more than half consistent answers to recover the data. So this
|
||
|
read fails with EIO error. This error is propagated to the parent write,
|
||
|
which is aborted and EIO is returned to the application.
|
||
|
|
||
|
The issue happened because EC assumed that a write to a given offset
|
||
|
implies that offsets below it exist.
|
||
|
|
||
|
This fix prevents the read of the chunk from bricks if the current size
|
||
|
of the file is smaller than the read chunk offset. This size is
|
||
|
correctly tracked, so this fixes the issue.
|
||
|
|
||
|
Also modifying ec-stripe.t file for Test #13 within it.
|
||
|
In this patch, if a file size is less than the offset we are writing, we
|
||
|
fill zeros in head and tail and do not consider it strip cache miss.
|
||
|
That actually make sense as we know what data that part holds and there is
|
||
|
no need of reading it from bricks.
|
||
|
|
||
|
Upstream-patch: https://review.gluster.org/c/glusterfs/+/23066
|
||
|
Change-Id: Ic342e8c35c555b8534109e9314c9a0710b6225d6
|
||
|
fixes: bz#1731448
|
||
|
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
|
||
|
Reviewed-on: https://code.engineering.redhat.com/gerrit/177975
|
||
|
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
||
|
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
|
||
|
---
|
||
|
tests/basic/ec/ec-stripe.t | 2 +-
|
||
|
xlators/cluster/ec/src/ec-inode-write.c | 26 +++++++++++++++++---------
|
||
|
2 files changed, 18 insertions(+), 10 deletions(-)
|
||
|
|
||
|
diff --git a/tests/basic/ec/ec-stripe.t b/tests/basic/ec/ec-stripe.t
|
||
|
index 1e940eb..98b9229 100644
|
||
|
--- a/tests/basic/ec/ec-stripe.t
|
||
|
+++ b/tests/basic/ec/ec-stripe.t
|
||
|
@@ -202,7 +202,7 @@ TEST truncate -s 0 $B0/test_file
|
||
|
TEST truncate -s 0 $M0/test_file
|
||
|
TEST dd if=$B0/misc_file of=$B0/test_file bs=1022 count=5 oflag=seek_bytes,sync seek=400 conv=notrunc
|
||
|
TEST dd if=$B0/misc_file of=$M0/test_file bs=1022 count=5 oflag=seek_bytes,sync seek=400 conv=notrunc
|
||
|
-check_statedump_md5sum 4 5
|
||
|
+check_statedump_md5sum 4 4
|
||
|
clean_file_unmount
|
||
|
|
||
|
### 14 - Truncate to invalidate all but one the stripe in cache ####
|
||
|
diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c
|
||
|
index ea55140..a45e6d6 100644
|
||
|
--- a/xlators/cluster/ec/src/ec-inode-write.c
|
||
|
+++ b/xlators/cluster/ec/src/ec-inode-write.c
|
||
|
@@ -2013,20 +2013,28 @@ ec_writev_start(ec_fop_data_t *fop)
|
||
|
if (err != 0) {
|
||
|
goto failed_fd;
|
||
|
}
|
||
|
+ tail = fop->size - fop->user_size - fop->head;
|
||
|
if (fop->head > 0) {
|
||
|
- found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_HEAD);
|
||
|
- if (!found_stripe) {
|
||
|
- if (ec_make_internal_fop_xdata(&xdata)) {
|
||
|
- err = -ENOMEM;
|
||
|
- goto failed_xdata;
|
||
|
+ if (current > fop->offset) {
|
||
|
+ found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_HEAD);
|
||
|
+ if (!found_stripe) {
|
||
|
+ if (ec_make_internal_fop_xdata(&xdata)) {
|
||
|
+ err = -ENOMEM;
|
||
|
+ goto failed_xdata;
|
||
|
+ }
|
||
|
+ ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
|
||
|
+ ec_writev_merge_head, NULL, fd, ec->stripe_size,
|
||
|
+ fop->offset, 0, xdata);
|
||
|
+ }
|
||
|
+ } else {
|
||
|
+ memset(fop->vector[0].iov_base, 0, fop->head);
|
||
|
+ memset(fop->vector[0].iov_base + fop->size - tail, 0, tail);
|
||
|
+ if (ec->stripe_cache && (fop->size <= ec->stripe_size)) {
|
||
|
+ ec_add_stripe_in_cache(ec, fop);
|
||
|
}
|
||
|
- ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
|
||
|
- ec_writev_merge_head, NULL, fd, ec->stripe_size,
|
||
|
- fop->offset, 0, xdata);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
- tail = fop->size - fop->user_size - fop->head;
|
||
|
if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) {
|
||
|
/* Current locking scheme will make sure the 'current' below will
|
||
|
* never decrease while the fop is in progress, so the checks will
|
||
|
--
|
||
|
1.8.3.1
|
||
|
|