kernel/ext4-fix-race-between-write-and-fcntl-F_SETFL.patch

180 lines
6.2 KiB
Diff

From: Dmitry Monakhov <dmonakhov@xxxxxxxx>
Date: Thu, 9 Oct 2014 15:14:47 +0400
Subject: [PATCH] ext4: fix race between write and fcntl(F_SETFL)
O_DIRECT flags can be toggeled via fcntl(F_SETFL).
But this value checked twice inside ext4_file_write_iter() and __generic_file_write()
which result in BUG_ON (see typical stack trace below)
In order to fix this we have to use our own copy of __generic_file_write and
pass o_direct status explicitly.
TESTCASE: xfstest:generic/326 (http://patchwork.ozlabs.org/patch/397949/)
kernel BUG at fs/ext4/inode.c:2960!
invalid opcode: 0000 [#1] SMP
Modules linked in: brd iTCO_wdt lpc_ich mfd_core igb ptp dm_mirror dm_region_hash dm_log dm_mod
CPU: 6 PID: 5505 Comm: aio-dio-fcntl-r Not tainted 3.17.0-rc2-00176-gff5c017 #161
Hardware name: Intel Corporation W2600CR/W2600CR, BIOS SE5C600.86B.99.99.x028.061320111235 06/13/2011
task: ffff88080e95a7c0 ti: ffff88080f908000 task.ti: ffff88080f908000
RIP: 0010:[<ffffffff811fabf2>] [<ffffffff811fabf2>] ext4_direct_IO+0x162/0x3d0
RSP: 0018:ffff88080f90bb58 EFLAGS: 00010246
RAX: 0000000000000400 RBX: ffff88080fdb2a28 RCX: 00000000a802c818
RDX: 0000040000080000 RSI: ffff88080d8aeb80 RDI: 0000000000000001
RBP: ffff88080f90bbc8 R08: 0000000000000000 R09: 0000000000001581
R10: 0000000000000000 R11: 0000000000000000 R12: ffff88080d8aeb80
R13: ffff88080f90bbf8 R14: ffff88080fdb28c8 R15: ffff88080fdb2a28
FS: 00007f23b2055700(0000) GS:ffff880818400000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f23b2045000 CR3: 000000080cedf000 CR4: 00000000000407e0
Stack:
ffff88080f90bb98 0000000000000000 7ffffffffffffffe ffff88080fdb2c30
0000000000000200 0000000000000200 0000000000000001 0000000000000200
ffff88080f90bbc8 ffff88080fdb2c30 ffff88080f90be08 0000000000000200
Call Trace:
[<ffffffff8112ca9d>] generic_file_direct_write+0xed/0x180
[<ffffffff8112f2b2>] __generic_file_write_iter+0x222/0x370
[<ffffffff811f495b>] ext4_file_write_iter+0x34b/0x400
[<ffffffff811bd709>] ? aio_run_iocb+0x239/0x410
[<ffffffff811bd709>] ? aio_run_iocb+0x239/0x410
[<ffffffff810990e5>] ? local_clock+0x25/0x30
[<ffffffff810abd94>] ? __lock_acquire+0x274/0x700
[<ffffffff811f4610>] ? ext4_unwritten_wait+0xb0/0xb0
[<ffffffff811bd756>] aio_run_iocb+0x286/0x410
[<ffffffff810990e5>] ? local_clock+0x25/0x30
[<ffffffff810ac359>] ? lock_release_holdtime+0x29/0x190
[<ffffffff811bc05b>] ? lookup_ioctx+0x4b/0xf0
[<ffffffff811bde3b>] do_io_submit+0x55b/0x740
[<ffffffff811bdcaa>] ? do_io_submit+0x3ca/0x740
[<ffffffff811be030>] SyS_io_submit+0x10/0x20
[<ffffffff815ce192>] system_call_fastpath+0x16/0x1b
Code: 01 48 8b 80 f0 01 00 00 48 8b 18 49 8b 45 10 0f 85 f1 01 00 00 48 03 45 c8 48 3b 43 48 0f 8f e3 01 00 00 49 83 7c 24 18 00 75 04 <0f> 0b eb fe f0 ff 83 ec 01 00 00 49 8b 44 24 18 8b 00 85 c0 89
RIP [<ffffffff811fabf2>] ext4_direct_IO+0x162/0x3d0
RSP <ffff88080f90bb58>
Upstream-status: Submitted but likely not accepted
Bugzilla: 1152608
Reported-by: Sasha Levin <sasha.levin@xxxxxxxxxx>
Signed-off-by: Dmitry Monakhov <dmonakhov@xxxxxxxxxx>
---
fs/ext4/file.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 95 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index aca7b24a4432..8477eb259809 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -88,6 +88,100 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
return 0;
}
+/**
+ * copy of __generic_file_write_iter with explicit O_DIRECT status
+ * @iocb: IO state structure (file, offset, etc.)
+ * @from: iov_iter with data to write
+ * @direct: perform O_DIRECT IO
+ */
+static ssize_t
+__ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from, int direct)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ loff_t pos = iocb->ki_pos;
+ ssize_t written = 0;
+ ssize_t err;
+ ssize_t status;
+ size_t count = iov_iter_count(from);
+
+ /* We can write back this queue in page reclaim */
+ current->backing_dev_info = mapping->backing_dev_info;
+ err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+ if (err)
+ goto out;
+
+ if (count == 0)
+ goto out;
+
+ iov_iter_truncate(from, count);
+
+ err = file_remove_suid(file);
+ if (err)
+ goto out;
+
+ err = file_update_time(file);
+ if (err)
+ goto out;
+
+ /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
+ if (unlikely(direct)) {
+ loff_t endbyte;
+
+ written = generic_file_direct_write(iocb, from, pos);
+ if (written < 0 || written == count)
+ goto out;
+
+ /*
+ * direct-io write to a hole: fall through to buffered I/O
+ * for completing the rest of the request.
+ */
+ pos += written;
+ count -= written;
+
+ status = generic_perform_write(file, from, pos);
+ /*
+ * If generic_perform_write() returned a synchronous error
+ * then we want to return the number of bytes which were
+ * direct-written, or the error code if that was zero. Note
+ * that this differs from normal direct-io semantics, which
+ * will return -EFOO even if some bytes were written.
+ */
+ if (unlikely(status < 0)) {
+ err = status;
+ goto out;
+ }
+ iocb->ki_pos = pos + status;
+ /*
+ * We need to ensure that the page cache pages are written to
+ * disk and invalidated to preserve the expected O_DIRECT
+ * semantics.
+ */
+ endbyte = pos + status - 1;
+ err = filemap_write_and_wait_range(file->f_mapping, pos,
+ endbyte);
+ if (err == 0) {
+ written += status;
+ invalidate_mapping_pages(mapping,
+ pos >> PAGE_CACHE_SHIFT,
+ endbyte >> PAGE_CACHE_SHIFT);
+ } else {
+ /*
+ * We don't know how much we wrote, so just return
+ * the number of bytes which were direct-written
+ */
+ }
+ } else {
+ written = generic_perform_write(file, from, pos);
+ if (likely(written >= 0))
+ iocb->ki_pos = pos + written;
+ }
+out:
+ current->backing_dev_info = NULL;
+ return written ? written : err;
+}
+
static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
@@ -172,7 +266,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
}
- ret = __generic_file_write_iter(iocb, from);
+ ret = __ext4_file_write_iter(iocb, from, o_direct);
mutex_unlock(&inode->i_mutex);
if (ret > 0) {
--
1.9.3