diff --git a/kvm-block-posix-Always-allocate-the-first-block.patch b/kvm-block-posix-Always-allocate-the-first-block.patch new file mode 100644 index 0000000..6b4229c --- /dev/null +++ b/kvm-block-posix-Always-allocate-the-first-block.patch @@ -0,0 +1,386 @@ +From 58dc8ae23325384b0d9494d203254dc2f6a99255 Mon Sep 17 00:00:00 2001 +From: Max Reitz +Date: Mon, 9 Sep 2019 07:38:21 +0100 +Subject: [PATCH 5/6] block: posix: Always allocate the first block + +RH-Author: Max Reitz +Message-id: <20190909073822.26191-3-mreitz@redhat.com> +Patchwork-id: 90333 +O-Subject: [RHEL-AV-8.1.0 qemu-kvm PATCH 2/3] block: posix: Always allocate the first block +Bugzilla: 1749134 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Thomas Huth +RH-Acked-by: Kevin Wolf + +From: Nir Soffer + +When creating an image with preallocation "off" or "falloc", the first +block of the image is typically not allocated. When using Gluster +storage backed by XFS filesystem, reading this block using direct I/O +succeeds regardless of request length, fooling alignment detection. + +In this case we fallback to a safe value (4096) instead of the optimal +value (512), which may lead to unneeded data copying when aligning +requests. Allocating the first block avoids the fallback. + +Since we allocate the first block even with preallocation=off, we no +longer create images with zero disk size: + + $ ./qemu-img create -f raw test.raw 1g + Formatting 'test.raw', fmt=raw size=1073741824 + + $ ls -lhs test.raw + 4.0K -rw-r--r--. 1 nsoffer nsoffer 1.0G Aug 16 23:48 test.raw + +And converting the image requires additional cluster: + + $ ./qemu-img measure -f raw -O qcow2 test.raw + required size: 458752 + fully allocated size: 1074135040 + +When using format like vmdk with multiple files per image, we allocate +one block per file: + + $ ./qemu-img create -f vmdk -o subformat=twoGbMaxExtentFlat test.vmdk 4g + Formatting 'test.vmdk', fmt=vmdk size=4294967296 compat6=off hwversion=undefined subformat=twoGbMaxExtentFlat + + $ ls -lhs test*.vmdk + 4.0K -rw-r--r--. 1 nsoffer nsoffer 2.0G Aug 27 03:23 test-f001.vmdk + 4.0K -rw-r--r--. 1 nsoffer nsoffer 2.0G Aug 27 03:23 test-f002.vmdk + 4.0K -rw-r--r--. 1 nsoffer nsoffer 353 Aug 27 03:23 test.vmdk + +I did quick performance test for copying disks with qemu-img convert to +new raw target image to Gluster storage with sector size of 512 bytes: + + for i in $(seq 10); do + rm -f dst.raw + sleep 10 + time ./qemu-img convert -f raw -O raw -t none -T none src.raw dst.raw + done + +Here is a table comparing the total time spent: + +Type Before(s) After(s) Diff(%) +--------------------------------------- +real 530.028 469.123 -11.4 +user 17.204 10.768 -37.4 +sys 17.881 7.011 -60.7 + +We can see very clear improvement in CPU usage. + +Signed-off-by: Nir Soffer +Message-id: 20190827010528.8818-2-nsoffer@redhat.com +Reviewed-by: Max Reitz +Signed-off-by: Max Reitz +(cherry picked from commit 3a20013fbb26d2a1bd11ef148eefdb1508783787) +Signed-off-by: Max Reitz +Signed-off-by: Danilo C. L. de Paula +--- + block/file-posix.c | 51 ++++++++++++++++++++++++++++++++++++++++ + tests/qemu-iotests/059.out | 2 +- + tests/qemu-iotests/150.out | 11 --------- + tests/qemu-iotests/150.out.qcow2 | 11 +++++++++ + tests/qemu-iotests/150.out.raw | 12 ++++++++++ + tests/qemu-iotests/175 | 19 ++++++++++----- + tests/qemu-iotests/175.out | 8 +++---- + tests/qemu-iotests/178.out.qcow2 | 4 ++-- + tests/qemu-iotests/221.out | 12 ++++++---- + tests/qemu-iotests/253.out | 12 ++++++---- + 10 files changed, 110 insertions(+), 32 deletions(-) + delete mode 100644 tests/qemu-iotests/150.out + create mode 100644 tests/qemu-iotests/150.out.qcow2 + create mode 100644 tests/qemu-iotests/150.out.raw + +diff --git a/block/file-posix.c b/block/file-posix.c +index b8b4dad..8ea9889 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -1749,6 +1749,43 @@ static int handle_aiocb_discard(void *opaque) + return ret; + } + ++/* ++ * Help alignment probing by allocating the first block. ++ * ++ * When reading with direct I/O from unallocated area on Gluster backed by XFS, ++ * reading succeeds regardless of request length. In this case we fallback to ++ * safe alignment which is not optimal. Allocating the first block avoids this ++ * fallback. ++ * ++ * fd may be opened with O_DIRECT, but we don't know the buffer alignment or ++ * request alignment, so we use safe values. ++ * ++ * Returns: 0 on success, -errno on failure. Since this is an optimization, ++ * caller may ignore failures. ++ */ ++static int allocate_first_block(int fd, size_t max_size) ++{ ++ size_t write_size = (max_size < MAX_BLOCKSIZE) ++ ? BDRV_SECTOR_SIZE ++ : MAX_BLOCKSIZE; ++ size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize()); ++ void *buf; ++ ssize_t n; ++ int ret; ++ ++ buf = qemu_memalign(max_align, write_size); ++ memset(buf, 0, write_size); ++ ++ do { ++ n = pwrite(fd, buf, write_size, 0); ++ } while (n == -1 && errno == EINTR); ++ ++ ret = (n == -1) ? -errno : 0; ++ ++ qemu_vfree(buf); ++ return ret; ++} ++ + static int handle_aiocb_truncate(void *opaque) + { + RawPosixAIOData *aiocb = opaque; +@@ -1788,6 +1825,17 @@ static int handle_aiocb_truncate(void *opaque) + /* posix_fallocate() doesn't set errno. */ + error_setg_errno(errp, -result, + "Could not preallocate new data"); ++ } else if (current_length == 0) { ++ /* ++ * posix_fallocate() uses fallocate() if the filesystem ++ * supports it, or fallback to manually writing zeroes. If ++ * fallocate() was used, unaligned reads from the fallocated ++ * area in raw_probe_alignment() will succeed, hence we need to ++ * allocate the first block. ++ * ++ * Optimize future alignment probing; ignore failures. ++ */ ++ allocate_first_block(fd, offset); + } + } else { + result = 0; +@@ -1849,6 +1897,9 @@ static int handle_aiocb_truncate(void *opaque) + if (ftruncate(fd, offset) != 0) { + result = -errno; + error_setg_errno(errp, -result, "Could not resize file"); ++ } else if (current_length == 0 && offset > current_length) { ++ /* Optimize future alignment probing; ignore failures. */ ++ allocate_first_block(fd, offset); + } + return result; + default: +diff --git a/tests/qemu-iotests/059.out b/tests/qemu-iotests/059.out +index 4fab42a..fe3f861 100644 +--- a/tests/qemu-iotests/059.out ++++ b/tests/qemu-iotests/059.out +@@ -27,7 +27,7 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824000 subformat=twoGbMax + image: TEST_DIR/t.vmdk + file format: vmdk + virtual size: 0.977 TiB (1073741824000 bytes) +-disk size: 16 KiB ++disk size: 1.97 MiB + Format specific information: + cid: XXXXXXXX + parent cid: XXXXXXXX +diff --git a/tests/qemu-iotests/150.out b/tests/qemu-iotests/150.out +deleted file mode 100644 +index 2a54e8d..0000000 +--- a/tests/qemu-iotests/150.out ++++ /dev/null +@@ -1,11 +0,0 @@ +-QA output created by 150 +- +-=== Mapping sparse conversion === +- +-Offset Length File +- +-=== Mapping non-sparse conversion === +- +-Offset Length File +-0 0x100000 TEST_DIR/t.IMGFMT +-*** done +diff --git a/tests/qemu-iotests/150.out.qcow2 b/tests/qemu-iotests/150.out.qcow2 +new file mode 100644 +index 0000000..2a54e8d +--- /dev/null ++++ b/tests/qemu-iotests/150.out.qcow2 +@@ -0,0 +1,11 @@ ++QA output created by 150 ++ ++=== Mapping sparse conversion === ++ ++Offset Length File ++ ++=== Mapping non-sparse conversion === ++ ++Offset Length File ++0 0x100000 TEST_DIR/t.IMGFMT ++*** done +diff --git a/tests/qemu-iotests/150.out.raw b/tests/qemu-iotests/150.out.raw +new file mode 100644 +index 0000000..3cdc772 +--- /dev/null ++++ b/tests/qemu-iotests/150.out.raw +@@ -0,0 +1,12 @@ ++QA output created by 150 ++ ++=== Mapping sparse conversion === ++ ++Offset Length File ++0 0x1000 TEST_DIR/t.IMGFMT ++ ++=== Mapping non-sparse conversion === ++ ++Offset Length File ++0 0x100000 TEST_DIR/t.IMGFMT ++*** done +diff --git a/tests/qemu-iotests/175 b/tests/qemu-iotests/175 +index 51e62c8..7ba28b3 100755 +--- a/tests/qemu-iotests/175 ++++ b/tests/qemu-iotests/175 +@@ -37,14 +37,16 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 + # the file size. This function hides the resulting difference in the + # stat -c '%b' output. + # Parameter 1: Number of blocks an empty file occupies +-# Parameter 2: Image size in bytes ++# Parameter 2: Minimal number of blocks in an image ++# Parameter 3: Image size in bytes + _filter_blocks() + { + extra_blocks=$1 +- img_size=$2 ++ min_blocks=$2 ++ img_size=$3 + +- sed -e "s/blocks=$extra_blocks\\(\$\\|[^0-9]\\)/nothing allocated/" \ +- -e "s/blocks=$((extra_blocks + img_size / 512))\\(\$\\|[^0-9]\\)/everything allocated/" ++ sed -e "s/blocks=$min_blocks\\(\$\\|[^0-9]\\)/min allocation/" \ ++ -e "s/blocks=$((extra_blocks + img_size / 512))\\(\$\\|[^0-9]\\)/max allocation/" + } + + # get standard environment, filters and checks +@@ -60,16 +62,21 @@ size=$((1 * 1024 * 1024)) + touch "$TEST_DIR/empty" + extra_blocks=$(stat -c '%b' "$TEST_DIR/empty") + ++# We always write the first byte; check how many blocks this filesystem ++# allocates to match empty image alloation. ++printf "\0" > "$TEST_DIR/empty" ++min_blocks=$(stat -c '%b' "$TEST_DIR/empty") ++ + echo + echo "== creating image with default preallocation ==" + _make_test_img $size | _filter_imgfmt +-stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $size ++stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $min_blocks $size + + for mode in off full falloc; do + echo + echo "== creating image with preallocation $mode ==" + IMGOPTS=preallocation=$mode _make_test_img $size | _filter_imgfmt +- stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $size ++ stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $min_blocks $size + done + + # success, all done +diff --git a/tests/qemu-iotests/175.out b/tests/qemu-iotests/175.out +index 6d9a5ed..263e521 100644 +--- a/tests/qemu-iotests/175.out ++++ b/tests/qemu-iotests/175.out +@@ -2,17 +2,17 @@ QA output created by 175 + + == creating image with default preallocation == + Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 +-size=1048576, nothing allocated ++size=1048576, min allocation + + == creating image with preallocation off == + Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=off +-size=1048576, nothing allocated ++size=1048576, min allocation + + == creating image with preallocation full == + Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=full +-size=1048576, everything allocated ++size=1048576, max allocation + + == creating image with preallocation falloc == + Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=falloc +-size=1048576, everything allocated ++size=1048576, max allocation + *** done +diff --git a/tests/qemu-iotests/178.out.qcow2 b/tests/qemu-iotests/178.out.qcow2 +index 55a8dc9..9e7d8c4 100644 +--- a/tests/qemu-iotests/178.out.qcow2 ++++ b/tests/qemu-iotests/178.out.qcow2 +@@ -101,7 +101,7 @@ converted image file size in bytes: 196608 + == raw input image with data (human) == + + Formatting 'TEST_DIR/t.qcow2', fmt=IMGFMT size=1073741824 +-required size: 393216 ++required size: 458752 + fully allocated size: 1074135040 + wrote 512/512 bytes at offset 512 + 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +@@ -257,7 +257,7 @@ converted image file size in bytes: 196608 + + Formatting 'TEST_DIR/t.qcow2', fmt=IMGFMT size=1073741824 + { +- "required": 393216, ++ "required": 458752, + "fully-allocated": 1074135040 + } + wrote 512/512 bytes at offset 512 +diff --git a/tests/qemu-iotests/221.out b/tests/qemu-iotests/221.out +index 9f9dd52..dca024a 100644 +--- a/tests/qemu-iotests/221.out ++++ b/tests/qemu-iotests/221.out +@@ -3,14 +3,18 @@ QA output created by 221 + === Check mapping of unaligned raw image === + + Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=65537 +-[{ "start": 0, "length": 66048, "depth": 0, "zero": true, "data": false, "offset": OFFSET}] +-[{ "start": 0, "length": 66048, "depth": 0, "zero": true, "data": false, "offset": OFFSET}] ++[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET}, ++{ "start": 4096, "length": 61952, "depth": 0, "zero": true, "data": false, "offset": OFFSET}] ++[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET}, ++{ "start": 4096, "length": 61952, "depth": 0, "zero": true, "data": false, "offset": OFFSET}] + wrote 1/1 bytes at offset 65536 + 1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +-[{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false, "offset": OFFSET}, ++[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET}, ++{ "start": 4096, "length": 61440, "depth": 0, "zero": true, "data": false, "offset": OFFSET}, + { "start": 65536, "length": 1, "depth": 0, "zero": false, "data": true, "offset": OFFSET}, + { "start": 65537, "length": 511, "depth": 0, "zero": true, "data": false, "offset": OFFSET}] +-[{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false, "offset": OFFSET}, ++[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET}, ++{ "start": 4096, "length": 61440, "depth": 0, "zero": true, "data": false, "offset": OFFSET}, + { "start": 65536, "length": 1, "depth": 0, "zero": false, "data": true, "offset": OFFSET}, + { "start": 65537, "length": 511, "depth": 0, "zero": true, "data": false, "offset": OFFSET}] + *** done +diff --git a/tests/qemu-iotests/253.out b/tests/qemu-iotests/253.out +index 607c0ba..3d08b30 100644 +--- a/tests/qemu-iotests/253.out ++++ b/tests/qemu-iotests/253.out +@@ -3,12 +3,16 @@ QA output created by 253 + === Check mapping of unaligned raw image === + + Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048575 +-[{ "start": 0, "length": 1048576, "depth": 0, "zero": true, "data": false, "offset": OFFSET}] +-[{ "start": 0, "length": 1048576, "depth": 0, "zero": true, "data": false, "offset": OFFSET}] ++[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET}, ++{ "start": 4096, "length": 1044480, "depth": 0, "zero": true, "data": false, "offset": OFFSET}] ++[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET}, ++{ "start": 4096, "length": 1044480, "depth": 0, "zero": true, "data": false, "offset": OFFSET}] + wrote 65535/65535 bytes at offset 983040 + 63.999 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +-[{ "start": 0, "length": 983040, "depth": 0, "zero": true, "data": false, "offset": OFFSET}, ++[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET}, ++{ "start": 4096, "length": 978944, "depth": 0, "zero": true, "data": false, "offset": OFFSET}, + { "start": 983040, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": OFFSET}] +-[{ "start": 0, "length": 983040, "depth": 0, "zero": true, "data": false, "offset": OFFSET}, ++[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET}, ++{ "start": 4096, "length": 978944, "depth": 0, "zero": true, "data": false, "offset": OFFSET}, + { "start": 983040, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": OFFSET}] + *** done +-- +1.8.3.1 + diff --git a/kvm-ehci-fix-queue-dev-null-ptr-dereference.patch b/kvm-ehci-fix-queue-dev-null-ptr-dereference.patch new file mode 100644 index 0000000..9adeaeb --- /dev/null +++ b/kvm-ehci-fix-queue-dev-null-ptr-dereference.patch @@ -0,0 +1,50 @@ +From 0b38614471dbc44b87a1d2449e602df50c3ff535 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Thu, 5 Sep 2019 08:50:37 +0100 +Subject: [PATCH 2/6] ehci: fix queue->dev null ptr dereference +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20190905085037.5648-2-dgilbert@redhat.com> +Patchwork-id: 90288 +O-Subject: [RHEL-AV-8.1 qemu-kvm PATCH 1/1] ehci: fix queue->dev null ptr dereference +Bugzilla: 1746790 +RH-Acked-by: Peter Xu +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Juan Quintela + +From: Gerd Hoffmann + +In case we don't have a device for an active queue, just skip +processing the queue (same we do for inactive queues) and log +a guest bug. + +Reported-by: Guenter Roeck +Signed-off-by: Gerd Hoffmann +Tested-by: Guenter Roeck +Message-id: 20190821085319.13711-1-kraxel@redhat.com +(cherry picked from commit 1be344b7ad25d572dadeee46d80f0103354352b2) +Signed-off-by: Danilo C. L. de Paula +--- + hw/usb/hcd-ehci.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/hw/usb/hcd-ehci.c b/hw/usb/hcd-ehci.c +index 62dab05..5f089f3 100644 +--- a/hw/usb/hcd-ehci.c ++++ b/hw/usb/hcd-ehci.c +@@ -1834,6 +1834,9 @@ static int ehci_state_fetchqtd(EHCIQueue *q) + ehci_set_state(q->ehci, q->async, EST_EXECUTING); + break; + } ++ } else if (q->dev == NULL) { ++ ehci_trace_guest_bug(q->ehci, "no device attached to queue"); ++ ehci_set_state(q->ehci, q->async, EST_HORIZONTALQH); + } else { + p = ehci_alloc_packet(q); + p->qtdaddr = q->qtdaddr; +-- +1.8.3.1 + diff --git a/kvm-file-posix-Handle-undetectable-alignment.patch b/kvm-file-posix-Handle-undetectable-alignment.patch new file mode 100644 index 0000000..f5f883b --- /dev/null +++ b/kvm-file-posix-Handle-undetectable-alignment.patch @@ -0,0 +1,129 @@ +From 790cebc2a1d8de8d93b2a2a0ef19e31c767f4f1c Mon Sep 17 00:00:00 2001 +From: Max Reitz +Date: Mon, 9 Sep 2019 07:38:20 +0100 +Subject: [PATCH 4/6] file-posix: Handle undetectable alignment + +RH-Author: Max Reitz +Message-id: <20190909073822.26191-2-mreitz@redhat.com> +Patchwork-id: 90332 +O-Subject: [RHEL-AV-8.1.0 qemu-kvm PATCH 1/3] file-posix: Handle undetectable alignment +Bugzilla: 1749134 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Thomas Huth +RH-Acked-by: Kevin Wolf + +From: Nir Soffer + +In some cases buf_align or request_alignment cannot be detected: + +1. With Gluster, buf_align cannot be detected since the actual I/O is + done on Gluster server, and qemu buffer alignment does not matter. + Since we don't have alignment requirement, buf_align=1 is the best + value. + +2. With local XFS filesystem, buf_align cannot be detected if reading + from unallocated area. In this we must align the buffer, but we don't + know what is the correct size. Using the wrong alignment results in + I/O error. + +3. With Gluster backed by XFS, request_alignment cannot be detected if + reading from unallocated area. In this case we need to use the + correct alignment, and failing to do so results in I/O errors. + +4. With NFS, the server does not use direct I/O, so both buf_align cannot + be detected. In this case we don't need any alignment so we can use + buf_align=1 and request_alignment=1. + +These cases seems to work when storage sector size is 512 bytes, because +the current code starts checking align=512. If the check succeeds +because alignment cannot be detected we use 512. But this does not work +for storage with 4k sector size. + +To determine if we can detect the alignment, we probe first with +align=1. If probing succeeds, maybe there are no alignment requirement +(cases 1, 4) or we are probing unallocated area (cases 2, 3). Since we +don't have any way to tell, we treat this as undetectable alignment. If +probing with align=1 fails with EINVAL, but probing with one of the +expected alignments succeeds, we know that we found a working alignment. + +Practically the alignment requirements are the same for buffer +alignment, buffer length, and offset in file. So in case we cannot +detect buf_align, we can use request alignment. If we cannot detect +request alignment, we can fallback to a safe value. To use this logic, +we probe first request alignment instead of buf_align. + +Here is a table showing the behaviour with current code (the value in +parenthesis is the optimal value). + +Case Sector buf_align (opt) request_alignment (opt) result + +Signed-off-by: Danilo C. L. de Paula +--- + block/file-posix.c | 36 +++++++++++++++++++++++++----------- + 1 file changed, 25 insertions(+), 11 deletions(-) + +diff --git a/block/file-posix.c b/block/file-posix.c +index 4479cc7..b8b4dad 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -323,6 +323,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) + BDRVRawState *s = bs->opaque; + char *buf; + size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize()); ++ size_t alignments[] = {1, 512, 1024, 2048, 4096}; + + /* For SCSI generic devices the alignment is not really used. + With buffered I/O, we don't have any restrictions. */ +@@ -349,25 +350,38 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) + } + #endif + +- /* If we could not get the sizes so far, we can only guess them */ +- if (!s->buf_align) { ++ /* ++ * If we could not get the sizes so far, we can only guess them. First try ++ * to detect request alignment, since it is more likely to succeed. Then ++ * try to detect buf_align, which cannot be detected in some cases (e.g. ++ * Gluster). If buf_align cannot be detected, we fallback to the value of ++ * request_alignment. ++ */ ++ ++ if (!bs->bl.request_alignment) { ++ int i; + size_t align; +- buf = qemu_memalign(max_align, 2 * max_align); +- for (align = 512; align <= max_align; align <<= 1) { +- if (raw_is_io_aligned(fd, buf + align, max_align)) { +- s->buf_align = align; ++ buf = qemu_memalign(max_align, max_align); ++ for (i = 0; i < ARRAY_SIZE(alignments); i++) { ++ align = alignments[i]; ++ if (raw_is_io_aligned(fd, buf, align)) { ++ /* Fallback to safe value. */ ++ bs->bl.request_alignment = (align != 1) ? align : max_align; + break; + } + } + qemu_vfree(buf); + } + +- if (!bs->bl.request_alignment) { ++ if (!s->buf_align) { ++ int i; + size_t align; +- buf = qemu_memalign(s->buf_align, max_align); +- for (align = 512; align <= max_align; align <<= 1) { +- if (raw_is_io_aligned(fd, buf, align)) { +- bs->bl.request_alignment = align; ++ buf = qemu_memalign(max_align, 2 * max_align); ++ for (i = 0; i < ARRAY_SIZE(alignments); i++) { ++ align = alignments[i]; ++ if (raw_is_io_aligned(fd, buf + align, max_align)) { ++ /* Fallback to request_aligment. */ ++ s->buf_align = (align != 1) ? align : bs->bl.request_alignment; + break; + } + } +-- +1.8.3.1 + diff --git a/kvm-iotests-Test-allocate_first_block-with-O_DIRECT.patch b/kvm-iotests-Test-allocate_first_block-with-O_DIRECT.patch new file mode 100644 index 0000000..75c738d --- /dev/null +++ b/kvm-iotests-Test-allocate_first_block-with-O_DIRECT.patch @@ -0,0 +1,109 @@ +From b261b31812a3e89a9104fb33bb2339b1742ac494 Mon Sep 17 00:00:00 2001 +From: Max Reitz +Date: Mon, 9 Sep 2019 07:38:22 +0100 +Subject: [PATCH 6/6] iotests: Test allocate_first_block() with O_DIRECT + +RH-Author: Max Reitz +Message-id: <20190909073822.26191-4-mreitz@redhat.com> +Patchwork-id: 90334 +O-Subject: [RHEL-AV-8.1.0 qemu-kvm PATCH 3/3] iotests: Test allocate_first_block() with O_DIRECT +Bugzilla: 1749134 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Thomas Huth +RH-Acked-by: Kevin Wolf + +From: Nir Soffer + +Using block_resize we can test allocate_first_block() with file +descriptor opened with O_DIRECT, ensuring that it works for any size +larger than 4096 bytes. + +Testing smaller sizes is tricky as the result depends on the filesystem +used for testing. For example on NFS any size will work since O_DIRECT +does not require any alignment. + +Signed-off-by: Nir Soffer +Reviewed-by: Max Reitz +Message-id: 20190827010528.8818-3-nsoffer@redhat.com +Signed-off-by: Max Reitz +(cherry picked from commit 7e3dc2ba9a11862d4e1a08325b9165f27a1b1e7c) +Signed-off-by: Max Reitz +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/175 | 28 ++++++++++++++++++++++++++++ + tests/qemu-iotests/175.out | 8 ++++++++ + 2 files changed, 36 insertions(+) + +diff --git a/tests/qemu-iotests/175 b/tests/qemu-iotests/175 +index 7ba28b3..55db280 100755 +--- a/tests/qemu-iotests/175 ++++ b/tests/qemu-iotests/175 +@@ -49,6 +49,23 @@ _filter_blocks() + -e "s/blocks=$((extra_blocks + img_size / 512))\\(\$\\|[^0-9]\\)/max allocation/" + } + ++# Resize image using block_resize. ++# Parameter 1: image path ++# Parameter 2: new size ++_block_resize() ++{ ++ local path=$1 ++ local size=$2 ++ ++ $QEMU -qmp stdio -nographic -nodefaults \ ++ -blockdev file,node-name=file,filename=$path,cache.direct=on \ ++ </dev/null ++ stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $min_blocks $new_size ++done ++ + # success, all done + echo "*** done" + rm -f $seq.full +diff --git a/tests/qemu-iotests/175.out b/tests/qemu-iotests/175.out +index 263e521..39c2ee0 100644 +--- a/tests/qemu-iotests/175.out ++++ b/tests/qemu-iotests/175.out +@@ -15,4 +15,12 @@ size=1048576, max allocation + == creating image with preallocation falloc == + Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=falloc + size=1048576, max allocation ++ ++== resize empty image with block_resize == ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=0 ++size=4096, min allocation ++ ++== resize empty image with block_resize == ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=0 ++size=1048576, min allocation + *** done +-- +1.8.3.1 + diff --git a/kvm-migration-Do-not-re-read-the-clock-on-pre_save-in-ca.patch b/kvm-migration-Do-not-re-read-the-clock-on-pre_save-in-ca.patch new file mode 100644 index 0000000..604a4bc --- /dev/null +++ b/kvm-migration-Do-not-re-read-the-clock-on-pre_save-in-ca.patch @@ -0,0 +1,101 @@ +From 8b0a69f11052d271ef49db0051d79e7f1a6102be Mon Sep 17 00:00:00 2001 +From: David Gibson +Date: Mon, 2 Sep 2019 04:20:32 +0100 +Subject: [PATCH 1/6] migration: Do not re-read the clock on pre_save in case + of paused guest +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: David Gibson +Message-id: <20190902042032.25987-1-dgibson@redhat.com> +Patchwork-id: 90226 +O-Subject: [RHEL-AV-8.1.0 qemu-kvm PATCH] migration: Do not re-read the clock on pre_save in case of paused guest +Bugzilla: 1747836 +RH-Acked-by: Thomas Huth +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Laurent Vivier + +From: "Maxiwell S. Garcia" + +Re-read the timebase before migrate was ported from x86 commit: + 6053a86fe7bd: kvmclock: reduce kvmclock difference on migration + +The clock move makes the guest knows about the paused time between +the stop and migrate commands. This is an issue in an already-paused +VM because some side effects, like process stalls, could happen +after migration. + +So, this patch checks the runstate of guest in the pre_save handler and +do not re-reads the timebase in case of paused state (cold migration). + +Signed-off-by: Maxiwell S. Garcia +Message-Id: <20190711194702.26598-1-maxiwell@linux.ibm.com> +Signed-off-by: David Gibson +(cherry picked from commit d14f33976282a8744ca1bf1d64e73996c145aa3f) + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1747836 +Branch: rhel8/rhel-av-8.1.0/master-4.1.0 +Brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=23282250 +Testing: Booted guest with this qemu + +Signed-off-by: David Gibson +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/ppc.c | 13 +++++++++---- + target/ppc/cpu-qom.h | 1 + + 2 files changed, 10 insertions(+), 4 deletions(-) + +diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c +index a9e508c..8572e45 100644 +--- a/hw/ppc/ppc.c ++++ b/hw/ppc/ppc.c +@@ -1008,6 +1008,8 @@ static void timebase_save(PPCTimebase *tb) + * there is no need to update it from KVM here + */ + tb->guest_timebase = ticks + first_ppc_cpu->env.tb_env->tb_offset; ++ ++ tb->runstate_paused = runstate_check(RUN_STATE_PAUSED); + } + + static void timebase_load(PPCTimebase *tb) +@@ -1051,9 +1053,9 @@ void cpu_ppc_clock_vm_state_change(void *opaque, int running, + } + + /* +- * When migrating, read the clock just before migration, +- * so that the guest clock counts during the events +- * between: ++ * When migrating a running guest, read the clock just ++ * before migration, so that the guest clock counts ++ * during the events between: + * + * * vm_stop() + * * +@@ -1068,7 +1070,10 @@ static int timebase_pre_save(void *opaque) + { + PPCTimebase *tb = opaque; + +- timebase_save(tb); ++ /* guest_timebase won't be overridden in case of paused guest */ ++ if (!tb->runstate_paused) { ++ timebase_save(tb); ++ } + + return 0; + } +diff --git a/target/ppc/cpu-qom.h b/target/ppc/cpu-qom.h +index be9b4c3..5fbcdee 100644 +--- a/target/ppc/cpu-qom.h ++++ b/target/ppc/cpu-qom.h +@@ -201,6 +201,7 @@ typedef struct PowerPCCPUClass { + typedef struct PPCTimebase { + uint64_t guest_timebase; + int64_t time_of_the_day_ns; ++ bool runstate_paused; + } PPCTimebase; + + extern const struct VMStateDescription vmstate_ppc_timebase; +-- +1.8.3.1 + diff --git a/kvm-spapr-Use-SHUTDOWN_CAUSE_SUBSYSTEM_RESET-for-CAS-reb.patch b/kvm-spapr-Use-SHUTDOWN_CAUSE_SUBSYSTEM_RESET-for-CAS-reb.patch new file mode 100644 index 0000000..8b3c06e --- /dev/null +++ b/kvm-spapr-Use-SHUTDOWN_CAUSE_SUBSYSTEM_RESET-for-CAS-reb.patch @@ -0,0 +1,60 @@ +From c8d3479746b17fcdf56b8afb3eccdba2c14578e8 Mon Sep 17 00:00:00 2001 +From: David Gibson +Date: Fri, 6 Sep 2019 03:58:36 +0100 +Subject: [PATCH 3/6] spapr: Use SHUTDOWN_CAUSE_SUBSYSTEM_RESET for CAS reboots + +RH-Author: David Gibson +Message-id: <20190906035836.23689-1-dgibson@redhat.com> +Patchwork-id: 90293 +O-Subject: [RHEL-AV-8.1.0 qemu-kvm PATCH] spapr: Use SHUTDOWN_CAUSE_SUBSYSTEM_RESET for CAS reboots +Bugzilla: 1743477 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Thomas Huth +RH-Acked-by: Danilo de Paula + +From: David Gibson + +The sPAPR platform includes feature negotiation between the guest and +platform. That sometimes requires reconfiguring the virtual hardware, and +in some cases that is a complex enough process that we trigger a system +reset to handle it. That interacts badly with -no-reboot - we trigger the +reboot, -no-reboot means we exit and so the guest never gets to try again. + +Eventually we want to get rid of CAS reboots entirely, since they're odd +and irritating for the user. But in the meantime we can fix the -no-reboot +problem by using SHUTDOWN_CAUSE_SUBSYSTEM_RESET which ignores -no-reboot +and seems to be designed for this sort of faux-reset for internal purposes +only. + +Signed-off-by: David Gibson +(cherry picked from commit 9146206eb26c1436c80a7c2ca1e4c5f86b27179d) + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1743477 +Brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=23395494 +Branch: rhel-av-8.1.0/master-4.1.0 +Testing: Started a guest and verified that -no-reboot no longer + prevents the CAS reboot to negotiate XIVE support from + completing + +Signed-off-by: David Gibson +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/spapr_hcall.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c +index 6808d4c..687bb7b 100644 +--- a/hw/ppc/spapr_hcall.c ++++ b/hw/ppc/spapr_hcall.c +@@ -1672,7 +1672,7 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu, + spapr_ovec_cleanup(ov5_updates); + + if (spapr->cas_reboot) { +- qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); ++ qemu_system_reset_request(SHUTDOWN_CAUSE_SUBSYSTEM_RESET); + } + + return H_SUCCESS; +-- +1.8.3.1 + diff --git a/qemu-kvm.spec b/qemu-kvm.spec index 7618078..b087571 100644 --- a/qemu-kvm.spec +++ b/qemu-kvm.spec @@ -67,7 +67,7 @@ Obsoletes: %1-rhev Summary: QEMU is a machine emulator and virtualizer Name: qemu-kvm Version: 4.1.0 -Release: 7%{?dist} +Release: 8%{?dist} # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped Epoch: 15 License: GPLv2 and GPLv2+ and CC-BY @@ -166,6 +166,18 @@ Patch41: kvm-multifd-Use-number-of-channels-as-listen-backlog.patch Patch42: kvm-pseries-Fix-compat_pvr-on-reset.patch # For bz#1744107 - Migration from P8(qemu4.1) to P9(qemu4.1), after migration, qemu crash on destination with error message "qemu-kvm: error while loading state for instance 0x1 of device 'cpu'" Patch43: kvm-spapr-Set-compat-mode-in-spapr_core_plug.patch +# For bz#1747836 - Call traces after guest migration due to incorrect handling of the timebase +Patch44: kvm-migration-Do-not-re-read-the-clock-on-pre_save-in-ca.patch +# For bz#1746790 - qemu core dump while migrate from RHEL7.6 to RHEL8.1 +Patch45: kvm-ehci-fix-queue-dev-null-ptr-dereference.patch +# For bz#1743477 - Since bd94bc06479a "spapr: change default interrupt mode to 'dual'", QEMU resets the machine to select the appropriate interrupt controller. And -no-reboot prevents that. +Patch46: kvm-spapr-Use-SHUTDOWN_CAUSE_SUBSYSTEM_RESET-for-CAS-reb.patch +# For bz#1749134 - I/O error when virtio-blk disk is backed by a raw image on 4k disk +Patch47: kvm-file-posix-Handle-undetectable-alignment.patch +# For bz#1749134 - I/O error when virtio-blk disk is backed by a raw image on 4k disk +Patch48: kvm-block-posix-Always-allocate-the-first-block.patch +# For bz#1749134 - I/O error when virtio-blk disk is backed by a raw image on 4k disk +Patch49: kvm-iotests-Test-allocate_first_block-with-O_DIRECT.patch BuildRequires: wget BuildRequires: rpm-build @@ -1107,6 +1119,22 @@ useradd -r -u 107 -g qemu -G kvm -d / -s /sbin/nologin \ %changelog +* Mon Sep 09 2019 Danilo Cesar Lemes de Paula - 4.1.0-8.el8 +- kvm-migration-Do-not-re-read-the-clock-on-pre_save-in-ca.patch [bz#1747836] +- kvm-ehci-fix-queue-dev-null-ptr-dereference.patch [bz#1746790] +- kvm-spapr-Use-SHUTDOWN_CAUSE_SUBSYSTEM_RESET-for-CAS-reb.patch [bz#1743477] +- kvm-file-posix-Handle-undetectable-alignment.patch [bz#1749134] +- kvm-block-posix-Always-allocate-the-first-block.patch [bz#1749134] +- kvm-iotests-Test-allocate_first_block-with-O_DIRECT.patch [bz#1749134] +- Resolves: bz#1743477 + (Since bd94bc06479a "spapr: change default interrupt mode to 'dual'", QEMU resets the machine to select the appropriate interrupt controller. And -no-reboot prevents that.) +- Resolves: bz#1746790 + (qemu core dump while migrate from RHEL7.6 to RHEL8.1) +- Resolves: bz#1747836 + (Call traces after guest migration due to incorrect handling of the timebase) +- Resolves: bz#1749134 + (I/O error when virtio-blk disk is backed by a raw image on 4k disk) + * Fri Sep 06 2019 Danilo Cesar Lemes de Paula - 4.1.0-7.el8 - kvm-trace-Clarify-DTrace-SystemTap-help-message.patch [bz#1516220] - kvm-socket-Add-backlog-parameter-to-socket_listen.patch [bz#1726898]