180 lines
7.7 KiB
Diff
180 lines
7.7 KiB
Diff
From 419aa23f0c6338b01b644094c3af2c024470e9b2 Mon Sep 17 00:00:00 2001
|
|
From: Nir Soffer <nsoffer@redhat.com>
|
|
Date: Sat, 10 Jul 2021 02:35:27 +0300
|
|
Subject: [PATCH] v2v: rhv-upload-plugin: Fix waiting for finalize
|
|
|
|
Waiting for image transfer finalize is complex. In the past we tried to
|
|
simplify the process by waiting on the disk status, but turns out that
|
|
due to the way oVirt lock the disk, this is not reliable.
|
|
|
|
These is finalize success flow:
|
|
|
|
1. User asks to finalize the transfer
|
|
2. oVirt sets transfer phase to FINALIZING_SUCCESS
|
|
3. oVirt finalizes the transfer
|
|
4. oVirt sets disk status to OK
|
|
5. oVirt unlocks the disk and changes transfer phase to FINISHED_SUCCESS
|
|
6. oVirt removes the transfer
|
|
|
|
In oVirt logs we can see that disk status changes to OK about 3 seconds
|
|
before the disk is actually unlocked. This is a very old problem that is
|
|
unlikely to be fixed.
|
|
|
|
The only way to wait for transfer completion is to poll the transfer
|
|
phase, but oVirt makes this hard by removing the transfer shortly after
|
|
it completes, so we may not be able to get the FINISHED_SUCCESS phase.
|
|
If the transfer was removed before we got one of the final phases, we
|
|
need to check the disk status to understand the result of transfer.
|
|
|
|
oVirt 4.4.7 made polling transfer phase easier by keeping the transfer
|
|
after completion, but we need to support older versions so we must have
|
|
generic code that work with any version.
|
|
|
|
To make debugging easier, we log the transfer phase during polling. Here
|
|
is a typical transfer log when finalizing transfer:
|
|
|
|
finalizing transfer 59e545f3-db1f-4a6b-90b1-80ac66572faa
|
|
transfer 59e545f3-db1f-4a6b-90b1-80ac66572faa is finalizing_success
|
|
transfer 59e545f3-db1f-4a6b-90b1-80ac66572faa is finalizing_success
|
|
transfer 59e545f3-db1f-4a6b-90b1-80ac66572faa is finalizing_success
|
|
transfer 59e545f3-db1f-4a6b-90b1-80ac66572faa is finalizing_success
|
|
transfer 59e545f3-db1f-4a6b-90b1-80ac66572faa is finished_success
|
|
transfer 59e545f3-db1f-4a6b-90b1-80ac66572faa finalized in 5.153 seconds
|
|
|
|
Signed-off-by: Nir Soffer <nsoffer@redhat.com>
|
|
Fixes: https://bugzilla.redhat.com/1976024
|
|
(cherry picked from commit 79702b28329d15a7485801ed7e915d486fcc0cf4)
|
|
---
|
|
v2v/rhv-upload-plugin.py | 98 ++++++++++++++++++++++++++++------------
|
|
1 file changed, 69 insertions(+), 29 deletions(-)
|
|
|
|
diff --git a/v2v/rhv-upload-plugin.py b/v2v/rhv-upload-plugin.py
|
|
index 7cd6dea6..61ade1a8 100644
|
|
--- a/v2v/rhv-upload-plugin.py
|
|
+++ b/v2v/rhv-upload-plugin.py
|
|
@@ -601,17 +601,29 @@ def finalize_transfer(connection, transfer, disk_id):
|
|
"""
|
|
Finalize a transfer, making the transfer disk available.
|
|
|
|
- If finalizing succeeds, transfer's phase will change to FINISHED_SUCCESS
|
|
- and the transer's disk status will change to OK. On errors, the transfer's
|
|
- phase will change to FINISHED_FAILURE and the disk status will change to
|
|
- ILLEGAL and it will be removed. In both cases the transfer entity will be
|
|
- removed shortly after.
|
|
+ If finalizing succeeds, the transfer's disk status will change to OK
|
|
+ and transfer's phase will change to FINISHED_SUCCESS. Unfortunately,
|
|
+ the disk status is modified before the transfer finishes, and oVirt
|
|
+ may still hold a lock on the disk at this point.
|
|
|
|
- If oVirt fails to finalize the transfer, transfer's phase will change to
|
|
- PAUSED_SYSTEM. In this case the disk's status will change to ILLEGAL and it
|
|
- will not be removed.
|
|
+ The only way to make sure that the disk is unlocked, is to wait
|
|
+ until the transfer phase switches FINISHED_SUCCESS. Unfortunately
|
|
+ oVirt makes this hard to use because the transfer is removed shortly
|
|
+ after switching the phase to the final phase. However if the
|
|
+ transfer was removed, we can be sure that the disk is not locked,
|
|
+ since oVirt releases the locks before removing the transfer.
|
|
|
|
- For simplicity, we track only disk's status changes.
|
|
+ On errors, the transfer's phase will change to FINISHED_FAILURE and
|
|
+ the disk status will change to ILLEGAL and it will be removed. Again
|
|
+ the transfer will be removed shortly after that.
|
|
+
|
|
+ If oVirt fails to finalize the transfer, transfer's phase will
|
|
+ change to PAUSED_SYSTEM. In this case the disk's status will change
|
|
+ to ILLEGAL and it will not be removed.
|
|
+
|
|
+ oVirt 4.4.7 made waiting for transfer easier by keeping transfers
|
|
+ after they complete, but we must support older versions so we have
|
|
+ generic code that work with any version.
|
|
|
|
For more info see:
|
|
- http://ovirt.github.io/ovirt-engine-api-model/4.4/#services/image_transfer
|
|
@@ -626,34 +638,62 @@ def finalize_transfer(connection, transfer, disk_id):
|
|
|
|
transfer_service.finalize()
|
|
|
|
- disk_service = (connection.system_service()
|
|
- .disks_service()
|
|
- .disk_service(disk_id))
|
|
-
|
|
while True:
|
|
time.sleep(1)
|
|
try:
|
|
- disk = disk_service.get()
|
|
+ transfer = transfer_service.get()
|
|
except sdk.NotFoundError:
|
|
- # Disk verification failed and the system removed the disk.
|
|
- raise RuntimeError(
|
|
- "transfer %s failed: disk %s was removed"
|
|
- % (transfer.id, disk_id))
|
|
+ # Transfer was removed (ovirt < 4.4.7). We need to check the
|
|
+ # disk status to understand if the transfer was successful.
|
|
+ # Due to the way oVirt does locking, we know that the disk
|
|
+ # is unlocked at this point so we can check only once.
|
|
|
|
- if disk.status == types.DiskStatus.ILLEGAL:
|
|
- # Disk verification failed or transfer was paused by the system.
|
|
- raise RuntimeError(
|
|
- "transfer %s failed: disk is ILLEGAL" % transfer.id)
|
|
+ debug("transfer %s was removed, checking disk %s status"
|
|
+ % (transfer.id, disk_id))
|
|
+
|
|
+ disk_service = (connection.system_service()
|
|
+ .disks_service()
|
|
+ .disk_service(disk_id))
|
|
|
|
- if disk.status == types.DiskStatus.OK:
|
|
- debug("transfer %s finalized in %.3f seconds"
|
|
- % (transfer.id, time.time() - start))
|
|
- break
|
|
+ try:
|
|
+ disk = disk_service.get()
|
|
+ except sdk.NotFoundError:
|
|
+ raise RuntimeError(
|
|
+ "transfer %s failed: disk %s was removed"
|
|
+ % (transfer.id, disk_id))
|
|
+
|
|
+ debug("disk %s is %s" % (disk_id, disk.status))
|
|
+
|
|
+ if disk.status == types.DiskStatus.OK:
|
|
+ break
|
|
|
|
- if time.time() > start + timeout:
|
|
raise RuntimeError(
|
|
- "timed out waiting for transfer %s to finalize"
|
|
- % transfer.id)
|
|
+ "transfer %s failed: disk is %s" % (transfer.id, disk.status))
|
|
+ else:
|
|
+ # Transfer exists, check if it reached one of the final
|
|
+ # phases, or we timed out.
|
|
+
|
|
+ debug("transfer %s is %s" % (transfer.id, transfer.phase))
|
|
+
|
|
+ if transfer.phase == types.ImageTransferPhase.FINISHED_SUCCESS:
|
|
+ break
|
|
+
|
|
+ if transfer.phase == types.ImageTransferPhase.FINISHED_FAILURE:
|
|
+ raise RuntimeError(
|
|
+ "transfer %s has failed" % (transfer.id,))
|
|
+
|
|
+ if transfer.phase == types.ImageTransferPhase.PAUSED_SYSTEM:
|
|
+ raise RuntimeError(
|
|
+ "transfer %s was paused by system" % (transfer.id,))
|
|
+
|
|
+ if time.time() > start + timeout:
|
|
+ raise RuntimeError(
|
|
+ "timed out waiting for transfer %s to finalize, "
|
|
+ "transfer is %s"
|
|
+ % (transfer.id, transfer.phase))
|
|
+
|
|
+ debug("transfer %s finalized in %.3f seconds"
|
|
+ % (transfer.id, time.time() - start))
|
|
|
|
|
|
def transfer_supports_format():
|
|
--
|
|
2.27.0
|
|
|