From a91da7741464dadeb306a741b4fb562e49ffea57 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 7 Feb 2023 15:57:11 -0500 Subject: [PATCH 5/8] util/userfaultfd: Support /dev/userfaultfd RH-Author: Peter Xu RH-MergeRequest: 149: Support /dev/userfaultfd RH-Bugzilla: 2158704 RH-Acked-by: Dr. David Alan Gilbert RH-Acked-by: quintela1 RH-Acked-by: Miroslav Rezanina RH-Commit: [3/3] 5f427d8c18c210ff8f66724c9e358a7120619e69 (peterx/qemu-kvm) Teach QEMU to use /dev/userfaultfd when it existed and fallback to the system call if either it's not there or doesn't have enough permission. Firstly, as long as the app has permission to access /dev/userfaultfd, it always have the ability to trap kernel faults which QEMU mostly wants. Meanwhile, in some context (e.g. containers) the userfaultfd syscall can be forbidden, so it can be the major way to use postcopy in a restricted environment with strict seccomp setup. Signed-off-by: Peter Xu Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela (cherry picked from commit c40c0463413b941c13fe5f99a90c02d7d6584828) Signed-off-by: Peter Xu --- util/trace-events | 1 + util/userfaultfd.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/util/trace-events b/util/trace-events index c8f53d7d9f..16f78d8fe5 100644 --- a/util/trace-events +++ b/util/trace-events @@ -93,6 +93,7 @@ qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_siz qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p" #userfaultfd.c +uffd_detect_open_mode(int mode) "%d" uffd_query_features_nosys(int err) "errno: %i" uffd_query_features_api_failed(int err) "errno: %i" uffd_create_fd_nosys(int err) "errno: %i" diff --git a/util/userfaultfd.c b/util/userfaultfd.c index 4953b3137d..fdff4867e8 100644 --- a/util/userfaultfd.c +++ b/util/userfaultfd.c @@ -18,10 +18,42 @@ #include #include #include +#include + +typedef enum { + UFFD_UNINITIALIZED = 0, + UFFD_USE_DEV_PATH, + UFFD_USE_SYSCALL, +} uffd_open_mode; int uffd_open(int flags) { #if defined(__NR_userfaultfd) + static uffd_open_mode open_mode; + static int uffd_dev; + + /* Detect how to generate uffd desc when run the 1st time */ + if (open_mode == UFFD_UNINITIALIZED) { + /* + * Make /dev/userfaultfd the default approach because it has better + * permission controls, meanwhile allows kernel faults without any + * privilege requirement (e.g. SYS_CAP_PTRACE). + */ + uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); + if (uffd_dev >= 0) { + open_mode = UFFD_USE_DEV_PATH; + } else { + /* Fallback to the system call */ + open_mode = UFFD_USE_SYSCALL; + } + trace_uffd_detect_open_mode(open_mode); + } + + if (open_mode == UFFD_USE_DEV_PATH) { + assert(uffd_dev >= 0); + return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags); + } + return syscall(__NR_userfaultfd, flags); #else return -EINVAL; -- 2.31.1