diff --git a/COPYING-5.14.0-570.25.1.el9 b/COPYING-5.14.0-570.26.1.el9 similarity index 100% rename from COPYING-5.14.0-570.25.1.el9 rename to COPYING-5.14.0-570.26.1.el9 diff --git a/Makefile.rhelver b/Makefile.rhelver index 02ac9c0224..f74ec79875 100644 --- a/Makefile.rhelver +++ b/Makefile.rhelver @@ -12,7 +12,7 @@ RHEL_MINOR = 6 # # Use this spot to avoid future merge conflicts. # Do not trim this comment. -RHEL_RELEASE = 570.25.1 +RHEL_RELEASE = 570.26.1 # # ZSTREAM diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index ca1f39e496..1906e41430 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -97,6 +97,7 @@ config ARM64 select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_SUPPORTS_PAGE_TABLE_CHECK select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_RT select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 9e44e693fc..7fa291d3f9 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -354,6 +354,7 @@ static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages) /* * Select all bits except the pfn */ +#define pte_pgprot pte_pgprot static inline pgprot_t pte_pgprot(pte_t pte) { unsigned long pfn = pte_pfn(pte); @@ -527,6 +528,14 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) return pte_pmd(set_pte_bit(pmd_pte(pmd), __pgprot(PTE_DEVMAP))); } +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +#define pmd_special(pte) (!!((pmd_val(pte) & PTE_SPECIAL))) +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return set_pmd_bit(pmd, __pgprot(PTE_SPECIAL)); +} +#endif + #define __pmd_to_phys(pmd) __pte_to_phys(pmd_pte(pmd)) #define __phys_to_pmd_val(phys) __phys_to_pte_val(phys) #define pmd_pfn(pmd) ((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT) @@ -544,6 +553,27 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) #define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT) #define pfn_pud(pfn,prot) __pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) +#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +#define pud_special(pte) pte_special(pud_pte(pud)) +#define pud_mkspecial(pte) pte_pud(pte_mkspecial(pud_pte(pud))) +#endif + +#define pmd_pgprot pmd_pgprot +static inline pgprot_t pmd_pgprot(pmd_t pmd) +{ + unsigned long pfn = pmd_pfn(pmd); + + return __pgprot(pmd_val(pfn_pmd(pfn, __pgprot(0))) ^ pmd_val(pmd)); +} + +#define pud_pgprot pud_pgprot +static inline pgprot_t pud_pgprot(pud_t pud) +{ + unsigned long pfn = pud_pfn(pud); + + return __pgprot(pud_val(pfn_pud(pfn, __pgprot(0))) ^ pud_val(pud)); +} + static inline void __set_pte_at(struct mm_struct *mm, unsigned long __always_unused addr, pte_t *ptep, pte_t pte, unsigned int nr) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 5c12b43e74..4833c86ec4 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -396,33 +396,35 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) #define __flush_tlb_range_op(op, start, pages, stride, \ asid, tlb_level, tlbi_user, lpa2) \ do { \ + typeof(start) __flush_start = start; \ + typeof(pages) __flush_pages = pages; \ int num = 0; \ int scale = 3; \ int shift = lpa2 ? 16 : PAGE_SHIFT; \ unsigned long addr; \ \ - while (pages > 0) { \ + while (__flush_pages > 0) { \ if (!system_supports_tlb_range() || \ - pages == 1 || \ - (lpa2 && start != ALIGN(start, SZ_64K))) { \ - addr = __TLBI_VADDR(start, asid); \ + __flush_pages == 1 || \ + (lpa2 && __flush_start != ALIGN(__flush_start, SZ_64K))) { \ + addr = __TLBI_VADDR(__flush_start, asid); \ __tlbi_level(op, addr, tlb_level); \ if (tlbi_user) \ __tlbi_user_level(op, addr, tlb_level); \ - start += stride; \ - pages -= stride >> PAGE_SHIFT; \ + __flush_start += stride; \ + __flush_pages -= stride >> PAGE_SHIFT; \ continue; \ } \ \ - num = __TLBI_RANGE_NUM(pages, scale); \ + num = __TLBI_RANGE_NUM(__flush_pages, scale); \ if (num >= 0) { \ - addr = __TLBI_VADDR_RANGE(start >> shift, asid, \ + addr = __TLBI_VADDR_RANGE(__flush_start >> shift, asid, \ scale, num, tlb_level); \ __tlbi(r##op, addr); \ if (tlbi_user) \ __tlbi_user(r##op, addr); \ - start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; \ - pages -= __TLBI_RANGE_PAGES(num, scale); \ + __flush_start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; \ + __flush_pages -= __TLBI_RANGE_PAGES(num, scale);\ } \ scale--; \ } \ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 6543642f56..96f6376ad2 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -44,6 +44,7 @@ static inline unsigned long pte_pfn(pte_t pte) /* * Select all bits except the pfn */ +#define pte_pgprot pte_pgprot static inline pgprot_t pte_pgprot(pte_t pte) { unsigned long pte_flags; diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 2d2d224207..61ec7143cd 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -941,6 +941,7 @@ static inline int pte_unused(pte_t pte) * young/old accounting is not supported, i.e _PAGE_PROTECT and _PAGE_INVALID * must not be set. */ +#define pte_pgprot pte_pgprot static inline pgprot_t pte_pgprot(pte_t pte) { unsigned long pte_flags = pte_val(pte) & _PAGE_CHG_MASK; diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c index 5880893329..84a8c8f517 100644 --- a/arch/s390/pci/pci_mmio.c +++ b/arch/s390/pci/pci_mmio.c @@ -118,12 +118,11 @@ static inline int __memcpy_toio_inuser(void __iomem *dst, SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, const void __user *, user_buffer, size_t, length) { + struct follow_pfnmap_args args = { }; u8 local_buf[64]; void __iomem *io_addr; void *buf; struct vm_area_struct *vma; - pte_t *ptep; - spinlock_t *ptl; long ret; if (!zpci_is_enabled()) @@ -169,11 +168,13 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, if (!(vma->vm_flags & VM_WRITE)) goto out_unlock_mmap; - ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl); + args.address = mmio_addr; + args.vma = vma; + ret = follow_pfnmap_start(&args); if (ret) goto out_unlock_mmap; - io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) | + io_addr = (void __iomem *)((args.pfn << PAGE_SHIFT) | (mmio_addr & ~PAGE_MASK)); if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE) @@ -181,7 +182,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, ret = zpci_memcpy_toio(io_addr, buf, length); out_unlock_pt: - pte_unmap_unlock(ptep, ptl); + follow_pfnmap_end(&args); out_unlock_mmap: mmap_read_unlock(current->mm); out_free: @@ -260,12 +261,11 @@ static inline int __memcpy_fromio_inuser(void __user *dst, SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, void __user *, user_buffer, size_t, length) { + struct follow_pfnmap_args args = { }; u8 local_buf[64]; void __iomem *io_addr; void *buf; struct vm_area_struct *vma; - pte_t *ptep; - spinlock_t *ptl; long ret; if (!zpci_is_enabled()) @@ -308,11 +308,13 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, if (!(vma->vm_flags & VM_WRITE)) goto out_unlock_mmap; - ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl); + args.vma = vma; + args.address = mmio_addr; + ret = follow_pfnmap_start(&args); if (ret) goto out_unlock_mmap; - io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) | + io_addr = (void __iomem *)((args.pfn << PAGE_SHIFT) | (mmio_addr & ~PAGE_MASK)); if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE) { @@ -322,7 +324,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, ret = zpci_memcpy_fromio(buf, io_addr, length); out_unlock_pt: - pte_unmap_unlock(ptep, ptl); + follow_pfnmap_end(&args); out_unlock_mmap: mmap_read_unlock(current->mm); diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 896d9b7867..da3a5f673c 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -782,6 +782,7 @@ static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) return __pmd(pte_val(pte)); } +#define pmd_pgprot pmd_pgprot static inline pgprot_t pmd_pgprot(pmd_t entry) { unsigned long val = pmd_val(entry); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a5a59118ef..5d4f050bd5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -28,6 +28,7 @@ config X86_64 select ARCH_HAS_GIGANTIC_PAGE select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_RT select HAVE_ARCH_SOFT_DIRTY select MODULES_USE_ELF_RELA diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 8149afec43..c5bc120fad 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -121,6 +121,34 @@ extern pmdval_t early_pmd_flags; #define arch_end_context_switch(prev) do {} while(0) #endif /* CONFIG_PARAVIRT_XXL */ +static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) +{ + pmdval_t v = native_pmd_val(pmd); + + return native_make_pmd(v | set); +} + +static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) +{ + pmdval_t v = native_pmd_val(pmd); + + return native_make_pmd(v & ~clear); +} + +static inline pud_t pud_set_flags(pud_t pud, pudval_t set) +{ + pudval_t v = native_pud_val(pud); + + return native_make_pud(v | set); +} + +static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) +{ + pudval_t v = native_pud_val(pud); + + return native_make_pud(v & ~clear); +} + /* * The following only work if pte_present() is true. * Undefined behaviour if not.. @@ -310,6 +338,30 @@ static inline int pud_devmap(pud_t pud) } #endif +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +static inline bool pmd_special(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_SPECIAL; +} + +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_SPECIAL); +} +#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ + +#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +static inline bool pud_special(pud_t pud) +{ + return pud_flags(pud) & _PAGE_SPECIAL; +} + +static inline pud_t pud_mkspecial(pud_t pud) +{ + return pud_set_flags(pud, _PAGE_SPECIAL); +} +#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ + static inline int pgd_devmap(pgd_t pgd) { return 0; @@ -480,20 +532,6 @@ static inline pte_t pte_mkdevmap(pte_t pte) return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP); } -static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) -{ - pmdval_t v = native_pmd_val(pmd); - - return native_make_pmd(v | set); -} - -static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) -{ - pmdval_t v = native_pmd_val(pmd); - - return native_make_pmd(v & ~clear); -} - /* See comments above mksaveddirty_shift() */ static inline pmd_t pmd_mksaveddirty(pmd_t pmd) { @@ -588,20 +626,6 @@ static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); #define pmd_mkwrite pmd_mkwrite -static inline pud_t pud_set_flags(pud_t pud, pudval_t set) -{ - pudval_t v = native_pud_val(pud); - - return native_make_pud(v | set); -} - -static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) -{ - pudval_t v = native_pud_val(pud); - - return native_make_pud(v & ~clear); -} - /* See comments above mksaveddirty_shift() */ static inline pud_t pud_mksaveddirty(pud_t pud) { diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index c0d56c02b8..9e84bcedd9 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -834,7 +834,7 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz return ret; } - for_each_node(nid) { + for_each_node_with_cpus(nid) { cpu = cpumask_first(cpumask_of_node(nid)); c = &cpu_data(cpu); diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 36b603d0cd..fd210b362a 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -947,6 +948,26 @@ static void free_pfn_range(u64 paddr, unsigned long size) memtype_free(paddr, paddr + size); } +static int follow_phys(struct vm_area_struct *vma, unsigned long *prot, + resource_size_t *phys) +{ + struct follow_pfnmap_args args = { .vma = vma, .address = vma->vm_start }; + + if (follow_pfnmap_start(&args)) + return -EINVAL; + + /* Never return PFNs of anon folios in COW mappings. */ + if (!args.special) { + follow_pfnmap_end(&args); + return -EINVAL; + } + + *prot = pgprot_val(args.pgprot); + *phys = (resource_size_t)args.pfn << PAGE_SHIFT; + follow_pfnmap_end(&args); + return 0; +} + static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr, pgprot_t *pgprot) { @@ -964,7 +985,7 @@ static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr, * detect the PFN. If we need the cachemode as well, we're out of luck * for now and have to fail fork(). */ - if (!follow_phys(vma, vma->vm_start, 0, &prot, paddr)) { + if (!follow_phys(vma, &prot, paddr)) { if (pgprot) *pgprot = __pgprot(prot); return 0; diff --git a/configs/kernel-5.14.0-aarch64-64k-debug.config b/configs/kernel-5.14.0-aarch64-64k-debug.config index ffbbd0d2d5..366a420477 100644 --- a/configs/kernel-5.14.0-aarch64-64k-debug.config +++ b/configs/kernel-5.14.0-aarch64-64k-debug.config @@ -1076,6 +1076,8 @@ CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y # CONFIG_TRANSPARENT_HUGEPAGE_MADVISE is not set # CONFIG_READ_ONLY_THP_FOR_FS is not set +CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PMD_PFNMAP=y CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y diff --git a/configs/kernel-5.14.0-aarch64-64k.config b/configs/kernel-5.14.0-aarch64-64k.config index ac272010b4..863379693d 100644 --- a/configs/kernel-5.14.0-aarch64-64k.config +++ b/configs/kernel-5.14.0-aarch64-64k.config @@ -1072,6 +1072,8 @@ CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y # CONFIG_TRANSPARENT_HUGEPAGE_MADVISE is not set # CONFIG_READ_ONLY_THP_FOR_FS is not set +CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PMD_PFNMAP=y CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y diff --git a/configs/kernel-5.14.0-aarch64-debug.config b/configs/kernel-5.14.0-aarch64-debug.config index 1d04e6b661..3f095a4cca 100644 --- a/configs/kernel-5.14.0-aarch64-debug.config +++ b/configs/kernel-5.14.0-aarch64-debug.config @@ -1079,6 +1079,8 @@ CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y # CONFIG_TRANSPARENT_HUGEPAGE_MADVISE is not set CONFIG_THP_SWAP=y # CONFIG_READ_ONLY_THP_FOR_FS is not set +CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PMD_PFNMAP=y CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y diff --git a/configs/kernel-5.14.0-aarch64.config b/configs/kernel-5.14.0-aarch64.config index d8d030e0d8..0e16af1904 100644 --- a/configs/kernel-5.14.0-aarch64.config +++ b/configs/kernel-5.14.0-aarch64.config @@ -1075,6 +1075,8 @@ CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y # CONFIG_TRANSPARENT_HUGEPAGE_MADVISE is not set CONFIG_THP_SWAP=y # CONFIG_READ_ONLY_THP_FOR_FS is not set +CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PMD_PFNMAP=y CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y diff --git a/configs/kernel-5.14.0-x86_64-debug.config b/configs/kernel-5.14.0-x86_64-debug.config index cf7b76ff65..2dd50663a7 100644 --- a/configs/kernel-5.14.0-x86_64-debug.config +++ b/configs/kernel-5.14.0-x86_64-debug.config @@ -1138,6 +1138,9 @@ CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y # CONFIG_TRANSPARENT_HUGEPAGE_MADVISE is not set CONFIG_THP_SWAP=y # CONFIG_READ_ONLY_THP_FOR_FS is not set +CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PMD_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PUD_PFNMAP=y CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y diff --git a/configs/kernel-5.14.0-x86_64.config b/configs/kernel-5.14.0-x86_64.config index 18f8ddbcb7..856f304b1f 100644 --- a/configs/kernel-5.14.0-x86_64.config +++ b/configs/kernel-5.14.0-x86_64.config @@ -1133,6 +1133,9 @@ CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y # CONFIG_TRANSPARENT_HUGEPAGE_MADVISE is not set CONFIG_THP_SWAP=y # CONFIG_READ_ONLY_THP_FOR_FS is not set +CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PMD_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PUD_PFNMAP=y CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 36442eeffb..73aa4347a2 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -600,6 +600,9 @@ static bool turbo_is_disabled(void) { u64 misc_en; + if (!cpu_feature_enabled(X86_FEATURE_IDA)) + return true; + rdmsrl(MSR_IA32_MISC_ENABLE, misc_en); return !!(misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index ffda816e01..c9eaba2276 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -57,11 +58,6 @@ struct vfio_pci_vf_token { int users; }; -struct vfio_pci_mmap_vma { - struct vm_area_struct *vma; - struct list_head vma_next; -}; - static inline bool vfio_vga_disabled(void) { #ifdef CONFIG_VFIO_PCI_VGA @@ -1610,100 +1606,20 @@ ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *bu } EXPORT_SYMBOL_GPL(vfio_pci_core_write); -/* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */ -static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try) +static void vfio_pci_zap_bars(struct vfio_pci_core_device *vdev) { - struct vfio_pci_mmap_vma *mmap_vma, *tmp; + struct vfio_device *core_vdev = &vdev->vdev; + loff_t start = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_BAR0_REGION_INDEX); + loff_t end = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_ROM_REGION_INDEX); + loff_t len = end - start; - /* - * Lock ordering: - * vma_lock is nested under mmap_lock for vm_ops callback paths. - * The memory_lock semaphore is used by both code paths calling - * into this function to zap vmas and the vm_ops.fault callback - * to protect the memory enable state of the device. - * - * When zapping vmas we need to maintain the mmap_lock => vma_lock - * ordering, which requires using vma_lock to walk vma_list to - * acquire an mm, then dropping vma_lock to get the mmap_lock and - * reacquiring vma_lock. This logic is derived from similar - * requirements in uverbs_user_mmap_disassociate(). - * - * mmap_lock must always be the top-level lock when it is taken. - * Therefore we can only hold the memory_lock write lock when - * vma_list is empty, as we'd need to take mmap_lock to clear - * entries. vma_list can only be guaranteed empty when holding - * vma_lock, thus memory_lock is nested under vma_lock. - * - * This enables the vm_ops.fault callback to acquire vma_lock, - * followed by memory_lock read lock, while already holding - * mmap_lock without risk of deadlock. - */ - while (1) { - struct mm_struct *mm = NULL; - - if (try) { - if (!mutex_trylock(&vdev->vma_lock)) - return 0; - } else { - mutex_lock(&vdev->vma_lock); - } - while (!list_empty(&vdev->vma_list)) { - mmap_vma = list_first_entry(&vdev->vma_list, - struct vfio_pci_mmap_vma, - vma_next); - mm = mmap_vma->vma->vm_mm; - if (mmget_not_zero(mm)) - break; - - list_del(&mmap_vma->vma_next); - kfree(mmap_vma); - mm = NULL; - } - if (!mm) - return 1; - mutex_unlock(&vdev->vma_lock); - - if (try) { - if (!mmap_read_trylock(mm)) { - mmput(mm); - return 0; - } - } else { - mmap_read_lock(mm); - } - if (try) { - if (!mutex_trylock(&vdev->vma_lock)) { - mmap_read_unlock(mm); - mmput(mm); - return 0; - } - } else { - mutex_lock(&vdev->vma_lock); - } - list_for_each_entry_safe(mmap_vma, tmp, - &vdev->vma_list, vma_next) { - struct vm_area_struct *vma = mmap_vma->vma; - - if (vma->vm_mm != mm) - continue; - - list_del(&mmap_vma->vma_next); - kfree(mmap_vma); - - zap_vma_ptes(vma, vma->vm_start, - vma->vm_end - vma->vm_start); - } - mutex_unlock(&vdev->vma_lock); - mmap_read_unlock(mm); - mmput(mm); - } + unmap_mapping_range(core_vdev->inode->i_mapping, start, len, true); } void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev) { - vfio_pci_zap_and_vma_lock(vdev, false); down_write(&vdev->memory_lock); - mutex_unlock(&vdev->vma_lock); + vfio_pci_zap_bars(vdev); } u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev) @@ -1725,100 +1641,83 @@ void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 c up_write(&vdev->memory_lock); } -/* Caller holds vma_lock */ -static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev, - struct vm_area_struct *vma) -{ - struct vfio_pci_mmap_vma *mmap_vma; - - mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL_ACCOUNT); - if (!mmap_vma) - return -ENOMEM; - - mmap_vma->vma = vma; - list_add(&mmap_vma->vma_next, &vdev->vma_list); - - return 0; -} - -/* - * Zap mmaps on open so that we can fault them in on access and therefore - * our vma_list only tracks mappings accessed since last zap. - */ -static void vfio_pci_mmap_open(struct vm_area_struct *vma) -{ - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); -} - -static void vfio_pci_mmap_close(struct vm_area_struct *vma) +static unsigned long vma_to_pfn(struct vm_area_struct *vma) { struct vfio_pci_core_device *vdev = vma->vm_private_data; - struct vfio_pci_mmap_vma *mmap_vma; + int index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); + u64 pgoff; - mutex_lock(&vdev->vma_lock); - list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { - if (mmap_vma->vma == vma) { - list_del(&mmap_vma->vma_next); - kfree(mmap_vma); - break; - } - } - mutex_unlock(&vdev->vma_lock); + pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff; } -static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) +static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, + unsigned int order) { struct vm_area_struct *vma = vmf->vma; struct vfio_pci_core_device *vdev = vma->vm_private_data; - struct vfio_pci_mmap_vma *mmap_vma; - vm_fault_t ret = VM_FAULT_NOPAGE; + unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1); + unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + unsigned long pfn = vma_to_pfn(vma) + pgoff; + vm_fault_t ret = VM_FAULT_SIGBUS; + + if (order && (addr < vma->vm_start || + addr + (PAGE_SIZE << order) > vma->vm_end || + pfn & ((1 << order) - 1))) { + ret = VM_FAULT_FALLBACK; + goto out; + } - mutex_lock(&vdev->vma_lock); down_read(&vdev->memory_lock); - /* - * Memory region cannot be accessed if the low power feature is engaged - * or memory access is disabled. - */ - if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) { - ret = VM_FAULT_SIGBUS; - goto up_out; + if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) + goto out_unlock; + + switch (order) { + case 0: + ret = vmf_insert_pfn(vma, vmf->address, pfn); + break; +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP + case PMD_ORDER: + ret = vmf_insert_pfn_pmd(vmf, + __pfn_to_pfn_t(pfn, PFN_DEV), false); + break; +#endif +#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP + case PUD_ORDER: + ret = vmf_insert_pfn_pud(vmf, + __pfn_to_pfn_t(pfn, PFN_DEV), false); + break; +#endif + default: + ret = VM_FAULT_FALLBACK; } - /* - * We populate the whole vma on fault, so we need to test whether - * the vma has already been mapped, such as for concurrent faults - * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if - * we ask it to fill the same range again. - */ - list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { - if (mmap_vma->vma == vma) - goto up_out; - } - - if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, - vma->vm_end - vma->vm_start, - vma->vm_page_prot)) { - ret = VM_FAULT_SIGBUS; - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); - goto up_out; - } - - if (__vfio_pci_add_vma(vdev, vma)) { - ret = VM_FAULT_OOM; - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); - } - -up_out: +out_unlock: up_read(&vdev->memory_lock); - mutex_unlock(&vdev->vma_lock); +out: + dev_dbg_ratelimited(&vdev->pdev->dev, + "%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n", + __func__, order, + vma->vm_pgoff >> + (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT), + pgoff, (unsigned int)ret); + return ret; } +static vm_fault_t vfio_pci_mmap_page_fault(struct vm_fault *vmf) +{ + return vfio_pci_mmap_huge_fault(vmf, 0); +} + static const struct vm_operations_struct vfio_pci_mmap_ops = { - .open = vfio_pci_mmap_open, - .close = vfio_pci_mmap_close, - .fault = vfio_pci_mmap_fault, + .fault = vfio_pci_mmap_page_fault, +#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP + .huge_fault = vfio_pci_mmap_huge_fault, +#endif }; int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) @@ -1880,11 +1779,12 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma vma->vm_private_data = vdev; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); /* - * See remap_pfn_range(), called from vfio_pci_fault() but we can't - * change vm_flags within the fault handler. Set them now. + * Set vm_flags now, they should not be changed in the fault handler. + * We want the same flags and page protection (decrypted above) as + * io_remap_pfn_range() would set. * * VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64, * allowing KVM stage 2 device mapping attributes to use Normal-NC @@ -2202,8 +2102,6 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) mutex_init(&vdev->ioeventfds_lock); INIT_LIST_HEAD(&vdev->dummy_resources_list); INIT_LIST_HEAD(&vdev->ioeventfds_list); - mutex_init(&vdev->vma_lock); - INIT_LIST_HEAD(&vdev->vma_list); INIT_LIST_HEAD(&vdev->sriov_pfs_item); init_rwsem(&vdev->memory_lock); xa_init(&vdev->ctx); @@ -2219,7 +2117,6 @@ void vfio_pci_core_release_dev(struct vfio_device *core_vdev) mutex_destroy(&vdev->igate); mutex_destroy(&vdev->ioeventfds_lock); - mutex_destroy(&vdev->vma_lock); kfree(vdev->region); kfree(vdev->pm_save); } @@ -2497,26 +2394,15 @@ unwind: return ret; } -/* - * We need to get memory_lock for each device, but devices can share mmap_lock, - * therefore we need to zap and hold the vma_lock for each device, and only then - * get each memory_lock. - */ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, struct vfio_pci_group_info *groups, struct iommufd_ctx *iommufd_ctx) { - struct vfio_pci_core_device *cur_mem; - struct vfio_pci_core_device *cur_vma; - struct vfio_pci_core_device *cur; + struct vfio_pci_core_device *vdev; struct pci_dev *pdev; - bool is_mem = true; int ret; mutex_lock(&dev_set->lock); - cur_mem = list_first_entry(&dev_set->device_list, - struct vfio_pci_core_device, - vdev.dev_set_list); pdev = vfio_pci_dev_set_resettable(dev_set); if (!pdev) { @@ -2533,7 +2419,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, if (ret) goto err_unlock; - list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) { + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) { bool owned; /* @@ -2557,38 +2443,38 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, * Otherwise, reset is not allowed. */ if (iommufd_ctx) { - int devid = vfio_iommufd_get_dev_id(&cur_vma->vdev, + int devid = vfio_iommufd_get_dev_id(&vdev->vdev, iommufd_ctx); owned = (devid > 0 || devid == -ENOENT); } else { - owned = vfio_dev_in_groups(&cur_vma->vdev, groups); + owned = vfio_dev_in_groups(&vdev->vdev, groups); } if (!owned) { ret = -EINVAL; - goto err_undo; + break; } /* - * Locking multiple devices is prone to deadlock, runaway and - * unwind if we hit contention. + * Take the memory write lock for each device and zap BAR + * mappings to prevent the user accessing the device while in + * reset. Locking multiple devices is prone to deadlock, + * runaway and unwind if we hit contention. */ - if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) { + if (!down_write_trylock(&vdev->memory_lock)) { ret = -EBUSY; - goto err_undo; + break; } - } - cur_vma = NULL; - list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) { - if (!down_write_trylock(&cur_mem->memory_lock)) { - ret = -EBUSY; - goto err_undo; - } - mutex_unlock(&cur_mem->vma_lock); + vfio_pci_zap_bars(vdev); + } + + if (!list_entry_is_head(vdev, + &dev_set->device_list, vdev.dev_set_list)) { + vdev = list_prev_entry(vdev, vdev.dev_set_list); + goto err_undo; } - cur_mem = NULL; /* * The pci_reset_bus() will reset all the devices in the bus. @@ -2599,25 +2485,22 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, * cause the PCI config space reset without restoring the original * state (saved locally in 'vdev->pm_save'). */ - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) - vfio_pci_set_power_state(cur, PCI_D0); + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) + vfio_pci_set_power_state(vdev, PCI_D0); ret = pci_reset_bus(pdev); -err_undo: - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { - if (cur == cur_mem) - is_mem = false; - if (cur == cur_vma) - break; - if (is_mem) - up_write(&cur->memory_lock); - else - mutex_unlock(&cur->vma_lock); - } + vdev = list_last_entry(&dev_set->device_list, + struct vfio_pci_core_device, vdev.dev_set_list); + +err_undo: + list_for_each_entry_from_reverse(vdev, &dev_set->device_list, + vdev.dev_set_list) + up_write(&vdev->memory_lock); + + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) + pm_runtime_put(&vdev->pdev->dev); - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) - pm_runtime_put(&cur->pdev->dev); err_unlock: mutex_unlock(&dev_set->lock); return ret; diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 6c6586af79..f8b8f3bcc7 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -105,9 +105,9 @@ struct vfio_dma { struct vfio_batch { struct page **pages; /* for pin_user_pages_remote */ struct page *fallback_page; /* if pages alloc fails */ - int capacity; /* length of pages array */ - int size; /* of batch currently */ - int offset; /* of next entry in pages */ + unsigned int capacity; /* length of pages array */ + unsigned int size; /* of batch currently */ + unsigned int offset; /* of next entry in pages */ }; struct vfio_iommu_group { @@ -474,12 +474,12 @@ static int put_pfn(unsigned long pfn, int prot) #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *)) -static void vfio_batch_init(struct vfio_batch *batch) +static void __vfio_batch_init(struct vfio_batch *batch, bool single) { batch->size = 0; batch->offset = 0; - if (unlikely(disable_hugepages)) + if (single || unlikely(disable_hugepages)) goto fallback; batch->pages = (struct page **) __get_free_page(GFP_KERNEL); @@ -494,6 +494,16 @@ fallback: batch->capacity = 1; } +static void vfio_batch_init(struct vfio_batch *batch) +{ + __vfio_batch_init(batch, false); +} + +static void vfio_batch_init_single(struct vfio_batch *batch) +{ + __vfio_batch_init(batch, true); +} + static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma) { while (batch->size) { @@ -513,14 +523,12 @@ static void vfio_batch_fini(struct vfio_batch *batch) static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, unsigned long vaddr, unsigned long *pfn, - bool write_fault) + unsigned long *addr_mask, bool write_fault) { - pte_t *ptep; - pte_t pte; - spinlock_t *ptl; + struct follow_pfnmap_args args = { .vma = vma, .address = vaddr }; int ret; - ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl); + ret = follow_pfnmap_start(&args); if (ret) { bool unlocked = false; @@ -534,43 +542,51 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, if (ret) return ret; - ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl); + ret = follow_pfnmap_start(&args); if (ret) return ret; } - pte = ptep_get(ptep); - - if (write_fault && !pte_write(pte)) + if (write_fault && !args.writable) { ret = -EFAULT; - else - *pfn = pte_pfn(pte); + } else { + *pfn = args.pfn; + *addr_mask = args.addr_mask; + } - pte_unmap_unlock(ptep, ptl); + follow_pfnmap_end(&args); return ret; } /* * Returns the positive number of pfns successfully obtained or a negative - * error code. + * error code. The initial pfn is stored in the pfn arg. For page-backed + * pfns, the provided batch is also updated to indicate the filled pages and + * initial offset. For VM_PFNMAP pfns, only the returned number of pfns and + * returned initial pfn are provided; subsequent pfns are contiguous. */ -static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, - long npages, int prot, unsigned long *pfn, - struct page **pages) +static long vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, + unsigned long npages, int prot, unsigned long *pfn, + struct vfio_batch *batch) { + unsigned long pin_pages = min_t(unsigned long, npages, batch->capacity); struct vm_area_struct *vma; unsigned int flags = 0; - int ret; + long ret; if (prot & IOMMU_WRITE) flags |= FOLL_WRITE; mmap_read_lock(mm); - ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM, - pages, NULL); + ret = pin_user_pages_remote(mm, vaddr, pin_pages, flags | FOLL_LONGTERM, + batch->pages, NULL); if (ret > 0) { - *pfn = page_to_pfn(pages[0]); + *pfn = page_to_pfn(batch->pages[0]); + batch->size = ret; + batch->offset = 0; goto done; + } else if (!ret) { + ret = -EFAULT; } vaddr = untagged_addr_remote(mm, vaddr); @@ -579,15 +595,22 @@ retry: vma = vma_lookup(mm, vaddr); if (vma && vma->vm_flags & VM_PFNMAP) { - ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE); + unsigned long addr_mask; + + ret = follow_fault_pfn(vma, mm, vaddr, pfn, &addr_mask, + prot & IOMMU_WRITE); if (ret == -EAGAIN) goto retry; if (!ret) { - if (is_invalid_reserved_pfn(*pfn)) - ret = 1; - else + if (is_invalid_reserved_pfn(*pfn)) { + unsigned long epfn; + + epfn = (*pfn | (~addr_mask >> PAGE_SHIFT)) + 1; + ret = min_t(long, npages, epfn - *pfn); + } else { ret = -EFAULT; + } } } done: @@ -601,7 +624,7 @@ done: * first page and all consecutive pages with the same locking. */ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, - long npage, unsigned long *pfn_base, + unsigned long npage, unsigned long *pfn_base, unsigned long limit, struct vfio_batch *batch) { unsigned long pfn; @@ -623,32 +646,42 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, *pfn_base = 0; } + if (unlikely(disable_hugepages)) + npage = 1; + while (npage) { if (!batch->size) { /* Empty batch, so refill it. */ - long req_pages = min_t(long, npage, batch->capacity); - - ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot, - &pfn, batch->pages); + ret = vaddr_get_pfns(mm, vaddr, npage, dma->prot, + &pfn, batch); if (ret < 0) goto unpin_out; - batch->size = ret; - batch->offset = 0; - if (!*pfn_base) { *pfn_base = pfn; rsvd = is_invalid_reserved_pfn(*pfn_base); } + + /* Handle pfnmap */ + if (!batch->size) { + if (pfn != *pfn_base + pinned || !rsvd) + goto out; + + pinned += ret; + npage -= ret; + vaddr += (PAGE_SIZE * ret); + iova += (PAGE_SIZE * ret); + continue; + } } /* - * pfn is preset for the first iteration of this inner loop and - * updated at the end to handle a VM_PFNMAP pfn. In that case, - * batch->pages isn't valid (there's no struct page), so allow - * batch->pages to be touched only when there's more than one - * pfn to check, which guarantees the pfns are from a - * !VM_PFNMAP vma. + * pfn is preset for the first iteration of this inner loop + * due to the fact that vaddr_get_pfns() needs to provide the + * initial pfn for pfnmaps. Therefore to reduce redundancy, + * the next pfn is fetched at the end of the loop. + * A PageReserved() page could still qualify as page backed + * and rsvd here, and therefore continues to use the batch. */ while (true) { if (pfn != *pfn_base + pinned || @@ -683,21 +716,12 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, pfn = page_to_pfn(batch->pages[batch->offset]); } - - if (unlikely(disable_hugepages)) - break; } out: ret = vfio_lock_acct(dma, lock_acct, false); unpin_out: - if (batch->size == 1 && !batch->offset) { - /* May be a VM_PFNMAP pfn, which the batch can't remember. */ - put_pfn(pfn, dma->prot); - batch->size = 0; - } - if (ret < 0) { if (pinned && !rsvd) { for (pfn = *pfn_base ; pinned ; pfn++, pinned--) @@ -712,7 +736,7 @@ unpin_out: } static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, - unsigned long pfn, long npage, + unsigned long pfn, unsigned long npage, bool do_accounting) { long unlocked = 0, locked = 0; @@ -735,7 +759,7 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, unsigned long *pfn_base, bool do_accounting) { - struct page *pages[1]; + struct vfio_batch batch; struct mm_struct *mm; int ret; @@ -743,7 +767,9 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, if (!mmget_not_zero(mm)) return -ENODEV; - ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages); + vfio_batch_init_single(&batch); + + ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, &batch); if (ret != 1) goto out; @@ -762,6 +788,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, } out: + vfio_batch_fini(&batch); mmput(mm); return ret; } diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 429529f7a4..245a99afe9 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -158,6 +158,8 @@ extern int cifs_get_writable_path(struct cifs_tcon *tcon, const char *name, extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); extern int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name, struct cifsFileInfo **ret_file); +extern int cifs_get_hardlink_path(struct cifs_tcon *tcon, struct inode *inode, + struct file *file); extern unsigned int smbCalcSize(void *buf); extern int decode_negTokenInit(unsigned char *security_blob, int length, struct TCP_Server_Info *server); diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index dd6cb08fd4..254347d16d 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -685,15 +685,23 @@ int cifs_open(struct inode *inode, struct file *file) rc = cifs_get_readable_path(tcon, full_path, &cfile); } if (rc == 0) { - if (file->f_flags == cfile->f_flags) { + unsigned int oflags = file->f_flags & ~(O_CREAT|O_EXCL|O_TRUNC); + unsigned int cflags = cfile->f_flags & ~(O_CREAT|O_EXCL|O_TRUNC); + + if (cifs_convert_flags(oflags, 0) == cifs_convert_flags(cflags, 0) && + (oflags & (O_SYNC|O_DIRECT)) == (cflags & (O_SYNC|O_DIRECT))) { file->private_data = cfile; spin_lock(&CIFS_I(inode)->deferred_lock); cifs_del_deferred_close(cfile); spin_unlock(&CIFS_I(inode)->deferred_lock); goto use_cache; - } else { - _cifsFileInfo_put(cfile, true, false); } + _cifsFileInfo_put(cfile, true, false); + } else { + /* hard link on the defeered close file */ + rc = cifs_get_hardlink_path(tcon, inode, file); + if (rc) + cifs_close_deferred_file(CIFS_I(inode)); } if (server->oplocks) @@ -1754,6 +1762,29 @@ cifs_move_llist(struct list_head *source, struct list_head *dest) list_move(li, dest); } +int +cifs_get_hardlink_path(struct cifs_tcon *tcon, struct inode *inode, + struct file *file) +{ + struct cifsFileInfo *open_file = NULL; + struct cifsInodeInfo *cinode = CIFS_I(inode); + int rc = 0; + + spin_lock(&tcon->open_file_lock); + spin_lock(&cinode->open_file_lock); + + list_for_each_entry(open_file, &cinode->openFileList, flist) { + if (file->f_flags == open_file->f_flags) { + rc = -EINVAL; + break; + } + } + + spin_unlock(&cinode->open_file_lock); + spin_unlock(&tcon->open_file_lock); + return rc; +} + void cifs_free_llist(struct list_head *llist) { diff --git a/include/config/ARCH_SUPPORTS_HUGE_PFNMAP b/include/config/ARCH_SUPPORTS_HUGE_PFNMAP new file mode 100644 index 0000000000..e69de29bb2 diff --git a/include/config/ARCH_SUPPORTS_PMD_PFNMAP b/include/config/ARCH_SUPPORTS_PMD_PFNMAP new file mode 100644 index 0000000000..e69de29bb2 diff --git a/include/config/auto.conf b/include/config/auto.conf index b9f5a0a80e..81102308c9 100644 --- a/include/config/auto.conf +++ b/include/config/auto.conf @@ -1369,6 +1369,7 @@ CONFIG_CRYPTO_AES_ARM64_CE=y CONFIG_CORESIGHT_LINK_AND_SINK_TMC=m CONFIG_NFT_FIB_NETDEV=m CONFIG_SERIAL_EARLYCON=y +CONFIG_ARCH_SUPPORTS_PMD_PFNMAP=y CONFIG_CLS_U32_MARK=y CONFIG_SND_ICE1712=m CONFIG_GENERIC_IRQ_INJECTION=y @@ -1680,6 +1681,7 @@ CONFIG_ARCH_HAS_FAST_MULTIPLIER=y CONFIG_ATH9K_DEBUGFS=y CONFIG_NET_VENDOR_REALTEK=y CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y +CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP=y CONFIG_NETFILTER_XT_TARGET_HL=m CONFIG_MEGARAID_SAS=m CONFIG_MT792x_LIB=m diff --git a/include/generated/autoconf.h b/include/generated/autoconf.h index 1a74489160..2402b89ca7 100644 --- a/include/generated/autoconf.h +++ b/include/generated/autoconf.h @@ -1371,6 +1371,7 @@ #define CONFIG_CORESIGHT_LINK_AND_SINK_TMC_MODULE 1 #define CONFIG_NFT_FIB_NETDEV_MODULE 1 #define CONFIG_SERIAL_EARLYCON 1 +#define CONFIG_ARCH_SUPPORTS_PMD_PFNMAP 1 #define CONFIG_CLS_U32_MARK 1 #define CONFIG_SND_ICE1712_MODULE 1 #define CONFIG_GENERIC_IRQ_INJECTION 1 @@ -1682,6 +1683,7 @@ #define CONFIG_ATH9K_DEBUGFS 1 #define CONFIG_NET_VENDOR_REALTEK 1 #define CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG 1 +#define CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP 1 #define CONFIG_NETFILTER_XT_TARGET_HL_MODULE 1 #define CONFIG_MEGARAID_SAS_MODULE 1 #define CONFIG_MT792x_LIB_MODULE 1 diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index fc789c0ac8..eaba832e03 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -256,11 +256,6 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd); } -static inline bool is_huge_zero_pud(pud_t pud) -{ - return false; -} - struct page *mm_get_huge_zero_page(struct mm_struct *mm); void mm_put_huge_zero_page(struct mm_struct *mm); @@ -379,11 +374,6 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) return false; } -static inline bool is_huge_zero_pud(pud_t pud) -{ - return false; -} - static inline void mm_put_huge_zero_page(struct mm_struct *mm) { return; diff --git a/include/linux/mm.h b/include/linux/mm.h index 196c481ec1..ab100f6bd2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2427,15 +2427,42 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); -int follow_pte(struct mm_struct *mm, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp); -int follow_pfn(struct vm_area_struct *vma, unsigned long address, - unsigned long *pfn); -int follow_phys(struct vm_area_struct *vma, unsigned long address, - unsigned int flags, unsigned long *prot, resource_size_t *phys); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); +struct follow_pfnmap_args { + /** + * Inputs: + * @vma: Pointer to @vm_area_struct struct + * @address: the virtual address to walk + */ + struct vm_area_struct *vma; + unsigned long address; + /** + * Internals: + * + * The caller shouldn't touch any of these. + */ + spinlock_t *lock; + pte_t *ptep; + /** + * Outputs: + * + * @pfn: the PFN of the address + * @addr_mask: address mask covering pfn + * @pgprot: the pgprot_t of the mapping + * @writable: whether the mapping is writable + * @special: whether the mapping is a special mapping (real PFN maps) + */ + unsigned long pfn; + unsigned long addr_mask; + pgprot_t pgprot; + bool writable; + bool special; +}; +int follow_pfnmap_start(struct follow_pfnmap_args *args); +void follow_pfnmap_end(struct follow_pfnmap_args *args); + extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); @@ -2730,6 +2757,30 @@ static inline pte_t pte_mkspecial(pte_t pte) } #endif +#ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +static inline bool pmd_special(pmd_t pmd) +{ + return false; +} + +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return pmd; +} +#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ + +#ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +static inline bool pud_special(pud_t pud) +{ + return false; +} + +static inline pud_t pud_mkspecial(pud_t pud) +{ + return pud; +} +#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ + #ifndef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pte_devmap(pte_t pte) { diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index f62a9c9f3c..73eca45d91 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1685,6 +1685,18 @@ typedef unsigned int pgtbl_mod_mask; #define MAX_PTRS_PER_P4D PTRS_PER_P4D #endif +#ifndef pte_pgprot +#define pte_pgprot(x) ((pgprot_t) {0}) +#endif + +#ifndef pmd_pgprot +#define pmd_pgprot(x) ((pgprot_t) {0}) +#endif + +#ifndef pud_pgprot +#define pud_pgprot(x) ((pgprot_t) {0}) +#endif + /* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected * behavior is in parens: diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 30c79194ee..fbb472dd99 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -93,8 +93,6 @@ struct vfio_pci_core_device { struct list_head sriov_pfs_item; struct vfio_pci_core_device *sriov_pf_core_dev; struct notifier_block nb; - struct mutex vma_lock; - struct list_head vma_list; struct rw_semaphore memory_lock; }; diff --git a/mm/Kconfig b/mm/Kconfig index a91823e31f..ab0e794e3f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -897,6 +897,19 @@ config READ_ONLY_THP_FOR_FS endif # TRANSPARENT_HUGEPAGE +# TODO: Allow to be enabled without THP +config ARCH_SUPPORTS_HUGE_PFNMAP + def_bool n + depends on TRANSPARENT_HUGEPAGE + +config ARCH_SUPPORTS_PMD_PFNMAP + def_bool y + depends on ARCH_SUPPORTS_HUGE_PFNMAP && HAVE_ARCH_TRANSPARENT_HUGEPAGE + +config ARCH_SUPPORTS_PUD_PFNMAP + def_bool y + depends on ARCH_SUPPORTS_HUGE_PFNMAP && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD + # # UP and nommu archs use km based percpu allocator # diff --git a/mm/gup.c b/mm/gup.c index ad7345cfba..16cdddef91 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2903,6 +2903,9 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) return 0; + if (pmd_special(orig)) + return 0; + if (pmd_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; @@ -2947,6 +2950,9 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, if (!pud_access_permitted(orig, flags & FOLL_WRITE)) return 0; + if (pud_special(orig)) + return 0; + if (pud_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 20d9b3971d..c1cdbd21dd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -860,6 +860,8 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pmd_mkdevmap(entry); + else + entry = pmd_mkspecial(entry); if (write) { entry = pmd_mkyoung(pmd_mkdirty(entry)); entry = maybe_pmd_mkwrite(entry, vma); @@ -943,10 +945,8 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, ptl = pud_lock(mm, pud); if (!pud_none(*pud)) { if (write) { - if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) { - WARN_ON_ONCE(!is_huge_zero_pud(*pud)); + if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn))) goto out_unlock; - } entry = pud_mkyoung(*pud); entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); if (pudp_set_access_flags(vma, addr, pud, entry, 1)) @@ -958,6 +958,8 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, entry = pud_mkhuge(pfn_t_pud(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pud_mkdevmap(entry); + else + entry = pud_mkspecial(entry); if (write) { entry = pud_mkyoung(pud_mkdirty(entry)); entry = maybe_pud_mkwrite(entry, vma); @@ -1070,6 +1072,24 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pgtable_t pgtable = NULL; int ret = -ENOMEM; + pmd = pmdp_get_lockless(src_pmd); + if (unlikely(pmd_present(pmd) && pmd_special(pmd))) { + dst_ptl = pmd_lock(dst_mm, dst_pmd); + src_ptl = pmd_lockptr(src_mm, src_pmd); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + /* + * No need to recheck the pmd, it can't change with write + * mmap lock held here. + * + * Meanwhile, making sure it's not a CoW VMA with writable + * mapping, otherwise it means either the anon page wrongly + * applied special bit, or we made the PRIVATE mapping be + * able to wrongly write to the backend MMIO. + */ + VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd)); + goto set_pmd; + } + /* Skip if can be re-fill on fault */ if (!vma_is_anonymous(dst_vma)) return 0; @@ -1150,7 +1170,9 @@ out_zero_page: pmdp_set_wrprotect(src_mm, addr, src_pmd); if (!userfaultfd_wp(dst_vma)) pmd = pmd_clear_uffd_wp(pmd); - pmd = pmd_mkold(pmd_wrprotect(pmd)); + pmd = pmd_wrprotect(pmd); +set_pmd: + pmd = pmd_mkold(pmd); set_pmd_at(dst_mm, addr, dst_pmd, pmd); ret = 0; @@ -1235,21 +1257,15 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) goto out_unlock; - /* - * When page table lock is held, the huge zero pud should not be - * under splitting since we don't split the page itself, only pud to - * a page table. - */ - if (is_huge_zero_pud(pud)) { - /* No huge zero pud yet */ - } - /* * TODO: once we support anonymous pages, use page_try_dup_anon_rmap() * and split if duplicating fails. */ - pudp_set_wrprotect(src_mm, addr, src_pud); - pud = pud_mkold(pud_wrprotect(pud)); + if (is_cow_mapping(vma->vm_flags) && pud_write(pud)) { + pudp_set_wrprotect(src_mm, addr, src_pud); + pud = pud_wrprotect(pud); + } + pud = pud_mkold(pud); set_pud_at(dst_mm, addr, dst_pud, pud); ret = 0; diff --git a/mm/memory.c b/mm/memory.c index e2794e3b89..0338ced72b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -659,11 +659,10 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, { unsigned long pfn = pmd_pfn(pmd); - /* - * There is no pmd_special() but there may be special pmds, e.g. - * in a direct-access (dax) mapping, so let's just replicate the - * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here. - */ + /* Currently it's only used for huge pfnmaps */ + if (unlikely(pmd_special(pmd))) + return NULL; + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) @@ -5607,130 +5606,159 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ +static inline void pfnmap_args_setup(struct follow_pfnmap_args *args, + spinlock_t *lock, pte_t *ptep, + pgprot_t pgprot, unsigned long pfn_base, + unsigned long addr_mask, bool writable, + bool special) +{ + args->lock = lock; + args->ptep = ptep; + args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT); + args->addr_mask = addr_mask; + args->pgprot = pgprot; + args->writable = writable; + args->special = special; +} + +static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma) +{ +#ifdef CONFIG_LOCKDEP + struct file *file = vma->vm_file; + struct address_space *mapping = file ? file->f_mapping : NULL; + + if (mapping) + lockdep_assert(lockdep_is_held(&vma->vm_file->f_mapping->i_mmap_rwsem) || + lockdep_is_held(&vma->vm_mm->mmap_lock)); + else + lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock)); +#endif +} + /** - * follow_pte - look up PTE at a user virtual address - * @mm: the mm_struct of the target address space - * @address: user virtual address - * @ptepp: location to store found PTE - * @ptlp: location to store the lock for the PTE + * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address + * @args: Pointer to struct @follow_pfnmap_args * - * On a successful return, the pointer to the PTE is stored in @ptepp; - * the corresponding lock is taken and its location is stored in @ptlp. - * The contents of the PTE are only stable until @ptlp is released; - * any further use, if any, must be protected against invalidation - * with MMU notifiers. + * The caller needs to setup args->vma and args->address to point to the + * virtual address as the target of such lookup. On a successful return, + * the results will be put into other output fields. + * + * After the caller finished using the fields, the caller must invoke + * another follow_pfnmap_end() to proper releases the locks and resources + * of such look up request. + * + * During the start() and end() calls, the results in @args will be valid + * as proper locks will be held. After the end() is called, all the fields + * in @follow_pfnmap_args will be invalid to be further accessed. Further + * use of such information after end() may require proper synchronizations + * by the caller with page table updates, otherwise it can create a + * security bug. + * + * If the PTE maps a refcounted page, callers are responsible to protect + * against invalidation with MMU notifiers; otherwise access to the PFN at + * a later point in time can trigger use-after-free. * * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore - * should be taken for read. + * should be taken for read, and the mmap semaphore cannot be released + * before the end() is invoked. * - * KVM uses this function. While it is arguably less bad than ``follow_pfn``, - * it is not a good general-purpose API. + * This function must not be used to modify PTE content. * - * Return: zero on success, -ve otherwise. + * Return: zero on success, negative otherwise. */ -int follow_pte(struct mm_struct *mm, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp) +int follow_pfnmap_start(struct follow_pfnmap_args *args) { - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep; + struct vm_area_struct *vma = args->vma; + unsigned long address = args->address; + struct mm_struct *mm = vma->vm_mm; + spinlock_t *lock; + pgd_t *pgdp; + p4d_t *p4dp, p4d; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + pfnmap_lockdep_assert(vma); + + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) goto out; - p4d = p4d_offset(pgd, address); - if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d))) + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out; +retry: + pgdp = pgd_offset(mm, address); + if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp))) goto out; - pud = pud_offset(p4d, address); - if (pud_none(*pud) || unlikely(pud_bad(*pud))) + p4dp = p4d_offset(pgdp, address); + p4d = READ_ONCE(*p4dp); + if (p4d_none(p4d) || unlikely(p4d_bad(p4d))) goto out; - pmd = pmd_offset(pud, address); - VM_BUG_ON(pmd_trans_huge(*pmd)); + pudp = pud_offset(p4dp, address); + pud = READ_ONCE(*pudp); + if (pud_none(pud)) + goto out; + if (pud_leaf(pud)) { + lock = pud_lock(mm, pudp); + if (!unlikely(pud_leaf(pud))) { + spin_unlock(lock); + goto retry; + } + pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud), + pud_pfn(pud), PUD_MASK, pud_write(pud), + pud_special(pud)); + return 0; + } - ptep = pte_offset_map_lock(mm, pmd, address, ptlp); + pmdp = pmd_offset(pudp, address); + pmd = pmdp_get_lockless(pmdp); + if (pmd_leaf(pmd)) { + lock = pmd_lock(mm, pmdp); + if (!unlikely(pmd_leaf(pmd))) { + spin_unlock(lock); + goto retry; + } + pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd), + pmd_pfn(pmd), PMD_MASK, pmd_write(pmd), + pmd_special(pmd)); + return 0; + } + + ptep = pte_offset_map_lock(mm, pmdp, address, &lock); if (!ptep) goto out; - if (!pte_present(ptep_get(ptep))) + pte = ptep_get(ptep); + if (!pte_present(pte)) goto unlock; - *ptepp = ptep; + pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte), + pte_pfn(pte), PAGE_MASK, pte_write(pte), + pte_special(pte)); return 0; unlock: - pte_unmap_unlock(ptep, *ptlp); + pte_unmap_unlock(ptep, lock); out: return -EINVAL; } -EXPORT_SYMBOL_GPL(follow_pte); +EXPORT_SYMBOL_GPL(follow_pfnmap_start); /** - * follow_pfn - look up PFN at a user virtual address - * @vma: memory mapping - * @address: user virtual address - * @pfn: location to store found PFN + * follow_pfnmap_end(): End a follow_pfnmap_start() process + * @args: Pointer to struct @follow_pfnmap_args * - * Only IO mappings and raw PFN mappings are allowed. - * - * This function does not allow the caller to read the permissions - * of the PTE. Do not use it. - * - * Return: zero and the pfn at @pfn on success, -ve otherwise. + * Must be used in pair of follow_pfnmap_start(). See the start() function + * above for more information. */ -int follow_pfn(struct vm_area_struct *vma, unsigned long address, - unsigned long *pfn) +void follow_pfnmap_end(struct follow_pfnmap_args *args) { - int ret = -EINVAL; - spinlock_t *ptl; - pte_t *ptep; - - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return ret; - - ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); - if (ret) - return ret; - *pfn = pte_pfn(ptep_get(ptep)); - pte_unmap_unlock(ptep, ptl); - return 0; + if (args->lock) + spin_unlock(args->lock); + if (args->ptep) + pte_unmap(args->ptep); } -EXPORT_SYMBOL(follow_pfn); +EXPORT_SYMBOL_GPL(follow_pfnmap_end); #ifdef CONFIG_HAVE_IOREMAP_PROT -int follow_phys(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned long *prot, resource_size_t *phys) -{ - int ret = -EINVAL; - pte_t *ptep, pte; - spinlock_t *ptl; - - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - goto out; - - if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) - goto out; - pte = ptep_get(ptep); - - /* Never return PFNs of anon folios in COW mappings. */ - if (vm_normal_folio(vma, address, pte)) - goto unlock; - - if ((flags & FOLL_WRITE) && !pte_write(pte)) - goto unlock; - - *prot = pgprot_val(pte_pgprot(pte)); - *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; - - ret = 0; -unlock: - pte_unmap_unlock(ptep, ptl); -out: - return ret; -} - /** * generic_access_phys - generic implementation for iomem mmap access * @vma: the vma to access @@ -5749,37 +5777,34 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, resource_size_t phys_addr; unsigned long prot = 0; void __iomem *maddr; - pte_t *ptep, pte; - spinlock_t *ptl; int offset = offset_in_page(addr); int ret = -EINVAL; - - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return -EINVAL; + bool writable; + struct follow_pfnmap_args args = { .vma = vma, .address = addr }; retry: - if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) + if (follow_pfnmap_start(&args)) return -EINVAL; - pte = ptep_get(ptep); - pte_unmap_unlock(ptep, ptl); + prot = pgprot_val(args.pgprot); + phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT; + writable = args.writable; + follow_pfnmap_end(&args); - prot = pgprot_val(pte_pgprot(pte)); - phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; - - if ((write & FOLL_WRITE) && !pte_write(pte)) + if ((write & FOLL_WRITE) && !writable) return -EINVAL; maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); if (!maddr) return -ENOMEM; - if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) + if (follow_pfnmap_start(&args)) goto out_unmap; - if (!pte_same(pte, ptep_get(ptep))) { - pte_unmap_unlock(ptep, ptl); + if ((prot != pgprot_val(args.pgprot)) || + (phys_addr != (args.pfn << PAGE_SHIFT)) || + (writable != args.writable)) { + follow_pfnmap_end(&args); iounmap(maddr); - goto retry; } @@ -5788,7 +5813,7 @@ retry: else memcpy_fromio(buf, maddr + offset, len); ret = len; - pte_unmap_unlock(ptep, ptl); + follow_pfnmap_end(&args); out_unmap: iounmap(maddr); diff --git a/mm/nommu.c b/mm/nommu.c index f3f6a7e976..de9ecac05d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -110,27 +110,6 @@ unsigned int kobjsize(const void *objp) return page_size(page); } -/** - * follow_pfn - look up PFN at a user virtual address - * @vma: memory mapping - * @address: user virtual address - * @pfn: location to store found PFN - * - * Only IO mappings and raw PFN mappings are allowed. - * - * Returns zero and the pfn at @pfn on success, -ve otherwise. - */ -int follow_pfn(struct vm_area_struct *vma, unsigned long address, - unsigned long *pfn) -{ - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return -EINVAL; - - *pfn = address >> PAGE_SHIFT; - return 0; -} -EXPORT_SYMBOL(follow_pfn); - LIST_HEAD(vmap_area_list); void vfree(const void *addr) diff --git a/redhat/kernel.changelog-9.6 b/redhat/kernel.changelog-9.6 index 45aa1339d2..c750b086df 100644 --- a/redhat/kernel.changelog-9.6 +++ b/redhat/kernel.changelog-9.6 @@ -1,3 +1,45 @@ +* Sat Jul 05 2025 CKI KWF Bot [5.14.0-570.26.1.el9_6] +- x86/microcode/AMD: Fix out-of-bounds on systems with CPU-less NUMA nodes (CKI Backport Bot) [RHEL-98996] {CVE-2025-21991} +- cpufreq: intel_pstate: Unchecked MSR aceess in legacy mode (David Arcari) [RHEL-90212] +- smb: client: fix perf regression with deferred closes (Paulo Alcantara) [RHEL-97482] +- smb3 client: fix open hardlink on deferred close file error (Paulo Alcantara) [RHEL-97482] +- Fix mmu notifiers for range-based invalidates (Jay Shin) [RHEL-93743] +- vfio/pci: Align huge faults to order (Alex Williamson) [RHEL-88275] +- vfio/type1: Use mapping page mask for pfnmaps (Alex Williamson) [RHEL-88275] +- mm: Provide address mask in struct follow_pfnmap_args (Alex Williamson) [RHEL-88275] +- vfio/type1: Use consistent types for page counts (Alex Williamson) [RHEL-88275] +- vfio/type1: Use vfio_batch for vaddr_get_pfns() (Alex Williamson) [RHEL-88275] +- vfio/type1: Convert all vaddr_get_pfns() callers to use vfio_batch (Alex Williamson) [RHEL-88275] +- vfio/type1: Catch zero from pin_user_pages_remote() (Alex Williamson) [RHEL-88275] +- vfio/pci: Fallback huge faults for unaligned pfn (Donald Dutile) [RHEL-85623] +- vfio/pci: implement huge_fault support (Donald Dutile) [RHEL-85623] +- vfio/pci: Remove unused struct 'vfio_pci_mmap_vma' (Donald Dutile) [RHEL-85623] +- vfio/pci: Insert full vma on mmap'd MMIO fault (Donald Dutile) [RHEL-85623] +- vfio/pci: Use unmap_mapping_range() (Donald Dutile) [RHEL-85623] +- mm/arm64: support large pfn mappings (Donald Dutile) [RHEL-85623] +- mm/x86: support large pfn mappings (Donald Dutile) [RHEL-85623] +- mm: remove follow_pte() (Donald Dutile) [RHEL-85623] +- mm: follow_pte() improvements (Donald Dutile) [RHEL-85623] +- mm/access_process_vm: use the new follow_pfnmap API (Donald Dutile) [RHEL-85623] +- vfio: use the new follow_pfnmap API (Donald Dutile) [RHEL-85623] +- mm/x86/pat: use the new follow_pfnmap API (Donald Dutile) [RHEL-85623] +- s390/pci_mmio: use follow_pfnmap API (Donald Dutile) [RHEL-85623] +- KVM: use follow_pfnmap API (Donald Dutile) [RHEL-85623] +- mm: pass VMA instead of MM to follow_pte() (Donald Dutile) [RHEL-85623] +- mm: move follow_phys to arch/x86/mm/pat/memtype.c (Donald Dutile) [RHEL-85623] +- mm: fix follow_pfnmap API lockdep assert (Donald Dutile) [RHEL-85623] +- mm: new follow_pfnmap API (Donald Dutile) [RHEL-85623] +- mm: remove follow_pfn (Donald Dutile) [RHEL-85623] +- mm: always define pxx_pgprot() (Donald Dutile) [RHEL-85623] +- mm/huge_memory: check pmd_special() only after pmd_present() (Donald Dutile) [RHEL-85623] +- mm/fork: accept huge pfnmap entries (Donald Dutile) [RHEL-85623] +- mm/pagewalk: check pfnmap for folio_walk_start() (Donald Dutile) [RHEL-85623] +- mm/gup: detect huge pfnmap entries in gup-fast (Donald Dutile) [RHEL-85623] +- mm: mark special bits for huge pfn mappings when inject (Donald Dutile) [RHEL-85623] +- mm: drop is_huge_zero_pud() (Donald Dutile) [RHEL-85623] +- mm: introduce ARCH_SUPPORTS_HUGE_PFNMAP and special bits to pmd/pud (Donald Dutile) [RHEL-85623] +Resolves: RHEL-85623, RHEL-88275, RHEL-90212, RHEL-93743, RHEL-97482, RHEL-98996 + * Sat Jun 28 2025 CKI KWF Bot [5.14.0-570.25.1.el9_6] - udf: Fix a slab-out-of-bounds write bug in udf_find_entry() (CKI Backport Bot) [RHEL-99124] {CVE-2022-49846} - vmxnet3: Fix malformed packet sizing in vmxnet3_process_xdp (CKI Backport Bot) [RHEL-97110] {CVE-2025-37799} diff --git a/scripts/basic/fixdep b/scripts/basic/fixdep index ac57412205..fc40813391 100755 Binary files a/scripts/basic/fixdep and b/scripts/basic/fixdep differ diff --git a/scripts/kconfig/conf b/scripts/kconfig/conf index baf1524040..dd279e0d9c 100755 Binary files a/scripts/kconfig/conf and b/scripts/kconfig/conf differ diff --git a/scripts/kconfig/conf.o b/scripts/kconfig/conf.o index b4bf3a12d7..e1deac6c57 100644 Binary files a/scripts/kconfig/conf.o and b/scripts/kconfig/conf.o differ diff --git a/scripts/kconfig/confdata.o b/scripts/kconfig/confdata.o index 6d7c905ba9..bfe1b5cdcd 100644 Binary files a/scripts/kconfig/confdata.o and b/scripts/kconfig/confdata.o differ diff --git a/scripts/kconfig/expr.o b/scripts/kconfig/expr.o index 4b71eca37d..a246c725bc 100644 Binary files a/scripts/kconfig/expr.o and b/scripts/kconfig/expr.o differ diff --git a/scripts/kconfig/lexer.lex.o b/scripts/kconfig/lexer.lex.o index 936f879cb3..4f1aae7c48 100644 Binary files a/scripts/kconfig/lexer.lex.o and b/scripts/kconfig/lexer.lex.o differ diff --git a/scripts/kconfig/menu.o b/scripts/kconfig/menu.o index 8cfe9ee404..17fab9bb93 100644 Binary files a/scripts/kconfig/menu.o and b/scripts/kconfig/menu.o differ diff --git a/scripts/kconfig/parser.tab.o b/scripts/kconfig/parser.tab.o index 3c121c7fb8..b97c979ad6 100644 Binary files a/scripts/kconfig/parser.tab.o and b/scripts/kconfig/parser.tab.o differ diff --git a/scripts/kconfig/preprocess.o b/scripts/kconfig/preprocess.o index f576fa5524..9a08c4ca8a 100644 Binary files a/scripts/kconfig/preprocess.o and b/scripts/kconfig/preprocess.o differ diff --git a/scripts/kconfig/symbol.o b/scripts/kconfig/symbol.o index d1bc21c270..f809f89b8c 100644 Binary files a/scripts/kconfig/symbol.o and b/scripts/kconfig/symbol.o differ diff --git a/scripts/kconfig/util.o b/scripts/kconfig/util.o index ea9489e4aa..eb2329c962 100644 Binary files a/scripts/kconfig/util.o and b/scripts/kconfig/util.o differ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b163a079fe..279572294a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2878,13 +2878,11 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, unsigned long addr, bool write_fault, bool *writable, kvm_pfn_t *p_pfn) { + struct follow_pfnmap_args args = { .vma = vma, .address = addr }; kvm_pfn_t pfn; - pte_t *ptep; - pte_t pte; - spinlock_t *ptl; int r; - r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); + r = follow_pfnmap_start(&args); if (r) { /* * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does @@ -2899,21 +2897,19 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, if (r) return r; - r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); + r = follow_pfnmap_start(&args); if (r) return r; } - pte = ptep_get(ptep); - - if (write_fault && !pte_write(pte)) { + if (write_fault && !args.writable) { pfn = KVM_PFN_ERR_RO_FAULT; goto out; } if (writable) - *writable = pte_write(pte); - pfn = pte_pfn(pte); + *writable = args.writable; + pfn = args.pfn; /* * Get a reference here because callers of *hva_to_pfn* and @@ -2934,9 +2930,8 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, */ if (!kvm_try_get_pfn(pfn)) r = -EFAULT; - out: - pte_unmap_unlock(ptep, ptl); + follow_pfnmap_end(&args); *p_pfn = pfn; return r;