From b0aa308873f88fcc9ddef1e6ecb7a4f32dee327e Mon Sep 17 00:00:00 2001 From: almalinux-bot-kernel Date: Wed, 29 Oct 2025 04:09:27 +0000 Subject: [PATCH] Import of kernel-4.18.0-553.81.1.el8_10 --- Makefile.rhelver | 2 +- arch/arm64/include/asm/pgtable.h | 11 +- arch/arm64/kernel/setup.c | 3 - arch/arm64/kernel/smp.c | 2 + arch/arm64/mm/mmu.c | 3 +- arch/powerpc/include/asm/pgtable.h | 4 - arch/powerpc/mm/init_64.c | 5 +- arch/powerpc/mm/pgtable_64.c | 1 - arch/powerpc/platforms/pseries/lpar.c | 4 +- arch/s390/include/asm/extable.h | 9 +- arch/s390/mm/gmap.c | 2 +- arch/s390/mm/hugetlbpage.c | 2 +- arch/x86/include/asm/tlbflush.h | 2 +- arch/x86/mm/kaslr.c | 8 +- arch/x86/mm/pat/cpa-test.c | 2 +- drivers/base/node.c | 150 +------ drivers/net/ethernet/mellanox/mlx5/core/fw.c | 4 + .../net/ethernet/mellanox/mlx5/core/health.c | 8 + .../ethernet/mellanox/mlx5/core/lib/pci_vsc.c | 4 + drivers/scsi/lpfc/lpfc_nvmet.c | 10 +- fs/dlm/lock.c | 2 +- fs/dlm/recover.c | 2 +- fs/efivarfs/super.c | 4 + fs/fs-writeback.c | 41 +- fs/proc/task_mmu.c | 8 +- include/linux/gfp.h | 2 + include/linux/hugetlb.h | 19 + include/linux/memcontrol.h | 9 +- include/linux/memregion.h | 2 +- include/linux/node.h | 24 +- include/linux/pagemap.h | 62 ++- include/linux/shmem_fs.h | 10 +- include/linux/swap.h | 7 + include/linux/tracehook.h | 2 +- include/trace/events/writeback.h | 2 +- lib/radix-tree.c | 1 - lib/slub_kunit.c | 2 +- mm/cma.c | 2 +- mm/compaction.c | 94 +++-- mm/filemap.c | 72 ++-- mm/gup.c | 4 +- mm/huge_memory.c | 6 +- mm/hugetlb.c | 71 ++-- mm/internal.h | 9 +- mm/khugepaged.c | 2 +- mm/madvise.c | 21 +- mm/memcontrol.c | 68 +-- mm/memory-failure.c | 21 +- mm/memory.c | 4 +- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 18 +- mm/mincore.c | 28 +- mm/page_alloc.c | 12 +- mm/page_isolation.c | 10 + mm/page_reporting.c | 6 +- mm/percpu.c | 8 +- mm/readahead.c | 3 +- mm/shmem.c | 392 +++++++++--------- mm/slab_common.c | 24 +- mm/swap.c | 4 + mm/swap_state.c | 38 +- mm/vmalloc.c | 48 ++- mm/zswap.c | 16 +- net/bluetooth/hci_core.c | 16 +- net/bluetooth/l2cap_core.c | 8 +- net/wireless/sme.c | 5 +- 66 files changed, 767 insertions(+), 680 deletions(-) diff --git a/Makefile.rhelver b/Makefile.rhelver index f197d0f7f5..7c6d625f11 100644 --- a/Makefile.rhelver +++ b/Makefile.rhelver @@ -12,7 +12,7 @@ RHEL_MINOR = 10 # # Use this spot to avoid future merge conflicts. # Do not trim this comment. -RHEL_RELEASE = 553.80.1 +RHEL_RELEASE = 553.81.1 # # ZSTREAM diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 838e1d667a..925facf6fe 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -188,7 +188,7 @@ static inline pte_t pte_wrprotect(pte_t pte) * clear), set the PTE_DIRTY bit. */ if (pte_hw_dirty(pte)) - pte = pte_mkdirty(pte); + pte = set_pte_bit(pte, __pgprot(PTE_DIRTY)); pte = clear_pte_bit(pte, __pgprot(PTE_WRITE)); pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); @@ -675,8 +675,15 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) PTE_PROT_NONE | PTE_VALID | PTE_WRITE | PTE_GP; /* preserve the hardware dirty information */ if (pte_hw_dirty(pte)) - pte = pte_mkdirty(pte); + pte = set_pte_bit(pte, __pgprot(PTE_DIRTY)); + pte_val(pte) = (pte_val(pte) & ~mask) | (pgprot_val(newprot) & mask); + /* + * If we end up clearing hw dirtiness for a sw-dirty PTE, set hardware + * dirtiness again. + */ + if (pte_sw_dirty(pte)) + pte = pte_mkdirty(pte); return pte; } diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index fc6c540117..7c9f92247f 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -353,9 +353,6 @@ void __init setup_arch(char **cmdline_p) smp_init_cpus(); smp_build_mpidr_hash(); - /* Init percpu seeds for random tags after cpus are set up. */ - kasan_init_sw_tags(); - #ifdef CONFIG_ARM64_SW_TTBR0_PAN /* * Make sure init_thread_info.ttbr0 always generates translation diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 366f375144..c390369bb5 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -474,6 +474,8 @@ void __init smp_prepare_boot_cpu(void) init_gic_priority_masking(); kasan_init_hw_tags(); + /* Init percpu seeds for random tags after cpus are set up. */ + kasan_init_sw_tags(); } static u64 __init of_get_cpu_mpidr(struct device_node *dn) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 39ce5afd51..538d7bdeff 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1434,7 +1434,8 @@ int arch_add_memory(int nid, u64 start, u64 size, __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size); else { - max_pfn = PFN_UP(start + size); + /* Address of hotplugged memory can be smaller */ + max_pfn = max(max_pfn, PFN_UP(start + size)); max_low_pfn = max_pfn; } diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 0d1e5b6a87..2cd14454fe 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -68,13 +68,9 @@ static inline void mark_initmem_nx(void) { } #define is_ioremap_addr is_ioremap_addr static inline bool is_ioremap_addr(const void *x) { -#ifdef CONFIG_MMU unsigned long addr = (unsigned long)x; return addr >= IOREMAP_BASE && addr < IOREMAP_END; -#else - return false; -#endif } #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index d92d4b5fef..3e9b44ef36 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -198,7 +198,7 @@ static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long star unsigned long nr_pfn = page_size / sizeof(struct page); unsigned long start_pfn = page_to_pfn((struct page *)start); - if ((start_pfn + nr_pfn) > altmap->end_pfn) + if ((start_pfn + nr_pfn - 1) > altmap->end_pfn) return true; if (start_pfn < altmap->base_pfn) @@ -305,8 +305,7 @@ void __ref vmemmap_free(unsigned long start, unsigned long end, start = ALIGN_DOWN(start, page_size); if (altmap) { alt_start = altmap->base_pfn; - alt_end = altmap->base_pfn + altmap->reserve + - altmap->free + altmap->alloc + altmap->align; + alt_end = altmap->base_pfn + altmap->reserve + altmap->free; } pr_debug("vmemmap_free %lx...%lx\n", start, end); diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index a9a640317e..01d64c441b 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -99,7 +99,6 @@ EXPORT_SYMBOL(__vmalloc_end); unsigned long __kernel_io_start; EXPORT_SYMBOL(__kernel_io_start); unsigned long __kernel_io_end; -EXPORT_SYMBOL(__kernel_io_end); struct page *vmemmap; EXPORT_SYMBOL(vmemmap); unsigned long __pte_frag_nr; diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 395c508eda..a47fe99bee 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -521,8 +521,10 @@ static ssize_t vcpudispatch_stats_write(struct file *file, const char __user *p, if (cmd) { rc = init_cpu_associativity(); - if (rc) + if (rc) { + destroy_cpu_associativity(); goto out; + } for_each_possible_cpu(cpu) { disp = per_cpu_ptr(&vcpu_disp_data, cpu); diff --git a/arch/s390/include/asm/extable.h b/arch/s390/include/asm/extable.h index f38da2bf29..d64b9e0473 100644 --- a/arch/s390/include/asm/extable.h +++ b/arch/s390/include/asm/extable.h @@ -71,8 +71,13 @@ static inline void swap_ex_entry_fixup(struct exception_table_entry *a, { a->fixup = b->fixup + delta; b->fixup = tmp.fixup - delta; - a->handler = b->handler + delta; - b->handler = tmp.handler - delta; + a->handler = b->handler; + if (a->handler) + a->handler += delta; + b->handler = tmp.handler; + if (b->handler) + b->handler -= delta; } +#define swap_ex_entry_fixup swap_ex_entry_fixup #endif diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 6e49e3c722..49d203aaf5 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2652,7 +2652,7 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, return 0; start = pmd_val(*pmd) & HPAGE_MASK; - end = start + HPAGE_SIZE - 1; + end = start + HPAGE_SIZE; __storage_key_init_range(start, end); set_bit(PG_arch_1, &page->flags); cond_resched(); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index c904101b16..8111583e4b 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -146,7 +146,7 @@ static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) } if (!test_and_set_bit(PG_arch_1, &page->flags)) - __storage_key_init_range(paddr, paddr + size - 1); + __storage_key_init_range(paddr, paddr + size); } void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index d5ffeabc94..844608948b 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -571,7 +571,7 @@ struct flush_tlb_info { flush_tlb_mm_range((vma)->vm_mm, start, end, \ ((vma)->vm_flags & VM_HUGETLB) \ ? huge_page_shift(hstate_vma(vma)) \ - : PAGE_SHIFT, false) + : PAGE_SHIFT, true) extern void flush_tlb_all(void); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index dc6182eece..a37c2873fd 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -182,11 +182,11 @@ static void __meminit init_trampoline_pud(void) set_p4d(p4d_tramp, __p4d(_KERNPG_TABLE | __pa(pud_page_tramp))); - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); + trampoline_pgd_entry = + __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)); } else { - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); + trampoline_pgd_entry = + __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)); } } diff --git a/arch/x86/mm/pat/cpa-test.c b/arch/x86/mm/pat/cpa-test.c index facce271e8..0d24ab91cc 100644 --- a/arch/x86/mm/pat/cpa-test.c +++ b/arch/x86/mm/pat/cpa-test.c @@ -184,7 +184,7 @@ static int pageattr_test(void) break; case 1: - err = change_page_attr_set(addrs, len[1], PAGE_CPA_TEST, 1); + err = change_page_attr_set(addrs, len[i], PAGE_CPA_TEST, 1); break; case 2: diff --git a/drivers/base/node.c b/drivers/base/node.c index fb1d56f977..f7dafc501a 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -20,6 +20,7 @@ #include #include #include +#include static struct bus_type node_subsys = { .name = "node", @@ -584,64 +585,9 @@ static const struct attribute_group *node_dev_groups[] = { NULL }; -#ifdef CONFIG_HUGETLBFS -/* - * hugetlbfs per node attributes registration interface: - * When/if hugetlb[fs] subsystem initializes [sometime after this module], - * it will register its per node attributes for all online nodes with - * memory. It will also call register_hugetlbfs_with_node(), below, to - * register its attribute registration functions with this node driver. - * Once these hooks have been initialized, the node driver will call into - * the hugetlb module to [un]register attributes for hot-plugged nodes. - */ -static node_registration_func_t __hugetlb_register_node; -static node_registration_func_t __hugetlb_unregister_node; - -static inline bool hugetlb_register_node(struct node *node) -{ - if (__hugetlb_register_node && - node_state(node->dev.id, N_MEMORY)) { - __hugetlb_register_node(node); - return true; - } - return false; -} - -static inline void hugetlb_unregister_node(struct node *node) -{ - if (__hugetlb_unregister_node) - __hugetlb_unregister_node(node); -} - -void register_hugetlbfs_with_node(node_registration_func_t doregister, - node_registration_func_t unregister) -{ - __hugetlb_register_node = doregister; - __hugetlb_unregister_node = unregister; -} -#else -static inline void hugetlb_register_node(struct node *node) {} - -static inline void hugetlb_unregister_node(struct node *node) {} -#endif - static void node_device_release(struct device *dev) { - struct node *node = to_node(dev); - -#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS) - /* - * We schedule the work only when a memory section is - * onlined/offlined on this node. When we come here, - * all the memory on this node has been offlined, - * so we won't enqueue new work to this work. - * - * The work is using node->node_work, so we should - * flush work before freeing the memory. - */ - flush_work(&node->node_work); -#endif - kfree(node); + kfree(to_node(dev)); } /* @@ -660,13 +606,13 @@ static int register_node(struct node *node, int num) node->dev.groups = node_dev_groups; error = device_register(&node->dev); - if (error) + if (error) { put_device(&node->dev); - else { + } else { hugetlb_register_node(node); - compaction_register_node(node); } + return error; } @@ -679,8 +625,8 @@ static int register_node(struct node *node, int num) */ void unregister_node(struct node *node) { + hugetlb_unregister_node(node); compaction_unregister_node(node); - hugetlb_unregister_node(node); /* no-op, if memoryless node */ node_remove_accesses(node); node_remove_caches(node); device_unregister(&node->dev); @@ -904,83 +850,21 @@ void register_memory_blocks_under_node(int nid, unsigned long start_pfn, (void *)&nid, func); return; } - -#ifdef CONFIG_HUGETLBFS -/* - * Handle per node hstate attribute [un]registration on transistions - * to/from memoryless state. - */ -static void node_hugetlb_work(struct work_struct *work) -{ - struct node *node = container_of(work, struct node, node_work); - - /* - * We only get here when a node transitions to/from memoryless state. - * We can detect which transition occurred by examining whether the - * node has memory now. hugetlb_register_node() already check this - * so we try to register the attributes. If that fails, then the - * node has transitioned to memoryless, try to unregister the - * attributes. - */ - if (!hugetlb_register_node(node)) - hugetlb_unregister_node(node); -} - -static void init_node_hugetlb_work(int nid) -{ - INIT_WORK(&node_devices[nid]->node_work, node_hugetlb_work); -} - -static int node_memory_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - struct memory_notify *mnb = arg; - int nid = mnb->status_change_nid; - - switch (action) { - case MEM_ONLINE: - case MEM_OFFLINE: - /* - * offload per node hstate [un]registration to a work thread - * when transitioning to/from memoryless state. - */ - if (nid != NUMA_NO_NODE) - schedule_work(&node_devices[nid]->node_work); - break; - - case MEM_GOING_ONLINE: - case MEM_GOING_OFFLINE: - case MEM_CANCEL_ONLINE: - case MEM_CANCEL_OFFLINE: - default: - break; - } - - return NOTIFY_OK; -} -#endif /* CONFIG_HUGETLBFS */ #endif /* CONFIG_MEMORY_HOTPLUG */ -#if !defined(CONFIG_MEMORY_HOTPLUG) || !defined(CONFIG_HUGETLBFS) -static inline int node_memory_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - return NOTIFY_OK; -} - -static void init_node_hugetlb_work(int nid) { } - -#endif - int __register_one_node(int nid) { int error; int cpu; + struct node *node; - node_devices[nid] = kzalloc(sizeof(struct node), GFP_KERNEL); - if (!node_devices[nid]) + node = kzalloc(sizeof(struct node), GFP_KERNEL); + if (!node) return -ENOMEM; + INIT_LIST_HEAD(&node->access_list); + node_devices[nid] = node; + error = register_node(node_devices[nid], nid); /* link cpu under this node */ @@ -989,9 +873,6 @@ int __register_one_node(int nid) register_cpu_under_node(cpu, nid); } - INIT_LIST_HEAD(&node_devices[nid]->access_list); - /* initialize work queue for memory hot plug */ - init_node_hugetlb_work(nid); node_init_caches(nid); return error; @@ -1062,13 +943,8 @@ static const struct attribute_group *cpu_root_attr_groups[] = { NULL, }; -#define NODE_CALLBACK_PRI 2 /* lower than SLAB */ void __init node_dev_init(void) { - static struct notifier_block node_memory_callback_nb = { - .notifier_call = node_memory_callback, - .priority = NODE_CALLBACK_PRI, - }; int ret, i; BUILD_BUG_ON(ARRAY_SIZE(node_state_attr) != NR_NODE_STATES); @@ -1078,8 +954,6 @@ void __init node_dev_init(void) if (ret) panic("%s() failed to register subsystem: %d\n", __func__, ret); - register_hotmemory_notifier(&node_memory_callback_nb); - /* * Create all node devices, which will properly link the node * to applicable memory block devices and already created cpu devices. diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index 3b29f4367d..8b0311b74e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -372,6 +372,10 @@ int mlx5_cmd_fast_teardown_hca(struct mlx5_core_dev *dev) do { if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) break; + if (pci_channel_offline(dev->pdev)) { + mlx5_core_err(dev, "PCI channel offline, stop waiting for NIC IFC\n"); + return -EACCES; + } cond_resched(); } while (!time_after(jiffies, end)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 871c32dda6..0235ba8b75 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -261,6 +261,10 @@ recover_from_sw_reset: do { if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) break; + if (pci_channel_offline(dev->pdev)) { + mlx5_core_err(dev, "PCI channel offline, stop waiting for NIC IFC\n"); + goto unlock; + } msleep(20); } while (!time_after(jiffies, end)); @@ -330,6 +334,10 @@ int mlx5_health_wait_pci_up(struct mlx5_core_dev *dev) mlx5_core_warn(dev, "device is being removed, stop waiting for PCI\n"); return -ENODEV; } + if (pci_channel_offline(dev->pdev)) { + mlx5_core_err(dev, "PCI channel offline, stop waiting for PCI\n"); + return -EACCES; + } msleep(100); } return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c index 6b774e0c27..d0b595ba61 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c @@ -74,6 +74,10 @@ int mlx5_vsc_gw_lock(struct mlx5_core_dev *dev) ret = -EBUSY; goto pci_unlock; } + if (pci_channel_offline(dev->pdev)) { + ret = -EACCES; + goto pci_unlock; + } /* Check if semaphore is already locked */ ret = vsc_read(dev, VSC_SEMAPHORE_OFFSET, &lock_val); diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c index d2011e93f1..7f4ab8dbf5 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.c +++ b/drivers/scsi/lpfc/lpfc_nvmet.c @@ -1241,7 +1241,7 @@ lpfc_nvmet_defer_rcv(struct nvmet_fc_target_port *tgtport, struct lpfc_nvmet_tgtport *tgtp; struct lpfc_async_xchg_ctx *ctxp = container_of(rsp, struct lpfc_async_xchg_ctx, hdlrctx.fcp_req); - struct rqb_dmabuf *nvmebuf = ctxp->rqb_buffer; + struct rqb_dmabuf *nvmebuf; struct lpfc_hba *phba = ctxp->phba; unsigned long iflag; @@ -1249,13 +1249,18 @@ lpfc_nvmet_defer_rcv(struct nvmet_fc_target_port *tgtport, lpfc_nvmeio_data(phba, "NVMET DEFERRCV: xri x%x sz %d CPU %02x\n", ctxp->oxid, ctxp->size, raw_smp_processor_id()); + spin_lock_irqsave(&ctxp->ctxlock, iflag); + nvmebuf = ctxp->rqb_buffer; if (!nvmebuf) { + spin_unlock_irqrestore(&ctxp->ctxlock, iflag); lpfc_printf_log(phba, KERN_INFO, LOG_NVME_IOERR, "6425 Defer rcv: no buffer oxid x%x: " "flg %x ste %x\n", ctxp->oxid, ctxp->flag, ctxp->state); return; } + ctxp->rqb_buffer = NULL; + spin_unlock_irqrestore(&ctxp->ctxlock, iflag); tgtp = phba->targetport->private; if (tgtp) @@ -1263,9 +1268,6 @@ lpfc_nvmet_defer_rcv(struct nvmet_fc_target_port *tgtport, /* Free the nvmebuf since a new buffer already replaced it */ nvmebuf->hrq->rqbp->rqb_free_buffer(phba, nvmebuf); - spin_lock_irqsave(&ctxp->ctxlock, iflag); - ctxp->rqb_buffer = NULL; - spin_unlock_irqrestore(&ctxp->ctxlock, iflag); } static void diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 1199205060..6a36bae143 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -5650,7 +5650,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, if (rl->rl_status == DLM_LKSTS_CONVERT && middle_conversion(lkb)) { /* We may need to adjust grmode depending on other granted locks. */ - log_limit(ls, "%s %x middle convert gr %d rq %d remote %d %x", + log_rinfo(ls, "%s %x middle convert gr %d rq %d remote %d %x", __func__, lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode, lkb->lkb_nodeid, lkb->lkb_remid); rsb_set_flag(r, RSB_RECOVER_CONVERT); diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index 400cb8e948..461cd283f8 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -846,7 +846,7 @@ static void recover_conversion(struct dlm_rsb *r) */ if (((lkb->lkb_grmode == DLM_LOCK_PR) && (other_grmode == DLM_LOCK_CW)) || ((lkb->lkb_grmode == DLM_LOCK_CW) && (other_grmode == DLM_LOCK_PR))) { - log_limit(ls, "%s %x gr %d rq %d, remote %d %x, other_lkid %u, other gr %d, set gr=NL", + log_rinfo(ls, "%s %x gr %d rq %d, remote %d %x, other_lkid %u, other gr %d, set gr=NL", __func__, lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode, lkb->lkb_nodeid, lkb->lkb_remid, other_lkid, other_grmode); diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 061ef959a2..e05f9c9f38 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -51,6 +51,10 @@ static int efivarfs_d_compare(const struct dentry *dentry, { int guid = len - EFI_VARIABLE_GUID_LEN; + /* Parallel lookups may produce a temporary invalid filename */ + if (guid <= 0) + return 1; + if (name->len != len) return 1; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 60fc70b3c2..61459dca30 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -608,6 +608,24 @@ out_free: kfree(isw); } +static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw, + struct list_head *list, int *nr) +{ + struct inode *inode; + + list_for_each_entry(inode, list, i_io_list) { + if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + continue; + + isw->inodes[*nr] = inode; + (*nr)++; + + if (*nr >= WB_MAX_INODES_PER_ISW - 1) + return true; + } + return false; +} + /** * cleanup_offline_cgwb - detach associated inodes * @wb: target wb @@ -620,7 +638,6 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) { struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; - struct inode *inode; int nr; bool restart = false; @@ -642,17 +659,17 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) nr = 0; spin_lock(&wb->list_lock); - list_for_each_entry(inode, &wb->b_attached, i_io_list) { - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) - continue; - - isw->inodes[nr++] = inode; - - if (nr >= WB_MAX_INODES_PER_ISW - 1) { - restart = true; - break; - } - } + /* + * In addition to the inodes that have completed writeback, also switch + * cgwbs for those inodes only with dirty timestamps. Otherwise, those + * inodes won't be written back for a long time when lazytime is + * enabled, and thus pinning the dying cgwbs. It won't break the + * bandwidth restrictions, as writeback of inode metadata is not + * accounted for. + */ + restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr); + if (!restart) + restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr); spin_unlock(&wb->list_lock); /* no attached inodes? bail out */ diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3e6b456742..ec52e808eb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -528,16 +528,10 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap && pte_none(*pte))) { - page = find_get_entry(vma->vm_file->f_mapping, + page = xa_load(&vma->vm_file->f_mapping->i_pages, linear_page_index(vma, addr)); - if (!page) - return; - if (xa_is_value(page)) mss->swap += PAGE_SIZE; - else - put_page(page); - return; } diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 2dac4b49d5..01759d5c71 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -596,6 +596,8 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask); extern void pm_restrict_gfp_mask(void); extern void pm_restore_gfp_mask(void); +extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma); + #ifdef CONFIG_PM_SLEEP extern bool pm_suspended_storage(void); #else diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6605a0d41b..810e8ba974 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -15,6 +15,7 @@ struct ctl_table; struct user_struct; struct mmu_gather; +struct node; #ifndef is_hugepd typedef struct { unsigned long pd; } hugepd_t; @@ -584,6 +585,7 @@ struct huge_bootmem_page { struct hstate *hstate; }; +void wait_for_freed_hugetlb_pages(void); struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, @@ -843,6 +845,11 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, } #endif +#ifdef CONFIG_NUMA +void hugetlb_register_node(struct node *node); +void hugetlb_unregister_node(struct node *node); +#endif + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; @@ -851,6 +858,10 @@ static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage) return NULL; } +static inline void wait_for_freed_hugetlb_pages(void) +{ +} + static inline struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) @@ -1000,6 +1011,14 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr pte_t *ptep, pte_t pte, unsigned long sz) { } + +static inline void hugetlb_register_node(struct node *node) +{ +} + +static inline void hugetlb_unregister_node(struct node *node) +{ +} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2b9c6b1b71..ab77fb4b0a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -943,7 +943,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, return READ_ONCE(mz->lru_zone_size[zone_idx][lru]); } -void mem_cgroup_handle_over_high(void); +void mem_cgroup_handle_over_high(gfp_t gfp_mask); unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); @@ -1403,7 +1403,7 @@ static inline void unlock_page_memcg(struct page *page) { } -static inline void mem_cgroup_handle_over_high(void) +static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) { } @@ -1615,10 +1615,13 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, static inline void mem_cgroup_track_foreign_dirty(struct page *page, struct bdi_writeback *wb) { + struct mem_cgroup *memcg; + if (mem_cgroup_disabled()) return; - if (unlikely(&page_memcg(page)->css != wb->memcg_css)) + memcg = page_memcg(page); + if (unlikely(memcg && &memcg->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(page, wb); } diff --git a/include/linux/memregion.h b/include/linux/memregion.h index e11595256c..c04c4fd2e2 100644 --- a/include/linux/memregion.h +++ b/include/linux/memregion.h @@ -16,7 +16,7 @@ static inline int memregion_alloc(gfp_t gfp) { return -ENOMEM; } -void memregion_free(int id) +static inline void memregion_free(int id) { } #endif diff --git a/include/linux/node.h b/include/linux/node.h index 9ec680dd60..427a5975cf 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -2,15 +2,15 @@ /* * include/linux/node.h - generic node definition * - * This is mainly for topological representation. We define the - * basic 'struct node' here, which can be embedded in per-arch + * This is mainly for topological representation. We define the + * basic 'struct node' here, which can be embedded in per-arch * definitions of processors. * * Basic handling of the devices is done in drivers/base/node.c - * and system devices are handled in drivers/base/sys.c. + * and system devices are handled in drivers/base/sys.c. * * Nodes are exported via driverfs in the class/node/devices/ - * directory. + * directory. */ #ifndef _LINUX_NODE_H_ #define _LINUX_NODE_H_ @@ -18,7 +18,6 @@ #include #include #include -#include /** * struct node_hmem_attrs - heterogeneous memory performance attributes @@ -84,10 +83,6 @@ static inline void node_set_perf_attrs(unsigned int nid, struct node { struct device dev; struct list_head access_list; - -#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS) - struct work_struct node_work; -#endif #ifdef CONFIG_HMEM_REPORTING struct list_head cache_attrs; struct device *cache_dev; @@ -96,7 +91,6 @@ struct node { struct memory_block; extern struct node *node_devices[]; -typedef void (*node_registration_func_t)(struct node *); #if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA) void register_memory_blocks_under_node(int nid, unsigned long start_pfn, @@ -144,11 +138,6 @@ extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk); extern int register_memory_node_under_compute_node(unsigned int mem_nid, unsigned int cpu_nid, unsigned access); - -#ifdef CONFIG_HUGETLBFS -extern void register_hugetlbfs_with_node(node_registration_func_t doregister, - node_registration_func_t unregister); -#endif #else static inline void node_dev_init(void) { @@ -176,11 +165,6 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) static inline void unregister_memory_block_under_nodes(struct memory_block *mem_blk) { } - -static inline void register_hugetlbfs_with_node(node_registration_func_t reg, - node_registration_func_t unreg) -{ -} #endif #define to_node(device) container_of(device, struct node, dev) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d561363be0..e1fd4f5e96 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -243,6 +243,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, #define FGP_NOFS 0x00000010 #define FGP_NOWAIT 0x00000020 #define FGP_FOR_MMAP 0x00000040 +#define FGP_HEAD 0x00000080 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, int fgp_flags, gfp_t cache_gfp_mask); @@ -272,20 +273,39 @@ static inline struct page *find_get_page_flags(struct address_space *mapping, /** * find_lock_page - locate, pin and lock a pagecache page * @mapping: the address_space to search - * @offset: the page index + * @index: the page index * - * Looks up the page cache slot at @mapping & @offset. If there is a + * Looks up the page cache entry at @mapping & @index. If there is a * page cache page, it is returned locked and with an increased * refcount. * - * Otherwise, %NULL is returned. - * - * find_lock_page() may sleep. + * Context: May sleep. + * Return: A struct page or %NULL if there is no page in the cache for this + * index. */ static inline struct page *find_lock_page(struct address_space *mapping, - pgoff_t offset) + pgoff_t index) { - return pagecache_get_page(mapping, offset, FGP_LOCK, 0); + return pagecache_get_page(mapping, index, FGP_LOCK, 0); +} + +/** + * find_lock_head - Locate, pin and lock a pagecache page. + * @mapping: The address_space to search. + * @index: The page index. + * + * Looks up the page cache entry at @mapping & @index. If there is a + * page cache page, its head page is returned locked and with an increased + * refcount. + * + * Context: May sleep. + * Return: A struct page which is !PageTail, or %NULL if there is no page + * in the cache for this index. + */ +static inline struct page *find_lock_head(struct address_space *mapping, + pgoff_t index) +{ + return pagecache_get_page(mapping, index, FGP_LOCK | FGP_HEAD, 0); } /** @@ -336,18 +356,28 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, mapping_gfp_mask(mapping)); } -static inline struct page *find_subpage(struct page *page, pgoff_t offset) +/* Does this page contain this index? */ +static inline bool thp_contains(struct page *head, pgoff_t index) { - if (PageHuge(page)) - return page; - - VM_BUG_ON_PAGE(PageTail(page), page); - - return page + (offset & (compound_nr(page) - 1)); + /* HugeTLBfs indexes the page cache in units of hpage_size */ + if (PageHuge(head)) + return head->index == index; + return page_index(head) == (index & ~(thp_nr_pages(head) - 1UL)); +} + +/* + * Given the page we found in the page cache, return the page corresponding + * to this index in the file + */ +static inline struct page *find_subpage(struct page *head, pgoff_t index) +{ + /* HugeTLBfs wants the head page regardless */ + if (PageHuge(head)) + return head; + + return head + (index & (thp_nr_pages(head) - 1)); } -struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); -struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); unsigned find_get_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices); diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 65e6a3fa97..ad1322e8f0 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -79,7 +79,12 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); extern int shmem_unuse(unsigned int type, bool frontswap, unsigned long *fs_pages_to_unuse); -extern bool shmem_huge_enabled(struct vm_area_struct *vma); +extern bool shmem_is_huge(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index); +static inline bool shmem_huge_enabled(struct vm_area_struct *vma) +{ + return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff); +} extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); @@ -87,9 +92,8 @@ extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, /* Flag allocation requirements to shmem_getpage */ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ + SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ - SGP_NOHUGE, /* like SGP_CACHE, but no huge pages */ - SGP_HUGE, /* like SGP_CACHE, huge pages preferred */ SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; diff --git a/include/linux/swap.h b/include/linux/swap.h index d0f44b3760..6ec6b42b5a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -423,6 +423,7 @@ extern void free_pages_and_swap_cache(struct page **, int); extern struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); +struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index); extern struct page *read_swap_cache_async(swp_entry_t, gfp_t, struct vm_area_struct *vma, unsigned long addr, bool do_poll); @@ -584,6 +585,12 @@ static inline struct page *lookup_swap_cache(swp_entry_t swp, return NULL; } +static inline +struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) +{ + return find_get_page(mapping, index); +} + static inline int add_to_swap(struct page *page) { return 0; diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 40b0b4c1bf..05b2d77d26 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -187,7 +187,7 @@ static inline void tracehook_notify_resume(struct pt_regs *regs) if (unlikely(current->task_works)) task_work_run(); - mem_cgroup_handle_over_high(); + mem_cgroup_handle_over_high(GFP_KERNEL); blkcg_maybe_throttle_current(); } diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 42c2518227..372177e4fe 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -68,7 +68,7 @@ DECLARE_EVENT_CLASS(writeback_page_template, strscpy_pad(__entry->name, bdi_dev_name(mapping ? inode_to_bdi(mapping->host) : NULL), 32); - __entry->ino = mapping ? mapping->host->i_ino : 0; + __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0; __entry->index = page->index; ), diff --git a/lib/radix-tree.c b/lib/radix-tree.c index bcbcbc3168..092a94de90 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -1170,7 +1170,6 @@ static void set_iter_tags(struct radix_tree_iter *iter, void __rcu **radix_tree_iter_resume(void __rcu **slot, struct radix_tree_iter *iter) { - slot++; iter->index = __radix_tree_iter_add(iter, 1); iter->next_index = iter->index; iter->tags = 0; diff --git a/lib/slub_kunit.c b/lib/slub_kunit.c index 7a0564d7cb..9384747d90 100644 --- a/lib/slub_kunit.c +++ b/lib/slub_kunit.c @@ -39,7 +39,7 @@ static void test_next_pointer(struct kunit *test) ptr_addr = (unsigned long *)(p + s->offset); tmp = *ptr_addr; - p[s->offset] = 0x12; + p[s->offset] = ~p[s->offset]; /* * Expecting three errors. diff --git a/mm/cma.c b/mm/cma.c index c22cb34d73..ca228a9418 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -490,7 +490,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, */ if (page) { for (i = 0; i < count; i++) - page_kasan_tag_reset(page + i); + page_kasan_tag_reset(nth_page(page, i)); } if (ret && !no_warn) { diff --git a/mm/compaction.c b/mm/compaction.c index a4aab90d8b..2e761ec7d5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1050,12 +1050,12 @@ isolate_success: /* * Avoid isolating too much unless this block is being - * rescanned (e.g. dirty/writeback pages, parallel allocation) + * fully scanned (e.g. dirty/writeback pages, parallel allocation) * or a lock is contended. For contention, isolate quickly to * potentially remove one source of contention. */ if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX && - !cc->rescan && !cc->contended) { + !cc->finish_pageblock && !cc->contended) { ++low_pfn; break; } @@ -1117,14 +1117,14 @@ isolate_abort: } /* - * Updated the cached scanner pfn once the pageblock has been scanned + * Update the cached scanner pfn once the pageblock has been scanned. * Pages will either be migrated in which case there is no point * scanning in the near future or migration failed in which case the * failure reason may persist. The block is marked for skipping if * there were no pages isolated in the block or if the block is * rescanned twice in a row. */ - if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) { + if (low_pfn == end_pfn && (!nr_isolated || cc->finish_pageblock)) { if (valid_page && !skip_updated) set_pageblock_skip(valid_page); update_cached_migrate(cc, low_pfn); @@ -1710,6 +1710,13 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) if (cc->ignore_skip_hint) return pfn; + /* + * If the pageblock should be finished then do not select a different + * pageblock. + */ + if (cc->finish_pageblock) + return pfn; + /* * If the migrate_pfn is not at the start of a zone or the start * of a pageblock then assume this is a continuation of a previous @@ -2112,13 +2119,6 @@ static enum compact_result compact_finished(struct compact_control *cc) return ret; } -/* - * compaction_suitable: Is this suitable to run compaction on this zone now? - * Returns - * COMPACT_SKIPPED - If there are too few free pages for compaction - * COMPACT_SUCCESS - If the allocation would succeed without compaction - * COMPACT_CONTINUE - If compaction should run now - */ static enum compact_result __compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, int highest_zoneidx, @@ -2162,6 +2162,13 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, return COMPACT_CONTINUE; } +/* + * compaction_suitable: Is this suitable to run compaction on this zone now? + * Returns + * COMPACT_SKIPPED - If there are too few free pages for compaction + * COMPACT_SUCCESS - If the allocation would succeed without compaction + * COMPACT_CONTINUE - If compaction should run now + */ enum compact_result compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, int highest_zoneidx) @@ -2317,22 +2324,23 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) { int err; - unsigned long start_pfn = cc->migrate_pfn; + unsigned long iteration_start_pfn = cc->migrate_pfn; /* - * Avoid multiple rescans which can happen if a page cannot be - * isolated (dirty/writeback in async mode) or if the migrated - * pages are being allocated before the pageblock is cleared. - * The first rescan will capture the entire pageblock for - * migration. If it fails, it'll be marked skip and scanning - * will proceed as normal. + * Avoid multiple rescans of the same pageblock which can + * happen if a page cannot be isolated (dirty/writeback in + * async mode) or if the migrated pages are being allocated + * before the pageblock is cleared. The first rescan will + * capture the entire pageblock for migration. If it fails, + * it'll be marked skip and scanning will proceed as normal. */ - cc->rescan = false; + cc->finish_pageblock = false; if (pageblock_start_pfn(last_migrated_pfn) == - pageblock_start_pfn(start_pfn)) { - cc->rescan = true; + pageblock_start_pfn(iteration_start_pfn)) { + cc->finish_pageblock = true; } +rescan: switch (isolate_migratepages(cc)) { case ISOLATE_ABORT: ret = COMPACT_CONTENDED; @@ -2353,8 +2361,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) goto check_drain; case ISOLATE_SUCCESS: update_cached = false; - last_migrated_pfn = start_pfn; - ; + last_migrated_pfn = iteration_start_pfn; } err = migrate_pages(&cc->migratepages, compaction_alloc, @@ -2377,18 +2384,37 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) goto out; } /* - * We failed to migrate at least one page in the current - * order-aligned block, so skip the rest of it. + * If an ASYNC or SYNC_LIGHT fails to migrate a page + * within the current order-aligned block, scan the + * remainder of the pageblock. This will mark the + * pageblock "skip" to avoid rescanning in the near + * future. This will isolate more pages than necessary + * for the request but avoid loops due to + * fast_find_migrateblock revisiting blocks that were + * recently partially scanned. */ - if (cc->direct_compaction && - (cc->mode == MIGRATE_ASYNC)) { - cc->migrate_pfn = block_end_pfn( - cc->migrate_pfn - 1, cc->order); - /* Draining pcplists is useless in this case */ - last_migrated_pfn = 0; + if (cc->direct_compaction && !cc->finish_pageblock && + (cc->mode < MIGRATE_SYNC)) { + cc->finish_pageblock = true; + + /* + * Draining pcplists does not help THP if + * any page failed to migrate. Even after + * drain, the pageblock will not be free. + */ + if (cc->order == COMPACTION_HPAGE_ORDER) + last_migrated_pfn = 0; + + goto rescan; } } + /* Stop if a page has been captured */ + if (capc && capc->page) { + ret = COMPACT_SUCCESS; + break; + } + check_drain: /* * Has the migration scanner moved away from the previous @@ -2407,12 +2433,6 @@ check_drain: last_migrated_pfn = 0; } } - - /* Stop if a page has been captured */ - if (capc && capc->page) { - ret = COMPACT_SUCCESS; - break; - } } out: diff --git a/mm/filemap.c b/mm/filemap.c index bdef5f9ef4..fbdd1b7281 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1733,19 +1733,19 @@ EXPORT_SYMBOL(page_cache_prev_miss); /** * find_get_entry - find and get a page cache entry * @mapping: the address_space to search - * @offset: the page cache index + * @index: The page cache index. * * Looks up the page cache slot at @mapping & @offset. If there is a - * page cache page, it is returned with an increased refcount. + * page cache page, the head page is returned with an increased refcount. * * If the slot holds a shadow entry of a previously evicted page, or a * swap entry from shmem/tmpfs, it is returned. * - * Return: the found page or shadow entry, %NULL if nothing is found. + * Return: The head page or shadow entry, %NULL if nothing is found. */ -struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) +struct page *find_get_entry(struct address_space *mapping, pgoff_t index) { - XA_STATE(xas, &mapping->i_pages, offset); + XA_STATE(xas, &mapping->i_pages, index); struct page *page; rcu_read_lock(); @@ -1773,49 +1773,44 @@ repeat: put_page(page); goto repeat; } - page = find_subpage(page, offset); out: rcu_read_unlock(); return page; } -EXPORT_SYMBOL(find_get_entry); /** - * find_lock_entry - locate, pin and lock a page cache entry - * @mapping: the address_space to search - * @offset: the page cache index + * find_lock_entry - Locate and lock a page cache entry. + * @mapping: The address_space to search. + * @index: The page cache index. * - * Looks up the page cache slot at @mapping & @offset. If there is a - * page cache page, it is returned locked and with an increased - * refcount. + * Looks up the page at @mapping & @index. If there is a page in the + * cache, the head page is returned locked and with an increased refcount. * * If the slot holds a shadow entry of a previously evicted page, or a * swap entry from shmem/tmpfs, it is returned. * - * find_lock_entry() may sleep. - * - * Return: the found page or shadow entry, %NULL if nothing is found. + * Context: May sleep. + * Return: The head page or shadow entry, %NULL if nothing is found. */ -struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) +struct page *find_lock_entry(struct address_space *mapping, pgoff_t index) { struct page *page; repeat: - page = find_get_entry(mapping, offset); + page = find_get_entry(mapping, index); if (page && !xa_is_value(page)) { lock_page(page); /* Has the page been truncated? */ - if (unlikely(page_mapping(page) != mapping)) { + if (unlikely(page->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; } - VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); + VM_BUG_ON_PAGE(!thp_contains(page, index), page); } return page; } -EXPORT_SYMBOL(find_lock_entry); /** * pagecache_get_page - Find and get a reference to a page. @@ -1830,6 +1825,8 @@ EXPORT_SYMBOL(find_lock_entry); * * * %FGP_ACCESSED - The page will be marked accessed. * * %FGP_LOCK - The page is returned locked. + * * %FGP_HEAD - If the page is present and a THP, return the head page + * rather than the exact page specified by the index. * * %FGP_CREAT - If no page is present then a new page is allocated using * @gfp_mask and added to the page cache and the VM's LRU list. * The page is returned locked and with an increased refcount. @@ -1870,12 +1867,12 @@ repeat: } /* Has the page been truncated? */ - if (unlikely(compound_head(page)->mapping != mapping)) { + if (unlikely(page->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; } - VM_BUG_ON_PAGE(page->index != index, page); + VM_BUG_ON_PAGE(!thp_contains(page, index), page); } if (page && (fgp_flags & FGP_ACCESSED)) @@ -1885,6 +1882,8 @@ repeat: if (page_is_idle(page)) clear_page_idle(page); } + if (!(fgp_flags & FGP_HEAD)) + page = find_subpage(page, index); no_page: if (!page && (fgp_flags & FGP_CREAT)) { @@ -1945,6 +1944,11 @@ EXPORT_SYMBOL(pagecache_get_page); * Any shadow entries of evicted pages, or swap entries from * shmem/tmpfs, are included in the returned array. * + * If it finds a Transparent Huge Page, head or tail, find_get_entries() + * stops at that page: the caller is likely to have a better way to handle + * the compound page as a whole, and then skip its extent, than repeatedly + * calling find_get_entries() to return all its tails. + * * Return: the number of pages and shadow entries which were found. */ unsigned find_get_entries(struct address_space *mapping, @@ -1976,8 +1980,15 @@ unsigned find_get_entries(struct address_space *mapping, /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - page = find_subpage(page, xas.xa_index); + /* + * Terminate early on finding a THP, to allow the caller to + * handle it all at once; but continue if this is hugetlbfs. + */ + if (PageTransHuge(page) && !PageHuge(page)) { + page = find_subpage(page, xas.xa_index); + nr_entries = ret + 1; + } export: indices[ret] = xas.xa_index; entries[ret] = page; @@ -2042,7 +2053,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) { - *start = page->index + 1; + *start = xas.xa_index + 1; goto out; } continue; @@ -2123,7 +2134,7 @@ retry: EXPORT_SYMBOL(find_get_pages_contig); /** - * find_get_pages_range_tag - find and return pages in given range matching @tag + * find_get_pages_range_tag - Find and return head pages matching @tag. * @mapping: the address_space to search * @index: the starting page index * @end: The final page index (inclusive) @@ -2131,8 +2142,9 @@ EXPORT_SYMBOL(find_get_pages_contig); * @nr_pages: the maximum number of pages * @pages: where the resulting pages are placed * - * Like find_get_pages, except we only return pages which are tagged with - * @tag. We update @index to index the next page for the traversal. + * Like find_get_pages(), except we only return head pages which are tagged + * with @tag. @index is updated to the index immediately after the last + * page we return, ready for the next iteration. * * Return: the number of pages which were found. */ @@ -2166,9 +2178,9 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = find_subpage(page, xas.xa_index); + pages[ret] = page; if (++ret == nr_pages) { - *index = page->index + 1; + *index = page->index + thp_nr_pages(page); goto out; } continue; diff --git a/mm/gup.c b/mm/gup.c index aa9e600b62..666ff4e5d4 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1921,8 +1921,8 @@ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size) } while (start != end); mmap_read_unlock(mm); - if (size > (unsigned long)uaddr - start) - return size - ((unsigned long)uaddr - start); + if (size > start - (unsigned long)uaddr) + return size - (start - (unsigned long)uaddr); return 0; } EXPORT_SYMBOL(fault_in_safe_writeable); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5b679c61c0..3b1d9cc39d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -664,9 +664,9 @@ release: * available * never: never stall for any thp allocation */ -static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) +gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma) { - const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE); if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); @@ -749,7 +749,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) } return ret; } - gfp = alloc_hugepage_direct_gfpmask(vma); + gfp = vma_thp_gfp_mask(vma); page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f1385ebb3b..85e3888693 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -2002,6 +2003,14 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, return page; } +void wait_for_freed_hugetlb_pages(void) +{ + if (llist_empty(&hpage_freelist)) + return; + + flush_work(&free_hpage_work); +} + /* page migration callback function */ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) @@ -3061,24 +3070,8 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, return retval; } -static void __init hugetlb_sysfs_init(void) -{ - struct hstate *h; - int err; - - hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); - if (!hugepages_kobj) - return; - - for_each_hstate(h) { - err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, - hstate_kobjs, &hstate_attr_group); - if (err) - pr_err("HugeTLB: Unable to add hstate %s", h->name); - } -} - #ifdef CONFIG_NUMA +static bool hugetlb_sysfs_initialized __ro_after_init; /* * node_hstate/s - associate per node hstate attributes, via their kobjects, @@ -3134,7 +3127,7 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) * Unregister hstate attributes from a single node device. * No-op if no hstate attributes attached. */ -static void hugetlb_unregister_node(struct node *node) +void hugetlb_unregister_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; @@ -3159,12 +3152,15 @@ static void hugetlb_unregister_node(struct node *node) * Register hstate attributes for a single node device. * No-op if attributes already registered. */ -static void hugetlb_register_node(struct node *node) +void hugetlb_register_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; int err; + if (!hugetlb_sysfs_initialized) + return; + if (nhs->hugepages_kobj) return; /* already allocated */ @@ -3195,18 +3191,8 @@ static void __init hugetlb_register_all_nodes(void) { int nid; - for_each_node_state(nid, N_MEMORY) { - struct node *node = node_devices[nid]; - if (node->dev.id == nid) - hugetlb_register_node(node); - } - - /* - * Let the node device driver know we're here so it can - * [un]register hstate attributes on node hotplug. - */ - register_hugetlbfs_with_node(hugetlb_register_node, - hugetlb_unregister_node); + for_each_online_node(nid) + hugetlb_register_node(node_devices[nid]); } #else /* !CONFIG_NUMA */ @@ -3222,6 +3208,28 @@ static void hugetlb_register_all_nodes(void) { } #endif +static void __init hugetlb_sysfs_init(void) +{ + struct hstate *h; + int err; + + hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); + if (!hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, + hstate_kobjs, &hstate_attr_group); + if (err) + pr_err("HugeTLB: Unable to add hstate %s", h->name); + } + +#ifdef CONFIG_NUMA + hugetlb_sysfs_initialized = true; +#endif + hugetlb_register_all_nodes(); +} + static int __init hugetlb_init(void) { int i; @@ -3271,7 +3279,6 @@ static int __init hugetlb_init(void) report_hugepages(); hugetlb_sysfs_init(); - hugetlb_register_all_nodes(); hugetlb_cgroup_file_init(); #ifdef CONFIG_SMP diff --git a/mm/internal.h b/mm/internal.h index f11a5be64e..fbf4f2fd74 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -89,6 +89,9 @@ static inline void ra_submit(struct file_ra_state *ra, ra->start, ra->size, ra->async_size); } +struct page *find_get_entry(struct address_space *mapping, pgoff_t index); +struct page *find_lock_entry(struct address_space *mapping, pgoff_t index); + /** * page_evictable - test whether a page is evictable * @page: the page to test @@ -272,7 +275,11 @@ struct compact_control { bool proactive_compaction; /* kcompactd proactive compaction */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock or sched contention */ - bool rescan; /* Rescanning the same pageblock */ + bool finish_pageblock; /* Scan the remainder of a pageblock. Used + * when there are potentially transient + * isolation or migration failures to + * ensure forward progress. + */ bool alloc_contig; /* alloc_contig_range allocation */ }; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 8fe629e536..04c6ca4e34 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1730,7 +1730,7 @@ static void collapse_file(struct mm_struct *mm, xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_getpage(mapping->host, index, &page, - SGP_NOHUGE)) { + SGP_NOALLOC)) { result = SCAN_FAIL; goto xa_unlocked; } diff --git a/mm/madvise.c b/mm/madvise.c index 7598ec47d9..0d767763fd 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -244,25 +244,28 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct address_space *mapping) { - pgoff_t index; + XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); + pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1); struct page *page; - swp_entry_t swap; - for (; start < end; start += PAGE_SIZE) { - index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + rcu_read_lock(); + xas_for_each(&xas, page, end_index) { + swp_entry_t swap; - page = find_get_entry(mapping, index); - if (!xa_is_value(page)) { - if (page) - put_page(page); + if (!xa_is_value(page)) continue; - } + xas_pause(&xas); + rcu_read_unlock(); + swap = radix_to_swp_entry(page); page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, NULL, 0, false); if (page) put_page(page); + + rcu_read_lock(); } + rcu_read_unlock(); lru_add_drain(); /* Push any new pages onto the LRU now */ } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d2de9d6764..6e2a077af4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2609,10 +2609,11 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg, } /* - * Scheduled by try_charge() to be executed from the userland return path - * and reclaims memory over the high limit. + * Reclaims memory over the high limit. Called directly from + * try_charge() (context permitting), as well as from the userland + * return path where reclaim is always able to block. */ -void mem_cgroup_handle_over_high(void) +void mem_cgroup_handle_over_high(gfp_t gfp_mask) { unsigned long penalty_jiffies; unsigned long pflags; @@ -2629,6 +2630,17 @@ void mem_cgroup_handle_over_high(void) current->memcg_nr_pages_over_high = 0; retry_reclaim: + /* + * Bail if the task is already exiting. Unlike memory.max, + * memory.high enforcement isn't as strict, and there is no + * OOM killer involved, which means the excess could already + * be much bigger (and still growing) than it could for + * memory.max; the dying task could get stuck in fruitless + * reclaim for a long time, which isn't desirable. + */ + if (task_is_dying()) + goto out; + /* * The allocating task should reclaim at least the batch size, but for * subsequent retries we only want to do what's necessary to prevent oom @@ -2640,7 +2652,7 @@ retry_reclaim: */ nr_reclaimed = reclaim_high(memcg, in_retry ? SWAP_CLUSTER_MAX : nr_pages, - GFP_KERNEL); + gfp_mask); /* * memory.high is breached and reclaim is unable to keep up. Throttle @@ -2679,6 +2691,9 @@ retry_reclaim: } /* + * Reclaim didn't manage to push usage below the limit, slow + * this allocating task down. + * * If we exit early, we're guaranteed to die (since * schedule_timeout_killable sets TASK_KILLABLE). This means we don't * need to account for any ill-begotten jiffies to pay them off later. @@ -2864,11 +2879,17 @@ done_restock: } } while ((memcg = parent_mem_cgroup(memcg))); + /* + * Reclaim is set up above to be called from the userland + * return path. But also attempt synchronous reclaim to avoid + * excessive overrun while the task is still inside the + * kernel. If this is successful, the return path will see it + * when it rechecks the overage and simply bail out. + */ if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && !(current->flags & PF_MEMALLOC) && - gfpflags_allow_blocking(gfp_mask)) { - mem_cgroup_handle_over_high(); - } + gfpflags_allow_blocking(gfp_mask)) + mem_cgroup_handle_over_high(gfp_mask); return 0; } @@ -4677,7 +4698,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats_delayed(); *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); @@ -5548,7 +5569,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) return -ENOMEM; } - if (unlikely(mem_cgroup_is_root(memcg))) + if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled()) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); @@ -5850,35 +5871,15 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, static struct page *mc_handle_file_pte(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, swp_entry_t *entry) { - struct page *page = NULL; - struct address_space *mapping; - pgoff_t pgoff; - if (!vma->vm_file) /* anonymous vma */ return NULL; if (!(mc.flags & MOVE_FILE)) return NULL; - mapping = vma->vm_file->f_mapping; - pgoff = linear_page_index(vma, addr); - /* page is moved even if it's not RSS of this task(page-faulted). */ -#ifdef CONFIG_SWAP /* shmem/tmpfs may report page out on swap: account for that too. */ - if (shmem_mapping(mapping)) { - page = find_get_entry(mapping, pgoff); - if (xa_is_value(page)) { - swp_entry_t swp = radix_to_swp_entry(page); - *entry = swp; - page = find_get_page(swap_address_space(swp), - swp_offset(swp)); - } - } else - page = find_get_page(mapping, pgoff); -#else - page = find_get_page(mapping, pgoff); -#endif - return page; + return find_get_incore_page(vma->vm_file->f_mapping, + linear_page_index(vma, addr)); } /** @@ -6774,6 +6775,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, return err; while (nr_reclaimed < nr_to_reclaim) { + /* Will converge on zero, but reclaim enforces a minimum */ + unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4; unsigned long reclaimed; if (signal_pending(current)) @@ -6788,8 +6791,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, lru_add_drain_all(); reclaimed = try_to_free_mem_cgroup_pages(memcg, - min(nr_to_reclaim - nr_reclaimed, SWAP_CLUSTER_MAX), - GFP_KERNEL, true); + batch_size, GFP_KERNEL, true); if (!reclaimed && !nr_retries--) return -EAGAIN; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b17f0e2344..5f584fb174 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1973,16 +1973,6 @@ static int soft_offline_in_use_page(struct page *page) return __soft_offline_page(page); } -static int soft_offline_free_page(struct page *page) -{ - int rc = 0; - - if (!page_handle_poison(page, true, false)) - rc = -EBUSY; - - return rc; -} - static void put_ref_page(struct page *page) { if (page) @@ -2045,10 +2035,13 @@ retry: if (ret > 0) { ret = soft_offline_in_use_page(page); } else if (ret == 0) { - if (soft_offline_free_page(page) && try_again) { - try_again = false; - flags &= ~MF_COUNT_INCREASED; - goto retry; + if (!page_handle_poison(page, true, false)) { + if (try_again) { + try_again = false; + flags &= ~MF_COUNT_INCREASED; + goto retry; + } + ret = -EBUSY; } } else if (ret == -EIO) { pr_info("%s: %#lx: unknown page type: %lx (%pGp)\n", diff --git a/mm/memory.c b/mm/memory.c index a628697ff6..5052fbb5c7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2235,11 +2235,11 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, if (fn) { do { if (create || !pte_none(*pte)) { - err = fn(pte++, addr, data); + err = fn(pte, addr, data); if (err) break; } - } while (addr += PAGE_SIZE, addr != end); + } while (pte++, addr += PAGE_SIZE, addr != end); } arch_leave_lazy_mmu_mode(); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 62e469ace6..b1f7577b0b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1290,7 +1290,7 @@ static int scan_movable_pages(unsigned long start, unsigned long end, */ if (HPageMigratable(head)) goto found; - skip = compound_nr(head) - (page - head); + skip = compound_nr(head) - (pfn - page_to_pfn(head)); pfn += skip - 1; } return -ENOENT; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 66710b7a52..dac62c5ff9 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2986,8 +2986,9 @@ out: * @pol: pointer to mempolicy to be formatted * * Convert @pol into a string. If @buffer is too short, truncate the string. - * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the - * longest flag, "relative", and to display at least a few node ids. + * Recommend a @maxlen of at least 51 for the longest mode, "weighted + * interleave", plus the longest flag flags, "relative|balancing", and to + * display at least a few node ids. */ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { @@ -2996,7 +2997,10 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) unsigned short mode = MPOL_DEFAULT; unsigned short flags = 0; - if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { + if (pol && + pol != &default_policy && + !(pol >= &preferred_node_policy[0] && + pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) { mode = pol->mode; flags = pol->flags; } @@ -3023,12 +3027,18 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) p += snprintf(p, buffer + maxlen - p, "="); /* - * Currently, the only defined flags are mutually exclusive + * Static and relative are mutually exclusive. */ if (flags & MPOL_F_STATIC_NODES) p += snprintf(p, buffer + maxlen - p, "static"); else if (flags & MPOL_F_RELATIVE_NODES) p += snprintf(p, buffer + maxlen - p, "relative"); + + if (flags & MPOL_F_NUMA_BALANCING) { + if (!is_power_of_2(flags & MPOL_MODE_FLAGS)) + p += snprintf(p, buffer + maxlen - p, "|"); + p += snprintf(p, buffer + maxlen - p, "balancing"); + } } if (!nodes_empty(nodes)) diff --git a/mm/mincore.c b/mm/mincore.c index 97973ea98d..85f84aa489 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -48,7 +48,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, * and is up to date; i.e. that no page-in operation would be required * at this time if an application were to map and access this page. */ -static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) +static unsigned char mincore_page(struct address_space *mapping, pgoff_t index) { unsigned char present = 0; struct page *page; @@ -59,31 +59,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ -#ifdef CONFIG_SWAP - if (shmem_mapping(mapping)) { - page = find_get_entry(mapping, pgoff); - /* - * shmem/tmpfs may return swap: account for swapcache - * page too. - */ - if (xa_is_value(page)) { - swp_entry_t swp = radix_to_swp_entry(page); - struct swap_info_struct *si; - - /* Prevent swap device to being swapoff under us */ - si = get_swap_device(swp); - if (si) { - page = find_get_page(swap_address_space(swp), - swp_offset(swp)); - put_swap_device(si); - } else - page = NULL; - } - } else - page = find_get_page(mapping, pgoff); -#else - page = find_get_page(mapping, pgoff); -#endif + page = find_get_incore_page(mapping, index); if (page) { present = PageUptodate(page); put_page(page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2c94a3719f..bcdc02a714 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2132,6 +2132,14 @@ void __init page_alloc_init_late(void) /* Block until all are initialised */ wait_for_completion(&pgdat_init_all_done_comp); + /* + * The number of managed pages has changed due to the initialisation + * so the pcpu batch and high limits needs to be updated or the limits + * will be artificially small. + */ + for_each_populated_zone(zone) + zone_pcp_update(zone); + /* * We initialized the rest of the deferred pages. Permanently disable * on-demand struct page initialization. @@ -8892,7 +8900,6 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages) } EXPORT_SYMBOL(free_contig_range); -#ifdef CONFIG_MEMORY_HOTPLUG /* * The zone indicated has a new number of managed_pages; batch sizes and percpu * page high values need to be recalculated. @@ -8903,7 +8910,6 @@ void __meminit zone_pcp_update(struct zone *zone) zone_set_pageset_high_and_batch(zone); mutex_unlock(&pcp_batch_high_lock); } -#endif /* * Effectively disable pcplists for the zone by setting the high limit to 0 @@ -9034,6 +9040,7 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page, next_page = page; current_buddy = page + size; } + page = next_page; if (set_page_guard(zone, current_buddy, high, migratetype)) continue; @@ -9041,7 +9048,6 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page, if (current_buddy != target) { add_to_free_list(current_buddy, zone, high, migratetype); set_buddy_order(current_buddy, high); - page = next_page; } } } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 037eb6efb2..7ce499c23d 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -282,6 +282,16 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, struct page *page; struct zone *zone; + /* + * Due to the deferred freeing of hugetlb folios, the hugepage folios may + * not immediately release to the buddy system. This can cause PageBuddy() + * to fail in __test_page_isolated_in_pageblock(). To ensure that the + * hugetlb folios are properly released back to the buddy system, we + * invoke the wait_for_freed_hugetlb_folios() function to wait for the + * release to complete. + */ + wait_for_freed_hugetlb_pages(); + /* * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages * are not aligned to pageblock_nr_pages. diff --git a/mm/page_reporting.c b/mm/page_reporting.c index c3ab32b0b0..b034ef7a24 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -319,7 +319,8 @@ int page_reporting_register(struct page_reporting_dev_info *prdev) mutex_lock(&page_reporting_mutex); /* nothing to do if already in use */ - if (rcu_access_pointer(pr_dev_info)) { + if (rcu_dereference_protected(pr_dev_info, + lockdep_is_held(&page_reporting_mutex))) { err = -EBUSY; goto err_out; } @@ -350,7 +351,8 @@ void page_reporting_unregister(struct page_reporting_dev_info *prdev) { mutex_lock(&page_reporting_mutex); - if (rcu_access_pointer(pr_dev_info) == prdev) { + if (prdev == rcu_dereference_protected(pr_dev_info, + lockdep_is_held(&page_reporting_mutex))) { /* Disable page reporting notification */ RCU_INIT_POINTER(pr_dev_info, NULL); synchronize_rcu(); diff --git a/mm/percpu.c b/mm/percpu.c index a95c439fe1..eb53d3e663 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1741,7 +1741,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, gfp = current_gfp_context(gfp); /* whitelisted flags that can be passed to the backing allocators */ pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); - is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; + is_atomic = !gfpflags_allow_blocking(gfp); do_warn = !(gfp & __GFP_NOWARN); /* @@ -2243,7 +2243,12 @@ static void pcpu_balance_workfn(struct work_struct *work) * to grow other chunks. This then gives pcpu_reclaim_populated() time * to move fully free chunks to the active list to be freed if * appropriate. + * + * Enforce GFP_NOIO allocations because we have pcpu_alloc users + * constrained to GFP_NOIO/NOFS contexts and they could form lock + * dependency through pcpu_alloc_mutex */ + unsigned int flags = memalloc_noio_save(); mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); @@ -2254,6 +2259,7 @@ static void pcpu_balance_workfn(struct work_struct *work) spin_unlock_irq(&pcpu_lock); mutex_unlock(&pcpu_alloc_mutex); + memalloc_noio_restore(flags); } /** diff --git a/mm/readahead.c b/mm/readahead.c index c0fab7f8e3..9834ae0128 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -673,7 +673,8 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count) */ ret = -EINVAL; if (!f.file->f_mapping || !f.file->f_mapping->a_ops || - !S_ISREG(file_inode(f.file)->i_mode)) + (!S_ISREG(file_inode(f.file)->i_mode) && + !S_ISBLK(file_inode(f.file)->i_mode))) goto out; ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED); diff --git a/mm/shmem.c b/mm/shmem.c index 6663310c30..e25df59f85 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -413,7 +413,38 @@ static bool shmem_confirm_swap(struct address_space *mapping, #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* ifdef here to avoid bloating shmem.o when not necessary */ -static int shmem_huge __read_mostly; +static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; + +bool shmem_is_huge(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index) +{ + loff_t i_size; + + if (shmem_huge == SHMEM_HUGE_DENY) + return false; + if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) + return false; + if (shmem_huge == SHMEM_HUGE_FORCE) + return true; + + switch (SHMEM_SB(inode->i_sb)->huge) { + case SHMEM_HUGE_ALWAYS: + return true; + case SHMEM_HUGE_WITHIN_SIZE: + index = round_up(index, HPAGE_PMD_NR); + i_size = round_up(i_size_read(inode), PAGE_SIZE); + if (i_size >= HPAGE_PMD_SIZE && (i_size >> PAGE_SHIFT) >= index) + return true; + /* fall through */ + case SHMEM_HUGE_ADVISE: + if (vma && (vma->vm_flags & VM_HUGEPAGE)) + return true; + fallthrough; + default: + return false; + } +} #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static int shmem_parse_huge(const char *str) @@ -587,6 +618,12 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY +bool shmem_is_huge(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index) +{ + return false; +} + static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_split) { @@ -594,15 +631,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) -{ - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && - (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) && - shmem_huge != SHMEM_HUGE_DENY) - return true; - return false; -} - /* * Like add_to_page_cache_locked, but error if expected item has gone. */ @@ -786,7 +814,6 @@ unsigned long shmem_swap_usage(struct vm_area_struct *vma) void shmem_unlock_mapping(struct address_space *mapping) { struct pagevec pvec; - pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index = 0; pagevec_init(&pvec); @@ -794,22 +821,40 @@ void shmem_unlock_mapping(struct address_space *mapping) * Minor point, but we might as well stop if someone else SHM_LOCKs it. */ while (!mapping_unevictable(mapping)) { - /* - * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it - * has finished, if it hits a row of PAGEVEC_SIZE swap entries. - */ - pvec.nr = find_get_entries(mapping, index, - PAGEVEC_SIZE, pvec.pages, indices); - if (!pvec.nr) + if (!pagevec_lookup(&pvec, mapping, &index)) break; - index = indices[pvec.nr - 1] + 1; - pagevec_remove_exceptionals(&pvec); check_move_unevictable_pages(&pvec); pagevec_release(&pvec); cond_resched(); } } +/* + * Check whether a hole-punch or truncation needs to split a huge page, + * returning true if no split was required, or the split has been successful. + * + * Eviction (or truncation to 0 size) should never need to split a huge page; + * but in rare cases might do so, if shmem_undo_range() failed to trylock on + * head, and then succeeded to trylock on tail. + * + * A split can only succeed when there are no additional references on the + * huge page: so the split below relies upon find_get_entries() having stopped + * when it found a subpage of the huge page, without getting further references. + */ +static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end) +{ + if (!PageTransCompound(page)) + return true; + + /* Just proceed to delete a huge page wholly within the range punched */ + if (PageHead(page) && + page->index >= start && page->index + HPAGE_PMD_NR <= end) + return true; + + /* Try to split huge page, so we can truly punch the hole or truncate */ + return split_huge_page(page) >= 0; +} + /* * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. @@ -863,31 +908,11 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (!trylock_page(page)) continue; - if (PageTransTail(page)) { - /* Middle of THP: zero out the page */ - clear_highpage(page); - unlock_page(page); - continue; - } else if (PageTransHuge(page)) { - if (index == round_down(end, HPAGE_PMD_NR)) { - /* - * Range ends in the middle of THP: - * zero out the page - */ - clear_highpage(page); - unlock_page(page); - continue; - } - index += HPAGE_PMD_NR - 1; - i += HPAGE_PMD_NR - 1; - } - - if (!unfalloc || !PageUptodate(page)) { - VM_BUG_ON_PAGE(PageTail(page), page); - if (page_mapping(page) == mapping) { - VM_BUG_ON_PAGE(PageWriteback(page), page); + if ((!unfalloc || !PageUptodate(page)) && + page_mapping(page) == mapping) { + VM_BUG_ON_PAGE(PageWriteback(page), page); + if (shmem_punch_compound(page, start, end)) truncate_inode_page(mapping, page); - } } unlock_page(page); } @@ -961,43 +986,25 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, lock_page(page); - if (PageTransTail(page)) { - /* Middle of THP: zero out the page */ - clear_highpage(page); - unlock_page(page); - /* - * Partial thp truncate due 'start' in middle - * of THP: don't need to look on these pages - * again on !pvec.nr restart. - */ - if (index != round_down(end, HPAGE_PMD_NR)) - start++; - continue; - } else if (PageTransHuge(page)) { - if (index == round_down(end, HPAGE_PMD_NR)) { - /* - * Range ends in the middle of THP: - * zero out the page - */ - clear_highpage(page); - unlock_page(page); - continue; - } - index += HPAGE_PMD_NR - 1; - i += HPAGE_PMD_NR - 1; - } - if (!unfalloc || !PageUptodate(page)) { - VM_BUG_ON_PAGE(PageTail(page), page); - if (page_mapping(page) == mapping) { - VM_BUG_ON_PAGE(PageWriteback(page), page); - truncate_inode_page(mapping, page); - } else { + if (page_mapping(page) != mapping) { /* Page was replaced by swap: retry */ unlock_page(page); index--; break; } + VM_BUG_ON_PAGE(PageWriteback(page), page); + if (shmem_punch_compound(page, start, end)) + truncate_inode_page(mapping, page); + else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + /* Wipe the page and don't get stuck */ + clear_highpage(page); + flush_dcache_page(page); + set_page_dirty(page); + if (index < + round_up(start, HPAGE_PMD_NR)) + start = index + 1; + } } unlock_page(page); } @@ -1024,7 +1031,6 @@ static int shmem_getattr(const struct path *path, struct kstat *stat, { struct inode *inode = path->dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb); if (info->alloced - info->swapped != inode->i_mapping->nrpages) { spin_lock_irq(&info->lock); @@ -1033,7 +1039,7 @@ static int shmem_getattr(const struct path *path, struct kstat *stat, } generic_fillattr(inode, stat); - if (is_huge_enabled(sb_info)) + if (shmem_is_huge(NULL, inode, 0)) stat->blksize = HPAGE_PMD_SIZE; return 0; @@ -1043,7 +1049,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); int error; error = setattr_prepare(dentry, attr); @@ -1079,24 +1084,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) if (oldsize > holebegin) unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); - - /* - * Part of the huge page can be beyond i_size: subject - * to shrink under memory pressure. - */ - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - spin_lock(&sbinfo->shrinklist_lock); - /* - * _careful to defend against unlocked access to - * ->shrink_list in shmem_unused_huge_shrink() - */ - if (list_empty_careful(&info->shrinklist)) { - list_add_tail(&info->shrinklist, - &sbinfo->shrinklist); - sbinfo->shrinklist_len++; - } - spin_unlock(&sbinfo->shrinklist_lock); - } } } @@ -1281,6 +1268,7 @@ int shmem_unuse(unsigned int type, bool frontswap, return 0; mutex_lock(&shmem_swaplist_mutex); +start_over: list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { if (!info->swapped) { list_del_init(&info->swaplist); @@ -1300,13 +1288,15 @@ int shmem_unuse(unsigned int type, bool frontswap, cond_resched(); mutex_lock(&shmem_swaplist_mutex); - next = list_next_entry(info, swaplist); - if (!info->swapped) - list_del_init(&info->swaplist); if (atomic_dec_and_test(&info->stop_eviction)) wake_up_var(&info->stop_eviction); if (error) break; + if (list_empty(&info->swaplist)) + goto start_over; + next = list_next_entry(info, swaplist); + if (!info->swapped) + list_del_init(&info->swaplist); } mutex_unlock(&shmem_swaplist_mutex); @@ -1324,7 +1314,19 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) swp_entry_t swap; pgoff_t index; - VM_BUG_ON_PAGE(PageCompound(page), page); + /* + * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or + * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, + * and its shmem_writeback() needs them to be split when swapping. + */ + if (PageTransCompound(page)) { + /* Ensure the subpages are still dirty */ + SetPageDirty(page); + if (split_huge_page(page) < 0) + goto redirty; + ClearPageDirty(page); + } + BUG_ON(!PageLocked(page)); mapping = page->mapping; index = page->index; @@ -1492,6 +1494,30 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, return page; } +/* + * Make sure huge_gfp is always more limited than limit_gfp. + * Some of the flags set permissions, while others set limitations. + */ +static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) +{ + gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; + gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; + gfp_t zoneflags = limit_gfp & GFP_ZONEMASK; + gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK); + + /* Allow allocations only from the originally specified zones. */ + result |= zoneflags; + + /* + * Minimize the result gfp by taking the union with the deny flags, + * and the intersection of the allow flags. + */ + result |= (limit_gfp & denyflags); + result |= (huge_gfp & limit_gfp) & allowflags; + + return result; +} + static struct page *shmem_alloc_hugepage(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { @@ -1506,8 +1532,8 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, return NULL; shmem_pseudo_vma_init(&pvma, info, hindex); - page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, - HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); + page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), + true); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); @@ -1755,16 +1781,14 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; struct page *page; - enum sgp_type sgp_huge = sgp; pgoff_t hindex = index; + gfp_t huge_gfp; int error; int once = 0; int alloced = 0; if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; - if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) - sgp = SGP_CACHE; repeat: if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { @@ -1785,25 +1809,31 @@ repeat: return error; } - if (page && sgp == SGP_WRITE) - mark_page_accessed(page); - - /* fallocated page? */ - if (page && !PageUptodate(page)) { + if (page) { + hindex = page->index; + if (sgp == SGP_WRITE) + mark_page_accessed(page); + if (PageUptodate(page)) + goto out; + /* fallocated page */ if (sgp != SGP_READ) goto clear; unlock_page(page); put_page(page); - page = NULL; - } - if (page || sgp == SGP_READ) { - *pagep = page; - return 0; } /* - * Fast cache lookup did not find it: - * bring it back from swap or allocate. + * SGP_READ: succeed on hole, with NULL page, letting caller zero. + * SGP_NOALLOC: fail on hole, with NULL page, letting caller fail. + */ + *pagep = NULL; + if (sgp == SGP_READ) + return 0; + if (sgp == SGP_NOALLOC) + return -ENOENT; + + /* + * Fast cache lookup and swap lookup did not find it: allocate. */ if (vma && userfaultfd_missing(vma)) { @@ -1811,34 +1841,15 @@ repeat: return 0; } - /* shmem_symlink() */ - if (mapping->a_ops != &shmem_aops) + /* Never use a huge page for shmem_symlink() */ + if (S_ISLNK(inode->i_mode)) goto alloc_nohuge; - if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) + if (!shmem_is_huge(vma, inode, index)) goto alloc_nohuge; - if (shmem_huge == SHMEM_HUGE_FORCE) - goto alloc_huge; - switch (sbinfo->huge) { - loff_t i_size; - pgoff_t off; - case SHMEM_HUGE_NEVER: - goto alloc_nohuge; - case SHMEM_HUGE_WITHIN_SIZE: - off = round_up(index, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >= HPAGE_PMD_SIZE && - i_size >> PAGE_SHIFT >= off) - goto alloc_huge; - /* fallthrough */ - case SHMEM_HUGE_ADVISE: - if (sgp_huge == SGP_HUGE) - goto alloc_huge; - /* TODO: implement fadvise() hints */ - goto alloc_nohuge; - } -alloc_huge: - page = shmem_alloc_and_acct_page(gfp, inode, index, true); + huge_gfp = vma_thp_gfp_mask(vma); + huge_gfp = limit_gfp_mask(huge_gfp, gfp); + page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true); if (IS_ERR(page)) { alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, @@ -1921,14 +1932,13 @@ clear: * it now, lest undo on failure cancel our earlier guarantee. */ if (sgp != SGP_WRITE && !PageUptodate(page)) { - struct page *head = compound_head(page); int i; - for (i = 0; i < compound_nr(head); i++) { - clear_highpage(head + i); - flush_dcache_page(head + i); + for (i = 0; i < compound_nr(page); i++) { + clear_highpage(page + i); + flush_dcache_page(page + i); } - SetPageUptodate(head); + SetPageUptodate(page); } /* Perhaps the file has been truncated since we checked */ @@ -1944,6 +1954,7 @@ clear: error = -EINVAL; goto unlock; } +out: *pagep = page + index - hindex; return 0; @@ -1991,7 +2002,6 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); - enum sgp_type sgp; int err; vm_fault_t ret = VM_FAULT_LOCKED; @@ -2054,15 +2064,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) spin_unlock(&inode->i_lock); } - sgp = SGP_CACHE; - - if ((vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - sgp = SGP_NOHUGE; - else if (vma->vm_flags & VM_HUGEPAGE) - sgp = SGP_HUGE; - - err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, + err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE, gfp, vma, vmf, &ret); if (err) return vmf_error(err); @@ -3138,12 +3140,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); - if (error) { - if (error != -EOPNOTSUPP) { - iput(inode); - return error; - } - error = 0; + if (error && error != -EOPNOTSUPP) { + iput(inode); + return error; } inode->i_size = len-1; @@ -3441,6 +3440,8 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) unsigned long long size; char *rest; int opt; + kuid_t kuid; + kgid_t kgid; opt = fs_parse(fc, shmem_fs_parameters, param, &result); if (opt < 0) @@ -3476,14 +3477,32 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) ctx->mode = result.uint_32 & 07777; break; case Opt_uid: - ctx->uid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(ctx->uid)) + kuid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(kuid)) goto bad_value; + + /* + * The requested uid must be representable in the + * filesystem's idmapping. + */ + if (!kuid_has_mapping(fc->user_ns, kuid)) + goto bad_value; + + ctx->uid = kuid; break; case Opt_gid: - ctx->gid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(ctx->gid)) + kgid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(kgid)) goto bad_value; + + /* + * The requested gid must be representable in the + * filesystem's idmapping. + */ + if (!kgid_has_mapping(fc->user_ns, kgid)) + goto bad_value; + + ctx->gid = kgid; break; case Opt_huge: ctx->huge = result.uint_32; @@ -3953,7 +3972,7 @@ int __init shmem_init(void) if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else - shmem_huge = 0; /* just in case it was patched */ + shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ #endif return 0; @@ -4019,43 +4038,6 @@ struct kobj_attribute shmem_enabled_attr = __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -bool shmem_huge_enabled(struct vm_area_struct *vma) -{ - struct inode *inode = file_inode(vma->vm_file); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - loff_t i_size; - pgoff_t off; - - if ((vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - return false; - if (shmem_huge == SHMEM_HUGE_FORCE) - return true; - if (shmem_huge == SHMEM_HUGE_DENY) - return false; - switch (sbinfo->huge) { - case SHMEM_HUGE_NEVER: - return false; - case SHMEM_HUGE_ALWAYS: - return true; - case SHMEM_HUGE_WITHIN_SIZE: - off = round_up(vma->vm_pgoff, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >= HPAGE_PMD_SIZE && - i_size >> PAGE_SHIFT >= off) - return true; - /* fall through */ - case SHMEM_HUGE_ADVISE: - /* TODO: implement fadvise() hints */ - return (vma->vm_flags & VM_HUGEPAGE); - default: - VM_BUG_ON(1); - return false; - } -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - #else /* !CONFIG_SHMEM */ /* diff --git a/mm/slab_common.c b/mm/slab_common.c index 34c5bef3ce..0bf0bf6029 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -748,21 +748,23 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) size_t kmalloc_size_roundup(size_t size) { - struct kmem_cache *c; + if (size && size <= KMALLOC_MAX_CACHE_SIZE) { + /* + * The flags don't matter since size_index is common to all. + * Neither does the caller for just getting ->object_size. + */ + return kmalloc_slab(size, GFP_KERNEL)->object_size; + } - /* Short-circuit the 0 size case. */ - if (unlikely(size == 0)) - return 0; - /* Short-circuit saturated "too-large" case. */ - if (unlikely(size == SIZE_MAX)) - return SIZE_MAX; /* Above the smaller buckets, size is a multiple of page size. */ - if (size > KMALLOC_MAX_CACHE_SIZE) + if (size && size <= KMALLOC_MAX_SIZE) return PAGE_SIZE << get_order(size); - /* The flags don't matter since size_index is common to all. */ - c = kmalloc_slab(size, GFP_KERNEL); - return c ? c->object_size : 0; + /* + * Return 'size' for 0 - kmalloc() returns ZERO_SIZE_PTR + * and very large size - kmalloc() may fail. + */ + return size; } EXPORT_SYMBOL(kmalloc_size_roundup); diff --git a/mm/swap.c b/mm/swap.c index bf459c2683..81ec55308f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -969,6 +969,10 @@ void __pagevec_lru_add(struct pagevec *pvec) * ascending indexes. There may be holes in the indices due to * not-present entries. * + * Only one subpage of a Transparent Huge Page is returned in one call: + * allowing truncate_inode_pages_range() to evict the whole THP without + * cycling through a pagevec of extra references. + * * pagevec_lookup_entries() returns the number of entries which were * found. */ diff --git a/mm/swap_state.c b/mm/swap_state.c index b9b9aa94ee..233e4b6884 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "internal.h" #include @@ -88,11 +89,9 @@ void *get_shadow_from_swap_cache(swp_entry_t entry) pgoff_t idx = swp_offset(entry); struct page *page; - page = find_get_entry(address_space, idx); + page = xa_load(&address_space->i_pages, idx); if (xa_is_value(page)) return page; - if (page) - put_page(page); return NULL; } @@ -392,6 +391,39 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, return page; } +/** + * find_get_incore_page - Find and get a page from the page or swap caches. + * @mapping: The address_space to search. + * @index: The page cache index. + * + * This differs from find_get_page() in that it will also look for the + * page in the swap cache. + * + * Return: The found page or %NULL. + */ +struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) +{ + swp_entry_t swp; + struct swap_info_struct *si; + struct page *page = find_get_entry(mapping, index); + + if (!page) + return page; + if (!xa_is_value(page)) + return find_subpage(page, index); + if (!shmem_mapping(mapping)) + return NULL; + + swp = radix_to_swp_entry(page); + /* Prevent swapoff from happening to us */ + si = get_swap_device(swp); + if (!si) + return NULL; + page = find_get_page(swap_address_space(swp), swp_offset(swp)); + put_swap_device(si); + return page; +} + struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7a4fe1bbcf..35d9b1cc9f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -437,9 +437,9 @@ unsigned long vmalloc_nr_pages(void) return atomic_long_read(&nr_vmalloc_pages); } -static struct vmap_area *__find_vmap_area(unsigned long addr) +static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) { - struct rb_node *n = vmap_area_root.rb_node; + struct rb_node *n = root->rb_node; while (n) { struct vmap_area *va; @@ -1403,7 +1403,7 @@ static struct vmap_area *find_vmap_area(unsigned long addr) struct vmap_area *va; spin_lock(&vmap_area_lock); - va = __find_vmap_area(addr); + va = __find_vmap_area(addr, &vmap_area_root); spin_unlock(&vmap_area_lock); return va; @@ -2135,7 +2135,7 @@ struct vm_struct *remove_vm_area(const void *addr) might_sleep(); spin_lock(&vmap_area_lock); - va = __find_vmap_area((unsigned long)addr); + va = __find_vmap_area((unsigned long)addr, &vmap_area_root); if (va && va->vm) { struct vm_struct *vm = va->vm; @@ -2397,10 +2397,16 @@ struct vmap_pfn_data { static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private) { struct vmap_pfn_data *data = private; + unsigned long pfn = data->pfns[data->idx]; + pte_t ptent; - if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx]))) + if (WARN_ON_ONCE(pfn_valid(pfn))) return -EINVAL; - *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot)); + + ptent = pte_mkspecial(pfn_pte(pfn, data->prot)); + set_pte_at(&init_mm, addr, pte, ptent); + + data->idx++; return 0; } @@ -2427,6 +2433,10 @@ void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) free_vm_area(area); return NULL; } + + flush_cache_vmap((unsigned long)area->addr, + (unsigned long)area->addr + count * PAGE_SIZE); + return area->addr; } EXPORT_SYMBOL_GPL(vmap_pfn); @@ -3533,14 +3543,32 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) #ifdef CONFIG_PRINTK bool vmalloc_dump_obj(void *object) { - struct vm_struct *vm; void *objp = (void *)PAGE_ALIGN((unsigned long)object); + const void *caller; + struct vm_struct *vm; + struct vmap_area *va; + unsigned long addr; + unsigned int nr_pages; - vm = find_vm_area(objp); - if (!vm) + if (!spin_trylock(&vmap_area_lock)) return false; + va = __find_vmap_area((unsigned long)objp, &vmap_area_root); + if (!va) { + spin_unlock(&vmap_area_lock); + return false; + } + + vm = va->vm; + if (!vm) { + spin_unlock(&vmap_area_lock); + return false; + } + addr = (unsigned long)vm->addr; + caller = vm->caller; + nr_pages = vm->nr_pages; + spin_unlock(&vmap_area_lock); pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", - vm->nr_pages, (unsigned long)vm->addr, vm->caller); + nr_pages, addr, caller); return true; } #endif diff --git a/mm/zswap.c b/mm/zswap.c index a856681594..b9881c1185 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -43,6 +43,8 @@ #include #include +#include "internal.h" + /********************************* * statistics **********************************/ @@ -533,9 +535,19 @@ static void shrink_worker(struct work_struct *w) { struct zswap_pool *pool = container_of(w, typeof(*pool), shrink_work); + int ret, failures = 0; - if (zpool_shrink(pool->zpool, 1, NULL)) - zswap_reject_reclaim_fail++; + do { + ret = zpool_shrink(pool->zpool, 1, NULL); + if (ret) { + zswap_reject_reclaim_fail++; + if (ret != -EAGAIN) + break; + if (++failures == MAX_RECLAIM_RETRIES) + break; + } + cond_resched(); + } while (!zswap_can_accept()); zswap_pool_put(pool); } diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index deb24272d0..1f82d98d7a 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1072,9 +1072,9 @@ void hci_uuids_clear(struct hci_dev *hdev) void hci_link_keys_clear(struct hci_dev *hdev) { - struct link_key *key; + struct link_key *key, *tmp; - list_for_each_entry(key, &hdev->link_keys, list) { + list_for_each_entry_safe(key, tmp, &hdev->link_keys, list) { list_del_rcu(&key->list); kfree_rcu(key, rcu); } @@ -1082,9 +1082,9 @@ void hci_link_keys_clear(struct hci_dev *hdev) void hci_smp_ltks_clear(struct hci_dev *hdev) { - struct smp_ltk *k; + struct smp_ltk *k, *tmp; - list_for_each_entry(k, &hdev->long_term_keys, list) { + list_for_each_entry_safe(k, tmp, &hdev->long_term_keys, list) { list_del_rcu(&k->list); kfree_rcu(k, rcu); } @@ -1092,9 +1092,9 @@ void hci_smp_ltks_clear(struct hci_dev *hdev) void hci_smp_irks_clear(struct hci_dev *hdev) { - struct smp_irk *k; + struct smp_irk *k, *tmp; - list_for_each_entry(k, &hdev->identity_resolving_keys, list) { + list_for_each_entry_safe(k, tmp, &hdev->identity_resolving_keys, list) { list_del_rcu(&k->list); kfree_rcu(k, rcu); } @@ -1102,9 +1102,9 @@ void hci_smp_irks_clear(struct hci_dev *hdev) void hci_blocked_keys_clear(struct hci_dev *hdev) { - struct blocked_key *b; + struct blocked_key *b, *tmp; - list_for_each_entry(b, &hdev->blocked_keys, list) { + list_for_each_entry_safe(b, tmp, &hdev->blocked_keys, list) { list_del_rcu(&b->list); kfree_rcu(b, rcu); } diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index e5c4730277..447d4d3b3b 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -4310,6 +4310,12 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn, } } + chan = l2cap_chan_hold_unless_zero(chan); + if (!chan) { + err = -EBADSLT; + goto unlock; + } + err = 0; l2cap_chan_lock(chan); @@ -4339,6 +4345,7 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn, } l2cap_chan_unlock(chan); + l2cap_chan_put(chan); unlock: mutex_unlock(&conn->chan_lock); @@ -4671,7 +4678,6 @@ static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn, chan = l2cap_get_chan_by_scid(conn, scid); if (!chan) { - mutex_unlock(&conn->chan_lock); return 0; } diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 7bdeb8eea9..ed053782d8 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -898,13 +898,16 @@ void __cfg80211_connect_result(struct net_device *dev, if (!wdev->u.client.ssid_len) { rcu_read_lock(); for_each_valid_link(cr, link) { + u32 ssid_len; + ssid = ieee80211_bss_get_elem(cr->links[link].bss, WLAN_EID_SSID); if (!ssid || !ssid->datalen) continue; - memcpy(wdev->u.client.ssid, ssid->data, ssid->datalen); + ssid_len = min(ssid->datalen, IEEE80211_MAX_SSID_LEN); + memcpy(wdev->u.client.ssid, ssid->data, ssid_len); wdev->u.client.ssid_len = ssid->datalen; break; }