forked from rpms/kernel
		
	CVE-2012-1179 fix pmd_bad() triggering in code paths holding mmap_sem read mode
This commit is contained in:
		
							parent
							
								
									c4069fe6d2
								
							
						
					
					
						commit
						6875c9a8ca
					
				| @ -775,6 +775,9 @@ Patch21302: sony-laptop-Enable-keyboard-backlight-by-default.patch | |||||||
| # Disable threading in hibernate compression | # Disable threading in hibernate compression | ||||||
| Patch21303: disable-threading-in-compression-for-hibernate.patch | Patch21303: disable-threading-in-compression-for-hibernate.patch | ||||||
| 
 | 
 | ||||||
|  | #rhbz 803809 CVE-2012-1179 | ||||||
|  | Patch21304: mm-thp-fix-pmd_bad-triggering.patch | ||||||
|  | 
 | ||||||
| Patch21400: unhandled-irqs-switch-to-polling.patch | Patch21400: unhandled-irqs-switch-to-polling.patch | ||||||
| 
 | 
 | ||||||
| Patch22000: weird-root-dentry-name-debug.patch | Patch22000: weird-root-dentry-name-debug.patch | ||||||
| @ -1486,6 +1489,9 @@ ApplyPatch unhandled-irqs-switch-to-polling.patch | |||||||
| 
 | 
 | ||||||
| ApplyPatch weird-root-dentry-name-debug.patch | ApplyPatch weird-root-dentry-name-debug.patch | ||||||
| 
 | 
 | ||||||
|  | #rhbz 803809 CVE-2012-1179 | ||||||
|  | ApplyPatch mm-thp-fix-pmd_bad-triggering.patch | ||||||
|  | 
 | ||||||
| #Highbank clock functions | #Highbank clock functions | ||||||
| ApplyPatch highbank-export-clock-functions.patch  | ApplyPatch highbank-export-clock-functions.patch  | ||||||
| 
 | 
 | ||||||
| @ -2344,6 +2350,9 @@ fi | |||||||
| #                 ||----w | | #                 ||----w | | ||||||
| #                 ||     || | #                 ||     || | ||||||
| %changelog | %changelog | ||||||
|  | * Thu Mar 15 2012 Justin M. Forbes <jforbes@redhat.com> | ||||||
|  | - CVE-2012-1179 fix pmd_bad() triggering in code paths holding mmap_sem read mode (rhbz 803809) | ||||||
|  | 
 | ||||||
| * Wed Mar 14 2012 Josh Boyer <jwboyer@redhat.com> | * Wed Mar 14 2012 Josh Boyer <jwboyer@redhat.com> | ||||||
| - Fixup irqpoll patch to only activate on machines with ASM108x PCI bridge | - Fixup irqpoll patch to only activate on machines with ASM108x PCI bridge | ||||||
| 
 | 
 | ||||||
|  | |||||||
							
								
								
									
										447
									
								
								mm-thp-fix-pmd_bad-triggering.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										447
									
								
								mm-thp-fix-pmd_bad-triggering.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,447 @@ | |||||||
|  | In some cases it may happen that pmd_none_or_clear_bad() is called | ||||||
|  | with the mmap_sem hold in read mode. In those cases the huge page | ||||||
|  | faults can allocate hugepmds under pmd_none_or_clear_bad() and that | ||||||
|  | can trigger a false positive from pmd_bad() that will not like to see | ||||||
|  | a pmd materializing as trans huge. | ||||||
|  | 
 | ||||||
|  | It's not khugepaged the problem, khugepaged holds the mmap_sem in | ||||||
|  | write mode (and all those sites must hold the mmap_sem in read mode to | ||||||
|  | prevent pagetables to go away from under them, during code review it | ||||||
|  | seems vm86 mode on 32bit kernels requires that too unless it's | ||||||
|  | restricted to 1 thread per process or UP builds). The race is only | ||||||
|  | with the huge pagefaults that can convert a pmd_none() into a | ||||||
|  | pmd_trans_huge(). | ||||||
|  | 
 | ||||||
|  | Effectively all these pmd_none_or_clear_bad() sites running with | ||||||
|  | mmap_sem in read mode are somewhat speculative with the page faults, | ||||||
|  | and the result is always undefined when they run simultaneously. This | ||||||
|  | is probably why it wasn't common to run into this. For example if the | ||||||
|  | madvise(MADV_DONTNEED) runs zap_page_range() shortly before the page | ||||||
|  | fault, the hugepage will not be zapped, if the page fault runs first | ||||||
|  | it will be zapped. | ||||||
|  | 
 | ||||||
|  | Altering pmd_bad() not to error out if it finds hugepmds won't be | ||||||
|  | enough to fix this, because zap_pmd_range would then proceed to call | ||||||
|  | zap_pte_range (which would be incorrect if the pmd become a | ||||||
|  | pmd_trans_huge()). | ||||||
|  | 
 | ||||||
|  | The simplest way to fix this is to read the pmd in the local stack | ||||||
|  | (regardless of what we read, no need of actual CPU barriers, only | ||||||
|  | compiler barrier needed), and be sure it is not changing under the | ||||||
|  | code that computes its value. Even if the real pmd is changing under | ||||||
|  | the value we hold on the stack, we don't care. If we actually end up | ||||||
|  | in zap_pte_range it means the pmd was not none already and it was not | ||||||
|  | huge, and it can't become huge from under us (khugepaged locking | ||||||
|  | explained above). | ||||||
|  | 
 | ||||||
|  | All we need is to enforce that there is no way anymore that in a code | ||||||
|  | path like below, pmd_trans_huge can be false, but | ||||||
|  | pmd_none_or_clear_bad can run into a hugepmd. The overhead of a | ||||||
|  | barrier() is just a compiler tweak and should not be measurable (I | ||||||
|  | only added it for THP builds). I don't exclude different compiler | ||||||
|  | versions may have prevented the race too by caching the value of *pmd | ||||||
|  | on the stack (that hasn't been verified, but it wouldn't be impossible | ||||||
|  | considering pmd_none_or_clear_bad, pmd_bad, pmd_trans_huge, pmd_none | ||||||
|  | are all inlines and there's no external function called in between | ||||||
|  | pmd_trans_huge and pmd_none_or_clear_bad). | ||||||
|  | 
 | ||||||
|  | 		if (pmd_trans_huge(*pmd)) { | ||||||
|  | 			if (next-addr != HPAGE_PMD_SIZE) { | ||||||
|  | 				VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); | ||||||
|  | 				split_huge_page_pmd(vma->vm_mm, pmd); | ||||||
|  | 			} else if (zap_huge_pmd(tlb, vma, pmd, addr)) | ||||||
|  | 				continue; | ||||||
|  | 			/* fall through */ | ||||||
|  | 		} | ||||||
|  | 		if (pmd_none_or_clear_bad(pmd)) | ||||||
|  | 
 | ||||||
|  | Because this race condition could be exercised without special | ||||||
|  | privileges this was reported in CVE-2012-1179. | ||||||
|  | 
 | ||||||
|  | The race was identified and fully explained by Ulrich who debugged it. | ||||||
|  | I'm quoting his accurate explanation below, for reference. | ||||||
|  | 
 | ||||||
|  | ====== start quote =======
 | ||||||
|  |   mapcount 0 page_mapcount 1 | ||||||
|  |   kernel BUG at mm/huge_memory.c:1384! | ||||||
|  | 
 | ||||||
|  | At some point prior to the panic, a "bad pmd ..." message similar to the | ||||||
|  | following is logged on the console: | ||||||
|  | 
 | ||||||
|  |   mm/memory.c:145: bad pmd ffff8800376e1f98(80000000314000e7). | ||||||
|  | 
 | ||||||
|  | The "bad pmd ..." message is logged by pmd_clear_bad() before it clears | ||||||
|  | the page's PMD table entry. | ||||||
|  | 
 | ||||||
|  |     143 void pmd_clear_bad(pmd_t *pmd) | ||||||
|  |     144 { | ||||||
|  | ->  145         pmd_ERROR(*pmd);
 | ||||||
|  |     146         pmd_clear(pmd); | ||||||
|  |     147 } | ||||||
|  | 
 | ||||||
|  | After the PMD table entry has been cleared, there is an inconsistency | ||||||
|  | between the actual number of PMD table entries that are mapping the page | ||||||
|  | and the page's map count (_mapcount field in struct page). When the page | ||||||
|  | is subsequently reclaimed, __split_huge_page() detects this inconsistency. | ||||||
|  | 
 | ||||||
|  |    1381         if (mapcount != page_mapcount(page)) | ||||||
|  |    1382                 printk(KERN_ERR "mapcount %d page_mapcount %d\n", | ||||||
|  |    1383                        mapcount, page_mapcount(page)); | ||||||
|  | -> 1384         BUG_ON(mapcount != page_mapcount(page));
 | ||||||
|  | 
 | ||||||
|  | The root cause of the problem is a race of two threads in a multithreaded | ||||||
|  | process. Thread B incurs a page fault on a virtual address that has never | ||||||
|  | been accessed (PMD entry is zero) while Thread A is executing an madvise() | ||||||
|  | system call on a virtual address within the same 2 MB (huge page) range. | ||||||
|  | 
 | ||||||
|  |            virtual address space | ||||||
|  |           .---------------------. | ||||||
|  |           |                     | | ||||||
|  |           |                     | | ||||||
|  |         .-|---------------------| | ||||||
|  |         | |                     | | ||||||
|  |         | |                     |<-- B(fault) | ||||||
|  |         | |                     | | ||||||
|  |   2 MB  | |/////////////////////|-. | ||||||
|  |   huge <  |/////////////////////|  > A(range) | ||||||
|  |   page  | |/////////////////////|-' | ||||||
|  |         | |                     | | ||||||
|  |         | |                     | | ||||||
|  |         '-|---------------------| | ||||||
|  |           |                     | | ||||||
|  |           |                     | | ||||||
|  |           '---------------------' | ||||||
|  | 
 | ||||||
|  | - Thread A is executing an madvise(..., MADV_DONTNEED) system call
 | ||||||
|  |   on the virtual address range "A(range)" shown in the picture. | ||||||
|  | 
 | ||||||
|  | sys_madvise | ||||||
|  |   // Acquire the semaphore in shared mode. | ||||||
|  |   down_read(¤t->mm->mmap_sem) | ||||||
|  |   ... | ||||||
|  |   madvise_vma | ||||||
|  |     switch (behavior) | ||||||
|  |     case MADV_DONTNEED: | ||||||
|  |          madvise_dontneed | ||||||
|  |            zap_page_range | ||||||
|  |              unmap_vmas | ||||||
|  |                unmap_page_range | ||||||
|  |                  zap_pud_range | ||||||
|  |                    zap_pmd_range | ||||||
|  |                      // | ||||||
|  |                      // Assume that this huge page has never been accessed. | ||||||
|  |                      // I.e. content of the PMD entry is zero (not mapped). | ||||||
|  |                      // | ||||||
|  |                      if (pmd_trans_huge(*pmd)) { | ||||||
|  |                          // We don't get here due to the above assumption. | ||||||
|  |                      } | ||||||
|  |                      // | ||||||
|  |                      // Assume that Thread B incurred a page fault and | ||||||
|  |          .---------> // sneaks in here as shown below. | ||||||
|  |          |           // | ||||||
|  |          |           if (pmd_none_or_clear_bad(pmd)) | ||||||
|  |          |               { | ||||||
|  |          |                 if (unlikely(pmd_bad(*pmd))) | ||||||
|  |          |                     pmd_clear_bad | ||||||
|  |          |                     { | ||||||
|  |          |                       pmd_ERROR | ||||||
|  |          |                         // Log "bad pmd ..." message here. | ||||||
|  |          |                       pmd_clear | ||||||
|  |          |                         // Clear the page's PMD entry. | ||||||
|  |          |                         // Thread B incremented the map count | ||||||
|  |          |                         // in page_add_new_anon_rmap(), but | ||||||
|  |          |                         // now the page is no longer mapped | ||||||
|  |          |                         // by a PMD entry (-> inconsistency). | ||||||
|  |          |                     } | ||||||
|  |          |               } | ||||||
|  |          | | ||||||
|  |          v | ||||||
|  | - Thread B is handling a page fault on virtual address "B(fault)" shown
 | ||||||
|  |   in the picture. | ||||||
|  | 
 | ||||||
|  | ... | ||||||
|  | do_page_fault | ||||||
|  |   __do_page_fault | ||||||
|  |     // Acquire the semaphore in shared mode. | ||||||
|  |     down_read_trylock(&mm->mmap_sem) | ||||||
|  |     ... | ||||||
|  |     handle_mm_fault | ||||||
|  |       if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) | ||||||
|  |           // We get here due to the above assumption (PMD entry is zero). | ||||||
|  |           do_huge_pmd_anonymous_page | ||||||
|  |             alloc_hugepage_vma | ||||||
|  |               // Allocate a new transparent huge page here. | ||||||
|  |             ... | ||||||
|  |             __do_huge_pmd_anonymous_page | ||||||
|  |               ... | ||||||
|  |               spin_lock(&mm->page_table_lock) | ||||||
|  |               ... | ||||||
|  |               page_add_new_anon_rmap | ||||||
|  |                 // Here we increment the page's map count (starts at -1). | ||||||
|  |                 atomic_set(&page->_mapcount, 0) | ||||||
|  |               set_pmd_at | ||||||
|  |                 // Here we set the page's PMD entry which will be cleared | ||||||
|  |                 // when Thread A calls pmd_clear_bad(). | ||||||
|  |               ... | ||||||
|  |               spin_unlock(&mm->page_table_lock) | ||||||
|  | 
 | ||||||
|  | The mmap_sem does not prevent the race because both threads are acquiring | ||||||
|  | it in shared mode (down_read). Thread B holds the page_table_lock while | ||||||
|  | the page's map count and PMD table entry are updated. However, Thread A | ||||||
|  | does not synchronize on that lock. | ||||||
|  | ====== end quote =======
 | ||||||
|  | 
 | ||||||
|  | Reported-by: Ulrich Obergfell <uobergfe@redhat.com> | ||||||
|  | Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> | ||||||
|  | ---
 | ||||||
|  |  arch/x86/kernel/vm86_32.c     |    2 + | ||||||
|  |  fs/proc/task_mmu.c            |    9 ++++++ | ||||||
|  |  include/asm-generic/pgtable.h |   57 +++++++++++++++++++++++++++++++++++++++++ | ||||||
|  |  mm/memcontrol.c               |    4 +++ | ||||||
|  |  mm/memory.c                   |   14 ++++++++-- | ||||||
|  |  mm/mempolicy.c                |    2 +- | ||||||
|  |  mm/mincore.c                  |    2 +- | ||||||
|  |  mm/pagewalk.c                 |    2 +- | ||||||
|  |  mm/swapfile.c                 |    4 +-- | ||||||
|  |  9 files changed, 87 insertions(+), 9 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
 | ||||||
|  | index b466cab..328cb37 100644
 | ||||||
|  | --- a/arch/x86/kernel/vm86_32.c
 | ||||||
|  | +++ b/arch/x86/kernel/vm86_32.c
 | ||||||
|  | @@ -172,6 +172,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
 | ||||||
|  |  	spinlock_t *ptl; | ||||||
|  |  	int i; | ||||||
|  |   | ||||||
|  | +	down_write(&mm->mmap_sem);
 | ||||||
|  |  	pgd = pgd_offset(mm, 0xA0000); | ||||||
|  |  	if (pgd_none_or_clear_bad(pgd)) | ||||||
|  |  		goto out; | ||||||
|  | @@ -190,6 +191,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
 | ||||||
|  |  	} | ||||||
|  |  	pte_unmap_unlock(pte, ptl); | ||||||
|  |  out: | ||||||
|  | +	up_write(&mm->mmap_sem);
 | ||||||
|  |  	flush_tlb(); | ||||||
|  |  } | ||||||
|  |   | ||||||
|  | diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
 | ||||||
|  | index 7dcd2a2..3efa725 100644
 | ||||||
|  | --- a/fs/proc/task_mmu.c
 | ||||||
|  | +++ b/fs/proc/task_mmu.c
 | ||||||
|  | @@ -409,6 +409,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 | ||||||
|  |  	} else { | ||||||
|  |  		spin_unlock(&walk->mm->page_table_lock); | ||||||
|  |  	} | ||||||
|  | +
 | ||||||
|  | +	if (pmd_trans_unstable(pmd))
 | ||||||
|  | +		return 0;
 | ||||||
|  |  	/* | ||||||
|  |  	 * The mmap_sem held all the way back in m_start() is what | ||||||
|  |  	 * keeps khugepaged out of here and from collapsing things | ||||||
|  | @@ -507,6 +510,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 | ||||||
|  |  	struct page *page; | ||||||
|  |   | ||||||
|  |  	split_huge_page_pmd(walk->mm, pmd); | ||||||
|  | +	if (pmd_trans_unstable(pmd))
 | ||||||
|  | +		return 0;
 | ||||||
|  |   | ||||||
|  |  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||||||
|  |  	for (; addr != end; pte++, addr += PAGE_SIZE) { | ||||||
|  | @@ -670,6 +675,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 | ||||||
|  |  	int err = 0; | ||||||
|  |   | ||||||
|  |  	split_huge_page_pmd(walk->mm, pmd); | ||||||
|  | +	if (pmd_trans_unstable(pmd))
 | ||||||
|  | +		return 0;
 | ||||||
|  |   | ||||||
|  |  	/* find the first VMA at or above 'addr' */ | ||||||
|  |  	vma = find_vma(walk->mm, addr); | ||||||
|  | @@ -961,6 +968,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 | ||||||
|  |  		spin_unlock(&walk->mm->page_table_lock); | ||||||
|  |  	} | ||||||
|  |   | ||||||
|  | +	if (pmd_trans_unstable(pmd))
 | ||||||
|  | +		return 0;
 | ||||||
|  |  	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); | ||||||
|  |  	do { | ||||||
|  |  		struct page *page = can_gather_numa_stats(*pte, md->vma, addr); | ||||||
|  | diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
 | ||||||
|  | index 76bff2b..10f8291 100644
 | ||||||
|  | --- a/include/asm-generic/pgtable.h
 | ||||||
|  | +++ b/include/asm-generic/pgtable.h
 | ||||||
|  | @@ -443,6 +443,63 @@ static inline int pmd_write(pmd_t pmd)
 | ||||||
|  |  #endif /* __HAVE_ARCH_PMD_WRITE */ | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  | +/*
 | ||||||
|  | + * This function is meant to be used by sites walking pagetables with
 | ||||||
|  | + * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
 | ||||||
|  | + * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd
 | ||||||
|  | + * into a null pmd and the transhuge page fault can convert a null pmd
 | ||||||
|  | + * into an hugepmd or into a regular pmd (if the hugepage allocation
 | ||||||
|  | + * fails). While holding the mmap_sem in read mode the pmd becomes
 | ||||||
|  | + * stable and stops changing under us only if it's not null and not a
 | ||||||
|  | + * transhuge pmd. When those races occurs and this function makes a
 | ||||||
|  | + * difference vs the standard pmd_none_or_clear_bad, the result is
 | ||||||
|  | + * undefined so behaving like if the pmd was none is safe (because it
 | ||||||
|  | + * can return none anyway). The compiler level barrier() is critically
 | ||||||
|  | + * important to compute the two checks atomically on the same pmdval.
 | ||||||
|  | + */
 | ||||||
|  | +static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
 | ||||||
|  | +{
 | ||||||
|  | +	/* depend on compiler for an atomic pmd read */
 | ||||||
|  | +	pmd_t pmdval = *pmd;
 | ||||||
|  | +	/*
 | ||||||
|  | +	 * The barrier will stabilize the pmdval in a register or on
 | ||||||
|  | +	 * the stack so that it will stop changing under the code.
 | ||||||
|  | +	 */
 | ||||||
|  | +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 | ||||||
|  | +	barrier();
 | ||||||
|  | +#endif
 | ||||||
|  | +	if (pmd_none(pmdval))
 | ||||||
|  | +		return 1;
 | ||||||
|  | +	if (unlikely(pmd_bad(pmdval))) {
 | ||||||
|  | +		if (!pmd_trans_huge(pmdval))
 | ||||||
|  | +			pmd_clear_bad(pmd);
 | ||||||
|  | +		return 1;
 | ||||||
|  | +	}
 | ||||||
|  | +	return 0;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +/*
 | ||||||
|  | + * This is a noop if Transparent Hugepage Support is not built into
 | ||||||
|  | + * the kernel. Otherwise it is equivalent to
 | ||||||
|  | + * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in
 | ||||||
|  | + * places that already verified the pmd is not none and they want to
 | ||||||
|  | + * walk ptes while holding the mmap sem in read mode (write mode don't
 | ||||||
|  | + * need this). If THP is not enabled, the pmd can't go away under the
 | ||||||
|  | + * code even if MADV_DONTNEED runs, but if THP is enabled we need to
 | ||||||
|  | + * run a pmd_trans_unstable before walking the ptes after
 | ||||||
|  | + * split_huge_page_pmd returns (because it may have run when the pmd
 | ||||||
|  | + * become null, but then a page fault can map in a THP and not a
 | ||||||
|  | + * regular page).
 | ||||||
|  | + */
 | ||||||
|  | +static inline int pmd_trans_unstable(pmd_t *pmd)
 | ||||||
|  | +{
 | ||||||
|  | +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 | ||||||
|  | +	return pmd_none_or_trans_huge_or_clear_bad(pmd);
 | ||||||
|  | +#else
 | ||||||
|  | +	return 0;
 | ||||||
|  | +#endif
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  |  #endif /* !__ASSEMBLY__ */ | ||||||
|  |   | ||||||
|  |  #endif /* _ASM_GENERIC_PGTABLE_H */ | ||||||
|  | diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 | ||||||
|  | index d0e57a3..67b0578 100644
 | ||||||
|  | --- a/mm/memcontrol.c
 | ||||||
|  | +++ b/mm/memcontrol.c
 | ||||||
|  | @@ -5193,6 +5193,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 | ||||||
|  |  	spinlock_t *ptl; | ||||||
|  |   | ||||||
|  |  	split_huge_page_pmd(walk->mm, pmd); | ||||||
|  | +	if (pmd_trans_unstable(pmd))
 | ||||||
|  | +		return 0;
 | ||||||
|  |   | ||||||
|  |  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||||||
|  |  	for (; addr != end; pte++, addr += PAGE_SIZE) | ||||||
|  | @@ -5355,6 +5357,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 | ||||||
|  |  	spinlock_t *ptl; | ||||||
|  |   | ||||||
|  |  	split_huge_page_pmd(walk->mm, pmd); | ||||||
|  | +	if (pmd_trans_unstable(pmd))
 | ||||||
|  | +		return 0;
 | ||||||
|  |  retry: | ||||||
|  |  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||||||
|  |  	for (; addr != end; addr += PAGE_SIZE) { | ||||||
|  | diff --git a/mm/memory.c b/mm/memory.c
 | ||||||
|  | index fa2f04e..e3090fc 100644
 | ||||||
|  | --- a/mm/memory.c
 | ||||||
|  | +++ b/mm/memory.c
 | ||||||
|  | @@ -1251,12 +1251,20 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 | ||||||
|  |  				VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); | ||||||
|  |  				split_huge_page_pmd(vma->vm_mm, pmd); | ||||||
|  |  			} else if (zap_huge_pmd(tlb, vma, pmd, addr)) | ||||||
|  | -				continue;
 | ||||||
|  | +				goto next;
 | ||||||
|  |  			/* fall through */ | ||||||
|  |  		} | ||||||
|  | -		if (pmd_none_or_clear_bad(pmd))
 | ||||||
|  | -			continue;
 | ||||||
|  | +		/*
 | ||||||
|  | +		 * Here there can be other concurrent MADV_DONTNEED or
 | ||||||
|  | +		 * trans huge page faults running, and if the pmd is
 | ||||||
|  | +		 * none or trans huge it can change under us. This is
 | ||||||
|  | +		 * because MADV_DONTNEED holds the mmap_sem in read
 | ||||||
|  | +		 * mode.
 | ||||||
|  | +		 */
 | ||||||
|  | +		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 | ||||||
|  | +			goto next;
 | ||||||
|  |  		next = zap_pte_range(tlb, vma, pmd, addr, next, details); | ||||||
|  | +	next:
 | ||||||
|  |  		cond_resched(); | ||||||
|  |  	} while (pmd++, addr = next, addr != end); | ||||||
|  |   | ||||||
|  | diff --git a/mm/mempolicy.c b/mm/mempolicy.c
 | ||||||
|  | index 47296fe..0a37570 100644
 | ||||||
|  | --- a/mm/mempolicy.c
 | ||||||
|  | +++ b/mm/mempolicy.c
 | ||||||
|  | @@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 | ||||||
|  |  	do { | ||||||
|  |  		next = pmd_addr_end(addr, end); | ||||||
|  |  		split_huge_page_pmd(vma->vm_mm, pmd); | ||||||
|  | -		if (pmd_none_or_clear_bad(pmd))
 | ||||||
|  | +		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 | ||||||
|  |  			continue; | ||||||
|  |  		if (check_pte_range(vma, pmd, addr, next, nodes, | ||||||
|  |  				    flags, private)) | ||||||
|  | diff --git a/mm/mincore.c b/mm/mincore.c
 | ||||||
|  | index 636a868..936b4ce 100644
 | ||||||
|  | --- a/mm/mincore.c
 | ||||||
|  | +++ b/mm/mincore.c
 | ||||||
|  | @@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 | ||||||
|  |  			} | ||||||
|  |  			/* fall through */ | ||||||
|  |  		} | ||||||
|  | -		if (pmd_none_or_clear_bad(pmd))
 | ||||||
|  | +		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 | ||||||
|  |  			mincore_unmapped_range(vma, addr, next, vec); | ||||||
|  |  		else | ||||||
|  |  			mincore_pte_range(vma, pmd, addr, next, vec); | ||||||
|  | diff --git a/mm/pagewalk.c b/mm/pagewalk.c
 | ||||||
|  | index 2f5cf10..aa9701e 100644
 | ||||||
|  | --- a/mm/pagewalk.c
 | ||||||
|  | +++ b/mm/pagewalk.c
 | ||||||
|  | @@ -59,7 +59,7 @@ again:
 | ||||||
|  |  			continue; | ||||||
|  |   | ||||||
|  |  		split_huge_page_pmd(walk->mm, pmd); | ||||||
|  | -		if (pmd_none_or_clear_bad(pmd))
 | ||||||
|  | +		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 | ||||||
|  |  			goto again; | ||||||
|  |  		err = walk_pte_range(pmd, addr, next, walk); | ||||||
|  |  		if (err) | ||||||
|  | diff --git a/mm/swapfile.c b/mm/swapfile.c
 | ||||||
|  | index d999f09..f31b29d 100644
 | ||||||
|  | --- a/mm/swapfile.c
 | ||||||
|  | +++ b/mm/swapfile.c
 | ||||||
|  | @@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 | ||||||
|  |  	pmd = pmd_offset(pud, addr); | ||||||
|  |  	do { | ||||||
|  |  		next = pmd_addr_end(addr, end); | ||||||
|  | -		if (unlikely(pmd_trans_huge(*pmd)))
 | ||||||
|  | -			continue;
 | ||||||
|  | -		if (pmd_none_or_clear_bad(pmd))
 | ||||||
|  | +		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 | ||||||
|  |  			continue; | ||||||
|  |  		ret = unuse_pte_range(vma, pmd, addr, next, entry, page); | ||||||
|  |  		if (ret) | ||||||
|  | 
 | ||||||
|  | --
 | ||||||
|  | To unsubscribe, send a message with 'unsubscribe linux-mm' in | ||||||
|  | the body to majordomo@kvack.org.  For more info on Linux MM, | ||||||
|  | see: http://www.linux-mm.org/ . | ||||||
|  | Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ | ||||||
|  | Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user