-
Notifications
You must be signed in to change notification settings - Fork 10
[LTS 9.4] hugetlb: CVE-2025-38084, CVE-2025-38085, CVE-2024-57883 #819
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: ciqlts9_4
Are you sure you want to change the base?
Conversation
|
🤖 Validation Checks In Progress Workflow run: https://github.com/ctrliq/kernel-src-tree/actions/runs/21229505497 |
🔍 Upstream Linux Kernel Commit Check
This is an automated message from the kernel commit checker workflow. |
🔍 Interdiff Analysis
diff -u b/mm/hugetlb.c b/mm/hugetlb.c
--- b/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7358,9 +7377,6 @@ INTERDIFF: rejected hunk from patch2, cannot diff context
if (!(vma->vm_flags & VM_MAYSHARE))
return;
- start = ALIGN(vma->vm_start, PUD_SIZE);
- end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
-
if (start >= end)
return;
@@ -7375,9 +7394,6 @@ INTERDIFF: rejected hunk from patch1, cannot diff context
if (!(vma->vm_flags & VM_MAYSHARE))
return;
- start = ALIGN(vma->vm_start, PUD_SIZE);
- end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
-
if (start >= end)
return;
diff -u b/include/linux/hugetlb.h b/include/linux/hugetlb.h
--- b/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -255,6 +255,7 @@ INTERDIFF: rejected hunk from patch1, cannot diff context
bool is_hugetlb_entry_migration(pte_t pte);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
#else /* !CONFIG_HUGETLB_PAGE */
@@ -279,6 +279,7 @@ INTERDIFF: rejected hunk from patch2, cannot diff context
bool is_hugetlb_entry_hwpoisoned(pte_t pte);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
void fixup_hugetlb_reservations(struct vm_area_struct *vma);
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
#else /* !CONFIG_HUGETLB_PAGE */
@@ -461,8 +461,8 @@
-{
-}
+
+static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }
static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}
#endif /* !CONFIG_HUGETLB_PAGE */
-
-#ifndef pgd_write
+/*
+ * hugepages at page global directory. If arch support
diff -u b/mm/hugetlb.c b/mm/hugetlb.c
--- b/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -95,7 +95,7 @@ INTERDIFF: rejected hunk from patch1, cannot diff context
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
- unsigned long start, unsigned long end);
+ unsigned long start, unsigned long end, bool take_locks);
static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
@@ -121,7 +121,7 @@ INTERDIFF: rejected hunk from patch2, cannot diff context
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
- unsigned long start, unsigned long end);
+ unsigned long start, unsigned long end, bool take_locks);
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
static void hugetlb_free_folio(struct folio *folio)
@@ -4846,7 +4846,7 @@
* MM, VMA and rmap all write-locked to prevent concurrent page table
* walks (except hardware and gup_fast()).
*/
- mmap_assert_write_locked(vma->vm_mm);
+ vma_assert_write_locked(vma);
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
if (addr & ~PUD_MASK) {
@@ -7371,4 +7371,4 @@
- spin_unlock_irq(&hugetlb_lock);
+ }
}
/*
@@ -7404,4 +7404,4 @@
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
if (take_locks) {
@@ -7444,4 +7444,4 @@
/* take_locks = */ true);
}
-/*
+#ifdef CONFIG_CMA
reverted:
--- b/mm/mmap.c
+++ a/mm/mmap.c
@@ -815,15 +815,7 @@
}
}
again:
- /*
- * Get rid of huge pages and shared page tables straddling the split
- * boundary.
- */
vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
- if (is_vm_hugetlb_page(orig_vma)) {
- hugetlb_split(orig_vma, start);
- hugetlb_split(orig_vma, end);
- }
if (file) {
mapping = file->f_mapping;
only in patch2:
unchanged:
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -539,7 +539,14 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
init_vma_prep(&vp, vma);
vp.insert = new;
vma_prepare(&vp);
+
+ /*
+ * Get rid of huge pages and shared page tables straddling the split
+ * boundary.
+ */
vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL);
+ if (is_vm_hugetlb_page(vma))
+ hugetlb_split(vma, addr);
if (new_below) {
vma->vm_start = addr;
only in patch2:
unchanged:
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -932,6 +932,8 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
(void)next;
}
+static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {}
+
static inline void vma_iter_free(struct vma_iterator *vmi)
{
mas_destroy(&vmi->mas);
diff -u b/include/linux/mm.h b/include/linux/mm.h
--- b/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2536,6 +2536,6 @@
- if (!pmd_ptlock_init(ptdesc))
+ if (!pmd_ptlock_init(page))
return false;
- __folio_set_pgtable(folio);
- lruvec_stat_add_folio(folio, NR_PAGETABLE);
+ __SetPageTable(page);
+ inc_lruvec_page_state(page, NR_PAGETABLE);
return true;
}
@@ -2539,6 +2539,9 @@ INTERDIFF: rejected hunk from patch1, cannot diff context
if (!pmd_ptlock_init(page))
return false;
__SetPageTable(page);
+#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+ atomic_set(&page->pt_share_count, 0);
+#endif
inc_lruvec_page_state(page, NR_PAGETABLE);
return true;
}
@@ -3125,6 +3125,7 @@ INTERDIFF: rejected hunk from patch2, cannot diff context
if (!pmd_ptlock_init(ptdesc))
return false;
__folio_set_pgtable(folio);
+ ptdesc_pmd_pts_init(ptdesc);
lruvec_stat_add_folio(folio, NR_PAGETABLE);
return true;
}
diff -u b/include/linux/mm_types.h b/include/linux/mm_types.h
--- b/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -170,6 +170,18446744073709551610 @@
- const struct page *: (const struct ptdesc *)(p), \
- struct page *: (struct ptdesc *)(p)))
-
-/*
- * Used for sizing the vmemmap region on some architectures
- */
+ union {
+ struct mm_struct *pt_mm; /* x86 pgds only */
+ atomic_t pt_frag_refcount; /* powerpc */
+ };
+#if ALLOC_SPLIT_PTLOCKS
+ spinlock_t *ptl;
@@ -173,6 +173,9 @@ INTERDIFF: rejected hunk from patch1, cannot diff context
union {
struct mm_struct *pt_mm; /* x86 pgds only */
atomic_t pt_frag_refcount; /* powerpc */
+#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+ RH_KABI_BROKEN_INSERT(atomic_t pt_share_count)
+#endif
};
#if ALLOC_SPLIT_PTLOCKS
spinlock_t *ptl;
@@ -445,6 +445,7 @@ INTERDIFF: rejected hunk from patch2, cannot diff context
* @pt_index: Used for s390 gmap.
* @pt_mm: Used for x86 pgds.
* @pt_frag_refcount: For fragmented page table tracking. Powerpc only.
+ * @pt_share_count: Used for HugeTLB PMD page table share count.
* @_pt_pad_2: Padding to ensure proper alignment.
* @ptl: Lock for the page table.
* @__page_type: Same as page->page_type. Unused for page tables.
@@ -471,6 +472,9 @@ INTERDIFF: rejected hunk from patch2, cannot diff context
pgoff_t pt_index;
struct mm_struct *pt_mm;
atomic_t pt_frag_refcount;
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
+ atomic_t pt_share_count;
+#endif
};
union {
@@ -516,6 +520,32 @@ INTERDIFF: rejected hunk from patch2, cannot diff context
const struct page *: (const struct ptdesc *)(p), \
struct page *: (struct ptdesc *)(p)))
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
+static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc)
+{
+ atomic_set(&ptdesc->pt_share_count, 0);
+}
+
+static inline void ptdesc_pmd_pts_inc(struct ptdesc *ptdesc)
+{
+ atomic_inc(&ptdesc->pt_share_count);
+}
+
+static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc)
+{
+ atomic_dec(&ptdesc->pt_share_count);
+}
+
+static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc)
+{
+ return atomic_read(&ptdesc->pt_share_count);
+}
+#else
+static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc)
+{
+}
+#endif
+
/*
* Used for sizing the vmemmap region on some architectures
*/
diff -u b/mm/hugetlb.c b/mm/hugetlb.c
--- b/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7139,5 +7139,5 @@
pud_clear(pud);
- ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
+ atomic_dec(&virt_to_page(ptep)->pt_share_count);
mm_dec_nr_pmds(mm);
return 1;
@@ -7141,6 +7141,13 @@ INTERDIFF: rejected hunk from patch1, cannot diff context
return 0;
pud_clear(pud);
+ /*
+ * Once our caller drops the rmap lock, some other process might be
+ * using this page table as a normal, non-hugetlb page table.
+ * Wait for pending gup_fast() in other threads to finish before letting
+ * that happen.
+ */
+ tlb_remove_table_sync_one();
atomic_dec(&virt_to_page(ptep)->pt_share_count);
mm_dec_nr_pmds(mm);
return 1;
@@ -7629,6 +7629,13 @@ INTERDIFF: rejected hunk from patch2, cannot diff context
return 0;
pud_clear(pud);
+ /*
+ * Once our caller drops the rmap lock, some other process might be
+ * using this page table as a normal, non-hugetlb page table.
+ * Wait for pending gup_fast() in other threads to finish before letting
+ * that happen.
+ */
+ tlb_remove_table_sync_one();
ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
mm_dec_nr_pmds(mm);
return 1;
diff -u b/mm/hugetlb.c b/mm/hugetlb.c
--- b/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5013,9 +5013,9 @@
break;
}
-#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
/* If the pagetables are shared, there is nothing to do */
- if (atomic_read(&virt_to_page(dst_pte)->pt_share_count)) {
+ if (ptdesc_pmd_is_shared(virt_to_ptdesc(dst_pte))) {
addr |= last_addr_mask;
continue;
}
@@ -5023,7 +5023,0 @@
- hugetlb_vma_assert_locked(vma);
- if (sz != PMD_SIZE)
- return 0;
- if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep)))
- return 0;
-
- pud_clear(pud);
@@ -7597,7 +7592,7 @@ INTERDIFF: rejected hunk from patch2, cannot diff context
hugetlb_vma_assert_locked(vma);
if (sz != PMD_SIZE)
return 0;
- if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep)))
+ if (!ptdesc_pmd_is_shared(virt_to_ptdesc(ptep)))
return 0;
pud_clear(pud);
only in patch2:
unchanged:
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -631,6 +631,11 @@ static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc)
{
return atomic_read(&ptdesc->pt_share_count);
}
+
+static inline bool ptdesc_pmd_is_shared(struct ptdesc *ptdesc)
+{
+ return !!ptdesc_pmd_pts_count(ptdesc);
+}
#else
static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc)
{This is an automated interdiff check for backported commits. |
JIRA PR Check Results5 commit(s) with issues found: Commit
|
|
❌ Validation checks completed with issues View full results: https://github.com/ctrliq/kernel-src-tree/actions/runs/21229505497 |
Fresh fixes just from yesterday. Thanks, bot. |
Yeah, please. It nice to have the cve-bf next to commit they fix. As long as the pr does not get too big. |
@pvts-mat what @roxanan1996 said, if it gets super complex with the bug fixes and or additional pre-conditionals please note and we can potentially wave the Bug Fixes |
|
Just dropping an update that this PR gets worked on, but this one bugfix I'll be arguing for dropping this fix, building the case now. Will update the PR soon. |
Not all bugfixes need to be backported as sometimes its a bigger issue than the fix |
In case this wasn't clear please don't include this |
Yeah, I just wanted to provide some explanation. Added the "CVE-2024-57883 fresh fixes" section and expanded the "Commits" one. Three out of four fixes were included. |
jira VULN-71578 cve-pre CVE-2025-38084 commit-author James Houghton <[email protected]> commit b30c14c PMD sharing can only be done in PUD_SIZE-aligned pieces of VMAs; however, it is possible that HugeTLB VMAs are split without unsharing the PMDs first. Without this fix, it is possible to hit the uffd-wp-related WARN_ON_ONCE in hugetlb_change_protection [1]. The key there is that hugetlb_unshare_all_pmds will not attempt to unshare PMDs in non-PUD_SIZE-aligned sections of the VMA. It might seem ideal to unshare in hugetlb_vm_op_open, but we need to unshare in both the new and old VMAs, so unsharing in hugetlb_vm_op_split seems natural. [1]: https://lore.kernel.org/linux-mm/CADrL8HVeOkj0QH5VZZbRzybNE8CG-tEGFshnA+bG9nMgcWtBSg@mail.gmail.com/ Link: https://lkml.kernel.org/r/[email protected] Fixes: 6dfeaff ("hugetlb/userfaultfd: unshare all pmds for hugetlbfs when register wp") Signed-off-by: James Houghton <[email protected]> Reviewed-by: Mike Kravetz <[email protected]> Acked-by: Peter Xu <[email protected]> Cc: Axel Rasmussen <[email protected]> Cc: Muchun Song <[email protected]> Cc: <[email protected]> Signed-off-by: Andrew Morton <[email protected]> (cherry picked from commit b30c14c) Signed-off-by: Marcin Wcisło <[email protected]>
jira VULN-71578 cve CVE-2025-38084 commit-author Jann Horn <[email protected]> commit 081056d upstream-diff Used linux-5.15.y backport 366298f2b04d2bf1f2f2b7078405bdf9df9bd5d0 as a base. Modified `hugetlb_unshare_pmds()' to wrap in the `take_lock == true' branches what was there originally. This change is the equivalent of applying upstream 081056d to the `hugetlb_unshare_pmds()' function and linux-5.15.y backport 366298f to everything else. Currently, __split_vma() triggers hugetlb page table unsharing through vm_ops->may_split(). This happens before the VMA lock and rmap locks are taken - which is too early, it allows racing VMA-locked page faults in our process and racing rmap walks from other processes to cause page tables to be shared again before we actually perform the split. Fix it by explicitly calling into the hugetlb unshare logic from __split_vma() in the same place where THP splitting also happens. At that point, both the VMA and the rmap(s) are write-locked. An annoying detail is that we can now call into the helper hugetlb_unshare_pmds() from two different locking contexts: 1. from hugetlb_split(), holding: - mmap lock (exclusively) - VMA lock - file rmap lock (exclusively) 2. hugetlb_unshare_all_pmds(), which I think is designed to be able to call us with only the mmap lock held (in shared mode), but currently only runs while holding mmap lock (exclusively) and VMA lock Backporting note: This commit fixes a racy protection that was introduced in commit b30c14c ("hugetlb: unshare some PMDs when splitting VMAs"); that commit claimed to fix an issue introduced in 5.13, but it should actually also go all the way back. [[email protected]: v2] Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Fixes: 39dde65 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Jann Horn <[email protected]> Cc: Liam Howlett <[email protected]> Reviewed-by: Lorenzo Stoakes <[email protected]> Reviewed-by: Oscar Salvador <[email protected]> Cc: Lorenzo Stoakes <[email protected]> Cc: Vlastimil Babka <[email protected]> Cc: <[email protected]> [b30c14c: hugetlb: unshare some PMDs when splitting VMAs] Cc: <[email protected]> Signed-off-by: Andrew Morton <[email protected]> (cherry picked from commit 366298f2b04d2bf1f2f2b7078405bdf9df9bd5d0) Signed-off-by: Marcin Wcisło <[email protected]>
jira VULN-46930 cve CVE-2024-57883 commit-author Liu Shixin <[email protected]> commit 59d9094 upstream-diff Stable 6.1 backport 02333ac1c35370517a19a4a131332a9690c6a5c7 was used for the actual (clean) cherry pick. Additionally the `atomic_t pt_share_count' field in `include/linux/mm_types.h' was wrapped in RH_KABI_BROKEN_INSERT macro to avoid kABI checker complains. It's justified, because the inserted field (it's included, as CONFIG_ARCH_WANT_HUGE_PMD_SHARE gets enabled for at least `kernel-x86_64-rhel.config') is placed within a union which already contained a field of the same type `atomic_t pt_frag_refcount', so the size of it cannot change. Moreover this union serves as a scratch space for the subsystems using the struct page. Upon releasing the ownership to buddy allocator the union contents no longer matter. When the page is allocated again the scratch space will be used by the new owner in its own way. The folio refcount may be increased unexpectly through try_get_folio() by caller such as split_huge_pages. In huge_pmd_unshare(), we use refcount to check whether a pmd page table is shared. The check is incorrect if the refcount is increased by the above caller, and this can cause the page table leaked: BUG: Bad page state in process sh pfn:109324 page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x66 pfn:0x109324 flags: 0x17ffff800000000(node=0|zone=2|lastcpupid=0xfffff) page_type: f2(table) raw: 017ffff800000000 0000000000000000 0000000000000000 0000000000000000 raw: 0000000000000066 0000000000000000 00000000f2000000 0000000000000000 page dumped because: nonzero mapcount ... CPU: 31 UID: 0 PID: 7515 Comm: sh Kdump: loaded Tainted: G B 6.13.0-rc2master+ ctrliq#7 Tainted: [B]=BAD_PAGE Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 Call trace: show_stack+0x20/0x38 (C) dump_stack_lvl+0x80/0xf8 dump_stack+0x18/0x28 bad_page+0x8c/0x130 free_page_is_bad_report+0xa4/0xb0 free_unref_page+0x3cc/0x620 __folio_put+0xf4/0x158 split_huge_pages_all+0x1e0/0x3e8 split_huge_pages_write+0x25c/0x2d8 full_proxy_write+0x64/0xd8 vfs_write+0xcc/0x280 ksys_write+0x70/0x110 __arm64_sys_write+0x24/0x38 invoke_syscall+0x50/0x120 el0_svc_common.constprop.0+0xc8/0xf0 do_el0_svc+0x24/0x38 el0_svc+0x34/0x128 el0t_64_sync_handler+0xc8/0xd0 el0t_64_sync+0x190/0x198 The issue may be triggered by damon, offline_page, page_idle, etc, which will increase the refcount of page table. 1. The page table itself will be discarded after reporting the "nonzero mapcount". 2. The HugeTLB page mapped by the page table miss freeing since we treat the page table as shared and a shared page table will not be unmapped. Fix it by introducing independent PMD page table shared count. As described by comment, pt_index/pt_mm/pt_frag_refcount are used for s390 gmap, x86 pgds and powerpc, pt_share_count is used for x86/arm64/riscv pmds, so we can reuse the field as pt_share_count. Link: https://lkml.kernel.org/r/[email protected] Fixes: 39dde65 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Liu Shixin <[email protected]> Cc: Kefeng Wang <[email protected]> Cc: Ken Chen <[email protected]> Cc: Muchun Song <[email protected]> Cc: Nanyong Sun <[email protected]> Cc: Jane Chu <[email protected]> Cc: <[email protected]> Signed-off-by: Andrew Morton <[email protected]> (cherry picked from commit 02333ac1c35370517a19a4a131332a9690c6a5c7) Signed-off-by: Marcin Wcisło <[email protected]>
jira VULN-71587 cve CVE-2025-38085 commit-author Jann Horn <[email protected]> commit 1013af4 upstream-diff Stable 6.1 b7754d3aa7bf9f62218d096c0c8f6c13698fac8b was used for the actual (clean) cherry pick huge_pmd_unshare() drops a reference on a page table that may have previously been shared across processes, potentially turning it into a normal page table used in another process in which unrelated VMAs can afterwards be installed. If this happens in the middle of a concurrent gup_fast(), gup_fast() could end up walking the page tables of another process. While I don't see any way in which that immediately leads to kernel memory corruption, it is really weird and unexpected. Fix it with an explicit broadcast IPI through tlb_remove_table_sync_one(), just like we do in khugepaged when removing page tables for a THP collapse. Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Fixes: 39dde65 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Jann Horn <[email protected]> Reviewed-by: Lorenzo Stoakes <[email protected]> Cc: Liam Howlett <[email protected]> Cc: Muchun Song <[email protected]> Cc: Oscar Salvador <[email protected]> Cc: Vlastimil Babka <[email protected]> Cc: <[email protected]> Signed-off-by: Andrew Morton <[email protected]> (cherry picked from commit b7754d3aa7bf9f62218d096c0c8f6c13698fac8b) Signed-off-by: Marcin Wcisło <[email protected]>
jira VULN-46930 cve-bf CVE-2024-57883 commit-author Jane Chu <[email protected]> commit 14967a9 upstream-diff This commit fixes `mm: hugetlb: independent PMD page table shared count' which was included in ciqlts9_4 by cherry-picking stable-6.1 backport 02333ac1c35370517a19a4a131332a9690c6a5c7 of kernel-mainline 59d9094. Differences between 02333ac and 59d9094 were driving the diffs between this commit and the upstrem 14967a9. include/linux/mm_types.h Removed the definition of `ptdesc_pmd_is_shared()' function in alignment with stable-5.15 backport 8410996 (it omits the definition of `ptdesc_pmd_pts_*()' functions family, to which `ptdesc_pmd_is_shared()' belongs). mm/hugetlb.c copy_hugetlb_page_range() 1. Used CONFIG_ARCH_WANT_HUGE_PMD_SHARE instead of CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING, because the latter was introduced only in the non-backported commit 188cac5. 2. Since `ptdesc_pmd_is_shared()' was not defined, read the `pt_share_count' field directly, as is done in the stable-5.15 backport 8410996. (Compare changes to `huge_pmd_unshare()' in `mm/hugetlb.c' between upstream 59d9094 and stable-5.15 8410996.) huge_pmd_unshare() No change to the conditional. It was arguably not needed in the upstream as well, probably introduced only for the sake of clarity in the presence of `ptdesc_pmd_is_shared()' function, which is missing here. commit 59d9094 ("mm: hugetlb: independent PMD page table shared count") introduced ->pt_share_count dedicated to hugetlb PMD share count tracking, but omitted fixing copy_hugetlb_page_range(), leaving the function relying on page_count() for tracking that no longer works. When lazy page table copy for hugetlb is disabled, that is, revert commit bcd51a3 ("hugetlb: lazy page table copies in fork()") fork()'ing with hugetlb PMD sharing quickly lockup - [ 239.446559] watchdog: BUG: soft lockup - CPU#75 stuck for 27s! [ 239.446611] RIP: 0010:native_queued_spin_lock_slowpath+0x7e/0x2e0 [ 239.446631] Call Trace: [ 239.446633] <TASK> [ 239.446636] _raw_spin_lock+0x3f/0x60 [ 239.446639] copy_hugetlb_page_range+0x258/0xb50 [ 239.446645] copy_page_range+0x22b/0x2c0 [ 239.446651] dup_mmap+0x3e2/0x770 [ 239.446654] dup_mm.constprop.0+0x5e/0x230 [ 239.446657] copy_process+0xd17/0x1760 [ 239.446660] kernel_clone+0xc0/0x3e0 [ 239.446661] __do_sys_clone+0x65/0xa0 [ 239.446664] do_syscall_64+0x82/0x930 [ 239.446668] ? count_memcg_events+0xd2/0x190 [ 239.446671] ? syscall_trace_enter+0x14e/0x1f0 [ 239.446676] ? syscall_exit_work+0x118/0x150 [ 239.446677] ? arch_exit_to_user_mode_prepare.constprop.0+0x9/0xb0 [ 239.446681] ? clear_bhb_loop+0x30/0x80 [ 239.446684] ? clear_bhb_loop+0x30/0x80 [ 239.446686] entry_SYSCALL_64_after_hwframe+0x76/0x7e There are two options to resolve the potential latent issue: 1. warn against PMD sharing in copy_hugetlb_page_range(), 2. fix it. This patch opts for the second option. While at it, simplify the comment, the details are not actually relevant anymore. Link: https://lkml.kernel.org/r/[email protected] Fixes: 59d9094 ("mm: hugetlb: independent PMD page table shared count") Signed-off-by: Jane Chu <[email protected]> Reviewed-by: Harry Yoo <[email protected]> Acked-by: Oscar Salvador <[email protected]> Acked-by: David Hildenbrand <[email protected]> Cc: Jann Horn <[email protected]> Cc: Liu Shixin <[email protected]> Cc: Muchun Song <[email protected]> Signed-off-by: Andrew Morton <[email protected]> (cherry picked from commit 14967a9) Signed-off-by: Marcin Wcisło <[email protected]>
jira VULN-46930 cve-bf CVE-2024-57883 commit-author David Hildenbrand (Red Hat) <[email protected]> commit ca1a47c upstream-diff Adaptation analogous to 3ab513d in relation to the upstream 14967a9 - inlined missing `ptdesc_pmd_is_shared()', `ptdesc_pmd_pts_count()', `virt_to_ptdesc()' Patch series "mm/hugetlb: fixes for PMD table sharing (incl. using mmu_gather)", v3. One functional fix, one performance regression fix, and two related comment fixes. I cleaned up my prototype I recently shared [1] for the performance fix, deferring most of the cleanups I had in the prototype to a later point. While doing that I identified the other things. The goal of this patch set is to be backported to stable trees "fairly" easily. At least patch ctrliq#1 and ctrliq#4. Patch ctrliq#1 fixes hugetlb_pmd_shared() not detecting any sharing Patch ctrliq#2 + ctrliq#3 are simple comment fixes that patch ctrliq#4 interacts with. Patch ctrliq#4 is a fix for the reported performance regression due to excessive IPI broadcasts during fork()+exit(). The last patch is all about TLB flushes, IPIs and mmu_gather. Read: complicated There are plenty of cleanups in the future to be had + one reasonable optimization on x86. But that's all out of scope for this series. Runtime tested, with a focus on fixing the performance regression using the original reproducer [2] on x86. This patch (of 4): We switched from (wrongly) using the page count to an independent shared count. Now, shared page tables have a refcount of 1 (excluding speculative references) and instead use ptdesc->pt_share_count to identify sharing. We didn't convert hugetlb_pmd_shared(), so right now, we would never detect a shared PMD table as such, because sharing/unsharing no longer touches the refcount of a PMD table. Page migration, like mbind() or migrate_pages() would allow for migrating folios mapped into such shared PMD tables, even though the folios are not exclusive. In smaps we would account them as "private" although they are "shared", and we would be wrongly setting the PM_MMAP_EXCLUSIVE in the pagemap interface. Fix it by properly using ptdesc_pmd_is_shared() in hugetlb_pmd_shared(). Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Link: https://lore.kernel.org/all/[email protected]/ [1] Link: https://lore.kernel.org/all/[email protected]/ [2] Fixes: 59d9094 ("mm: hugetlb: independent PMD page table shared count") Signed-off-by: David Hildenbrand (Red Hat) <[email protected]> Reviewed-by: Rik van Riel <[email protected]> Reviewed-by: Lance Yang <[email protected]> Tested-by: Lance Yang <[email protected]> Reviewed-by: Harry Yoo <[email protected]> Tested-by: Laurence Oberman <[email protected]> Reviewed-by: Lorenzo Stoakes <[email protected]> Acked-by: Oscar Salvador <[email protected]> Cc: Liu Shixin <[email protected]> Cc: Uschakow, Stanislav" <[email protected]> Cc: <[email protected]> Signed-off-by: Andrew Morton <[email protected]> (cherry picked from commit ca1a47c) Signed-off-by: Marcin Wcisło <[email protected]>
jira VULN-46930 cve-bf CVE-2024-57883 commit-author David Hildenbrand (Red Hat) <[email protected]> commit 3937027 upstream-diff Resolved conflicts due to the 4e1f5f6 backport incorporating a typo 'refernece', which was fixed upstream (and expected by this patch) in b6c4660, but it didn't make it to ciqlts9_4. Could not have been picked as prerequisite due to multiple conflicts Ever since we stopped using the page count to detect shared PMD page tables, these comments are outdated. The only reason we have to flush the TLB early is because once we drop the i_mmap_rwsem, the previously shared page table could get freed (to then get reallocated and used for other purpose). So we really have to flush the TLB before that could happen. So let's simplify the comments a bit. The "If we unshared PMDs, the TLB flush was not recorded in mmu_gather." part introduced as in commit a4a118f ("hugetlbfs: flush TLBs correctly after huge_pmd_unshare") was confusing: sure it is recorded in the mmu_gather, otherwise tlb_flush_mmu_tlbonly() wouldn't do anything. So let's drop that comment while at it as well. We'll centralize these comments in a single helper as we rework the code next. Link: https://lkml.kernel.org/r/[email protected] Fixes: 59d9094 ("mm: hugetlb: independent PMD page table shared count") Signed-off-by: David Hildenbrand (Red Hat) <[email protected]> Reviewed-by: Rik van Riel <[email protected]> Tested-by: Laurence Oberman <[email protected]> Reviewed-by: Lorenzo Stoakes <[email protected]> Acked-by: Oscar Salvador <[email protected]> Reviewed-by: Harry Yoo <[email protected]> Cc: Liu Shixin <[email protected]> Cc: Lance Yang <[email protected]> Cc: "Uschakow, Stanislav" <[email protected]> Cc: <[email protected]> Signed-off-by: Andrew Morton <[email protected]> (cherry picked from commit 3937027) Signed-off-by: Marcin Wcisło <[email protected]>
jira VULN-46930 cve-bf CVE-2024-57883 commit-author David Hildenbrand (Red Hat) <[email protected]> commit a8682d5 PMD page table unsharing no longer touches the refcount of a PMD page table. Also, it is not about dropping the refcount of a "PMD page" but the "PMD page table". Let's just simplify by saying that the PMD page table was unmapped, consequently also unmapping the folio that was mapped into this page. This code should be deduplicated in the future. Link: https://lkml.kernel.org/r/[email protected] Fixes: 59d9094 ("mm: hugetlb: independent PMD page table shared count") Signed-off-by: David Hildenbrand (Red Hat) <[email protected]> Reviewed-by: Rik van Riel <[email protected]> Tested-by: Laurence Oberman <[email protected]> Reviewed-by: Lorenzo Stoakes <[email protected]> Acked-by: Oscar Salvador <[email protected]> Cc: Liu Shixin <[email protected]> Cc: Harry Yoo <[email protected]> Cc: Lance Yang <[email protected]> Cc: "Uschakow, Stanislav" <[email protected]> Cc: <[email protected]> Signed-off-by: Andrew Morton <[email protected]> (cherry picked from commit a8682d5) Signed-off-by: Marcin Wcisło <[email protected]>
d0397e0 to
db8be66
Compare
[LTS 9.4]
CVE-2025-38084 VULN-71578
CVE-2025-38085 VULN-71587
CVE-2024-57883 VULN-46930
About
This PR is the LTS 9.4 version of #731. While CVE-2025-38084 and CVE-2025-38085 are not interdependent, their fixes appeared on upstream in the same branch d3c82f6 and are usually backported together as well (CentOS 9 462b3c3, CentOS 10 42421eb, stable 6.1
b7754d3aa7bf9f62218d096c0c8f6c13698fac8b, stable 5.10952596b08c74e8fe9e2883d1dc8a8f54a37384ec, stable 5.15a3d864c901a300c295692d129159fc3001a56185).For the comparison of commits related to the CVEs in different Linuxes see Appendix: Backports Overview
Relation to the LTS 9.2 fix
The fix is, for the most part, the same as in LTS 9.2 #731, with some minor differences.
mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race) and it's prerequisite (mm: hugetlb: independent PMD page table shared count) were taken fromlinux-6.1.yinstead oflinux-5.15.y. This was only to avoid context conflicts, aslinux-6.1.yhas more similar history of tlb module tociqlts9_4than thelinux-5.15.ydoes. The commits diffs are practically the same.mm/hugetlb: make detecting shared pte more reliablewasn't backported as the prerequisite for the bugfix of CVE-2025-38085 fix (themm/hugetlb: fix copy_hugetlb_page_range() to use ->pt_share_countcommit) because it was already backported tociqlts9_4as 643137f.mm/khugepaged: fix GUP-fast interaction by sending IPIas prerequisite for CVE-2025-38085 fix (mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race) was done, because it was already backported tociqlts9_4in f4c1e18.mm/hugetlb: unshare page tables during VMA split, not before) was taken fromlinux-5.15.yto minimize conflicts compared to the upstream, just like in LTS 9.2 case, however, the end result differs from thelinux-5.15.ypick for the LTS 9.2 version ad741c4 - see functionhugetlb_unshare_pmds().CVE-2025-38085 fix discussion
The LTS 9.2 PR #731 raised suspicion over the kABI breakage, which was eventually resolved. The same situation can be found in this patch set. An attempt was made to avoid the
mm: hugetlb: independent PMD page table shared countcommit requiring the use ofRH_KABI_BROKEN_INSERTas it may not have been strictly required for the fix of CVE-2025-38085. However, it was decided to backport it to LTS 9.4 as well, because only one solution was found - the CentOS 9 fix 12a6db3 - which didn't incorporate this commit as prerequisite. All other analyzed solutions did (CentOS 10 41f7eb5, stable 6.6fe684290418ef9ef76630072086ee530b92f02b8, stable 6.1b7754d3aa7bf9f62218d096c0c8f6c13698fac8b, stable 5.15a3d864c901a300c295692d129159fc3001a56185, stable 5.10952596b08c74e8fe9e2883d1dc8a8f54a37384ec).The upstream fix 1013af4 message explicitly mentions
which refers to the line right after the introduced
tlb_remove_table_sync_one()call:In
ciqlts9_4withoutmm: hugetlb: independent PMD page table shared countthis line would bejust like it was in CentOS 9 fix 12a6db3. It could not have been determined whether these two situations were similar enough to warrant the exact same fix, so the more established solution was used, even though 12a6db3 cherry-picked cleanly.
Additionally, this prerequisite has its own CVE-2024-57883 associated, which will probably have to be solved eventually anyway.
CVE-2024-57883 fresh fixes
The recent bugfixes of the fix of CVE-2024-57883 were part of the same branch c25f2fb.
Of these the commits
2,3,4were backported. Commit1was dropped, for the following reasons:Commit
mm/hugetlb: fix excessive IPI broadcasts when unsharing PMD tables using mmu_gatherrepresents more of an optimization than a bug fix, let alone vulnerability fix. Moreover, the performance drop was observed in a highly specific environmenthttps://lore.kernel.org/all/[email protected]/
The commit is relatively big and complicated, causing multiple conflicts upon cherry-picking onto
ciqlts9_4. Although they were resolved eventually (see f0dd083 for reference) no other backports (official stable or centos) exist to compare the solution with. This increases the risk of erroneous backport.The fix breaks the kABI with the change in
struct mmu_gatherinmm/mmu_gather.c:8ce720d#diff-297cf0f839ab2f417e9f0f34ba2b1e49d6056e3c99595aabd6a3d3c17eee991eL364
The breakage occurs in the
pv_opsvariable, at the definition of.mmu.tlb_remove_tablefieldkernel-src-tree/arch/x86/kernel/paravirt.c
Lines 321 to 322 in 36b870b
To establish whether the change is an actual kABI breakage the usage of
mmu_gathermust be analyzed. The documentation of the struct suggests that it's local to the memory management modulekernel-src-tree/include/asm-generic/tlb.h
Lines 267 to 270 in 36b870b
However, the usage can also be found in the files subsystem
kernel-src-tree/fs/exec.c
Line 690 in 36b870b
kernel-src-tree/fs/exec.c
Line 761 in 36b870b
The struct is never allocated dynamically and there is no API in
include/asm-generic/tlb.horinclude/linux/mm_types.hto achieve that. Instead it's allocated on the stack like in the examples above (19 cases in total) and the__tlb_gather_mmu()is used to initialize it, accessible publicly throughtlb_gather_mmu()andtlb_gather_mmu_fullmm()defined inmm/mmu_gather.c. This means that if an external driver used this struct it would probably allocate it on stack as well and the binary interface would break. It could not have been established whether it's a sufficiently unlikely scenario for an external driver to use the themmu_gatherstruct to warrant the suppression of kABI breakage warnings.However, it's important to remember that the
mmu_gatherstruct itself is not whitelisted, only thepv_opsvariable which uses the pointer tommu_gatheras an argument in one of its functions. It's possible that this function (.mmu.tlb_remove_table) is always called with themmu_gatherpointer provided only by the kernel, not the user, which would then mean the kABI isn't broken, as the__tlb_remove_page_size()function the.mmu.tlb_remove_tableis reduced to remains unchanged, with the same API as before. However, it could not have been established, again, whether themmu_gatheris unlikely to ever be allocated by the user when using.mmu.tlb_remove_table.Moreover, even if it was the case that the kABI was not broken in practice, none of the tested macros
RH_KABI_BROKEN_INSERTandRH_KABI_EXCLUDEactually succeeds in suppressing the kABI check errorIt's unclear what could be the reason for this. The source of the break was identified to be definitely located in the modification of the
mmu_gatherstruct by eliminating the change there (and only there), while preserving changes in all other places of the 8ce720d commit and obtaining a kABI-passing kernel build.Commits
CVE-2025-38084
CVE-2025-38085 (+ CVE-2024-57883)
This fix for CVE-2024-57883 serves more as prerequisite for the following commit than a CVE fix in itself.
To track the difference from the upstream refer to the definitions of related functions/macros from
kernel-mainline:ptdesc_pmd_is_shared()kernel-src-tree/include/linux/mm_types.h
Lines 658 to 661 in 6bd9ed0
ptdesc_pmd_pts_count()kernel-src-tree/include/linux/mm_types.h
Lines 653 to 656 in 6bd9ed0
virt_to_ptdesc()kernel-src-tree/include/linux/mm.h
Lines 3121 to 3124 in 6bd9ed0
page_ptdesc()kernel-src-tree/include/linux/mm_types.h
Lines 633 to 635 in 6bd9ed0
kABI check: passed
Boot test: passed
boot-test.log
Kselftests: passed relative
Reference
kselftests–ciqlts9_4–run1.log
Patch
kselftests–ciqlts9_4-CVE-batch-18–run1.log
kselftests–ciqlts9_4-CVE-batch-18–run2.log
kselftests–ciqlts9_4-CVE-batch-18–run3.log
Comparison
The tests results for the reference and the patch are the same.
Appendix: Backports Overview