Skip to content

Commit ee63072

Browse files
libcdb: improve the search speed of search_by_symbol_offsets (#2413)
* Add `hash_type` for `search_by_symbol_offsets` * Add docs * Update CHANGELOG * Allow search `id` in search_by_hash * Fix py2.7 test * Rename `hash_type` to `search_type` * Rename `TYPES['id']` to `TYPES['libs_id']` * Rename part `hex_encoded_id` to `search_target` * Turbofast extract build id * Fix docs * Add a map for types key * Extract proper buildid * Fix docs * Fix E0606 * Simplify get `mapped_type` * Add `search_by_libs_id` to `__all__` * Modify docs * Fix py2.7 * Fix reference --------- Co-authored-by: peace-maker <[email protected]>
1 parent 785ed9f commit ee63072

File tree

2 files changed

+140
-36
lines changed

2 files changed

+140
-36
lines changed

CHANGELOG.md

+2
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,13 @@ The table below shows which release corresponds to each branch, and what date th
7676
- [#2358][2358] Cache output of `asm()`
7777
- [#2457][2457] Catch exception of non-ELF files in checksec.
7878
- [#2444][2444] Add `ELF.close()` to release resources
79+
- [#2413][2413] libcdb: improve the search speed of `search_by_symbol_offsets` in local libc-database
7980

8081
[2471]: https://github.com/Gallopsled/pwntools/pull/2471
8182
[2358]: https://github.com/Gallopsled/pwntools/pull/2358
8283
[2457]: https://github.com/Gallopsled/pwntools/pull/2457
8384
[2444]: https://github.com/Gallopsled/pwntools/pull/2444
85+
[2413]: https://github.com/Gallopsled/pwntools/pull/2413
8486

8587
## 4.14.0 (`beta`)
8688

pwnlib/libcdb.py

+138-36
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,14 @@
88
import time
99
import six
1010
import tempfile
11+
import struct
1112

1213
from pwnlib.context import context
1314
from pwnlib.elf import ELF
1415
from pwnlib.filesystem.path import Path
1516
from pwnlib.log import getLogger
1617
from pwnlib.tubes.process import process
17-
from pwnlib.util.fiddling import enhex
18+
from pwnlib.util.fiddling import enhex, unhex
1819
from pwnlib.util.hashes import sha1filehex, sha256filehex, md5filehex
1920
from pwnlib.util.misc import read
2021
from pwnlib.util.misc import which
@@ -23,12 +24,46 @@
2324

2425
log = getLogger(__name__)
2526

26-
HASHES = {
27-
'build_id': lambda path: enhex(ELF(path, checksec=False).buildid or b''),
27+
28+
def _turbofast_extract_build_id(path):
29+
"""
30+
Elf_External_Note:
31+
32+
0x00 +--------+
33+
| namesz | <- Size of entry's owner string
34+
0x04 +--------+
35+
| descsz | <- Size of the note descriptor
36+
0x08 +--------+
37+
| type | <- Interpretation of the descriptor
38+
0x0c +--------+
39+
| name | <- Start of the name+desc data
40+
... +--------
41+
| desc |
42+
... +--------+
43+
"""
44+
data = read(path, 0x1000)
45+
# search NT_GNU_BUILD_ID and b"GNU\x00" (type+name)
46+
idx = data.find(unhex("03000000474e5500"))
47+
if idx == -1:
48+
return enhex(ELF(path, checksec=False).buildid or b'')
49+
descsz, = struct.unpack("<L", data[idx-4: idx])
50+
return enhex(data[idx+8: idx+8+descsz])
51+
52+
53+
TYPES = {
54+
'libs_id': None,
55+
'build_id': _turbofast_extract_build_id,
2856
'sha1': sha1filehex,
2957
'sha256': sha256filehex,
3058
'md5': md5filehex,
3159
}
60+
61+
# mapping for search result (same as libc.rip)
62+
MAP_TYPES = {
63+
'libs_id': 'id',
64+
'build_id': 'buildid'
65+
}
66+
3267
DEBUGINFOD_SERVERS = [
3368
'https://debuginfod.elfutils.org/',
3469
]
@@ -42,13 +77,16 @@
4277

4378
# https://gitlab.com/libcdb/libcdb wasn't updated after 2019,
4479
# but still is a massive database of older libc binaries.
45-
def provider_libcdb(hex_encoded_id, hash_type):
80+
def provider_libcdb(hex_encoded_id, search_type):
81+
if search_type == 'libs_id':
82+
return None
83+
4684
# Deferred import because it's slow
4785
import requests
4886
from six.moves import urllib
4987

5088
# Build the URL using the requested hash type
51-
url_base = "https://gitlab.com/libcdb/libcdb/raw/master/hashes/%s/" % hash_type
89+
url_base = "https://gitlab.com/libcdb/libcdb/raw/master/hashes/%s/" % search_type
5290
url = urllib.parse.urljoin(url_base, hex_encoded_id)
5391

5492
data = b""
@@ -58,15 +96,15 @@ def provider_libcdb(hex_encoded_id, hash_type):
5896
data = wget(url, timeout=20)
5997

6098
if not data:
61-
log.warn_once("Could not fetch libc for %s %s from libcdb", hash_type, hex_encoded_id)
99+
log.warn_once("Could not fetch libc for %s %s from libcdb", search_type, hex_encoded_id)
62100
break
63101

64102
# GitLab serves up symlinks with
65103
if data.startswith(b'..'):
66104
url = os.path.dirname(url) + '/'
67105
url = urllib.parse.urljoin(url.encode('utf-8'), data)
68106
except requests.RequestException as e:
69-
log.warn_once("Failed to fetch libc for %s %s from libcdb: %s", hash_type, hex_encoded_id, e)
107+
log.warn_once("Failed to fetch libc for %s %s from libcdb: %s", search_type, hex_encoded_id, e)
70108
return data
71109

72110
def query_libc_rip(params):
@@ -86,16 +124,17 @@ def query_libc_rip(params):
86124
return None
87125

88126
# https://libc.rip/
89-
def provider_libc_rip(hex_encoded_id, hash_type):
127+
def provider_libc_rip(search_target, search_type):
90128
# Build the request for the hash type
91129
# https://github.com/niklasb/libc-database/blob/master/searchengine/api.yml
92-
if hash_type == 'build_id':
93-
hash_type = 'buildid'
94-
params = {hash_type: hex_encoded_id}
130+
if search_type in MAP_TYPES.keys():
131+
search_type = MAP_TYPES[search_type]
132+
133+
params = {search_type: search_target}
95134

96135
libc_match = query_libc_rip(params)
97136
if not libc_match:
98-
log.warn_once("Could not find libc info for %s %s on libc.rip", hash_type, hex_encoded_id)
137+
log.warn_once("Could not find libc info for %s %s on libc.rip", search_type, search_target)
99138
return None
100139

101140
if len(libc_match) > 1:
@@ -107,13 +146,13 @@ def provider_libc_rip(hex_encoded_id, hash_type):
107146
data = wget(url, timeout=20)
108147

109148
if not data:
110-
log.warn_once("Could not fetch libc binary for %s %s from libc.rip", hash_type, hex_encoded_id)
149+
log.warn_once("Could not fetch libc binary for %s %s from libc.rip", search_type, search_target)
111150
return None
112151
return data
113152

114153
# Check if the local system libc matches the requested hash.
115-
def provider_local_system(hex_encoded_id, hash_type):
116-
if hash_type == 'id':
154+
def provider_local_system(hex_encoded_id, search_type):
155+
if search_type == 'libs_id':
117156
return None
118157
shell_path = os.environ.get('SHELL', None) or '/bin/sh'
119158
if not os.path.exists(shell_path):
@@ -123,22 +162,29 @@ def provider_local_system(hex_encoded_id, hash_type):
123162
if not local_libc:
124163
log.debug('Cannot lookup libc from shell %r. Skipping local system libc matching.', shell_path)
125164
return None
126-
if HASHES[hash_type](local_libc.path) == hex_encoded_id:
165+
if TYPES[search_type](local_libc.path) == hex_encoded_id:
127166
return local_libc.data
128167
return None
129168

130169
# Offline search https://github.com/niklasb/libc-database for hash type
131-
def provider_local_database(hex_encoded_id, hash_type):
170+
def provider_local_database(search_target, search_type):
132171
if not context.local_libcdb:
133172
return None
134173

135174
localdb = Path(context.local_libcdb)
136175
if not localdb.is_dir():
137176
return None
138177

139-
log.debug("Searching local libc database, %s: %s", hash_type, hex_encoded_id)
178+
# Handle the specific search type 'libs_id'
179+
if search_type == 'libs_id':
180+
libc_list = list(localdb.rglob("%s.so" % search_target))
181+
if len(libc_list) == 0:
182+
return None
183+
return read(libc_list[0])
184+
185+
log.debug("Searching local libc database, %s: %s", search_type, search_target)
140186
for libc_path in localdb.rglob("*.so"):
141-
if hex_encoded_id == HASHES[hash_type](libc_path):
187+
if search_target == TYPES[search_type](libc_path):
142188
return read(libc_path)
143189

144190
return None
@@ -185,11 +231,28 @@ def query_local_database(params):
185231
"online": [provider_libcdb, provider_libc_rip]
186232
}
187233

188-
def search_by_hash(hex_encoded_id, hash_type='build_id', unstrip=True, offline_only=False):
189-
assert hash_type in HASHES, hash_type
234+
def search_by_hash(search_target, search_type='build_id', unstrip=True, offline_only=False):
235+
"""search_by_hash(str, str, bool, bool) -> str
236+
Arguments:
237+
search_target(str):
238+
Use for searching the libc. This could be a hex encoded ID (`hex_encoded_id`) or a library
239+
name (`libs_id`). Depending on `search_type`, this can represent different types of encoded
240+
values or names.
241+
search_type(str):
242+
The type of the search to be performed, it should be one of the keys in the `TYPES` dictionary.
243+
unstrip(bool):
244+
Try to fetch debug info for the libc and apply it to the downloaded file.
245+
offline_only(bool):
246+
If True, restricts the search to offline providers only (local database). If False, it will also
247+
search online providers. Default is False.
248+
249+
Returns:
250+
The path to the cached directory containing the downloaded libraries.
251+
"""
252+
assert search_type in TYPES, search_type
190253

191254
# Ensure that the libcdb cache directory exists
192-
cache, cache_valid = _check_elf_cache('libcdb', hex_encoded_id, hash_type)
255+
cache, cache_valid = _check_elf_cache('libcdb', search_target, search_type)
193256
if cache_valid:
194257
return cache
195258

@@ -203,12 +266,12 @@ def search_by_hash(hex_encoded_id, hash_type='build_id', unstrip=True, offline_o
203266

204267
# Run through all available libc database providers to see if we have a match.
205268
for provider in providers:
206-
data = provider(hex_encoded_id, hash_type)
269+
data = provider(search_target, search_type)
207270
if data and data.startswith(b'\x7FELF'):
208271
break
209272

210273
if not data:
211-
log.warn_once("Could not find libc for %s %s anywhere", hash_type, hex_encoded_id)
274+
log.warn_once("Could not find libc for %s %s anywhere", search_type, search_target)
212275

213276
# Save whatever we got to the cache
214277
write(cache, data or b'')
@@ -257,7 +320,7 @@ def _search_debuginfo_by_hash(base_url, hex_encoded_id):
257320

258321
return cache
259322

260-
def _check_elf_cache(cache_type, hex_encoded_id, hash_type):
323+
def _check_elf_cache(cache_type, search_target, search_type):
261324
"""
262325
Check if there already is an ELF file for this hash in the cache.
263326
@@ -270,14 +333,14 @@ def _check_elf_cache(cache_type, hex_encoded_id, hash_type):
270333
True
271334
"""
272335
# Ensure that the cache directory exists
273-
cache_dir = os.path.join(context.cache_dir, cache_type, hash_type)
336+
cache_dir = os.path.join(context.cache_dir, cache_type, search_type)
274337

275338
if not os.path.isdir(cache_dir):
276339
os.makedirs(cache_dir)
277340

278341
# If we already downloaded the file, and it looks even passingly like
279342
# a valid ELF file, return it.
280-
cache = os.path.join(cache_dir, hex_encoded_id)
343+
cache = os.path.join(cache_dir, search_target)
281344

282345
if not os.path.exists(cache):
283346
return cache, False
@@ -289,7 +352,7 @@ def _check_elf_cache(cache_type, hex_encoded_id, hash_type):
289352
# Retry failed lookups after some time
290353
if time.time() > os.path.getmtime(cache) + NEGATIVE_CACHE_EXPIRY:
291354
return cache, False
292-
log.info_once("Skipping invalid cached ELF %s", hex_encoded_id)
355+
log.info_once("Skipping invalid cached ELF %s", search_target)
293356
return None, False
294357

295358
log.info_once("Using cached data from %r", cache)
@@ -583,7 +646,7 @@ def _handle_multiple_matching_libcs(matching_libcs):
583646
selected_index = options("Select the libc version to use:", [libc['id'] for libc in matching_libcs])
584647
return matching_libcs[selected_index]
585648

586-
def search_by_symbol_offsets(symbols, select_index=None, unstrip=True, return_as_list=False, offline_only=False):
649+
def search_by_symbol_offsets(symbols, select_index=None, unstrip=True, return_as_list=False, offline_only=False, search_type='build_id'):
587650
"""
588651
Lookup possible matching libc versions based on leaked function addresses.
589652
@@ -608,6 +671,8 @@ def search_by_symbol_offsets(symbols, select_index=None, unstrip=True, return_as
608671
offline_only(bool):
609672
When pass `offline_only=True`, restricts search mode to offline sources only,
610673
disable online lookup. Defaults to `False`, and enable both offline and online providers.
674+
search_type(str):
675+
An option to select searched hash.
611676
612677
Returns:
613678
Path to the downloaded library on disk, or :const:`None`.
@@ -626,6 +691,8 @@ def search_by_symbol_offsets(symbols, select_index=None, unstrip=True, return_as
626691
>>> for buildid in matched_libcs: # doctest +SKIP
627692
... libc = ELF(search_by_build_id(buildid)) # doctest +SKIP
628693
"""
694+
assert search_type in TYPES, search_type
695+
629696
for symbol, address in symbols.items():
630697
if isinstance(address, int):
631698
symbols[symbol] = hex(address)
@@ -661,21 +728,49 @@ def search_by_symbol_offsets(symbols, select_index=None, unstrip=True, return_as
661728
if return_as_list:
662729
return [libc['buildid'] for libc in matching_list]
663730

731+
mapped_type = MAP_TYPES.get(search_type, search_type)
732+
664733
# If there's only one match, return it directly
665734
if len(matching_list) == 1:
666-
return search_by_build_id(matching_list[0]['buildid'], unstrip=unstrip, offline_only=offline_only)
735+
return search_by_hash(matching_list[0][mapped_type], search_type=search_type, unstrip=unstrip, offline_only=offline_only)
667736

668737
# If a specific index is provided, validate it and return the selected libc
669738
if select_index is not None:
670739
if select_index > 0 and select_index <= len(matching_list):
671-
return search_by_build_id(matching_list[select_index - 1]['buildid'], unstrip=unstrip, offline_only=offline_only)
740+
return search_by_hash(matching_list[select_index - 1][mapped_type], search_type=search_type, unstrip=unstrip, offline_only=offline_only)
672741
else:
673742
log.error('Invalid selected libc index. %d is not in the range of 1-%d.', select_index, len(matching_list))
674743
return None
675744

676745
# Handle multiple matches interactively if no index is specified
677746
selected_libc = _handle_multiple_matching_libcs(matching_list)
678-
return search_by_build_id(selected_libc['buildid'], unstrip=unstrip, offline_only=offline_only)
747+
return search_by_hash(selected_libc[mapped_type], search_type=search_type, unstrip=unstrip, offline_only=offline_only)
748+
749+
def search_by_libs_id(libs_id, unstrip=True, offline_only=False):
750+
"""
751+
Given a Libs ID, attempt to download a matching libc from libcdb.
752+
753+
Arguments:
754+
libs_id(str):
755+
Libs ID (e.g. 'libc6_...') of the library
756+
unstrip(bool):
757+
Try to fetch debug info for the libc and apply it to the downloaded file.
758+
offline_only(bool):
759+
When pass `offline_only=True`, restricts search mode to offline sources only,
760+
disable online lookup. Defaults to `False`, and enable both offline and online providers.
761+
762+
Returns:
763+
Path to the downloaded library on disk, or :const:`None`.
764+
765+
Examples:
766+
767+
>>> None == search_by_libs_id('XX')
768+
True
769+
>>> filename = search_by_libs_id('libc6_2.31-3_amd64')
770+
>>> hex(ELF(filename).symbols.read)
771+
'0xeef40'
772+
"""
773+
return search_by_hash(libs_id, 'libs_id', unstrip, offline_only)
679774

680775
def search_by_build_id(hex_encoded_id, unstrip=True, offline_only=False):
681776
"""
@@ -819,9 +914,16 @@ def _pack_libs_info(path, libs_id, libs_url, syms):
819914
info["libs_url"] = libs_url
820915
info["download_url"] = ""
821916

822-
for hash_type, hash_func in HASHES.items():
823-
# replace 'build_id' to 'buildid'
824-
info[hash_type.replace("_", "")] = hash_func(path)
917+
for search_type, hash_func in TYPES.items():
918+
# pass libs_id
919+
if search_type == 'libs_id':
920+
continue
921+
922+
# replace search_type
923+
if search_type in MAP_TYPES.keys():
924+
search_type = MAP_TYPES[search_type]
925+
926+
info[search_type] = hash_func(path)
825927

826928
default_symbol_list = [
827929
"__libc_start_main_ret", "dup2", "printf", "puts", "read", "system", "str_bin_sh"
@@ -886,4 +988,4 @@ def get_build_id_offsets():
886988
}.get(context.arch, [])
887989

888990

889-
__all__ = ['get_build_id_offsets', 'search_by_build_id', 'search_by_sha1', 'search_by_sha256', 'search_by_md5', 'unstrip_libc', 'search_by_symbol_offsets', 'download_libraries']
991+
__all__ = ['get_build_id_offsets', 'search_by_build_id', 'search_by_sha1', 'search_by_sha256', 'search_by_md5', 'search_by_libs_id', 'unstrip_libc', 'search_by_symbol_offsets', 'download_libraries']

0 commit comments

Comments
 (0)