|
5 | 5 | import pprint
|
6 | 6 | import shutil
|
7 | 7 | import tempfile
|
8 |
| -from typing import Dict, List, Optional, Union |
| 8 | +from datetime import datetime |
| 9 | +from typing import Any, Dict, List, Optional, Union |
9 | 10 |
|
10 | 11 | import click
|
11 | 12 |
|
|
20 | 21 | from datahub.ingestion.source.source_registry import source_registry
|
21 | 22 | from datahub.ingestion.transformer.transform_registry import transform_registry
|
22 | 23 | from datahub.telemetry import telemetry
|
23 |
| -from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedList |
| 24 | +from datahub.utilities.file_backed_collections import ( |
| 25 | + ConnectionWrapper, |
| 26 | + FileBackedDict, |
| 27 | +) |
24 | 28 |
|
25 | 29 | logger = logging.getLogger(__name__)
|
26 | 30 |
|
@@ -391,29 +395,78 @@ def test_path_spec(config: str, input: str, path_spec_key: str) -> None:
|
391 | 395 | raise e
|
392 | 396 |
|
393 | 397 |
|
| 398 | +def _jsonify(data: Any) -> Any: |
| 399 | + if dataclasses.is_dataclass(data): |
| 400 | + # dataclasses.asdict() is recursive. We're doing the recursion |
| 401 | + # manually here via _jsonify calls, so we can't use |
| 402 | + # dataclasses.asdict() here. |
| 403 | + return { |
| 404 | + f.name: _jsonify(getattr(data, f.name)) for f in dataclasses.fields(data) |
| 405 | + } |
| 406 | + elif isinstance(data, list): |
| 407 | + return [_jsonify(item) for item in data] |
| 408 | + elif isinstance(data, dict): |
| 409 | + return {_jsonify(k): _jsonify(v) for k, v in data.items()} |
| 410 | + elif isinstance(data, datetime): |
| 411 | + return data.isoformat() |
| 412 | + else: |
| 413 | + return data |
| 414 | + |
| 415 | + |
394 | 416 | @check.command()
|
395 |
| -@click.argument("query-log-file", type=click.Path(exists=True, dir_okay=False)) |
396 |
| -@click.option("--output", type=click.Path()) |
397 |
| -def extract_sql_agg_log(query_log_file: str, output: Optional[str]) -> None: |
| 417 | +@click.argument("db-file", type=click.Path(exists=True, dir_okay=False)) |
| 418 | +def extract_sql_agg_log(db_file: str) -> None: |
398 | 419 | """Convert a sqlite db generated by the SqlParsingAggregator into a JSON."""
|
399 | 420 |
|
400 |
| - from datahub.sql_parsing.sql_parsing_aggregator import LoggedQuery |
| 421 | + if pathlib.Path(db_file).suffix != ".db": |
| 422 | + raise click.UsageError("DB file must be a sqlite db") |
| 423 | + |
| 424 | + output_dir = pathlib.Path(db_file).with_suffix("") |
| 425 | + output_dir.mkdir(exist_ok=True) |
| 426 | + |
| 427 | + shared_connection = ConnectionWrapper(pathlib.Path(db_file)) |
| 428 | + |
| 429 | + tables: List[str] = [ |
| 430 | + row[0] |
| 431 | + for row in shared_connection.execute( |
| 432 | + """\ |
| 433 | +SELECT |
| 434 | + name |
| 435 | +FROM |
| 436 | + sqlite_schema |
| 437 | +WHERE |
| 438 | + type ='table' AND |
| 439 | + name NOT LIKE 'sqlite_%'; |
| 440 | +""", |
| 441 | + parameters={}, |
| 442 | + ) |
| 443 | + ] |
| 444 | + logger.info(f"Extracting {len(tables)} tables from {db_file}: {tables}") |
| 445 | + |
| 446 | + for table in tables: |
| 447 | + table_output_path = output_dir / f"{table}.json" |
| 448 | + if table_output_path.exists(): |
| 449 | + logger.info(f"Skipping {table_output_path} because it already exists") |
| 450 | + continue |
401 | 451 |
|
402 |
| - assert dataclasses.is_dataclass(LoggedQuery) |
| 452 | + # Some of the tables might actually be FileBackedList. Because |
| 453 | + # the list is built on top of the FileBackedDict, we don't |
| 454 | + # need to distinguish between the two cases. |
403 | 455 |
|
404 |
| - shared_connection = ConnectionWrapper(pathlib.Path(query_log_file)) |
405 |
| - query_log = FileBackedList[LoggedQuery]( |
406 |
| - shared_connection=shared_connection, tablename="stored_queries" |
407 |
| - ) |
408 |
| - logger.info(f"Extracting {len(query_log)} queries from {query_log_file}") |
409 |
| - queries = [dataclasses.asdict(query) for query in query_log] |
| 456 | + table_data: FileBackedDict[Any] = FileBackedDict( |
| 457 | + shared_connection=shared_connection, tablename=table |
| 458 | + ) |
410 | 459 |
|
411 |
| - if output: |
412 |
| - with open(output, "w") as f: |
413 |
| - json.dump(queries, f, indent=2, default=str) |
414 |
| - logger.info(f"Extracted {len(queries)} queries to {output}") |
415 |
| - else: |
416 |
| - click.echo(json.dumps(queries, indent=2)) |
| 460 | + data = {} |
| 461 | + with click.progressbar( |
| 462 | + table_data.items(), length=len(table_data), label=f"Extracting {table}" |
| 463 | + ) as items: |
| 464 | + for k, v in items: |
| 465 | + data[k] = _jsonify(v) |
| 466 | + |
| 467 | + with open(table_output_path, "w") as f: |
| 468 | + json.dump(data, f, indent=2, default=str) |
| 469 | + logger.info(f"Extracted {len(data)} entries to {table_output_path}") |
417 | 470 |
|
418 | 471 |
|
419 | 472 | @check.command()
|
|
0 commit comments