Skip to content

Commit ef3a814

Browse files
authored
fix(ingest): bump sqlglot + add some debug info to tests (#9867)
1 parent b15b352 commit ef3a814

File tree

46 files changed

+228
-87
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+228
-87
lines changed

metadata-ingestion/setup.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@
9494
sqlglot_lib = {
9595
# Using an Acryl fork of sqlglot.
9696
# https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:hsheth?expand=1
97-
"acryl-sqlglot==20.4.1.dev14",
97+
"acryl-sqlglot==21.1.2.dev9",
9898
}
9999

100100
sql_common = (
@@ -245,9 +245,7 @@
245245

246246
powerbi_report_server = {"requests", "requests_ntlm"}
247247

248-
slack = {
249-
"slack-sdk==3.18.1"
250-
}
248+
slack = {"slack-sdk==3.18.1"}
251249

252250
databricks = {
253251
# 0.1.11 appears to have authentication issues with azure databricks

metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ class KnownQueryLineageInfo:
105105
column_lineage: Optional[List[ColumnLineageInfo]] = None
106106

107107
timestamp: Optional[datetime] = None
108+
session_id: Optional[str] = None
108109
query_type: QueryType = QueryType.UNKNOWN
109110

110111

@@ -338,7 +339,7 @@ def add_known_query_lineage(
338339
QueryMetadata(
339340
query_id=query_fingerprint,
340341
formatted_query_string=known_query_lineage.query_text,
341-
session_id=_MISSING_SESSION_ID,
342+
session_id=known_query_lineage.session_id or _MISSING_SESSION_ID,
342343
query_type=known_query_lineage.query_type,
343344
lineage_type=models.DatasetLineageTypeClass.TRANSFORMED,
344345
latest_timestamp=known_query_lineage.timestamp,
@@ -809,7 +810,6 @@ def _gen_lineage_for_downstream(
809810
entityUrn=self._query_urn(query_id),
810811
aspects=[
811812
models.QueryPropertiesClass(
812-
dataPlatform=self.platform.urn(),
813813
statement=models.QueryStatementClass(
814814
value=query.formatted_query_string,
815815
language=models.QueryLanguageClass.SQL,

metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py

+11-8
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
from datahub.sql_parsing.sqlglot_utils import (
4040
DialectOrStr,
4141
get_dialect,
42-
get_query_fingerprint,
42+
get_query_fingerprint_debug,
4343
is_dialect_instance,
4444
parse_statement,
4545
)
@@ -181,11 +181,13 @@ class ColumnLineageInfo(_ParserBaseModel):
181181
class SqlParsingDebugInfo(_ParserBaseModel):
182182
confidence: float = 0.0
183183

184-
tables_discovered: int = 0
185-
table_schemas_resolved: int = 0
184+
tables_discovered: int = pydantic.Field(0, exclude=True)
185+
table_schemas_resolved: int = pydantic.Field(0, exclude=True)
186186

187-
table_error: Optional[Exception] = None
188-
column_error: Optional[Exception] = None
187+
generalized_statement: Optional[str] = None
188+
189+
table_error: Optional[Exception] = pydantic.Field(default=None, exclude=True)
190+
column_error: Optional[Exception] = pydantic.Field(default=None, exclude=True)
189191

190192
@property
191193
def error(self) -> Optional[Exception]:
@@ -206,8 +208,7 @@ class SqlParsingResult(_ParserBaseModel):
206208
# TODO include list of referenced columns
207209

208210
debug_info: SqlParsingDebugInfo = pydantic.Field(
209-
default_factory=lambda: SqlParsingDebugInfo(),
210-
exclude=True,
211+
default_factory=lambda: SqlParsingDebugInfo()
211212
)
212213

213214
@classmethod
@@ -887,7 +888,9 @@ def _sqlglot_lineage_inner(
887888
query_type, query_type_props = get_query_type_of_sql(
888889
original_statement, dialect=dialect
889890
)
890-
query_fingerprint = get_query_fingerprint(original_statement, dialect=dialect)
891+
query_fingerprint, debug_info.generalized_statement = get_query_fingerprint_debug(
892+
original_statement, dialect=dialect
893+
)
891894
return SqlParsingResult(
892895
query_type=query_type,
893896
query_type_props=query_type_props,

metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py

+19-13
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import hashlib
22
import logging
3-
from typing import Dict, Iterable, Optional, Union
3+
from typing import Dict, Iterable, Optional, Tuple, Union
44

55
import sqlglot
66
import sqlglot.errors
@@ -119,6 +119,23 @@ def generate_hash(text: str) -> str:
119119
return hashlib.sha256(text.encode("utf-8")).hexdigest()
120120

121121

122+
def get_query_fingerprint_debug(
123+
expression: sqlglot.exp.ExpOrStr, dialect: DialectOrStr
124+
) -> Tuple[str, str]:
125+
try:
126+
dialect = get_dialect(dialect)
127+
expression_sql = generalize_query(expression, dialect=dialect)
128+
except (ValueError, sqlglot.errors.SqlglotError) as e:
129+
if not isinstance(expression, str):
130+
raise
131+
132+
logger.debug("Failed to generalize query for fingerprinting: %s", e)
133+
expression_sql = expression
134+
135+
fingerprint = generate_hash(expression_sql)
136+
return fingerprint, expression_sql
137+
138+
122139
def get_query_fingerprint(
123140
expression: sqlglot.exp.ExpOrStr, dialect: DialectOrStr
124141
) -> str:
@@ -142,18 +159,7 @@ def get_query_fingerprint(
142159
The fingerprint for the SQL query.
143160
"""
144161

145-
try:
146-
dialect = get_dialect(dialect)
147-
expression_sql = generalize_query(expression, dialect=dialect)
148-
except (ValueError, sqlglot.errors.SqlglotError) as e:
149-
if not isinstance(expression, str):
150-
raise
151-
152-
logger.debug("Failed to generalize query for fingerprinting: %s", e)
153-
expression_sql = expression
154-
155-
fingerprint = generate_hash(expression_sql)
156-
return fingerprint
162+
return get_query_fingerprint_debug(expression, dialect)[0]
157163

158164

159165
def detach_ctes(

metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json

+3-4
Original file line numberDiff line numberDiff line change
@@ -75,14 +75,13 @@
7575
},
7676
"source": "SYSTEM",
7777
"created": {
78-
"time": 0,
78+
"time": 20000,
7979
"actor": "urn:li:corpuser:_ingestion"
8080
},
8181
"lastModified": {
82-
"time": 1707182625000,
82+
"time": 20000,
8383
"actor": "urn:li:corpuser:_ingestion"
84-
},
85-
"dataPlatform": "urn:li:dataPlatform:redshift"
84+
}
8685
}
8786
}
8887
},

metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_basic_lineage.json

+1-2
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,7 @@
6969
"lastModified": {
7070
"time": 1707182625000,
7171
"actor": "urn:li:corpuser:_ingestion"
72-
},
73-
"dataPlatform": "urn:li:dataPlatform:redshift"
72+
}
7473
}
7574
}
7675
},

metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_column_lineage_deduplication.json

+2-4
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,7 @@
8181
"lastModified": {
8282
"time": 1707182625000,
8383
"actor": "urn:li:corpuser:_ingestion"
84-
},
85-
"dataPlatform": "urn:li:dataPlatform:redshift"
84+
}
8685
}
8786
}
8887
},
@@ -123,8 +122,7 @@
123122
"lastModified": {
124123
"time": 1707182625000,
125124
"actor": "urn:li:corpuser:_ingestion"
126-
},
127-
"dataPlatform": "urn:li:dataPlatform:redshift"
125+
}
128126
}
129127
}
130128
},

metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts.json

+2-4
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,7 @@
106106
"lastModified": {
107107
"time": 25000,
108108
"actor": "urn:li:corpuser:_ingestion"
109-
},
110-
"dataPlatform": "urn:li:dataPlatform:redshift"
109+
}
111110
}
112111
}
113112
},
@@ -148,8 +147,7 @@
148147
"lastModified": {
149148
"time": 20000,
150149
"actor": "urn:li:corpuser:_ingestion"
151-
},
152-
"dataPlatform": "urn:li:dataPlatform:redshift"
150+
}
153151
}
154152
}
155153
},

metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_temp_table.json

+3-6
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,7 @@
6969
"lastModified": {
7070
"time": 1707182625000,
7171
"actor": "urn:li:corpuser:_ingestion"
72-
},
73-
"dataPlatform": "urn:li:dataPlatform:redshift"
72+
}
7473
}
7574
}
7675
},
@@ -163,8 +162,7 @@
163162
"lastModified": {
164163
"time": 1707182625000,
165164
"actor": "urn:li:corpuser:_ingestion"
166-
},
167-
"dataPlatform": "urn:li:dataPlatform:redshift"
165+
}
168166
}
169167
}
170168
},
@@ -231,8 +229,7 @@
231229
"lastModified": {
232230
"time": 1707182625000,
233231
"actor": "urn:li:corpuser:_ingestion"
234-
},
235-
"dataPlatform": "urn:li:dataPlatform:redshift"
232+
}
236233
}
237234
}
238235
},

metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_view_lineage.json

+1-2
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,7 @@
6969
"lastModified": {
7070
"time": 1707182625000,
7171
"actor": "urn:li:corpuser:_ingestion"
72-
},
73-
"dataPlatform": "urn:li:dataPlatform:redshift"
72+
}
7473
}
7574
}
7675
},

metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_create_view_with_cte.json

+5-1
Original file line numberDiff line numberDiff line change
@@ -85,5 +85,9 @@
8585
}
8686
]
8787
}
88-
]
88+
],
89+
"debug_info": {
90+
"confidence": 0.425,
91+
"generalized_statement": "CREATE VIEW `my-proj-2`.dataset.my_view AS WITH cte1 AS (SELECT * FROM dataset.table1 WHERE col1 = ?), cte2 AS (SELECT col3, col4 AS join_key FROM dataset.table2 WHERE col3 = ?) SELECT col5, cte1.*, col3 FROM dataset.table3 JOIN cte1 ON table3.col5 = cte1.col2 JOIN cte2 USING (join_key)"
92+
}
8993
}

metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_from_sharded_table_wildcard.json

+5-1
Original file line numberDiff line numberDiff line change
@@ -43,5 +43,9 @@
4343
}
4444
]
4545
}
46-
]
46+
],
47+
"debug_info": {
48+
"confidence": 0.9,
49+
"generalized_statement": "SELECT * FROM `bq-proj`.dataset.`table_2023*`"
50+
}
4751
}

metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_nested_subqueries.json

+5-1
Original file line numberDiff line numberDiff line change
@@ -43,5 +43,9 @@
4343
}
4444
]
4545
}
46-
]
46+
],
47+
"debug_info": {
48+
"confidence": 0.9,
49+
"generalized_statement": "SELECT * FROM (SELECT * FROM (SELECT * FROM `bq-proj`.dataset.table1))"
50+
}
4751
}

metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_sharded_table_normalization.json

+5-1
Original file line numberDiff line numberDiff line change
@@ -43,5 +43,9 @@
4343
}
4444
]
4545
}
46-
]
46+
],
47+
"debug_info": {
48+
"confidence": 0.9,
49+
"generalized_statement": "SELECT * FROM `bq-proj`.dataset.table_20230101"
50+
}
4751
}

metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_star_with_replace.json

+5-1
Original file line numberDiff line numberDiff line change
@@ -65,5 +65,9 @@
6565
}
6666
]
6767
}
68-
]
68+
],
69+
"debug_info": {
70+
"confidence": 0.35,
71+
"generalized_statement": "CREATE VIEW `my-project`.`my-dataset`.test_table AS SELECT * REPLACE (LOWER(something) AS something) FROM `my-project2`.`my-dataset2`.test_physical_table"
72+
}
6973
}

metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_view_from_union.json

+5-1
Original file line numberDiff line numberDiff line change
@@ -56,5 +56,9 @@
5656
}
5757
]
5858
}
59-
]
59+
],
60+
"debug_info": {
61+
"confidence": 0.4,
62+
"generalized_statement": "CREATE VIEW my_view AS SELECT * FROM my_project_2.my_dataset_2.sometable UNION DISTINCT SELECT * FROM my_project_2.my_dataset_2.sometable2 AS a"
63+
}
6064
}

metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_table_ddl.json

+5-1
Original file line numberDiff line numberDiff line change
@@ -59,5 +59,9 @@
5959
},
6060
"upstreams": []
6161
}
62-
]
62+
],
63+
"debug_info": {
64+
"confidence": 0.2,
65+
"generalized_statement": "CREATE TABLE IF NOT EXISTS costs (id INTEGER PRIMARY KEY, month TEXT NOT NULL, total_cost REAL NOT NULL, area REAL NOT NULL)"
66+
}
6367
}

metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_view_as_select.json

+5-1
Original file line numberDiff line numberDiff line change
@@ -52,5 +52,9 @@
5252
}
5353
]
5454
}
55-
]
55+
],
56+
"debug_info": {
57+
"confidence": 0.2,
58+
"generalized_statement": "CREATE VIEW vsal AS SELECT a.deptno AS \"Department\", a.num_emp / b.total_count AS \"Employees\", a.sal_sum / b.total_sal AS \"Salary\" FROM (SELECT deptno, COUNT() AS num_emp, SUM(sal) AS sal_sum FROM scott.emp WHERE city = ? GROUP BY deptno) a, (SELECT COUNT() AS total_count, SUM(sal) AS total_sal FROM scott.emp WHERE city = ?) b"
59+
}
5660
}

metadata-ingestion/tests/unit/sql_parsing/goldens/test_expand_select_star_basic.json

+5-1
Original file line numberDiff line numberDiff line change
@@ -187,5 +187,9 @@
187187
}
188188
]
189189
}
190-
]
190+
],
191+
"debug_info": {
192+
"confidence": 0.9,
193+
"generalized_statement": "SELECT SUM(totalprice) AS total_agg, * FROM snowflake_sample_data.tpch_sf1.orders WHERE orderdate = ?"
194+
}
191195
}

metadata-ingestion/tests/unit/sql_parsing/goldens/test_insert_as_select.json

+5-1
Original file line numberDiff line numberDiff line change
@@ -98,5 +98,9 @@
9898
},
9999
"upstreams": []
100100
}
101-
]
101+
],
102+
"debug_info": {
103+
"confidence": 0.2,
104+
"generalized_statement": "INSERT INTO query72 SELECT i_item_desc, w_warehouse_name, d1.d_week_seq, SUM(CASE WHEN promotion.p_promo_sk IS NULL THEN ? ELSE ? END) AS no_promo, SUM(CASE WHEN NOT promotion.p_promo_sk IS NULL THEN ? ELSE ? END) AS promo, COUNT(*) AS total_cnt FROM catalog_sales JOIN inventory ON (cs_item_sk = inv_item_sk) JOIN warehouse ON (w_warehouse_sk = inv_warehouse_sk) JOIN item ON (i_item_sk = cs_item_sk) JOIN customer_demographics ON (cs_bill_cdemo_sk = cd_demo_sk) JOIN household_demographics ON (cs_bill_hdemo_sk = hd_demo_sk) JOIN date_dim AS d1 ON (cs_sold_date_sk = d1.d_date_sk) JOIN date_dim AS d2 ON (inv_date_sk = d2.d_date_sk) JOIN date_dim AS d3 ON (cs_ship_date_sk = d3.d_date_sk) LEFT OUTER JOIN promotion ON (cs_promo_sk = p_promo_sk) LEFT OUTER JOIN catalog_returns ON (cr_item_sk = cs_item_sk AND cr_order_number = cs_order_number) WHERE d1.d_week_seq = d2.d_week_seq AND inv_quantity_on_hand < cs_quantity AND hd_buy_potential = ? AND cd_marital_status = ? GROUP BY i_item_desc, w_warehouse_name, d1.d_week_seq ORDER BY total_cnt DESC, i_item_desc, w_warehouse_name, d_week_seq LIMIT ?"
105+
}
102106
}

metadata-ingestion/tests/unit/sql_parsing/goldens/test_insert_with_column_list.json

+5-1
Original file line numberDiff line numberDiff line change
@@ -37,5 +37,9 @@
3737
}
3838
]
3939
}
40-
]
40+
],
41+
"debug_info": {
42+
"confidence": 0.2,
43+
"generalized_statement": "INSERT INTO downstream (a, c) SELECT a, c FROM upstream2"
44+
}
4145
}

0 commit comments

Comments
 (0)