Skip to content

Commit bc291ea

Browse files
fix(ingest/bigquery): All View generation when queries_v2 is turned off
1 parent d0b4f7a commit bc291ea

File tree

3 files changed

+30
-27
lines changed

3 files changed

+30
-27
lines changed

metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py

+17-11
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import functools
33
import logging
44
import os
5-
from typing import Iterable, List, Optional
5+
from typing import Iterable, List, Optional, Set
66

77
from datahub.ingestion.api.common import PipelineContext
88
from datahub.ingestion.api.decorators import (
@@ -255,10 +255,8 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
255255
for project in projects:
256256
yield from self.bq_schema_extractor.get_project_workunits(project)
257257

258-
if self.config.use_queries_v2:
259-
# Always ingest View and Snapshot lineage with schema ingestion
258+
if self.config.include_view_lineage:
260259
self.report.set_ingestion_stage("*", "View and Snapshot Lineage")
261-
262260
yield from self.lineage_extractor.get_lineage_workunits_for_views_and_snapshots(
263261
[p.id for p in projects],
264262
self.bq_schema_extractor.view_refs_by_project,
@@ -267,6 +265,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
267265
self.bq_schema_extractor.snapshots_by_ref,
268266
)
269267

268+
if self.config.use_queries_v2:
270269
# if both usage and lineage are disabled then skip queries extractor piece
271270
if (
272271
not self.config.include_usage_statistics
@@ -276,13 +275,21 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
276275

277276
self.report.set_ingestion_stage("*", QUERIES_EXTRACTION)
278277

278+
discovered_tables: Set[str] = set()
279+
if self.config.include_table_lineage:
280+
discovered_tables.update(self.bq_schema_extractor.table_refs)
281+
282+
if self.config.include_view_lineage:
283+
discovered_tables.update(self.bq_schema_extractor.view_snapshot_refs)
284+
279285
with BigQueryQueriesExtractor(
280286
connection=self.config.get_bigquery_client(),
281287
schema_api=self.bq_schema_extractor.schema_api,
282288
config=BigQueryQueriesExtractorConfig(
283289
window=self.config,
284290
user_email_pattern=self.config.usage.user_email_pattern,
285-
include_lineage=self.config.include_table_lineage,
291+
include_lineage=self.config.include_table_lineage
292+
or self.config.include_view_lineage,
286293
include_usage_statistics=self.config.include_usage_statistics,
287294
include_operations=self.config.usage.include_operational_stats,
288295
top_n_queries=self.config.usage.top_n_queries,
@@ -292,24 +299,23 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
292299
filters=self.filters,
293300
identifiers=self.identifiers,
294301
schema_resolver=self.sql_parser_schema_resolver,
295-
discovered_tables=self.bq_schema_extractor.table_refs,
302+
discovered_tables=discovered_tables,
296303
) as queries_extractor:
297304
self.report.queries_extractor = queries_extractor.report
298305
yield from queries_extractor.get_workunits_internal()
299306

300307
else:
301308
if self.config.include_usage_statistics:
302309
yield from self.usage_extractor.get_usage_workunits(
303-
[p.id for p in projects], self.bq_schema_extractor.table_refs
310+
[p.id for p in projects],
311+
self.bq_schema_extractor.table_refs.union(
312+
self.bq_schema_extractor.view_snapshot_refs
313+
),
304314
)
305315

306316
if self.config.include_table_lineage:
307317
yield from self.lineage_extractor.get_lineage_workunits(
308318
[p.id for p in projects],
309-
self.bq_schema_extractor.view_refs_by_project,
310-
self.bq_schema_extractor.view_definitions,
311-
self.bq_schema_extractor.snapshot_refs_by_project,
312-
self.bq_schema_extractor.snapshots_by_ref,
313319
self.bq_schema_extractor.table_refs,
314320
)
315321

metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ def __init__(
195195

196196
# Global store of table identifiers for lineage filtering
197197
self.table_refs: Set[str] = set()
198+
self.view_snapshot_refs: Set[str] = set()
198199

199200
# Maps project -> view_ref, so we can find all views in a project
200201
self.view_refs_by_project: Dict[str, Set[str]] = defaultdict(set)
@@ -233,6 +234,14 @@ def store_table_refs(self):
233234
or self.config.use_queries_v2
234235
)
235236

237+
@property
238+
def store_view_refs(self):
239+
return (
240+
self.config.include_view_lineage
241+
or self.config.include_usage_statistics
242+
or self.config.use_queries_v2
243+
)
244+
236245
def modified_base32decode(self, text_to_decode: str) -> str:
237246
# When we sync from DataHub to BigQuery, we encode the tags as modified base32 strings.
238247
# BiqQuery labels only support lowercase letters, international characters, numbers, or underscores.
@@ -653,11 +662,11 @@ def _process_view(
653662
self.report.report_dropped(table_identifier.raw_table_name())
654663
return
655664

656-
if self.store_table_refs:
665+
if self.store_view_refs:
657666
table_ref = str(
658667
BigQueryTableRef(table_identifier).get_sanitized_table_ref()
659668
)
660-
self.table_refs.add(table_ref)
669+
self.view_snapshot_refs.add(table_ref)
661670
if self.config.lineage_parse_view_ddl and view.view_definition:
662671
self.view_refs_by_project[project_id].add(table_ref)
663672
self.view_definitions[table_ref] = view.view_definition
@@ -701,11 +710,11 @@ def _process_snapshot(
701710
f"Snapshot doesn't have any column or unable to get columns for snapshot: {table_identifier}"
702711
)
703712

704-
if self.store_table_refs:
713+
if self.store_view_refs:
705714
table_ref = str(
706715
BigQueryTableRef(table_identifier).get_sanitized_table_ref()
707716
)
708-
self.table_refs.add(table_ref)
717+
self.view_snapshot_refs.add(table_ref)
709718
if snapshot.base_table_identifier:
710719
self.snapshot_refs_by_project[project_id].add(table_ref)
711720
self.snapshots_by_ref[table_ref] = snapshot

metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py

-12
Original file line numberDiff line numberDiff line change
@@ -322,23 +322,11 @@ def get_lineage_workunits_for_views_and_snapshots(
322322
def get_lineage_workunits(
323323
self,
324324
projects: List[str],
325-
view_refs_by_project: Dict[str, Set[str]],
326-
view_definitions: FileBackedDict[str],
327-
snapshot_refs_by_project: Dict[str, Set[str]],
328-
snapshots_by_ref: FileBackedDict[BigqueryTableSnapshot],
329325
table_refs: Set[str],
330326
) -> Iterable[MetadataWorkUnit]:
331327
if not self._should_ingest_lineage():
332328
return
333329

334-
yield from self.get_lineage_workunits_for_views_and_snapshots(
335-
projects,
336-
view_refs_by_project,
337-
view_definitions,
338-
snapshot_refs_by_project,
339-
snapshots_by_ref,
340-
)
341-
342330
if self.config.use_exported_bigquery_audit_metadata:
343331
projects = ["*"] # project_id not used when using exported metadata
344332

0 commit comments

Comments
 (0)