Skip to content

Commit ea0ae8c

Browse files
gp1105739Peng Gao
and
Peng Gao
authored
feat(airflow) Override datajob external_url (#9681)
Co-authored-by: Peng Gao <[email protected]>
1 parent e6d7066 commit ea0ae8c

File tree

7 files changed

+49
-5
lines changed

7 files changed

+49
-5
lines changed

docs/lineage/airflow.md

+2
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,8 @@ conn_id = datahub_rest_default # or datahub_kafka_default
135135
| capture_ownership_info | true | If true, the owners field of the DAG will be capture as a DataHub corpuser. |
136136
| capture_tags_info | true | If true, the tags field of the DAG will be captured as DataHub tags. |
137137
| capture_executions | true | If true, we'll capture task runs in DataHub in addition to DAG definitions. |
138+
| datajob_url_link | taskinstance | If taskinstance, the datajob url will be taskinstance link on airflow. It can also be grid.
139+
|
138140
| graceful_exceptions | true | If set to true, most runtime errors in the lineage backend will be suppressed and will not cause the overall task to fail. Note that configuration issues will still throw exceptions. |
139141

140142
#### Validate that the plugin is working

metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_config.py

+12
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from enum import Enum
12
from typing import TYPE_CHECKING, Optional
23

34
import datahub.emitter.mce_builder as builder
@@ -8,6 +9,11 @@
89
from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook
910

1011

12+
class DatajobUrl(Enum):
13+
GRID = "grid"
14+
TASKINSTANCE = "taskinstance"
15+
16+
1117
class DatahubLineageConfig(ConfigModel):
1218
# This class is shared between the lineage backend and the Airflow plugin.
1319
# The defaults listed here are only relevant for the lineage backend.
@@ -41,6 +47,8 @@ class DatahubLineageConfig(ConfigModel):
4147
# The Airflow plugin behaves as if it were set to True.
4248
graceful_exceptions: bool = True
4349

50+
datajob_url_link: DatajobUrl = DatajobUrl.TASKINSTANCE
51+
4452
def make_emitter_hook(self) -> "DatahubGenericHook":
4553
# This is necessary to avoid issues with circular imports.
4654
from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook
@@ -65,6 +73,9 @@ def get_lineage_config() -> DatahubLineageConfig:
6573
disable_openlineage_plugin = conf.get(
6674
"datahub", "disable_openlineage_plugin", fallback=True
6775
)
76+
datajob_url_link = conf.get(
77+
"datahub", "datajob_url_link", fallback=DatajobUrl.TASKINSTANCE.value
78+
)
6879

6980
return DatahubLineageConfig(
7081
enabled=enabled,
@@ -77,4 +88,5 @@ def get_lineage_config() -> DatahubLineageConfig:
7788
log_level=log_level,
7889
debug_emitter=debug_emitter,
7990
disable_openlineage_plugin=disable_openlineage_plugin,
91+
datajob_url_link=datajob_url_link,
8092
)

metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/airflow_generator.py

+21-4
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from datahub.utilities.urns.data_job_urn import DataJobUrn
1414

1515
from datahub_airflow_plugin._airflow_compat import AIRFLOW_PATCHED
16+
from datahub_airflow_plugin._config import DatahubLineageConfig, DatajobUrl
1617

1718
assert AIRFLOW_PATCHED
1819

@@ -208,6 +209,7 @@ def generate_datajob(
208209
set_dependencies: bool = True,
209210
capture_owner: bool = True,
210211
capture_tags: bool = True,
212+
config: Optional[DatahubLineageConfig] = None,
211213
) -> DataJob:
212214
"""
213215
@@ -217,6 +219,7 @@ def generate_datajob(
217219
:param set_dependencies: bool - whether to extract dependencies from airflow task
218220
:param capture_owner: bool - whether to extract owner from airflow task
219221
:param capture_tags: bool - whether to set tags automatically from airflow task
222+
:param config: DatahubLineageConfig
220223
:return: DataJob - returns the generated DataJob object
221224
"""
222225
dataflow_urn = DataFlowUrn.create_from_ids(
@@ -267,7 +270,11 @@ def generate_datajob(
267270

268271
datajob.properties = job_property_bag
269272
base_url = conf.get("webserver", "base_url")
270-
datajob.url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={datajob.flow_urn.get_flow_id()}&_flt_3_task_id={task.task_id}"
273+
274+
if config and config.datajob_url_link == DatajobUrl.GRID:
275+
datajob.url = f"{base_url}/dags/{datajob.flow_urn.get_flow_id()}/grid?task_id={task.task_id}"
276+
else:
277+
datajob.url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={datajob.flow_urn.get_flow_id()}&_flt_3_task_id={task.task_id}"
271278

272279
if capture_owner and dag.owner:
273280
datajob.owners.add(dag.owner)
@@ -290,9 +297,12 @@ def create_datajob_instance(
290297
task: "Operator",
291298
dag: "DAG",
292299
data_job: Optional[DataJob] = None,
300+
config: Optional[DatahubLineageConfig] = None,
293301
) -> DataProcessInstance:
294302
if data_job is None:
295-
data_job = AirflowGenerator.generate_datajob(cluster, task=task, dag=dag)
303+
data_job = AirflowGenerator.generate_datajob(
304+
cluster, task=task, dag=dag, config=config
305+
)
296306
dpi = DataProcessInstance.from_datajob(
297307
datajob=data_job, id=task.task_id, clone_inlets=True, clone_outlets=True
298308
)
@@ -407,9 +417,12 @@ def run_datajob(
407417
datajob: Optional[DataJob] = None,
408418
attempt: Optional[int] = None,
409419
emit_templates: bool = True,
420+
config: Optional[DatahubLineageConfig] = None,
410421
) -> DataProcessInstance:
411422
if datajob is None:
412-
datajob = AirflowGenerator.generate_datajob(cluster, ti.task, dag)
423+
datajob = AirflowGenerator.generate_datajob(
424+
cluster, ti.task, dag, config=config
425+
)
413426

414427
assert dag_run.run_id
415428
dpi = DataProcessInstance.from_datajob(
@@ -480,6 +493,7 @@ def complete_datajob(
480493
end_timestamp_millis: Optional[int] = None,
481494
result: Optional[InstanceRunResult] = None,
482495
datajob: Optional[DataJob] = None,
496+
config: Optional[DatahubLineageConfig] = None,
483497
) -> DataProcessInstance:
484498
"""
485499
@@ -491,10 +505,13 @@ def complete_datajob(
491505
:param end_timestamp_millis: Optional[int]
492506
:param result: Optional[str] One of the result from datahub.metadata.schema_class.RunResultTypeClass
493507
:param datajob: Optional[DataJob]
508+
:param config: Optional[DatahubLineageConfig]
494509
:return: DataProcessInstance
495510
"""
496511
if datajob is None:
497-
datajob = AirflowGenerator.generate_datajob(cluster, ti.task, dag)
512+
datajob = AirflowGenerator.generate_datajob(
513+
cluster, ti.task, dag, config=config
514+
)
498515

499516
if end_timestamp_millis is None:
500517
if ti.end_date:

metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py

+4
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,7 @@ def on_task_instance_running(
376376
dag=dag,
377377
capture_tags=self.config.capture_tags_info,
378378
capture_owner=self.config.capture_ownership_info,
379+
config=self.config,
379380
)
380381

381382
# TODO: Make use of get_task_location to extract github urls.
@@ -397,6 +398,7 @@ def on_task_instance_running(
397398
dag_run=dagrun,
398399
datajob=datajob,
399400
emit_templates=False,
401+
config=self.config,
400402
)
401403
logger.debug(f"Emitted DataHub DataProcess Instance start: {dpi}")
402404

@@ -419,6 +421,7 @@ def on_task_instance_finish(
419421
dag=dag,
420422
capture_tags=self.config.capture_tags_info,
421423
capture_owner=self.config.capture_ownership_info,
424+
config=self.config,
422425
)
423426

424427
# Add lineage info.
@@ -436,6 +439,7 @@ def on_task_instance_finish(
436439
dag_run=dagrun,
437440
datajob=datajob,
438441
result=status,
442+
config=self.config,
439443
)
440444
logger.debug(
441445
f"Emitted DataHub DataProcess Instance with status {status}: {dpi}"

metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin_v22.py

+4
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ def datahub_task_status_callback(context, status):
120120
dag=dag,
121121
capture_tags=config.capture_tags_info,
122122
capture_owner=config.capture_ownership_info,
123+
config=config,
123124
)
124125
datajob.inlets.extend(
125126
entities_to_dataset_urn_list([let.urn for let in task_inlets])
@@ -143,6 +144,7 @@ def datahub_task_status_callback(context, status):
143144
dag_run=context["dag_run"],
144145
datajob=datajob,
145146
start_timestamp_millis=int(ti.start_date.timestamp() * 1000),
147+
config=config,
146148
)
147149

148150
task.log.info(f"Emitted Start Datahub Dataprocess Instance: {dpi}")
@@ -185,6 +187,7 @@ def datahub_pre_execution(context):
185187
dag=dag,
186188
capture_tags=config.capture_tags_info,
187189
capture_owner=config.capture_ownership_info,
190+
config=config,
188191
)
189192
datajob.inlets.extend(
190193
entities_to_dataset_urn_list([let.urn for let in task_inlets])
@@ -208,6 +211,7 @@ def datahub_pre_execution(context):
208211
dag_run=context["dag_run"],
209212
datajob=datajob,
210213
start_timestamp_millis=int(ti.start_date.timestamp() * 1000),
214+
config=config,
211215
)
212216

213217
task.log.info(f"Emitting Datahub Dataprocess Instance: {dpi}")

metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/_lineage_core.py

+1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ def send_lineage_to_datahub(
5151
dag=dag,
5252
capture_tags=config.capture_tags_info,
5353
capture_owner=config.capture_ownership_info,
54+
config=config,
5455
)
5556
datajob.inlets.extend(entities_to_dataset_urn_list([let.urn for let in inlets]))
5657
datajob.outlets.extend(entities_to_dataset_urn_list([let.urn for let in outlets]))

metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/datahub.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,11 @@ def send_lineage(
7171
try:
7272
context = context or {} # ensure not None to satisfy mypy
7373
send_lineage_to_datahub(
74-
config, operator, operator.inlets, operator.outlets, context
74+
config,
75+
operator,
76+
operator.inlets,
77+
operator.outlets,
78+
context,
7579
)
7680
except Exception as e:
7781
operator.log.error(e)

0 commit comments

Comments
 (0)