|
| 1 | +import time |
1 | 2 | import uuid
|
2 | 3 |
|
| 4 | +import datahub.metadata.schema_classes as models |
3 | 5 | from datahub.emitter.mce_builder import make_data_job_urn, make_dataset_urn
|
4 | 6 | from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
5 | 7 | from datahub.metadata.schema_classes import (
|
@@ -136,3 +138,95 @@ def test_datajob_inputoutput_dataset_patch(graph_client):
|
136 | 138 | inputoutput_lineage_read.inputDatasetEdges[0].destinationUrn
|
137 | 139 | == other_dataset_urn
|
138 | 140 | )
|
| 141 | + |
| 142 | + |
| 143 | +def test_datajob_multiple_inputoutput_dataset_patch(graph_client): |
| 144 | + """Test creating a data job with multiple input and output datasets and verifying the aspects.""" |
| 145 | + # Create the data job |
| 146 | + datajob_urn = "urn:li:dataJob:(urn:li:dataFlow:(airflow,training,default),training)" |
| 147 | + |
| 148 | + # Create input and output dataset URNs |
| 149 | + input_datasets = ["input_data_1", "input_data_2"] |
| 150 | + output_datasets = ["output_data_1", "output_data_2"] |
| 151 | + |
| 152 | + input_dataset_urns = [ |
| 153 | + make_dataset_urn(platform="s3", name=f"test_patch_{dataset}", env="PROD") |
| 154 | + for dataset in input_datasets |
| 155 | + ] |
| 156 | + output_dataset_urns = [ |
| 157 | + make_dataset_urn(platform="s3", name=f"test_patch_{dataset}", env="PROD") |
| 158 | + for dataset in output_datasets |
| 159 | + ] |
| 160 | + |
| 161 | + # Create edges for datasets |
| 162 | + def make_edge(urn, generate_auditstamp=False): |
| 163 | + audit_stamp = models.AuditStampClass( |
| 164 | + time=int(time.time() * 1000.0), |
| 165 | + actor="urn:li:corpuser:datahub", |
| 166 | + ) |
| 167 | + return EdgeClass( |
| 168 | + destinationUrn=str(urn), |
| 169 | + lastModified=audit_stamp if generate_auditstamp else None, |
| 170 | + ) |
| 171 | + |
| 172 | + # Initialize empty input/output lineage |
| 173 | + initial_lineage = DataJobInputOutputClass( |
| 174 | + inputDatasets=[], outputDatasets=[], inputDatasetEdges=[], outputDatasetEdges=[] |
| 175 | + ) |
| 176 | + |
| 177 | + # Emit initial lineage |
| 178 | + mcpw = MetadataChangeProposalWrapper(entityUrn=datajob_urn, aspect=initial_lineage) |
| 179 | + graph_client.emit_mcp(mcpw) |
| 180 | + |
| 181 | + # Create patches for input and output datasets |
| 182 | + patch_builder = DataJobPatchBuilder(datajob_urn) |
| 183 | + for input_urn in input_dataset_urns: |
| 184 | + patch_builder.add_input_dataset(make_edge(input_urn)) |
| 185 | + for output_urn in output_dataset_urns: |
| 186 | + patch_builder.add_output_dataset(make_edge(output_urn)) |
| 187 | + |
| 188 | + # Apply patches |
| 189 | + for patch_mcp in patch_builder.build(): |
| 190 | + graph_client.emit_mcp(patch_mcp) |
| 191 | + |
| 192 | + # Verify the lineage was correctly applied |
| 193 | + lineage_aspect = graph_client.get_aspect( |
| 194 | + entity_urn=datajob_urn, |
| 195 | + aspect_type=DataJobInputOutputClass, |
| 196 | + ) |
| 197 | + |
| 198 | + # Assert lineage was created |
| 199 | + assert lineage_aspect is not None |
| 200 | + assert lineage_aspect.inputDatasetEdges is not None |
| 201 | + assert lineage_aspect.outputDatasetEdges is not None |
| 202 | + |
| 203 | + # Verify input datasets |
| 204 | + assert len(lineage_aspect.inputDatasetEdges) == len(input_datasets) |
| 205 | + input_urns = {edge.destinationUrn for edge in lineage_aspect.inputDatasetEdges} |
| 206 | + expected_input_urns = {str(urn) for urn in input_dataset_urns} |
| 207 | + assert input_urns == expected_input_urns |
| 208 | + |
| 209 | + # Verify output datasets |
| 210 | + assert len(lineage_aspect.outputDatasetEdges) == len(output_datasets) |
| 211 | + output_urns = {edge.destinationUrn for edge in lineage_aspect.outputDatasetEdges} |
| 212 | + expected_output_urns = {str(urn) for urn in output_dataset_urns} |
| 213 | + assert output_urns == expected_output_urns |
| 214 | + |
| 215 | + # Test updating the same datasets again (idempotency) |
| 216 | + patch_builder = DataJobPatchBuilder(datajob_urn) |
| 217 | + for input_urn in input_dataset_urns: |
| 218 | + patch_builder.add_input_dataset(make_edge(input_urn)) |
| 219 | + for output_urn in output_dataset_urns: |
| 220 | + patch_builder.add_output_dataset(make_edge(output_urn)) |
| 221 | + |
| 222 | + for patch_mcp in patch_builder.build(): |
| 223 | + graph_client.emit_mcp(patch_mcp) |
| 224 | + |
| 225 | + # Verify the aspect hasn't changed |
| 226 | + updated_lineage_aspect = graph_client.get_aspect( |
| 227 | + entity_urn=datajob_urn, |
| 228 | + aspect_type=DataJobInputOutputClass, |
| 229 | + ) |
| 230 | + |
| 231 | + assert updated_lineage_aspect is not None |
| 232 | + assert updated_lineage_aspect.to_obj() == lineage_aspect.to_obj() |
0 commit comments