Skip to content

Commit 32bd577

Browse files
committed
feat(dataset_cli): add dry-run support
1 parent 5938387 commit 32bd577

File tree

4 files changed

+112
-10
lines changed

4 files changed

+112
-10
lines changed

metadata-ingestion/src/datahub/cli/specific/dataset_cli.py

+26-10
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,16 @@ def dataset() -> None:
2929
name="upsert",
3030
)
3131
@click.option("-f", "--file", required=True, type=click.Path(exists=True))
32+
@click.option(
33+
"-n", "--dry-run", type=bool, is_flag=True, default=False, help="Perform a dry run"
34+
)
3235
@upgrade.check_upgrade
3336
@telemetry.with_telemetry()
34-
def upsert(file: Path) -> None:
37+
def upsert(file: Path, dry_run: bool) -> None:
3538
"""Upsert attributes to a Dataset in DataHub."""
3639
# Call the sync command with to_datahub=True to perform the upsert operation
3740
ctx = click.get_current_context()
38-
ctx.invoke(sync, file=str(file), to_datahub=True)
41+
ctx.invoke(sync, file=str(file), dry_run=dry_run, to_datahub=True)
3942

4043

4144
@dataset.command(
@@ -167,11 +170,16 @@ def file(lintcheck: bool, lintfix: bool, file: str) -> None:
167170
)
168171
@click.option("-f", "--file", required=True, type=click.Path(exists=True))
169172
@click.option("--to-datahub/--from-datahub", required=True, is_flag=True)
173+
@click.option(
174+
"-n", "--dry-run", type=bool, is_flag=True, default=False, help="Perform a dry run"
175+
)
170176
@upgrade.check_upgrade
171177
@telemetry.with_telemetry()
172-
def sync(file: str, to_datahub: bool) -> None:
178+
def sync(file: str, to_datahub: bool, dry_run: bool) -> None:
173179
"""Sync a Dataset file to/from DataHub"""
174180

181+
dry_run_prefix = "[dry-run]: " if dry_run else "" # prefix to use in messages
182+
175183
failures: List[str] = []
176184
with get_default_graph() as graph:
177185
datasets = Dataset.from_yaml(file)
@@ -189,7 +197,7 @@ def sync(file: str, to_datahub: bool) -> None:
189197
click.secho(
190198
"\n\t- ".join(
191199
[
192-
f"Skipping Dataset {dataset.urn} due to missing entity references: "
200+
f"{dry_run_prefix}Skipping Dataset {dataset.urn} due to missing entity references: "
193201
]
194202
+ missing_entity_references
195203
),
@@ -199,13 +207,18 @@ def sync(file: str, to_datahub: bool) -> None:
199207
continue
200208
try:
201209
for mcp in dataset.generate_mcp():
202-
graph.emit(mcp)
203-
click.secho(f"Update succeeded for urn {dataset.urn}.", fg="green")
210+
if not dry_run:
211+
graph.emit(mcp)
212+
click.secho(
213+
f"{dry_run_prefix}Update succeeded for urn {dataset.urn}.",
214+
fg="green",
215+
)
204216
except Exception as e:
205217
click.secho(
206-
f"Update failed for id {id}. due to {e}",
218+
f"{dry_run_prefix}Update failed for id {id}. due to {e}",
207219
fg="red",
208220
)
221+
failures.append(dataset.urn)
209222
else:
210223
# Sync from DataHub
211224
if graph.exists(dataset.urn):
@@ -215,13 +228,16 @@ def sync(file: str, to_datahub: bool) -> None:
215228
existing_dataset: Dataset = Dataset.from_datahub(
216229
graph=graph, urn=dataset.urn, config=dataset_get_config
217230
)
218-
existing_dataset.to_yaml(Path(file))
231+
if not dry_run:
232+
existing_dataset.to_yaml(Path(file))
233+
else:
234+
click.secho(f"{dry_run_prefix}Will update file {file}")
219235
else:
220-
click.secho(f"Dataset {dataset.urn} does not exist")
236+
click.secho(f"{dry_run_prefix}Dataset {dataset.urn} does not exist")
221237
failures.append(dataset.urn)
222238
if failures:
223239
click.secho(
224-
f"\nFailed to sync the following Datasets: {', '.join(failures)}",
240+
f"\n{dry_run_prefix}Failed to sync the following Datasets: {', '.join(failures)}",
225241
fg="red",
226242
)
227243
raise click.Abort()

metadata-ingestion/tests/unit/cli/dataset/test_dataset_cmd.py

+60
Original file line numberDiff line numberDiff line change
@@ -217,3 +217,63 @@ def test_multiple_datasets_in_file(self, mock_dataset, test_yaml_file):
217217
# Verify both dataset instances had to_yaml called
218218
mock_dataset1.to_yaml.assert_called_once()
219219
mock_dataset2.to_yaml.assert_called_once()
220+
221+
@patch("datahub.ingestion.graph.client.get_default_graph")
222+
def test_dry_run_sync(self, mock_get_default_graph):
223+
mock_graph = MagicMock()
224+
mock_get_default_graph.return_value.__enter__.return_value = mock_graph
225+
226+
test_file = TEST_RESOURCES_DIR / "dataset_dry_run.yaml"
227+
228+
runner = CliRunner()
229+
result = runner.invoke(
230+
dataset, ["sync", "--dry-run", "--to-datahub", "-f", str(test_file)]
231+
)
232+
233+
# Verify
234+
assert result.exit_code == 0
235+
assert not mock_get_default_graph.emit.called
236+
237+
@patch("datahub.ingestion.graph.client.get_default_graph")
238+
def test_dry_run_sync_fail(self, mock_get_default_graph):
239+
mock_graph = MagicMock()
240+
mock_get_default_graph.return_value.__enter__.return_value = mock_graph
241+
242+
test_file = TEST_RESOURCES_DIR / "dataset_dry_run_bad_type.yaml"
243+
runner = CliRunner()
244+
result = runner.invoke(
245+
dataset, ["sync", "--dry-run", "--to-datahub", "-f", str(test_file)]
246+
)
247+
248+
# Verify
249+
assert result.exit_code != 0
250+
assert not mock_get_default_graph.emit.called
251+
assert "Type bad_type is not a valid primitive type" in result.output
252+
253+
@patch("datahub.cli.specific.dataset_cli.get_default_graph")
254+
def test_run_sync(self, mock_get_default_graph):
255+
mock_graph = MagicMock()
256+
mock_get_default_graph.return_value.__enter__.return_value = mock_graph
257+
258+
test_file = TEST_RESOURCES_DIR / "dataset_dry_run.yaml"
259+
260+
runner = CliRunner()
261+
result = runner.invoke(dataset, ["sync", "--to-datahub", "-f", str(test_file)])
262+
263+
# Verify
264+
assert result.exit_code == 0
265+
assert mock_graph.emit.called
266+
267+
@patch("datahub.cli.specific.dataset_cli.get_default_graph")
268+
def test_run_sync_fail(self, mock_get_default_graph):
269+
mock_graph = MagicMock()
270+
mock_get_default_graph.return_value.__enter__.return_value = mock_graph
271+
272+
test_file = TEST_RESOURCES_DIR / "dataset_dry_run_bad_type.yaml"
273+
runner = CliRunner()
274+
result = runner.invoke(dataset, ["sync", "--to-datahub", "-f", str(test_file)])
275+
276+
# Verify
277+
assert result.exit_code != 0
278+
assert not mock_get_default_graph.emit.called
279+
assert "is not a valid primitive type" in result.output
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
## This file is used to define the dataset schema for the dataset `foobar`
2+
- id: user.clicksv3
3+
platform: hive
4+
urn: urn:li:dataset:(urn:li:dataPlatform:hive,user.clicksv3,PROD) # use urn instead of id and platform
5+
subtype: View
6+
schema:
7+
fields:
8+
- id: ip
9+
type: string
10+
description: The IP address of the user
11+
- id: user_id
12+
type: string
13+
description: The user ID of the user
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
## This file is used to define the dataset schema for the dataset `foobar`
2+
- id: user.clicksv3
3+
platform: hive
4+
urn: urn:li:dataset:(urn:li:dataPlatform:hive,user.clicksv3,PROD) # use urn instead of id and platform
5+
subtype: View
6+
schema:
7+
fields:
8+
- id: ip
9+
type: string
10+
description: The IP address of the user
11+
- id: user_id
12+
type: bad_type
13+
description: The user ID of the user

0 commit comments

Comments
 (0)