Skip to content

Commit da85dc0

Browse files
hsheth2sleeperdeep
authored andcommitted
feat(ingest): add urn validation test files (datahub-project#12036)
1 parent 26e0596 commit da85dc0

12 files changed

+115
-136
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Basic URN format tests
2+
urn:li:abc
3+
urn:li:abc:
4+
urn:li:abc:()
5+
urn:li:abc:(abc,)
6+
urn:li:corpuser:abc)
7+
8+
# Reserved characters
9+
urn:li:corpuser:foo␟bar
10+
urn:li:tag:a,b,c
11+
12+
# CorpUser URN tests
13+
urn:li:corpuser:(part1,part2)
14+
15+
# Dataset URN tests
16+
urn:li:dataset:(urn:li:user:abc,dataset,prod)
17+
urn:li:dataset:(urn:li:user:abc,dataset)
18+
urn:li:dataset:(urn:li:user:abc,dataset,invalidEnv)
19+
20+
# DataFlow URN tests
21+
urn:li:dataFlow:(airflow,flow_id)
22+
23+
# DataJob URN tests
24+
urn:li:dataJob:(urn:li:user:abc,job_id)
25+
urn:li:dataJob:(urn:li:dataFlow:(airflow,flow_id,prod))
26+
27+
# Domain URN tests
28+
urn:li:domain:(part1,part2)
29+
30+
# Tag URN tests
31+
urn:li:tag:(part1,part2)
32+
33+
# Notebook URN tests
34+
urn:li:notebook:(part1,part2,part3)
35+
36+
# CorpGroup URN tests
37+
urn:li:corpGroup:(part1,part2)
38+
39+
# DataProcessInstance URN tests
40+
urn:li:dataProcessInstance:(part1,part2)

metadata-ingestion/tests/unit/urns/test_corp_group_urn.py

-10
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import pytest
44

55
from datahub.utilities.urns.corp_group_urn import CorpGroupUrn
6-
from datahub.utilities.urns.error import InvalidUrnError
76

87

98
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
@@ -17,12 +16,3 @@ def test_parse_urn(self) -> None:
1716
assert str(corp_group_urn) == corp_group_urn_str
1817
assert corp_group_urn == CorpGroupUrn(name="abc")
1918
assert corp_group_urn == CorpGroupUrn.create_from_id("abc")
20-
21-
def test_invalid_urn(self) -> None:
22-
with self.assertRaises(InvalidUrnError):
23-
CorpGroupUrn.create_from_string(
24-
"urn:li:abc:(urn:li:dataPlatform:abc,def,prod)"
25-
)
26-
27-
with self.assertRaises(InvalidUrnError):
28-
CorpGroupUrn.create_from_string("urn:li:corpGroup:(part1,part2)")

metadata-ingestion/tests/unit/urns/test_corpuser_urn.py

-10
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import pytest
44

55
from datahub.utilities.urns.corpuser_urn import CorpuserUrn
6-
from datahub.utilities.urns.error import InvalidUrnError
76

87

98
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
@@ -17,12 +16,3 @@ def test_parse_urn(self) -> None:
1716
assert str(corpuser_urn) == corpuser_urn_str
1817
assert corpuser_urn == CorpuserUrn("abc")
1918
assert corpuser_urn == CorpuserUrn.create_from_id("abc")
20-
21-
def test_invalid_urn(self) -> None:
22-
with self.assertRaises(InvalidUrnError):
23-
CorpuserUrn.create_from_string(
24-
"urn:li:abc:(urn:li:dataPlatform:abc,def,prod)"
25-
)
26-
27-
with self.assertRaises(InvalidUrnError):
28-
CorpuserUrn.create_from_string("urn:li:corpuser:(part1,part2)")

metadata-ingestion/tests/unit/urns/test_data_flow_urn.py

-8
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import pytest
44

55
from datahub.utilities.urns.data_flow_urn import DataFlowUrn
6-
from datahub.utilities.urns.error import InvalidUrnError
76

87

98
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
@@ -16,10 +15,3 @@ def test_parse_urn(self) -> None:
1615
assert data_flow_urn.get_env() == "prod"
1716
assert data_flow_urn.__str__() == "urn:li:dataFlow:(airflow,def,prod)"
1817
assert data_flow_urn == DataFlowUrn("airflow", "def", "prod")
19-
20-
def test_invalid_urn(self) -> None:
21-
with self.assertRaises(InvalidUrnError):
22-
DataFlowUrn.create_from_string("urn:li:abc:(airflow,def,prod)")
23-
24-
with self.assertRaises(InvalidUrnError):
25-
DataFlowUrn.create_from_string("urn:li:dataFlow:(airflow,flow_id)")

metadata-ingestion/tests/unit/urns/test_data_job_urn.py

-15
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
from datahub.utilities.urns.data_flow_urn import DataFlowUrn
66
from datahub.utilities.urns.data_job_urn import DataJobUrn
7-
from datahub.utilities.urns.error import InvalidUrnError
87

98

109
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
@@ -22,17 +21,3 @@ def test_parse_urn(self) -> None:
2221
assert data_job_urn == DataJobUrn(
2322
"urn:li:dataFlow:(airflow,flow_id,prod)", "job_id"
2423
)
25-
26-
def test_invalid_urn(self) -> None:
27-
with self.assertRaises(InvalidUrnError):
28-
DataJobUrn.create_from_string(
29-
"urn:li:abc:(urn:li:dataFlow:(airflow,flow_id,prod),job_id)"
30-
)
31-
32-
with self.assertRaises(InvalidUrnError):
33-
DataJobUrn.create_from_string("urn:li:dataJob:(urn:li:user:abc,job_id)")
34-
35-
with self.assertRaises(InvalidUrnError):
36-
DataJobUrn.create_from_string(
37-
"urn:li:dataJob:(urn:li:dataFlow:(airflow,flow_id,prod))"
38-
)

metadata-ingestion/tests/unit/urns/test_data_process_instance_urn.py

-10
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import pytest
44

55
from datahub.utilities.urns.data_process_instance_urn import DataProcessInstanceUrn
6-
from datahub.utilities.urns.error import InvalidUrnError
76

87

98
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
@@ -20,12 +19,3 @@ def test_parse_urn(self) -> None:
2019
assert dataprocessinstance_urn == DataProcessInstanceUrn("abc")
2120
assert dataprocessinstance_urn == DataProcessInstanceUrn.create_from_id("abc")
2221
assert "abc" == dataprocessinstance_urn.get_dataprocessinstance_id()
23-
24-
def test_invalid_urn(self) -> None:
25-
with self.assertRaises(InvalidUrnError):
26-
DataProcessInstanceUrn.create_from_string("urn:li:abc:dataProcessInstance")
27-
28-
with self.assertRaises(InvalidUrnError):
29-
DataProcessInstanceUrn.create_from_string(
30-
"urn:li:dataProcessInstance:(part1,part2)"
31-
)

metadata-ingestion/tests/unit/urns/test_dataset_urn.py

-20
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
from datahub.utilities.urns.data_platform_urn import DataPlatformUrn
66
from datahub.utilities.urns.dataset_urn import DatasetUrn
7-
from datahub.utilities.urns.error import InvalidUrnError
87

98

109
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
@@ -20,22 +19,3 @@ def test_parse_urn(self) -> None:
2019
assert dataset_urn.get_env() == "PROD"
2120
assert dataset_urn.__str__() == dataset_urn_str
2221
assert dataset_urn == DatasetUrn("urn:li:dataPlatform:abc", "def", "prod")
23-
24-
def test_invalid_urn(self) -> None:
25-
with self.assertRaises(InvalidUrnError):
26-
DatasetUrn.create_from_string(
27-
"urn:li:abc:(urn:li:dataPlatform:abc,def,prod)"
28-
)
29-
30-
with self.assertRaises(InvalidUrnError):
31-
DatasetUrn.create_from_string(
32-
"urn:li:dataset:(urn:li:user:abc,dataset,prod)"
33-
)
34-
35-
with self.assertRaises(InvalidUrnError):
36-
DatasetUrn.create_from_string("urn:li:dataset:(urn:li:user:abc,dataset)")
37-
38-
with self.assertRaises(InvalidUrnError):
39-
DatasetUrn.create_from_string(
40-
"urn:li:dataset:(urn:li:user:abc,dataset,invalidEnv)"
41-
)

metadata-ingestion/tests/unit/urns/test_domain_urn.py

-8
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import pytest
44

55
from datahub.utilities.urns.domain_urn import DomainUrn
6-
from datahub.utilities.urns.error import InvalidUrnError
76

87

98
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
@@ -17,10 +16,3 @@ def test_parse_urn(self) -> None:
1716
assert str(domain_urn) == domain_urn_str
1817
assert domain_urn == DomainUrn("abc")
1918
assert domain_urn == DomainUrn.create_from_id("abc")
20-
21-
def test_invalid_urn(self) -> None:
22-
with self.assertRaises(InvalidUrnError):
23-
DomainUrn.create_from_string("urn:li:abc:domain")
24-
25-
with self.assertRaises(InvalidUrnError):
26-
DomainUrn.create_from_string("urn:li:domain:(part1,part2)")

metadata-ingestion/tests/unit/urns/test_notebook_urn.py

-10
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import pytest
44

5-
from datahub.utilities.urns.error import InvalidUrnError
65
from datahub.utilities.urns.notebook_urn import NotebookUrn
76

87

@@ -16,12 +15,3 @@ def test_parse_urn(self) -> None:
1615
assert str(notebook_urn) == notebook_urn_str
1716

1817
assert notebook_urn == NotebookUrn("querybook", "123")
19-
20-
def test_invalid_urn(self) -> None:
21-
with self.assertRaises(InvalidUrnError):
22-
NotebookUrn.create_from_string(
23-
"urn:li:abc:(urn:li:dataPlatform:abc,def,prod)"
24-
)
25-
26-
with self.assertRaises(InvalidUrnError):
27-
NotebookUrn.create_from_string("urn:li:notebook:(part1,part2,part3)")

metadata-ingestion/tests/unit/urns/test_tag_urn.py

-8
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import pytest
44

5-
from datahub.utilities.urns.error import InvalidUrnError
65
from datahub.utilities.urns.tag_urn import TagUrn
76

87

@@ -17,10 +16,3 @@ def test_parse_urn(self) -> None:
1716
assert str(tag_urn) == tag_urn_str
1817
assert tag_urn == TagUrn("abc")
1918
assert tag_urn == TagUrn.create_from_id("abc")
20-
21-
def test_invalid_urn(self) -> None:
22-
with self.assertRaises(InvalidUrnError):
23-
TagUrn.create_from_string("urn:li:abc:tag_id")
24-
25-
with self.assertRaises(InvalidUrnError):
26-
TagUrn.create_from_string("urn:li:tag:(part1,part2)")
+51-37
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
1+
import logging
2+
import pathlib
3+
from typing import List
4+
15
import pytest
26

3-
from datahub.metadata.urns import (
4-
CorpUserUrn,
5-
DashboardUrn,
6-
DataPlatformUrn,
7-
DatasetUrn,
8-
Urn,
9-
)
7+
from datahub.metadata.urns import CorpUserUrn, DatasetUrn, Urn
108
from datahub.utilities.urns.error import InvalidUrnError
119

1210
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
1311

12+
_CURRENT_DIR = pathlib.Path(__file__).parent
13+
logger = logging.getLogger(__name__)
14+
1415

1516
def test_parse_urn() -> None:
1617
simple_urn_str = "urn:li:dataPlatform:abc"
@@ -40,38 +41,12 @@ def test_url_encode_urn() -> None:
4041
)
4142

4243

43-
def test_invalid_urn() -> None:
44-
with pytest.raises(InvalidUrnError):
45-
Urn.from_string("urn:li:abc")
46-
47-
with pytest.raises(InvalidUrnError):
48-
Urn.from_string("urn:li:abc:")
49-
50-
with pytest.raises(InvalidUrnError):
51-
Urn.from_string("urn:li:abc:()")
52-
53-
with pytest.raises(InvalidUrnError):
54-
Urn.from_string("urn:li:abc:(abc,)")
55-
56-
with pytest.raises(InvalidUrnError):
57-
Urn.from_string("urn:li:corpuser:abc)")
58-
59-
6044
def test_urn_colon() -> None:
61-
# Colon characters are valid in urns, and should not mess up parsing.
62-
63-
urn = Urn.from_string(
64-
"urn:li:dashboard:(looker,dashboards.thelook::customer_lookup)"
65-
)
66-
assert isinstance(urn, DashboardUrn)
67-
68-
assert DataPlatformUrn.from_string("urn:li:dataPlatform:abc:def")
69-
assert DatasetUrn.from_string(
70-
"urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,PROD)"
71-
)
72-
assert Urn.from_string("urn:li:corpuser:foo:[email protected]")
45+
# There's a bunch of other, simpler tests for special characters in the valid_urns test.
7346

47+
# This test ensures that the type dispatch and fields work fine here.
7448
# I'm not sure why you'd ever want this, but technically it's a valid urn.
49+
7550
urn = Urn.from_string("urn:li:corpuser::")
7651
assert isinstance(urn, CorpUserUrn)
7752
assert urn.username == ":"
@@ -85,9 +60,48 @@ def test_urn_coercion() -> None:
8560
assert urn == Urn.from_string(urn.urn())
8661

8762

88-
def test_urn_type_dispatch() -> None:
63+
def test_urn_type_dispatch_1() -> None:
8964
urn = Urn.from_string("urn:li:dataset:(urn:li:dataPlatform:abc,def,PROD)")
9065
assert isinstance(urn, DatasetUrn)
9166

9267
with pytest.raises(InvalidUrnError, match="Passed an urn of type corpuser"):
9368
DatasetUrn.from_string("urn:li:corpuser:foo")
69+
70+
71+
def test_urn_type_dispatch_2() -> None:
72+
urn = "urn:li:dataJob:(urn:li:dataFlow:(airflow,flow_id,prod),job_id)"
73+
assert Urn.from_string(urn).urn() == urn
74+
75+
with pytest.raises(InvalidUrnError, match="Passed an urn of type dataJob"):
76+
CorpUserUrn.from_string(urn)
77+
78+
79+
def _load_urns(file_name: pathlib.Path) -> List[str]:
80+
urns = [
81+
line.strip()
82+
for line in file_name.read_text().splitlines()
83+
if line.strip() and not line.startswith("#")
84+
]
85+
assert len(urns) > 0, f"No urns found in {file_name}"
86+
return urns
87+
88+
89+
def test_valid_urns() -> None:
90+
valid_urns_file = _CURRENT_DIR / "valid_urns.txt"
91+
valid_urns = _load_urns(valid_urns_file)
92+
93+
for valid_urn in valid_urns:
94+
logger.info(f"Testing valid URN: {valid_urn}")
95+
parsed_urn = Urn.from_string(valid_urn)
96+
assert parsed_urn.urn() == valid_urn
97+
98+
99+
def test_invalid_urns() -> None:
100+
invalid_urns_file = _CURRENT_DIR / "invalid_urns.txt"
101+
invalid_urns = _load_urns(invalid_urns_file)
102+
103+
# Test each invalid URN
104+
for invalid_urn in invalid_urns:
105+
with pytest.raises(InvalidUrnError):
106+
logger.info(f"Testing invalid URN: {invalid_urn}")
107+
Urn.from_string(invalid_urn)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Unknown entity types become generic urns
2+
urn:li:abc:foo
3+
urn:li:abc:(foo,bar)
4+
urn:li:abc:(urn:li:dataPlatform:abc,def,prod)
5+
6+
# A bunch of pretty normal urns
7+
urn:li:corpuser:foo
8+
urn:li:corpGroup:bar
9+
urn:li:dataset:(urn:li:dataPlatform:abc,def/ghi,prod)
10+
urn:li:dataFlow:(airflow,def,prod)
11+
urn:li:dataJob:(urn:li:dataFlow:(airflow,flow_id,prod),job_id)
12+
urn:li:tag:abc
13+
urn:li:chart:(looker,chart_name)
14+
urn:li:dashboard:(looker,dashboard_name)
15+
urn:li:dataProcessInstance:abc
16+
urn:li:domain:abc
17+
urn:li:notebook:(querybook,123)
18+
19+
# Urns with colons and other special characters
20+
urn:li:tag:dbt:bar
21+
urn:li:tag::
22+
urn:li:dashboard:(looker,dashboards.thelook::customer_lookup)
23+
urn:li:dataPlatform:abc:def
24+
urn:li:corpuser:foo:[email protected]

0 commit comments

Comments
 (0)