Skip to content

Commit f9cc67d

Browse files
authored
feat(schematron): add java capabilities for schema translation (#11963)
1 parent fbc9851 commit f9cc67d

File tree

28 files changed

+3998
-3
lines changed

28 files changed

+3998
-3
lines changed

build.gradle

+1
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,7 @@ allprojects {
350350
}
351351
}
352352
}
353+
353354
}
354355

355356
configure(subprojects.findAll {! it.name.startsWith('spark-lineage')}) {

docs-website/sidebars.js

+1
Original file line numberDiff line numberDiff line change
@@ -989,6 +989,7 @@ module.exports = {
989989
// "metadata-ingestion/examples/structured_properties/README"
990990
// "smoke-test/tests/openapi/README"
991991
// "docs/SECURITY_STANCE"
992+
// "metadata-integration/java/datahub-schematron/README"
992993
// ],
993994
],
994995
};

metadata-integration/java/datahub-client/build.gradle

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ jar {
1919
dependencies {
2020
api project(':entity-registry')
2121
api project(':metadata-integration:java:datahub-event')
22+
implementation project(':metadata-integration:java:datahub-schematron:lib')
2223
implementation(externalDependency.kafkaAvroSerializer) {
2324
exclude group: "org.apache.avro"
2425
}
@@ -114,7 +115,7 @@ shadowJar {
114115
relocate 'org.checkerframework', 'datahub.shaded.org.checkerframework'
115116
relocate 'com.google.errorprone', 'datahub.shaded.com.google.errorprone'
116117
// Below jars added for kafka emitter only
117-
relocate 'org.apache.avro', 'datahub.shaded.org.apache.avro'
118+
// relocate 'org.apache.avro', 'datahub.shaded.org.apache.avro'
118119
relocate 'com.thoughtworks.paranamer', 'datahub.shaded.com.thoughtworks.paranamer'
119120
relocate 'org.xerial.snappy', 'datahub.shaded.org.xerial.snappy'
120121
relocate 'org.apache.kafka', 'datahub.shaded.org.apache.kafka'

metadata-integration/java/datahub-client/scripts/check_jar.sh

+4-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,10 @@ jar -tvf $jarFile |\
4040
grep -v "mozilla" |\
4141
grep -v "VersionInfo.java" |\
4242
grep -v "mime.types" |\
43-
grep -v "com/ibm/.*"
43+
grep -v "com/ibm/.*" |\
44+
grep -v "org/apache/avro" |\
45+
grep -v "org/apache"
46+
4447

4548

4649
if [ $? -ne 0 ]; then
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# SchemaTron (Incubating)
2+
3+
> ⚠️ This is an incubating project in draft status. APIs and functionality may change significantly between releases.
4+
5+
SchemaTron is a schema translation toolkit that converts between various schema formats and DataHub's native schema representation. It currently provides robust support for Apache Avro schema translation with a focus on complex schema structures including unions, arrays, maps, and nested records.
6+
7+
## Modules
8+
9+
### CLI Module
10+
11+
Command-line interface for converting schemas and emitting them to DataHub.
12+
13+
```bash
14+
# Execute from this directory
15+
../../../gradlew :metadata-integration:java:datahub-schematron:cli:run --args="-i cli/src/test/resources/FlatUser.avsc"
16+
```
17+
18+
#### CLI Options
19+
20+
- `-i, --input`: Input schema file or directory path
21+
- `-p, --platform`: Data platform name (default: "avro")
22+
- `-s, --server`: DataHub server URL (default: "http://localhost:8080")
23+
- `-t, --token`: DataHub access token
24+
- `--sink`: Output sink - "rest" or "file" (default: "rest")
25+
- `--output-file`: Output file path when using file sink (default: "metadata.json")
26+
27+
### Library Module
28+
29+
Core translation logic and models for schema conversion. Features include:
30+
31+
- Support for complex Avro schema structures:
32+
- Union types with multiple record options
33+
- Nested records and arrays
34+
- Optional fields with defaults
35+
- Logical types (date, timestamp, etc.)
36+
- Maps with various value types
37+
- Enum types
38+
- Custom metadata and documentation
39+
40+
- Comprehensive path handling for schema fields
41+
- DataHub-compatible metadata generation
42+
- Schema fingerprinting and versioning
43+
44+
## Example Schema Support
45+
46+
The library can handle sophisticated schema structures including:
47+
48+
- Customer profiles with multiple identification types (passport, driver's license, national ID)
49+
- Contact information with primary and alternative contact methods
50+
- Address validation with verification metadata
51+
- Subscription history tracking
52+
- Flexible preference and metadata storage
53+
- Tagged customer attributes
54+
55+
## Development
56+
57+
The project includes extensive test coverage through:
58+
59+
- Unit tests for field path handling
60+
- Schema translation comparison tests
61+
- Integration tests with Python reference implementation
62+
63+
Test resources include example schemas demonstrating various Avro schema features and edge cases.
64+
65+
## Contributing
66+
67+
As this is an incubating project, we welcome contributions and feedback on:
68+
69+
- Additional schema format support
70+
- Improved handling of complex schema patterns
71+
- Enhanced metadata translation
72+
- Documentation and examples
73+
- Test coverage
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
plugins {
2+
id "application"
3+
}
4+
apply plugin: 'java'
5+
apply plugin: 'jacoco'
6+
7+
ext {
8+
javaMainClass = "io.datahubproject.schematron.cli.SchemaTron"
9+
}
10+
11+
application {
12+
mainClassName = javaMainClass
13+
}
14+
15+
dependencies {
16+
// Existing dependencies remain unchanged
17+
implementation 'info.picocli:picocli:4.7.5'
18+
annotationProcessor 'info.picocli:picocli-codegen:4.7.5'
19+
implementation 'ch.qos.logback:logback-classic:1.2.11'
20+
implementation 'ch.qos.logback:logback-core:1.2.11'
21+
implementation project(':metadata-integration:java:datahub-client')
22+
implementation project(':metadata-integration:java:datahub-schematron:lib')
23+
implementation externalDependency.avro
24+
compileOnly externalDependency.lombok
25+
annotationProcessor externalDependency.lombok
26+
27+
// Test dependencies
28+
testImplementation externalDependency.testng
29+
testImplementation externalDependency.mockito
30+
}
31+
32+
test {
33+
useTestNG()
34+
35+
testLogging {
36+
events "passed", "skipped", "failed"
37+
exceptionFormat "full"
38+
showStandardStreams = true
39+
}
40+
41+
systemProperty 'python.venv.path', System.getProperty('python.venv.path', '../venv')
42+
}
43+
44+
task validatePythonEnv {
45+
doFirst {
46+
def venvPath = System.getProperty('python.venv.path', '../../../../metadata-ingestion/venv')
47+
def isWindows = System.getProperty('os.name').toLowerCase().contains('windows')
48+
def pythonExe = isWindows ? "${venvPath}/Scripts/python.exe" : "${venvPath}/bin/python"
49+
50+
def result = exec {
51+
commandLine pythonExe, "-c", "import sys; print(sys.executable)"
52+
ignoreExitValue = true
53+
standardOutput = new ByteArrayOutputStream()
54+
errorOutput = new ByteArrayOutputStream()
55+
}
56+
57+
if (result.exitValue != 0) {
58+
throw new GradleException("Python virtual environment not properly set up at ${venvPath}")
59+
}
60+
}
61+
}
62+
63+
test.dependsOn tasks.getByPath(":metadata-ingestion:installDev")
64+
65+
jacocoTestReport {
66+
dependsOn test
67+
}
68+
69+
test.finalizedBy jacocoTestReport
70+
71+
task updateGoldenFiles {
72+
dependsOn validatePythonEnv
73+
doLast {
74+
def venvPath = System.getProperty('python.venv.path', '../../../../metadata-ingestion/venv')
75+
def isWindows = System.getProperty('os.name').toLowerCase().contains('windows')
76+
def pythonExe = isWindows ? "${venvPath}/Scripts/python.exe" : "${venvPath}/bin/python"
77+
def diffsDir = new File('src/test/resources/diffs')
78+
79+
if (!diffsDir.exists()) {
80+
throw new GradleException("Diffs directory not found at ${diffsDir.absolutePath}")
81+
}
82+
83+
// Find all json files in the diffs directory
84+
diffsDir.listFiles().findAll { it.name.endsWith('_diff.json') }.each { diffFile ->
85+
def baseName = diffFile.name.replace('_diff.json', '')
86+
def pythonOutput = "build/test-outputs/${baseName}_python.json"
87+
def javaOutput = "build/test-outputs/${baseName}_java.json"
88+
89+
println "Updating golden file for ${baseName}..."
90+
91+
exec {
92+
commandLine pythonExe,
93+
'scripts/mce_diff.py',
94+
'--update-golden-diff',
95+
'--golden-diff-file',
96+
diffFile.absolutePath,
97+
pythonOutput,
98+
javaOutput
99+
ignoreExitValue = true
100+
standardOutput = new ByteArrayOutputStream()
101+
errorOutput = new ByteArrayOutputStream()
102+
}
103+
}
104+
}
105+
}
106+
107+
configurations {
108+
provided
109+
implementation.extendsFrom provided
110+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
from datahub.ingestion.extractor.schema_util import AvroToMceSchemaConverter
2+
from avro.schema import parse as parse_avro, RecordSchema
3+
from datahub.emitter.synchronized_file_emitter import SynchronizedFileEmitter
4+
import datahub.metadata.schema_classes as models
5+
import click
6+
from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn
7+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
8+
import os
9+
import hashlib
10+
from datahub.ingestion.graph.client import get_default_graph
11+
12+
13+
def get_schema_hash(schema):
14+
# Convert schema to string if it isn't already
15+
schema_str = str(schema)
16+
17+
# Create MD5 hash
18+
schema_hash = hashlib.md5(schema_str.encode("utf-8")).hexdigest()
19+
20+
return schema_hash
21+
22+
23+
@click.command(name="avro2datahub")
24+
@click.option("--input-file", "-i", type=click.Path(exists=True), required=True)
25+
@click.option("--platform", type=str, required=True)
26+
@click.option("--output-file", "-o", type=click.Path(), default="metadata.py.json")
27+
@click.option("--to-file", "-f", is_flag=True, default=True)
28+
@click.option("--to-server", "-s", is_flag=True, default=False)
29+
def generate_schema_file_from_avro_schema(
30+
input_file: str, platform: str, output_file: str, to_file: bool, to_server: bool
31+
):
32+
avro_schema_file = input_file
33+
output_file_name = output_file
34+
platform_urn = make_data_platform_urn(platform)
35+
converter = AvroToMceSchemaConverter(is_key_schema=False)
36+
37+
# Delete the output file if it exists
38+
if os.path.exists(output_file_name):
39+
os.remove(output_file_name)
40+
41+
with open(avro_schema_file) as f:
42+
raw_string = f.read()
43+
avro_schema = parse_avro(raw_string)
44+
# Get fingerprint bytes
45+
canonical_form = avro_schema.canonical_form
46+
print(
47+
f"Schema canonical form: Length ({len(canonical_form)}); {canonical_form}"
48+
)
49+
md5_bytes = avro_schema.fingerprint("md5")
50+
# Convert to hex string
51+
avro_schema_hash = md5_bytes.hex()
52+
assert isinstance(
53+
avro_schema, RecordSchema
54+
), "This command only works for Avro records"
55+
dataset_urn = make_dataset_urn(
56+
platform=platform_urn,
57+
name=(
58+
f"{avro_schema.namespace}.{avro_schema.name}"
59+
if avro_schema.namespace
60+
else avro_schema.name
61+
),
62+
)
63+
schema_fields = [
64+
f for f in converter.to_mce_fields(avro_schema, is_key_schema=False)
65+
]
66+
schema_metadata = models.SchemaMetadataClass(
67+
schemaName=avro_schema.name,
68+
platform=platform_urn,
69+
version=0,
70+
hash=avro_schema_hash,
71+
platformSchema=models.OtherSchemaClass(rawSchema=raw_string),
72+
fields=schema_fields,
73+
)
74+
assert schema_metadata.validate()
75+
if to_file:
76+
with SynchronizedFileEmitter(output_file_name) as file_emitter:
77+
file_emitter.emit(
78+
MetadataChangeProposalWrapper(
79+
entityUrn=dataset_urn, aspect=schema_metadata
80+
)
81+
)
82+
if to_server:
83+
with get_default_graph() as graph:
84+
graph.emit(
85+
MetadataChangeProposalWrapper(
86+
entityUrn=dataset_urn, aspect=schema_metadata
87+
)
88+
)
89+
90+
print(f"Wrote metadata to {output_file}")
91+
92+
93+
if __name__ == "__main__":
94+
generate_schema_file_from_avro_schema()

0 commit comments

Comments
 (0)