Skip to content

Commit 4a12262

Browse files
authored
Merge branch 'master' into cus3379-tableau-ingestion-node-limit-exceeded
2 parents 5653fef + 48b5a62 commit 4a12262

File tree

45 files changed

+1232
-369
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+1232
-369
lines changed

.github/workflows/build-and-test.yml

+2
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ jobs:
7575
path: |
7676
~/.cache/uv
7777
key: ${{ runner.os }}-uv-${{ hashFiles('**/requirements.txt') }}
78+
- name: Install dependencies
79+
run: ./metadata-ingestion/scripts/install_deps.sh
7880
- name: Set up JDK 17
7981
uses: actions/setup-java@v4
8082
with:

.github/workflows/pr-labeler.yml

+4-4
Original file line numberDiff line numberDiff line change
@@ -29,24 +29,24 @@ jobs:
2929
"swaroopjagadish",
3030
"treff7es",
3131
"yoonhyejin",
32-
"eboneil",
3332
"gabe-lyons",
3433
"hsheth2",
3534
"jjoyce0510",
3635
"maggiehays",
3736
"pedro93",
3837
"RyanHolstien",
3938
"sakethvarma397",
40-
"Kunal-kankriya",
4139
"purnimagarg1",
42-
"dushayntAW",
4340
"sagar-salvi-apptware",
4441
"kushagra-apptware",
4542
"Salman-Apptware",
4643
"mayurinehate",
4744
"noggi",
4845
"skrydal",
49-
"kevinkarchacryl"
46+
"kevinkarchacryl",
47+
"sgomezvillamor",
48+
"acrylJonny",
49+
"chakru-r"
5050
]'),
5151
github.actor
5252
)

datahub-web-react/src/app/ingest/source/executions/reporting/StructuredReportItem.tsx

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ const StyledCollapse = styled(Collapse)<{ color: string }>`
1616
.ant-collapse-header {
1717
display: flex;
1818
align-items: center;
19+
overflow: auto;
1920
}
2021
2122
.ant-collapse-item {

entity-registry/src/main/java/com/linkedin/metadata/models/StructuredPropertyUtils.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ public static String toElasticsearchFieldName(
178178
/**
179179
* Return an elasticsearch type from structured property type
180180
*
181-
* @param fieldName filter or facet field name
181+
* @param fieldName filter or facet field name - must match actual FQN of structured prop
182182
* @param aspectRetriever aspect retriever
183183
* @return elasticsearch type
184184
*/

entity-registry/src/main/java/com/linkedin/metadata/models/extractor/FieldExtractor.java

+24-5
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import java.util.Optional;
1515
import java.util.function.Function;
1616
import java.util.stream.Collectors;
17-
import javax.annotation.Nonnull;
17+
import javax.annotation.Nullable;
1818

1919
/** Extracts fields from a RecordTemplate based on the appropriate {@link FieldSpec}. */
2020
public class FieldExtractor {
@@ -30,15 +30,34 @@ private static long getNumArrayWildcards(PathSpec pathSpec) {
3030

3131
// Extract the value of each field in the field specs from the input record
3232
public static <T extends FieldSpec> Map<T, List<Object>> extractFields(
33-
@Nonnull RecordTemplate record, List<T> fieldSpecs) {
34-
return extractFields(record, fieldSpecs, MAX_VALUE_LENGTH);
33+
@Nullable RecordTemplate record, List<T> fieldSpecs) {
34+
return extractFields(record, fieldSpecs, false);
3535
}
3636

3737
public static <T extends FieldSpec> Map<T, List<Object>> extractFields(
38-
@Nonnull RecordTemplate record, List<T> fieldSpecs, int maxValueLength) {
38+
@Nullable RecordTemplate record, List<T> fieldSpecs, boolean requiredFieldExtract) {
39+
return extractFields(record, fieldSpecs, MAX_VALUE_LENGTH, requiredFieldExtract);
40+
}
41+
42+
public static <T extends FieldSpec> Map<T, List<Object>> extractFields(
43+
@Nullable RecordTemplate record, List<T> fieldSpecs, int maxValueLength) {
44+
return extractFields(record, fieldSpecs, maxValueLength, false);
45+
}
46+
47+
public static <T extends FieldSpec> Map<T, List<Object>> extractFields(
48+
@Nullable RecordTemplate record,
49+
List<T> fieldSpecs,
50+
int maxValueLength,
51+
boolean requiredFieldExtract) {
3952
final Map<T, List<Object>> extractedFields = new HashMap<>();
4053
for (T fieldSpec : fieldSpecs) {
41-
Optional<Object> value = RecordUtils.getFieldValue(record, fieldSpec.getPath());
54+
if (requiredFieldExtract && record == null) {
55+
throw new IllegalArgumentException(
56+
"Field extraction is required and the RecordTemplate is null");
57+
}
58+
Optional<Object> value =
59+
Optional.ofNullable(record)
60+
.flatMap(maybeRecord -> RecordUtils.getFieldValue(maybeRecord, fieldSpec.getPath()));
4261
if (!value.isPresent()) {
4362
extractedFields.put(fieldSpec, Collections.emptyList());
4463
} else {

metadata-ingestion/setup.py

+9
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,15 @@
142142
# datahub does not depend on traitlets directly but great expectations does.
143143
# https://github.com/ipython/traitlets/issues/741
144144
"traitlets!=5.2.2",
145+
# GE depends on IPython - we have no direct dependency on it.
146+
# IPython 8.22.0 added a dependency on traitlets 5.13.x, but only declared a
147+
# version requirement of traitlets>5.
148+
# See https://github.com/ipython/ipython/issues/14352.
149+
# This issue was fixed by https://github.com/ipython/ipython/pull/14353,
150+
# which first appeared in IPython 8.22.1.
151+
# As such, we just need to avoid that version in order to get the
152+
# dependencies that we need. IPython probably should've yanked 8.22.0.
153+
"IPython!=8.22.0",
145154
"greenlet",
146155
*cachetools_lib,
147156
}

metadata-ingestion/src/datahub/cli/delete_cli.py

+37-4
Original file line numberDiff line numberDiff line change
@@ -214,14 +214,47 @@ def references(urn: str, dry_run: bool, force: bool) -> None:
214214

215215

216216
@delete.command()
217-
@click.option("--urn", required=True, type=str, help="the urn of the entity")
218-
def undo_by_filter(urn: str) -> None:
217+
@click.option("--urn", required=False, type=str, help="the urn of the entity")
218+
@click.option(
219+
"-p",
220+
"--platform",
221+
required=False,
222+
type=str,
223+
help="Platform filter (e.g. snowflake)",
224+
)
225+
@click.option(
226+
"-b",
227+
"--batch-size",
228+
required=False,
229+
default=3000,
230+
type=int,
231+
help="Batch size when querying for entities to un-soft delete."
232+
"Maximum 10000. Large batch sizes may cause timeouts.",
233+
)
234+
def undo_by_filter(
235+
urn: Optional[str], platform: Optional[str], batch_size: int
236+
) -> None:
219237
"""
220-
Undo a soft deletion of an entity
238+
Undo soft deletion by filters
221239
"""
222240
graph = get_default_graph()
223241
logger.info(f"Using {graph}")
224-
graph.set_soft_delete_status(urn=urn, delete=False)
242+
if urn:
243+
graph.set_soft_delete_status(urn=urn, delete=False)
244+
else:
245+
urns = list(
246+
graph.get_urns_by_filter(
247+
platform=platform,
248+
query="*",
249+
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
250+
batch_size=batch_size,
251+
)
252+
)
253+
logger.info(f"Going to un-soft delete {len(urns)} urns")
254+
urns_iter = progressbar.progressbar(urns, redirect_stdout=True)
255+
for urn in urns_iter:
256+
assert urn
257+
graph.set_soft_delete_status(urn=urn, delete=False)
225258

226259

227260
@delete.command(no_args_is_help=True)

metadata-ingestion/src/datahub/configuration/kafka_consumer_config.py

+31-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import inspect
12
import logging
23
from typing import Any, Dict, Optional
34

@@ -34,5 +35,34 @@ def _resolve_oauth_callback(self) -> None:
3435
"oauth_cb must be a string representing python function reference "
3536
"in the format <python-module>:<function-name>."
3637
)
38+
39+
call_back_fn = import_path(call_back)
40+
self._validate_call_back_fn_signature(call_back_fn)
41+
3742
# Set the callback
38-
self._config[CallableConsumerConfig.CALLBACK_ATTRIBUTE] = import_path(call_back)
43+
self._config[CallableConsumerConfig.CALLBACK_ATTRIBUTE] = call_back_fn
44+
45+
def _validate_call_back_fn_signature(self, call_back_fn: Any) -> None:
46+
sig = inspect.signature(call_back_fn)
47+
48+
num_positional_args = len(
49+
[
50+
param
51+
for param in sig.parameters.values()
52+
if param.kind
53+
in (
54+
inspect.Parameter.POSITIONAL_ONLY,
55+
inspect.Parameter.POSITIONAL_OR_KEYWORD,
56+
)
57+
and param.default == inspect.Parameter.empty
58+
]
59+
)
60+
61+
has_variadic_args = any(
62+
param.kind == inspect.Parameter.VAR_POSITIONAL
63+
for param in sig.parameters.values()
64+
)
65+
66+
assert num_positional_args == 1 or (
67+
has_variadic_args and num_positional_args <= 1
68+
), "oauth_cb function must accept single positional argument."

metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py

+47-23
Original file line numberDiff line numberDiff line change
@@ -208,22 +208,28 @@ def fetch_dpis(self, job_urn: str, batch_size: int) -> List[dict]:
208208
dpis = []
209209
start = 0
210210
while True:
211-
job_query_result = self.ctx.graph.execute_graphql(
212-
DATA_PROCESS_INSTANCES_QUERY,
213-
{"dataJobUrn": job_urn, "start": start, "count": batch_size},
214-
)
215-
job_data = job_query_result.get("dataJob")
216-
if not job_data:
217-
raise ValueError(f"Error getting job {job_urn}")
218-
219-
runs_data = job_data.get("runs")
220-
if not runs_data:
221-
raise ValueError(f"Error getting runs for {job_urn}")
222-
223-
runs = runs_data.get("runs")
224-
dpis.extend(runs)
225-
start += batch_size
226-
if len(runs) < batch_size:
211+
try:
212+
job_query_result = self.ctx.graph.execute_graphql(
213+
DATA_PROCESS_INSTANCES_QUERY,
214+
{"dataJobUrn": job_urn, "start": start, "count": batch_size},
215+
)
216+
job_data = job_query_result.get("dataJob")
217+
if not job_data:
218+
logger.error(f"Error getting job {job_urn}")
219+
break
220+
221+
runs_data = job_data.get("runs")
222+
if not runs_data:
223+
logger.error(f"Error getting runs for {job_urn}")
224+
break
225+
226+
runs = runs_data.get("runs")
227+
dpis.extend(runs)
228+
start += batch_size
229+
if len(runs) < batch_size:
230+
break
231+
except Exception as e:
232+
logger.error(f"Exception while fetching DPIs for job {job_urn}: {e}")
227233
break
228234
return dpis
229235

@@ -243,8 +249,12 @@ def keep_last_n_dpi(
243249
futures[future] = dpi
244250

245251
for future in as_completed(futures):
246-
deleted_count_last_n += 1
247-
futures[future]["deleted"] = True
252+
try:
253+
future.result()
254+
deleted_count_last_n += 1
255+
futures[future]["deleted"] = True
256+
except Exception as e:
257+
logger.error(f"Exception while deleting DPI: {e}")
248258

249259
if deleted_count_last_n % self.config.batch_size == 0:
250260
logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
@@ -279,7 +289,7 @@ def delete_dpi_from_datajobs(self, job: DataJobEntity) -> None:
279289
dpis = self.fetch_dpis(job.urn, self.config.batch_size)
280290
dpis.sort(
281291
key=lambda x: x["created"]["time"]
282-
if x["created"] and x["created"]["time"]
292+
if "created" in x and "time" in x["created"]
283293
else 0,
284294
reverse=True,
285295
)
@@ -314,15 +324,23 @@ def remove_old_dpis(
314324
if dpi.get("deleted"):
315325
continue
316326

317-
if dpi["created"]["time"] < retention_time * 1000:
327+
if (
328+
"created" not in dpi
329+
or "time" not in dpi["created"]
330+
or dpi["created"]["time"] < retention_time * 1000
331+
):
318332
future = executor.submit(
319333
self.delete_entity, dpi["urn"], "dataprocessInstance"
320334
)
321335
futures[future] = dpi
322336

323337
for future in as_completed(futures):
324-
deleted_count_retention += 1
325-
futures[future]["deleted"] = True
338+
try:
339+
future.result()
340+
deleted_count_retention += 1
341+
futures[future]["deleted"] = True
342+
except Exception as e:
343+
logger.error(f"Exception while deleting DPI: {e}")
326344

327345
if deleted_count_retention % self.config.batch_size == 0:
328346
logger.info(
@@ -378,8 +396,11 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
378396
dataFlows[flow.urn] = flow
379397

380398
scroll_id: Optional[str] = None
399+
previous_scroll_id: Optional[str] = None
400+
381401
dataJobs: Dict[str, List[DataJobEntity]] = defaultdict(list)
382402
deleted_jobs: int = 0
403+
383404
while True:
384405
result = self.ctx.graph.execute_graphql(
385406
DATAJOB_QUERY,
@@ -426,9 +447,11 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
426447
else:
427448
dataJobs[datajob_entity.flow_urn].append(datajob_entity)
428449

429-
if not scroll_id:
450+
if not scroll_id or previous_scroll_id == scroll_id:
430451
break
431452

453+
previous_scroll_id = scroll_id
454+
432455
logger.info(f"Deleted {deleted_jobs} DataJobs")
433456
# Delete empty dataflows if needed
434457
if self.config.delete_empty_data_flows:
@@ -443,4 +466,5 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
443466
if deleted_jobs % self.config.batch_size == 0:
444467
logger.info(f"Deleted {deleted_data_flows} DataFlows")
445468
logger.info(f"Deleted {deleted_data_flows} DataFlows")
469+
446470
return []

metadata-ingestion/src/datahub/ingestion/source/pulsar.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,17 @@ class PulsarSchema:
7878
def __init__(self, schema):
7979
self.schema_version = schema.get("version")
8080

81-
avro_schema = json.loads(schema.get("data"))
81+
schema_data = schema.get("data")
82+
if not schema_data:
83+
logger.warning("Schema data is empty or None. Using default empty schema.")
84+
schema_data = "{}"
85+
86+
try:
87+
avro_schema = json.loads(schema_data)
88+
except json.JSONDecodeError as e:
89+
logger.error(f"Invalid JSON schema: {schema_data}. Error: {str(e)}")
90+
avro_schema = {}
91+
8292
self.schema_name = avro_schema.get("namespace") + "." + avro_schema.get("name")
8393
self.schema_description = avro_schema.get("doc")
8494
self.schema_type = schema.get("type")

0 commit comments

Comments
 (0)