improvements

acrylJonny · acrylJonny · commit 7930212daa39 · 2025-03-17T18:18:18.000Z
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
@@ -99,45 +99,76 @@ def _extend_lineage(
         if destination_details is None:
             destination_details = self._get_destination_details(connector)
 
+        # Ensure platform is set to avoid URN creation issues
+        if not source_details.platform:
+            source_details.platform = self._detect_source_platform(connector)
+
+        if not destination_details.platform:
+            destination_details.platform = "snowflake"  # Default to snowflake
+
+        # Log the lineage information for debugging
+        logger.info(
+            f"Processing lineage for connector {connector.connector_id}: "
+            f"source_platform={source_details.platform}, "
+            f"destination_platform={destination_details.platform}, "
+            f"{len(connector.lineage)} table lineage entries"
+        )
+
         # Handle lineage truncation if needed
         if len(connector.lineage) >= MAX_TABLE_LINEAGE_PER_CONNECTOR:
             self._report_lineage_truncation(connector)
 
         # Process each table lineage entry
         for lineage in connector.lineage:
-            # Create source and destination URNs
-            source_urn = self._create_dataset_urn(
-                lineage.source_table,
-                source_details,
-                is_source=True,
-            )
-
-            dest_urn = self._create_dataset_urn(
-                lineage.destination_table,
-                destination_details,
-                is_source=False,
-            )
-
-            # Skip if either URN creation failed
-            if not source_urn or not dest_urn:
-                continue
+            try:
+                # Create source and destination URNs
+                source_urn = self._create_dataset_urn(
+                    lineage.source_table,
+                    source_details,
+                    is_source=True,
+                )
 
-            # Add URNs to lists (avoiding duplicates)
-            if source_urn not in input_dataset_urn_list:
-                input_dataset_urn_list.append(source_urn)
+                dest_urn = self._create_dataset_urn(
+                    lineage.destination_table,
+                    destination_details,
+                    is_source=False,
+                )
 
-            if dest_urn not in output_dataset_urn_list:
-                output_dataset_urn_list.append(dest_urn)
+                # Skip if either URN creation failed
+                if not source_urn or not dest_urn:
+                    logger.warning(
+                        f"Skipping lineage for {lineage.source_table} -> {lineage.destination_table}: "
+                        f"Failed to create URNs"
+                    )
+                    continue
+
+                # Add URNs to lists (avoiding duplicates)
+                if str(source_urn) not in [str(u) for u in input_dataset_urn_list]:
+                    input_dataset_urn_list.append(source_urn)
+
+                if str(dest_urn) not in [str(u) for u in output_dataset_urn_list]:
+                    output_dataset_urn_list.append(dest_urn)
+
+                # Create column lineage if enabled
+                if self.config.include_column_lineage:
+                    self._create_column_lineage(
+                        lineage=lineage,
+                        source_urn=source_urn,
+                        dest_urn=dest_urn,
+                        fine_grained_lineage=fine_grained_lineage,
+                    )
 
-            # Create column lineage if enabled
-            if self.config.include_column_lineage:
-                self._create_column_lineage(
-                    lineage=lineage,
-                    source_urn=source_urn,
-                    dest_urn=dest_urn,
-                    fine_grained_lineage=fine_grained_lineage,
+                logger.debug(f"Created lineage from {source_urn} to {dest_urn}")
+            except Exception as e:
+                logger.warning(
+                    f"Error creating lineage for table {lineage.source_table} -> {lineage.destination_table}: {e}"
                 )
 
+        # Log the lineage that was created for debugging
+        logger.info(
+            f"Created lineage with {len(input_dataset_urn_list)} input URNs and {len(output_dataset_urn_list)} output URNs"
+        )
+
         # Add URNs and lineage to the datajob
         datajob.inlets.extend(input_dataset_urn_list)
         datajob.outlets.extend(output_dataset_urn_list)
@@ -150,22 +181,6 @@ def _extend_lineage(
             destination_details=destination_details,
         )
 
-        # Add source and destination platform information to properties
-        if source_details.platform:
-            lineage_properties["source.platform"] = source_details.platform
-        if destination_details.platform:
-            lineage_properties["destination.platform"] = destination_details.platform
-
-        # Add database information if available
-        if source_details.database:
-            lineage_properties["source.database"] = source_details.database
-        if destination_details.database:
-            lineage_properties["destination.database"] = destination_details.database
-
-        # Add environment information
-        lineage_properties["source.env"] = source_details.env or "PROD"
-        lineage_properties["destination.env"] = destination_details.env or "PROD"
-
         return lineage_properties
 
     def _get_source_details(self, connector: Connector) -> PlatformDetail:
@@ -241,17 +256,24 @@ def _create_dataset_urn(
             platform = details.platform
             if not platform:
                 platform = "snowflake" if not is_source else "external"
+                logger.info(
+                    f"Using default platform {platform} for {'source' if is_source else 'destination'} table {table_name}"
+                )
 
-            # Include database in the table name if available
-            full_table_name = (
-                f"{details.database.lower()}.{table_name}"
-                if details.database
-                else table_name
-            )
+            # Include database in the table name if available and ensure it's lowercase
+            database = details.database.lower() if details.database else ""
+            full_table_name = f"{database}.{table_name}" if database else table_name
 
             # Ensure environment is set
             env = details.env or "PROD"
 
+            # Log the URN creation details for debugging
+            logger.debug(
+                f"Creating {'source' if is_source else 'destination'} URN with: "
+                f"platform={platform}, table_name={full_table_name}, env={env}, "
+                f"platform_instance={details.platform_instance}"
+            )
+
             return DatasetUrn.create_from_ids(
                 platform_id=platform,
                 table_name=full_table_name,
@@ -260,19 +282,10 @@ def _create_dataset_urn(
             )
         except Exception as e:
             logger.warning(
-                f"Failed to create {'source' if is_source else 'destination'} URN: {e}"
+                f"Failed to create {'source' if is_source else 'destination'} URN for {table_name}: {e}"
             )
             return None
 
-    def _report_lineage_truncation(self, connector: Connector) -> None:
-        """Report warning about truncated lineage."""
-        self.report.warning(
-            title="Table lineage truncated",
-            message=f"The connector had more than {MAX_TABLE_LINEAGE_PER_CONNECTOR} table lineage entries. "
-            f"Only the most recent {MAX_TABLE_LINEAGE_PER_CONNECTOR} entries were ingested.",
-            context=f"{connector.connector_name} (connector_id: {connector.connector_id})",
-        )
-
     def _create_column_lineage(
         self,
         lineage,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_api_client.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_api_client.py
@@ -206,16 +206,42 @@ def list_connector_schemas(self, connector_id: str) -> List[Dict]:
 
         try:
             response = self._make_request("GET", f"/connectors/{connector_id}/schemas")
+            # Log the raw response format for debugging
+            logger.debug(
+                f"Schema response format for connector {connector_id}: {type(response)}"
+            )
+
             schemas = response.get("data", {}).get("schemas", [])
+            logger.debug(f"Schemas format: {type(schemas)}, value: {schemas}")
 
-            # Ensure schemas is a list of dictionaries
-            if not isinstance(schemas, list):
-                logger.warning(
-                    f"Unexpected schema format for connector {connector_id}: {schemas}"
-                )
+            # Handle various schema response formats
+            if schemas is None:
                 schemas = []
+            elif isinstance(schemas, str):
+                # Some APIs return a JSON string that needs to be parsed
+                try:
+                    import json
+
+                    parsed = json.loads(schemas)
+                    if isinstance(parsed, list):
+                        schemas = parsed
+                    else:
+                        logger.warning(f"Parsed schema is not a list: {parsed}")
+                        schemas = []
+                except Exception as e:
+                    logger.warning(f"Failed to parse schema string: {e}")
+                    schemas = []
+            elif not isinstance(schemas, list):
+                logger.warning(f"Unexpected schema type: {type(schemas)}")
+                schemas = []
+
+            # Filter out non-dict entries
+            schemas = [s for s in schemas if isinstance(s, dict)]
 
             self._schema_cache[connector_id] = schemas
+            logger.info(
+                f"Retrieved {len(schemas)} schemas for connector {connector_id}"
+            )
             return schemas
         except Exception as e:
             logger.warning(f"Error fetching schemas for connector {connector_id}: {e}")
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_standard_api.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_standard_api.py
@@ -3,7 +3,11 @@
 
 from datahub.configuration.common import AllowDenyPattern
 from datahub.ingestion.source.fivetran.config import FivetranSourceReport
-from datahub.ingestion.source.fivetran.data_classes import Connector
+from datahub.ingestion.source.fivetran.data_classes import (
+    ColumnLineage,
+    Connector,
+    TableLineage,
+)
 from datahub.ingestion.source.fivetran.fivetran_access import FivetranAccessInterface
 from datahub.ingestion.source.fivetran.fivetran_api_client import FivetranAPIClient
 from datahub.ingestion.source.fivetran.fivetran_query import (
@@ -130,20 +134,141 @@ def _fill_connectors_lineage(self, connectors: List[Connector]) -> None:
         """
         for connector in connectors:
             try:
-                # Extract table lineage for this connector
-                lineage = self.api_client.extract_table_lineage(connector.connector_id)
+                logger.info(
+                    f"Extracting lineage for connector {connector.connector_id}"
+                )
+
+                # Get destination platform from connector properties
+                destination_platform = connector.additional_properties.get(
+                    "destination_platform", "snowflake"
+                )
+
+                # Try to get schema information with detailed logging
+                schemas = self.api_client.list_connector_schemas(connector.connector_id)
+                logger.info(
+                    f"Retrieved {len(schemas)} schemas for connector {connector.connector_id}"
+                )
+
+                lineage_list = []
+
+                # Process each schema
+                for schema in schemas:
+                    try:
+                        schema_name = schema.get("name", "")
+                        if not schema_name:
+                            logger.warning(f"Skipping schema with no name: {schema}")
+                            continue
+
+                        tables = schema.get("tables", [])
+                        if not isinstance(tables, list):
+                            logger.warning(
+                                f"Schema {schema_name} has non-list tables: {tables}"
+                            )
+                            continue
+
+                        # Log the number of tables found
+                        logger.info(
+                            f"Processing {len(tables)} tables in schema {schema_name}"
+                        )
+
+                        # Process each table in the schema
+                        for table in tables:
+                            try:
+                                if not isinstance(table, dict):
+                                    continue
+
+                                table_name = table.get("name", "")
+                                enabled = table.get("enabled", False)
+
+                                if not enabled or not table_name:
+                                    continue
 
-                # Check if we need to truncate the lineage
-                if len(lineage) > MAX_TABLE_LINEAGE_PER_CONNECTOR:
+                                # Create source and destination table identifiers
+                                source_table = f"{schema_name}.{table_name}"
+
+                                # Adjust case based on destination platform
+                                dest_schema = (
+                                    schema_name.upper()
+                                    if destination_platform != "bigquery"
+                                    else schema_name
+                                )
+                                dest_table = (
+                                    table_name.upper()
+                                    if destination_platform != "bigquery"
+                                    else table_name
+                                )
+                                destination_table = f"{dest_schema}.{dest_table}"
+
+                                # Process columns for lineage
+                                column_lineage = []
+                                columns = table.get("columns", [])
+
+                                if isinstance(columns, list):
+                                    for column in columns:
+                                        try:
+                                            if not isinstance(column, dict):
+                                                continue
+
+                                            col_name = column.get("name", "")
+                                            if not col_name:
+                                                continue
+
+                                            # Destination column name follows same case convention as table
+                                            dest_col_name = (
+                                                col_name.upper()
+                                                if destination_platform != "bigquery"
+                                                else col_name
+                                            )
+
+                                            column_lineage.append(
+                                                ColumnLineage(
+                                                    source_column=col_name,
+                                                    destination_column=dest_col_name,
+                                                )
+                                            )
+                                        except Exception as col_e:
+                                            logger.warning(
+                                                f"Error processing column in table {table_name}: {col_e}"
+                                            )
+
+                                # Add this table's lineage
+                                lineage_list.append(
+                                    TableLineage(
+                                        source_table=source_table,
+                                        destination_table=destination_table,
+                                        column_lineage=column_lineage,
+                                    )
+                                )
+
+                                logger.debug(
+                                    f"Added lineage: {source_table} -> {destination_table} with {len(column_lineage)} columns"
+                                )
+                            except Exception as table_e:
+                                logger.warning(
+                                    f"Error processing table {table.get('name', 'unknown')}: {table_e}"
+                                )
+                    except Exception as schema_e:
+                        logger.warning(
+                            f"Error processing schema {schema.get('name', 'unknown')}: {schema_e}"
+                        )
+
+                # Truncate if necessary
+                if len(lineage_list) > MAX_TABLE_LINEAGE_PER_CONNECTOR:
                     logger.warning(
-                        f"Connector {connector.connector_name} has {len(lineage)} tables, "
+                        f"Connector {connector.connector_name} has {len(lineage_list)} tables, "
                         f"truncating to {MAX_TABLE_LINEAGE_PER_CONNECTOR}"
                     )
-                    lineage = lineage[:MAX_TABLE_LINEAGE_PER_CONNECTOR]
+                    lineage_list = lineage_list[:MAX_TABLE_LINEAGE_PER_CONNECTOR]
+
+                connector.lineage = lineage_list
+
+                logger.info(
+                    f"Successfully extracted {len(lineage_list)} table lineages for connector {connector.connector_id}"
+                )
 
-                connector.lineage = lineage
             except Exception as e:
                 logger.error(
-                    f"Failed to extract lineage for connector {connector.connector_name}: {e}"
+                    f"Failed to extract lineage for connector {connector.connector_name}: {e}",
+                    exc_info=True,
                 )
                 connector.lineage = []