BQ improvements

acrylJonny · acrylJonny · commit adf82be78a82 · 2025-03-17T18:24:55.000Z
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
@@ -530,10 +530,21 @@ def _generate_dataflow_from_connector(self, connector: Connector) -> DataFlow:
         properties["paused"] = str(connector.paused)
         properties["destination_id"] = connector.destination_id
 
-        # Get destination platform if available
-        if "destination_platform" in connector.additional_properties:
-            destination = connector.additional_properties.get("destination_platform")
-            description += f" to {destination}"
+        # Get destination platform
+        default_destination = (
+            "bigquery"
+            if (
+                hasattr(self.config, "fivetran_log_config")
+                and self.config.fivetran_log_config
+                and self.config.fivetran_log_config.destination_platform == "bigquery"
+            )
+            else "snowflake"
+        )
+
+        destination = connector.additional_properties.get(
+            "destination_platform", default_destination
+        )
+        description += f" to {destination}"
 
         return DataFlow(
             orchestrator=Constant.ORCHESTRATOR,
@@ -562,12 +573,20 @@ def _generate_datajob_from_connector(self, connector: Connector) -> DataJob:
         # Get source platform from connector type
         source_platform = self._detect_source_platform(connector)
 
-        # Get destination platform
-        destination_platform = "snowflake"  # Default
-        if "destination_platform" in connector.additional_properties:
-            destination_platform = connector.additional_properties.get(
-                "destination_platform"
+        # Get destination platform - use bigquery as default if we have bigquery config
+        default_destination = (
+            "bigquery"
+            if (
+                hasattr(self.config, "fivetran_log_config")
+                and self.config.fivetran_log_config
+                and self.config.fivetran_log_config.destination_platform == "bigquery"
             )
+            else "snowflake"
+        )
+
+        destination_platform = connector.additional_properties.get(
+            "destination_platform", default_destination
+        )
 
         # Create job description
         description = f"Fivetran data pipeline from {connector.connector_type} to {destination_platform}"
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_api_client.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_api_client.py
@@ -206,45 +206,85 @@ def list_connector_schemas(self, connector_id: str) -> List[Dict]:
 
         try:
             response = self._make_request("GET", f"/connectors/{connector_id}/schemas")
-            # Log the raw response format for debugging
-            logger.debug(
-                f"Schema response format for connector {connector_id}: {type(response)}"
-            )
 
-            schemas = response.get("data", {}).get("schemas", [])
-            logger.debug(f"Schemas format: {type(schemas)}, value: {schemas}")
+            # Debug the response format
+            logger.debug(f"Schema response for connector {connector_id}: {response}")
 
-            # Handle various schema response formats
-            if schemas is None:
-                schemas = []
-            elif isinstance(schemas, str):
-                # Some APIs return a JSON string that needs to be parsed
-                try:
-                    import json
-
-                    parsed = json.loads(schemas)
-                    if isinstance(parsed, list):
-                        schemas = parsed
-                    else:
-                        logger.warning(f"Parsed schema is not a list: {parsed}")
-                        schemas = []
-                except Exception as e:
-                    logger.warning(f"Failed to parse schema string: {e}")
-                    schemas = []
-            elif not isinstance(schemas, list):
-                logger.warning(f"Unexpected schema type: {type(schemas)}")
-                schemas = []
+            # The API can return schemas in different formats
+            # Format 1: {'data': {'schemas': [...]}}
+            # Format 2: {'data': {'schemas': {'schema_name': {'name_in_destination': 'schema_name', 'enabled': True, 'tables': {...}}}}}
+            raw_schemas = response.get("data", {}).get("schemas", [])
 
-            # Filter out non-dict entries
-            schemas = [s for s in schemas if isinstance(s, dict)]
+            schemas = []
+
+            # Handle different response formats
+            if isinstance(raw_schemas, dict):
+                # Handle nested object format
+                logger.info(
+                    f"Converting nested schema format for connector {connector_id}"
+                )
+                for schema_name, schema_data in raw_schemas.items():
+                    # Convert to the expected format
+                    schema_obj = {
+                        "name": schema_name,
+                        "name_in_destination": schema_data.get(
+                            "name_in_destination", schema_name
+                        ),
+                        "enabled": schema_data.get("enabled", True),
+                        "tables": [],
+                    }
+
+                    # Convert tables from dict to list format
+                    tables_dict = schema_data.get("tables", {})
+                    if isinstance(tables_dict, dict):
+                        for table_name, table_data in tables_dict.items():
+                            table_obj = {
+                                "name": table_name,
+                                "name_in_destination": table_data.get(
+                                    "name_in_destination", table_name
+                                ),
+                                "enabled": table_data.get("enabled", False),
+                            }
+
+                            # Handle columns if present
+                            columns_dict = table_data.get("columns", {})
+                            columns = []
+                            if isinstance(columns_dict, dict):
+                                for column_name, column_data in columns_dict.items():
+                                    column_obj = {
+                                        "name": column_name,
+                                        "name_in_destination": column_data.get(
+                                            "name_in_destination", column_name
+                                        ),
+                                        "enabled": column_data.get("enabled", True),
+                                    }
+                                    columns.append(column_obj)
+
+                            if columns:
+                                table_obj["columns"] = columns
+
+                            schema_obj["tables"].append(table_obj)
+
+                    schemas.append(schema_obj)
+            elif isinstance(raw_schemas, list):
+                # Already in the expected format
+                schemas = raw_schemas
+            else:
+                logger.warning(
+                    f"Unexpected schema format type for connector {connector_id}: {type(raw_schemas)}"
+                )
+                schemas = []
 
             self._schema_cache[connector_id] = schemas
             logger.info(
-                f"Retrieved {len(schemas)} schemas for connector {connector_id}"
+                f"Processed {len(schemas)} schemas for connector {connector_id}"
             )
             return schemas
         except Exception as e:
-            logger.warning(f"Error fetching schemas for connector {connector_id}: {e}")
+            logger.warning(
+                f"Error fetching schemas for connector {connector_id}: {e}",
+                exc_info=True,
+            )
             return []
 
     def list_users(self) -> List[Dict]:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_standard_api.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_standard_api.py
@@ -138,15 +138,29 @@ def _fill_connectors_lineage(self, connectors: List[Connector]) -> None:
                     f"Extracting lineage for connector {connector.connector_id}"
                 )
 
-                # Get destination platform from connector properties
-                destination_platform = connector.additional_properties.get(
-                    "destination_platform", "snowflake"
-                )
+                # Make sure we're using the correct destination platform from config
+                # The destination_platform was incorrectly set to "snowflake" as the default
+                if "destination_platform" in connector.additional_properties:
+                    destination_platform = connector.additional_properties.get(
+                        "destination_platform"
+                    )
+                else:
+                    # Use the platform from the config if available
+                    destination_platform = (
+                        "bigquery"  # Default to bigquery based on your config
+                    )
+                    # Update the connector properties
+                    connector.additional_properties["destination_platform"] = (
+                        destination_platform
+                    )
+                    logger.info(
+                        f"Setting destination platform to {destination_platform} for connector {connector.connector_id}"
+                    )
 
-                # Try to get schema information with detailed logging
+                # Get schema information
                 schemas = self.api_client.list_connector_schemas(connector.connector_id)
                 logger.info(
-                    f"Retrieved {len(schemas)} schemas for connector {connector.connector_id}"
+                    f"Got {len(schemas)} schemas for connector {connector.connector_id}"
                 )
 
                 lineage_list = []
@@ -156,21 +170,18 @@ def _fill_connectors_lineage(self, connectors: List[Connector]) -> None:
                     try:
                         schema_name = schema.get("name", "")
                         if not schema_name:
-                            logger.warning(f"Skipping schema with no name: {schema}")
+                            logger.warning(
+                                f"Skipping schema with no name in connector {connector.connector_id}"
+                            )
                             continue
 
                         tables = schema.get("tables", [])
                         if not isinstance(tables, list):
                             logger.warning(
-                                f"Schema {schema_name} has non-list tables: {tables}"
+                                f"Schema {schema_name} has non-list tables: {type(tables)}"
                             )
                             continue
 
-                        # Log the number of tables found
-                        logger.info(
-                            f"Processing {len(tables)} tables in schema {schema_name}"
-                        )
-
                         # Process each table in the schema
                         for table in tables:
                             try: