Skip to content

Commit 0e62e8c

Browse files
authored
feat(ingestion) Adding vertexAI ingestion source (v1 - model group and model) (#12632)
1 parent f507e2c commit 0e62e8c

File tree

15 files changed

+1914
-48
lines changed

15 files changed

+1914
-48
lines changed

datahub-web-react/src/app/ingest/source/builder/sources.json

+7
Original file line numberDiff line numberDiff line change
@@ -333,5 +333,12 @@
333333
"description": "Import Nodes and Relationships from Neo4j.",
334334
"docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/neo4j/",
335335
"recipe": "source:\n type: 'neo4j'\n config:\n uri: 'neo4j+ssc://host:7687'\n username: 'neo4j'\n password: 'password'\n env: 'PROD'\n\nsink:\n type: \"datahub-rest\"\n config:\n server: 'http://localhost:8080'"
336+
},
337+
{
338+
"urn": "urn:li:dataPlatform:vertexai",
339+
"name": "vertexai",
340+
"displayName": "VertexAI",
341+
"docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/vertexai/",
342+
"recipe": "source:\n type: vertexai\n config:\n project_id: # you GCP project ID \n region: # region where your GCP project resides \n # Credentials\n # Add GCP credentials"
336343
}
337344
]
11.8 KB
Loading
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
Ingesting metadata from VertexAI requires using the **Vertex AI** module.
2+
3+
#### Prerequisites
4+
Please refer to the [Vertex AI documentation](https://cloud.google.com/vertex-ai/docs) for basic information on Vertex AI.
5+
6+
#### Credentials to access to GCP
7+
Please read the section to understand how to set up application default Credentials to GCP [GCP docs](https://cloud.google.com/docs/authentication/provide-credentials-adc#how-to).
8+
9+
#### Create a service account and assign roles
10+
11+
1. Setup a ServiceAccount as per [GCP docs](https://cloud.google.com/iam/docs/creating-managing-service-accounts#iam-service-accounts-create-console) and assign the previously created role to this service account.
12+
2. Download a service account JSON keyfile.
13+
- Example credential file:
14+
15+
```json
16+
{
17+
"type": "service_account",
18+
"project_id": "project-id-1234567",
19+
"private_key_id": "d0121d0000882411234e11166c6aaa23ed5d74e0",
20+
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIyourkey\n-----END PRIVATE KEY-----",
21+
"client_email": "[email protected]",
22+
"client_id": "113545814931671546333",
23+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
24+
"token_uri": "https://oauth2.googleapis.com/token",
25+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
26+
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/test%suppproject-id-1234567.iam.gserviceaccount.com"
27+
}
28+
```
29+
30+
3. To provide credentials to the source, you can either:
31+
32+
- Set an environment variable:
33+
34+
```sh
35+
$ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json"
36+
```
37+
38+
_or_
39+
40+
- Set credential config in your source based on the credential json file. For example:
41+
42+
```yml
43+
credential:
44+
private_key_id: "d0121d0000882411234e11166c6aaa23ed5d74e0"
45+
private_key: "-----BEGIN PRIVATE KEY-----\nMIIyourkey\n-----END PRIVATE KEY-----\n"
46+
client_email: "[email protected]"
47+
client_id: "123456678890"
48+
```
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
source:
2+
type: vertexai
3+
config:
4+
project_id: "acryl-poc"
5+
region: "us-west2"
6+
# You must either set GOOGLE_APPLICATION_CREDENTIALS or provide credential as shown below
7+
# credential:
8+
# private_key: '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'
9+
# private_key_id: "project_key_id"
10+
# client_email: "client_email"
11+
# client_id: "client_id"
12+
13+
sink:
14+
type: "datahub-rest"
15+
config:
16+
server: "http://localhost:8080"

metadata-ingestion/setup.py

+4
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,7 @@
532532
"sigma": sqlglot_lib | {"requests"},
533533
"sac": sac,
534534
"neo4j": {"pandas", "neo4j"},
535+
"vertexai": {"google-cloud-aiplatform>=1.80.0"},
535536
}
536537

537538
# This is mainly used to exclude plugins from the Docker image.
@@ -677,6 +678,7 @@
677678
"sac",
678679
"cassandra",
679680
"neo4j",
681+
"vertexai",
680682
]
681683
if plugin
682684
for dependency in plugins[plugin]
@@ -710,6 +712,7 @@
710712
"mariadb",
711713
"redash",
712714
"vertica",
715+
"vertexai"
713716
]
714717
if plugin
715718
for dependency in plugins[plugin]
@@ -799,6 +802,7 @@
799802
"sac = datahub.ingestion.source.sac.sac:SACSource",
800803
"cassandra = datahub.ingestion.source.cassandra.cassandra:CassandraSource",
801804
"neo4j = datahub.ingestion.source.neo4j.neo4j_source:Neo4jSource",
805+
"vertexai = datahub.ingestion.source.vertexai:VertexAISource",
802806
],
803807
"datahub.ingestion.transformer.plugins": [
804808
"pattern_cleanup_ownership = datahub.ingestion.transformer.pattern_cleanup_ownership:PatternCleanUpOwnership",

metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py

+2-46
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
1-
import json
21
import logging
32
import os
43
import re
5-
import tempfile
64
from datetime import timedelta
75
from typing import Any, Dict, List, Optional, Union
86

@@ -17,10 +15,10 @@
1715
PlatformInstanceConfigMixin,
1816
)
1917
from datahub.configuration.validate_field_removal import pydantic_removed_field
20-
from datahub.configuration.validate_multiline_string import pydantic_multiline_string
2118
from datahub.ingestion.glossary.classification_mixin import (
2219
ClassificationSourceConfigMixin,
2320
)
21+
from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential
2422
from datahub.ingestion.source.data_lake_common.path_spec import PathSpec
2523
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterConfig
2624
from datahub.ingestion.source.state.stateful_ingestion_base import (
@@ -107,50 +105,8 @@ class BigQueryUsageConfig(BaseUsageConfig):
107105
)
108106

109107

110-
class BigQueryCredential(ConfigModel):
111-
project_id: str = Field(description="Project id to set the credentials")
112-
private_key_id: str = Field(description="Private key id")
113-
private_key: str = Field(
114-
description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'"
115-
)
116-
client_email: str = Field(description="Client email")
117-
client_id: str = Field(description="Client Id")
118-
auth_uri: str = Field(
119-
default="https://accounts.google.com/o/oauth2/auth",
120-
description="Authentication uri",
121-
)
122-
token_uri: str = Field(
123-
default="https://oauth2.googleapis.com/token", description="Token uri"
124-
)
125-
auth_provider_x509_cert_url: str = Field(
126-
default="https://www.googleapis.com/oauth2/v1/certs",
127-
description="Auth provider x509 certificate url",
128-
)
129-
type: str = Field(default="service_account", description="Authentication type")
130-
client_x509_cert_url: Optional[str] = Field(
131-
default=None,
132-
description="If not set it will be default to https://www.googleapis.com/robot/v1/metadata/x509/client_email",
133-
)
134-
135-
_fix_private_key_newlines = pydantic_multiline_string("private_key")
136-
137-
@root_validator(skip_on_failure=True)
138-
def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
139-
if values.get("client_x509_cert_url") is None:
140-
values["client_x509_cert_url"] = (
141-
f"https://www.googleapis.com/robot/v1/metadata/x509/{values['client_email']}"
142-
)
143-
return values
144-
145-
def create_credential_temp_file(self) -> str:
146-
with tempfile.NamedTemporaryFile(delete=False) as fp:
147-
cred_json = json.dumps(self.dict(), indent=4, separators=(",", ": "))
148-
fp.write(cred_json.encode())
149-
return fp.name
150-
151-
152108
class BigQueryConnectionConfig(ConfigModel):
153-
credential: Optional[BigQueryCredential] = Field(
109+
credential: Optional[GCPCredential] = Field(
154110
default=None, description="BigQuery credential informations"
155111
)
156112

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import json
2+
import tempfile
3+
from typing import Any, Dict, Optional
4+
5+
from pydantic import Field, root_validator
6+
7+
from datahub.configuration import ConfigModel
8+
from datahub.configuration.validate_multiline_string import pydantic_multiline_string
9+
10+
11+
class GCPCredential(ConfigModel):
12+
project_id: Optional[str] = Field(description="Project id to set the credentials")
13+
private_key_id: str = Field(description="Private key id")
14+
private_key: str = Field(
15+
description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'"
16+
)
17+
client_email: str = Field(description="Client email")
18+
client_id: str = Field(description="Client Id")
19+
auth_uri: str = Field(
20+
default="https://accounts.google.com/o/oauth2/auth",
21+
description="Authentication uri",
22+
)
23+
token_uri: str = Field(
24+
default="https://oauth2.googleapis.com/token", description="Token uri"
25+
)
26+
auth_provider_x509_cert_url: str = Field(
27+
default="https://www.googleapis.com/oauth2/v1/certs",
28+
description="Auth provider x509 certificate url",
29+
)
30+
type: str = Field(default="service_account", description="Authentication type")
31+
client_x509_cert_url: Optional[str] = Field(
32+
default=None,
33+
description="If not set it will be default to https://www.googleapis.com/robot/v1/metadata/x509/client_email",
34+
)
35+
36+
_fix_private_key_newlines = pydantic_multiline_string("private_key")
37+
38+
@root_validator(skip_on_failure=True)
39+
def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
40+
if values.get("client_x509_cert_url") is None:
41+
values["client_x509_cert_url"] = (
42+
f"https://www.googleapis.com/robot/v1/metadata/x509/{values['client_email']}"
43+
)
44+
return values
45+
46+
def create_credential_temp_file(self, project_id: Optional[str] = None) -> str:
47+
configs = self.dict()
48+
if project_id:
49+
configs["project_id"] = project_id
50+
with tempfile.NamedTemporaryFile(delete=False) as fp:
51+
cred_json = json.dumps(configs, indent=4, separators=(",", ": "))
52+
fp.write(cred_json.encode())
53+
return fp.name

0 commit comments

Comments
 (0)