Skip to content

Commit 31f9368

Browse files
committed
Merge branch 'gc_source_doc' of github.com:treff7es/datahub into gc_source_doc
2 parents 9901423 + 3ee0273 commit 31f9368

File tree

48 files changed

+737
-310
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+737
-310
lines changed

.github/workflows/airflow-plugin.yml

+5
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,11 @@ jobs:
8787
flags: airflow-${{ matrix.python-version }}-${{ matrix.extra_pip_extras }}
8888
name: pytest-airflow
8989
verbose: true
90+
- name: Upload test results to Codecov
91+
if: ${{ !cancelled() }}
92+
uses: codecov/test-results-action@v1
93+
with:
94+
token: ${{ secrets.CODECOV_TOKEN }}
9095

9196
event-file:
9297
runs-on: ubuntu-latest

.github/workflows/build-and-test.yml

+7-2
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ jobs:
113113
if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }}
114114
run: |
115115
./gradlew -PjavaClassVersionDefault=8 :metadata-integration:java:spark-lineage:compileJava
116-
- uses: actions/upload-artifact@v3
116+
- uses: actions/upload-artifact@v4
117117
if: always()
118118
with:
119119
name: Test Results (build)
@@ -134,6 +134,11 @@ jobs:
134134
flags: ${{ matrix.timezone }}
135135
name: ${{ matrix.command }}
136136
verbose: true
137+
- name: Upload test results to Codecov
138+
if: ${{ !cancelled() }}
139+
uses: codecov/test-results-action@v1
140+
with:
141+
token: ${{ secrets.CODECOV_TOKEN }}
137142

138143
quickstart-compose-validation:
139144
runs-on: ubuntu-latest
@@ -152,7 +157,7 @@ jobs:
152157
runs-on: ubuntu-latest
153158
steps:
154159
- name: Upload
155-
uses: actions/upload-artifact@v3
160+
uses: actions/upload-artifact@v4
156161
with:
157162
name: Event File
158163
path: ${{ github.event_path }}

.github/workflows/close-stale-issues.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ jobs:
1010
issues: write
1111
pull-requests: write
1212
steps:
13-
- uses: actions/stale@v6
13+
- uses: actions/stale@v9
1414
with:
1515
ascending: true
1616
operations-per-run: 100

.github/workflows/contributor-open-pr-comment.yml

+3-3
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,12 @@ jobs:
1717
- name: Get and Format Username (PR only)
1818
if: github.event_name == 'pull_request'
1919
run: |
20-
formatted_username=$(echo "${{ github.event.pull_request.user.login }}" | tr '[:upper:]' '[:lower:]' | sed 's/ /-/g')
21-
echo "FORMATTED_USERNAME=$formatted_username" >> $GITHUB_ENV
20+
formatted_username="$(echo "${{ github.event.pull_request.user.login }}" | tr '[:upper:]' '[:lower:]' | sed 's/ /-/g')"
21+
echo "FORMATTED_USERNAME=${formatted_username}" >> "$GITHUB_ENV"
2222
2323
- name: Create Comment (PR only)
2424
if: github.event_name == 'pull_request'
25-
uses: actions/github-script@v6
25+
uses: actions/github-script@v7
2626
with:
2727
script: |
2828
if (context.payload.pull_request) {

.github/workflows/dagster-plugin.yml

+5
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,11 @@ jobs:
7474
flags: dagster-${{ matrix.python-version }}-${{ matrix.extraPythonRequirement }}
7575
name: pytest-dagster
7676
verbose: true
77+
- name: Upload test results to Codecov
78+
if: ${{ !cancelled() }}
79+
uses: codecov/test-results-action@v1
80+
with:
81+
token: ${{ secrets.CODECOV_TOKEN }}
7782

7883
event-file:
7984
runs-on: ubuntu-latest

.github/workflows/docker-unified.yml

+3-3
Original file line numberDiff line numberDiff line change
@@ -1253,19 +1253,19 @@ jobs:
12531253
TEST_STRATEGY="-${{ matrix.test_strategy }}-${{ matrix.batch }}"
12541254
source .github/scripts/docker_logs.sh
12551255
- name: Upload logs
1256-
uses: actions/upload-artifact@v3
1256+
uses: actions/upload-artifact@v4
12571257
if: failure()
12581258
with:
12591259
name: docker-logs-${{ matrix.test_strategy }}-${{ matrix.batch }}
12601260
path: "docker_logs/*.log"
12611261
retention-days: 5
12621262
- name: Upload screenshots
1263-
uses: actions/upload-artifact@v3
1263+
uses: actions/upload-artifact@v4
12641264
if: failure()
12651265
with:
12661266
name: cypress-snapshots-${{ matrix.test_strategy }}-${{ matrix.batch }}
12671267
path: smoke-test/tests/cypress/cypress/screenshots/
1268-
- uses: actions/upload-artifact@v3
1268+
- uses: actions/upload-artifact@v4
12691269
if: always()
12701270
with:
12711271
name: Test Results (smoke tests) ${{ matrix.test_strategy }} ${{ matrix.batch }}

.github/workflows/gx-plugin.yml

+5
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,11 @@ jobs:
7878
flags: gx-${{ matrix.python-version }}-${{ matrix.extraPythonRequirement }}
7979
name: pytest-gx
8080
verbose: true
81+
- name: Upload test results to Codecov
82+
if: ${{ !cancelled() }}
83+
uses: codecov/test-results-action@v1
84+
with:
85+
token: ${{ secrets.CODECOV_TOKEN }}
8186

8287
event-file:
8388
runs-on: ubuntu-latest

.github/workflows/metadata-ingestion.yml

+5
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,11 @@ jobs:
9898
flags: ingestion-${{ matrix.python-version }}-${{ matrix.command }}
9999
name: pytest-ingestion
100100
verbose: true
101+
- name: Upload test results to Codecov
102+
if: ${{ !cancelled() }}
103+
uses: codecov/test-results-action@v1
104+
with:
105+
token: ${{ secrets.CODECOV_TOKEN }}
101106

102107
event-file:
103108
runs-on: ubuntu-latest

.github/workflows/metadata-io.yml

+7-2
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ jobs:
7070
- name: Gradle build (and test)
7171
run: |
7272
./gradlew :metadata-io:test
73-
- uses: actions/upload-artifact@v3
73+
- uses: actions/upload-artifact@v4
7474
if: always()
7575
with:
7676
name: Test Results (metadata-io)
@@ -90,12 +90,17 @@ jobs:
9090
fail_ci_if_error: false
9191
name: metadata-io-test
9292
verbose: true
93+
- name: Upload test results to Codecov
94+
if: ${{ !cancelled() }}
95+
uses: codecov/test-results-action@v1
96+
with:
97+
token: ${{ secrets.CODECOV_TOKEN }}
9398

9499
event-file:
95100
runs-on: ubuntu-latest
96101
steps:
97102
- name: Upload
98-
uses: actions/upload-artifact@v3
103+
uses: actions/upload-artifact@v4
99104
with:
100105
name: Event File
101106
path: ${{ github.event_path }}

.github/workflows/prefect-plugin.yml

+5
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,11 @@ jobs:
7070
flags: prefect-${{ matrix.python-version }}
7171
name: pytest-prefect
7272
verbose: true
73+
- name: Upload test results to Codecov
74+
if: ${{ !cancelled() }}
75+
uses: codecov/test-results-action@v1
76+
with:
77+
token: ${{ secrets.CODECOV_TOKEN }}
7378

7479
event-file:
7580
runs-on: ubuntu-latest

.github/workflows/spark-smoke-test.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -72,14 +72,14 @@ jobs:
7272
docker logs elasticsearch >& elasticsearch-${{ matrix.test_strategy }}.log || true
7373
docker logs datahub-frontend-react >& frontend-${{ matrix.test_strategy }}.log || true
7474
- name: Upload logs
75-
uses: actions/upload-artifact@v3
75+
uses: actions/upload-artifact@v4
7676
if: failure()
7777
with:
7878
name: docker logs
7979
path: |
8080
"**/build/container-logs/*.log"
8181
"*.log"
82-
- uses: actions/upload-artifact@v3
82+
- uses: actions/upload-artifact@v4
8383
if: always()
8484
with:
8585
name: Test Results (smoke tests)

docs/businessattributes.md

+10-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
1+
import FeatureAvailability from '@site/src/components/FeatureAvailability';
2+
13
# Business Attributes
24

5+
<FeatureAvailability ossOnly />
6+
7+
>**Note:** This is <b>BETA</b> feature
38
49
## What are Business Attributes
510
A Business Attribute, as its name implies, is an attribute with a business focus. It embodies the traits or properties of an entity within a business framework. This attribute is a crucial piece of data for a business, utilised to define or control the entity throughout the organisation. If a business process or concept is depicted as a comprehensive logical model, then each Business Attribute can be considered as an individual component within that model. While business names and descriptions are generally managed through glossary terms, Business Attributes encompass additional characteristics such as data quality rules/assertions, data privacy markers, data usage protocols, standard tags, and supplementary documentation, alongside Names and Descriptions.
@@ -70,9 +75,11 @@ Description inherited from business attribute is greyed out to differentiate bet
7075
</p>
7176

7277
### Enable Business Attributes Feature
73-
By default, business attribute is disabled. To enable Business Attributes feature, set the following configuration in [application.yaml](../metadata-service/configuration/src/main/resources/application.yaml)
74-
75-
businessAttributeEntityEnabled : true
78+
By default, business attribute is disabled. To enable Business Attributes feature, export environmental variable
79+
(may be done via `extraEnvs` for GMS deployment):
80+
```shell
81+
BUSINESS_ATTRIBUTE_ENTITY_ENABLED=true
82+
```
7683

7784
### What updates are planned for the Business Attributes feature?
7885

metadata-ingestion-modules/airflow-plugin/setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ def get_long_description():
119119
"pendulum<3.0",
120120
"Flask-Session<0.6.0",
121121
"connexion<3.0",
122+
"marshmallow<3.24.0",
122123
},
123124
}
124125

metadata-ingestion/docs/dev_guides/classification.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@ The classification feature enables sources to be configured to automatically pre
77
Note that a `.` is used to denote nested fields in the YAML recipe.
88

99
| Field | Required | Type | Description | Default |
10-
| ------------------------- | -------- | --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------- |
10+
| ------------------------- | -------- | --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |------------------------------------------------------------|
1111
| enabled | | boolean | Whether classification should be used to auto-detect glossary terms | False |
1212
| sample_size | | int | Number of sample values used for classification. | 100 |
13-
| max_workers | | int | Number of worker processes to use for classification. Set to 1 to disable. | Number of cpu cores or 4 |
13+
| max_workers | | int | Number of worker processes to use for classification. Note that any number above 1 might lead to a deadlock. Set to 1 to disable. | 1 |
1414
| info_type_to_term | | Dict[str,string] | Optional mapping to provide glossary term identifier for info type. | By default, info type is used as glossary term identifier. |
1515
| classifiers | | Array of object | Classifiers to use to auto-detect glossary terms. If more than one classifier, infotype predictions from the classifier defined later in sequence take precedance. | [{'type': 'datahub', 'config': None}] |
1616
| table_pattern | | AllowDenyPattern (see below for fields) | Regex patterns to filter tables for classification. This is used in combination with other patterns in parent config. Specify regex to match the entire table name in `database.schema.table` format. e.g. to match all tables starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*' | {'allow': ['.*'], 'deny': [], 'ignoreCase': True} |

metadata-ingestion/docs/sources/tableau/tableau_pre.md

+23-7
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,24 @@
33
In order to ingest metadata from Tableau, you will need:
44

55
- Tableau Server Version 2021.1.10 and above. It may also work for older versions.
6-
- [Enable the Tableau Metadata API](https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_start.html#enable-the-tableau-metadata-api-for-tableau-server) for Tableau Server, if its not already enabled.
7-
- Tableau Credentials (Username/Password or [Personal Access Token](https://help.tableau.com/current/pro/desktop/en-us/useracct.htm#create-and-revoke-personal-access-tokens))
8-
- The user or token must have **Site Administrator Explorer** permissions.
6+
- [Enable the Tableau Metadata API](https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_start.html#enable-the-tableau-metadata-api-for-tableau-server) for Tableau Server, if its not already enabled. This is always enabled for Tableau Cloud.
7+
8+
### Authentication
9+
10+
DataHub supports two authentication methods:
11+
12+
1. Username/Password
13+
2. [Personal Access Token](https://help.tableau.com/current/pro/desktop/en-us/useracct.htm#create-and-revoke-personal-access-tokens)
14+
15+
Either way, the user/token must have the **Site Administrator Explorer** site role.
16+
17+
:::info
18+
19+
We need the `Site Administrator Explorer` site role in order to get complete metadata from Tableau.
20+
21+
With any lower role, the Tableau Metadata API returns missing/partial metadata. This particularly affects data source fields and definitions, which impacts our ability to extract columns and generate column lineage. As such, other site roles like `Viewer` are insufficient with the current Tableau Metadata API.
22+
23+
:::
924

1025
### Ingestion through UI
1126

@@ -46,8 +61,8 @@ This ingestion source maps the following Source System Concepts to DataHub Conce
4661

4762
| Source Concept | DataHub Concept | Notes |
4863
| --------------------------- | ------------------------------------------------------------- | --------------------------------- |
49-
| `"Tableau"` | [Data Platform](../../metamodel/entities/dataPlatform.md) |
50-
| Project | [Container](../../metamodel/entities/container.md) | SubType `"Project"` |
64+
| `"Tableau"` | [Data Platform](../../metamodel/entities/dataPlatform.md) |
65+
| Project | [Container](../../metamodel/entities/container.md) | SubType `"Project"` |
5166
| Embedded DataSource | [Dataset](../../metamodel/entities/dataset.md) | SubType `"Embedded Data Source"` |
5267
| Published DataSource | [Dataset](../../metamodel/entities/dataset.md) | SubType `"Published Data Source"` |
5368
| Custom SQL Table | [Dataset](../../metamodel/entities/dataset.md) | SubTypes `"View"`, `"Custom SQL"` |
@@ -75,14 +90,15 @@ Lineage is emitted as received from Tableau's metadata API for
7590

7691
### Troubleshooting
7792

78-
### Why are only some workbooks/custom SQLs/published datasources ingested from the specified project?
93+
#### Why are only some workbooks/custom SQLs/published datasources ingested from the specified project?
7994

8095
This may happen when the Tableau API returns NODE_LIMIT_EXCEEDED error in response to metadata query and returns partial results with message "Showing partial results. , The request exceeded the ‘n’ node limit. Use pagination, additional filtering, or both in the query to adjust results." To resolve this, consider
8196

8297
- reducing the page size using the `page_size` config param in datahub recipe (Defaults to 10).
8398
- increasing tableau configuration [metadata query node limit](https://help.tableau.com/current/server/en-us/cli_configuration-set_tsm.htm#metadata_nodelimit) to higher value.
8499

85-
### `PERMISSIONS_MODE_SWITCHED` error in ingestion report
100+
#### `PERMISSIONS_MODE_SWITCHED` error in ingestion report
101+
86102
This error occurs if the Tableau site is using external assets. For more detail, refer to the Tableau documentation [Manage Permissions for External Assets](https://help.tableau.com/current/online/en-us/dm_perms_assets.htm).
87103

88104
Follow the below steps to enable the derived permissions:

metadata-ingestion/setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,7 @@
461461
"mssql-odbc": sql_common | mssql_common | {"pyodbc"},
462462
"mysql": mysql,
463463
# mariadb should have same dependency as mysql
464-
"mariadb": sql_common | {"pymysql>=1.0.2"},
464+
"mariadb": sql_common | mysql,
465465
"okta": {"okta~=1.7.0", "nest-asyncio"},
466466
"oracle": sql_common | {"oracledb"},
467467
"postgres": sql_common | postgres_common,

metadata-ingestion/src/datahub/cli/cli_utils.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import time
44
import typing
55
from datetime import datetime
6-
from typing import Any, Dict, List, Optional, Tuple, Type, Union
6+
from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
77

88
import click
99
import requests
@@ -33,6 +33,15 @@ def first_non_null(ls: List[Optional[str]]) -> Optional[str]:
3333
return next((el for el in ls if el is not None and el.strip() != ""), None)
3434

3535

36+
_T = TypeVar("_T")
37+
38+
39+
def get_or_else(value: Optional[_T], default: _T) -> _T:
40+
# Normally we'd use `value or default`. However, that runs into issues
41+
# when value is falsey but not None.
42+
return value if value is not None else default
43+
44+
3645
def parse_run_restli_response(response: requests.Response) -> dict:
3746
response_json = response.json()
3847
if response.status_code != 200:
@@ -321,6 +330,8 @@ def get_frontend_session_login_as(
321330
def _ensure_valid_gms_url_acryl_cloud(url: str) -> str:
322331
if "acryl.io" not in url:
323332
return url
333+
if url.endswith(":8080"):
334+
url = url.replace(":8080", "")
324335
if url.startswith("http://"):
325336
url = url.replace("http://", "https://")
326337
if url.endswith("acryl.io"):

0 commit comments

Comments
 (0)