Skip to content

Commit 5dd7576

Browse files
committed
Merge branch 'master' into ab-08-jan-2025-fixup-gms
2 parents 622a707 + c0b13f0 commit 5dd7576

File tree

17 files changed

+224
-113
lines changed

17 files changed

+224
-113
lines changed

.github/workflows/airflow-plugin.yml

+5
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,11 @@ jobs:
8787
flags: airflow-${{ matrix.python-version }}-${{ matrix.extra_pip_extras }}
8888
name: pytest-airflow
8989
verbose: true
90+
- name: Upload test results to Codecov
91+
if: ${{ !cancelled() }}
92+
uses: codecov/test-results-action@v1
93+
with:
94+
token: ${{ secrets.CODECOV_TOKEN }}
9095

9196
event-file:
9297
runs-on: ubuntu-latest

.github/workflows/build-and-test.yml

+5
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,11 @@ jobs:
134134
flags: ${{ matrix.timezone }}
135135
name: ${{ matrix.command }}
136136
verbose: true
137+
- name: Upload test results to Codecov
138+
if: ${{ !cancelled() }}
139+
uses: codecov/test-results-action@v1
140+
with:
141+
token: ${{ secrets.CODECOV_TOKEN }}
137142

138143
quickstart-compose-validation:
139144
runs-on: ubuntu-latest

.github/workflows/dagster-plugin.yml

+5
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,11 @@ jobs:
7474
flags: dagster-${{ matrix.python-version }}-${{ matrix.extraPythonRequirement }}
7575
name: pytest-dagster
7676
verbose: true
77+
- name: Upload test results to Codecov
78+
if: ${{ !cancelled() }}
79+
uses: codecov/test-results-action@v1
80+
with:
81+
token: ${{ secrets.CODECOV_TOKEN }}
7782

7883
event-file:
7984
runs-on: ubuntu-latest

.github/workflows/gx-plugin.yml

+5
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,11 @@ jobs:
7878
flags: gx-${{ matrix.python-version }}-${{ matrix.extraPythonRequirement }}
7979
name: pytest-gx
8080
verbose: true
81+
- name: Upload test results to Codecov
82+
if: ${{ !cancelled() }}
83+
uses: codecov/test-results-action@v1
84+
with:
85+
token: ${{ secrets.CODECOV_TOKEN }}
8186

8287
event-file:
8388
runs-on: ubuntu-latest

.github/workflows/metadata-ingestion.yml

+5
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,11 @@ jobs:
9898
flags: ingestion-${{ matrix.python-version }}-${{ matrix.command }}
9999
name: pytest-ingestion
100100
verbose: true
101+
- name: Upload test results to Codecov
102+
if: ${{ !cancelled() }}
103+
uses: codecov/test-results-action@v1
104+
with:
105+
token: ${{ secrets.CODECOV_TOKEN }}
101106

102107
event-file:
103108
runs-on: ubuntu-latest

.github/workflows/metadata-io.yml

+5
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,11 @@ jobs:
9090
fail_ci_if_error: false
9191
name: metadata-io-test
9292
verbose: true
93+
- name: Upload test results to Codecov
94+
if: ${{ !cancelled() }}
95+
uses: codecov/test-results-action@v1
96+
with:
97+
token: ${{ secrets.CODECOV_TOKEN }}
9398

9499
event-file:
95100
runs-on: ubuntu-latest

.github/workflows/prefect-plugin.yml

+5
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,11 @@ jobs:
7070
flags: prefect-${{ matrix.python-version }}
7171
name: pytest-prefect
7272
verbose: true
73+
- name: Upload test results to Codecov
74+
if: ${{ !cancelled() }}
75+
uses: codecov/test-results-action@v1
76+
with:
77+
token: ${{ secrets.CODECOV_TOKEN }}
7378

7479
event-file:
7580
runs-on: ubuntu-latest

metadata-ingestion/src/datahub/cli/cli_utils.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import time
44
import typing
55
from datetime import datetime
6-
from typing import Any, Dict, List, Optional, Tuple, Type, Union
6+
from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
77

88
import click
99
import requests
@@ -33,6 +33,15 @@ def first_non_null(ls: List[Optional[str]]) -> Optional[str]:
3333
return next((el for el in ls if el is not None and el.strip() != ""), None)
3434

3535

36+
_T = TypeVar("_T")
37+
38+
39+
def get_or_else(value: Optional[_T], default: _T) -> _T:
40+
# Normally we'd use `value or default`. However, that runs into issues
41+
# when value is falsey but not None.
42+
return value if value is not None else default
43+
44+
3645
def parse_run_restli_response(response: requests.Response) -> dict:
3746
response_json = response.json()
3847
if response.status_code != 200:

metadata-ingestion/src/datahub/emitter/rest_emitter.py

+125-84
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,21 @@
1+
from __future__ import annotations
2+
13
import functools
24
import json
35
import logging
46
import os
57
from json.decoder import JSONDecodeError
6-
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence, Union
8+
from typing import (
9+
TYPE_CHECKING,
10+
Any,
11+
Callable,
12+
Dict,
13+
List,
14+
Optional,
15+
Sequence,
16+
Tuple,
17+
Union,
18+
)
719

820
import requests
921
from deprecated import deprecated
@@ -12,9 +24,13 @@
1224

1325
from datahub import nice_version_name
1426
from datahub.cli import config_utils
15-
from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url
27+
from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url, get_or_else
1628
from datahub.cli.env_utils import get_boolean_env_variable
17-
from datahub.configuration.common import ConfigurationError, OperationalError
29+
from datahub.configuration.common import (
30+
ConfigModel,
31+
ConfigurationError,
32+
OperationalError,
33+
)
1834
from datahub.emitter.generic_emitter import Emitter
1935
from datahub.emitter.mcp import MetadataChangeProposalWrapper
2036
from datahub.emitter.request_helper import make_curl_command
@@ -31,10 +47,8 @@
3147

3248
logger = logging.getLogger(__name__)
3349

34-
_DEFAULT_CONNECT_TIMEOUT_SEC = 30 # 30 seconds should be plenty to connect
35-
_DEFAULT_READ_TIMEOUT_SEC = (
36-
30 # Any ingest call taking longer than 30 seconds should be abandoned
37-
)
50+
_DEFAULT_TIMEOUT_SEC = 30 # 30 seconds should be plenty to connect
51+
_TIMEOUT_LOWER_BOUND_SEC = 1 # if below this, we log a warning
3852
_DEFAULT_RETRY_STATUS_CODES = [ # Additional status codes to retry on
3953
429,
4054
500,
@@ -63,15 +77,76 @@
6377
)
6478

6579

80+
class RequestsSessionConfig(ConfigModel):
81+
timeout: Union[float, Tuple[float, float], None] = _DEFAULT_TIMEOUT_SEC
82+
83+
retry_status_codes: List[int] = _DEFAULT_RETRY_STATUS_CODES
84+
retry_methods: List[str] = _DEFAULT_RETRY_METHODS
85+
retry_max_times: int = _DEFAULT_RETRY_MAX_TIMES
86+
87+
extra_headers: Dict[str, str] = {}
88+
89+
ca_certificate_path: Optional[str] = None
90+
client_certificate_path: Optional[str] = None
91+
disable_ssl_verification: bool = False
92+
93+
def build_session(self) -> requests.Session:
94+
session = requests.Session()
95+
96+
if self.extra_headers:
97+
session.headers.update(self.extra_headers)
98+
99+
if self.client_certificate_path:
100+
session.cert = self.client_certificate_path
101+
102+
if self.ca_certificate_path:
103+
session.verify = self.ca_certificate_path
104+
105+
if self.disable_ssl_verification:
106+
session.verify = False
107+
108+
try:
109+
# Set raise_on_status to False to propagate errors:
110+
# https://stackoverflow.com/questions/70189330/determine-status-code-from-python-retry-exception
111+
# Must call `raise_for_status` after making a request, which we do
112+
retry_strategy = Retry(
113+
total=self.retry_max_times,
114+
status_forcelist=self.retry_status_codes,
115+
backoff_factor=2,
116+
allowed_methods=self.retry_methods,
117+
raise_on_status=False,
118+
)
119+
except TypeError:
120+
# Prior to urllib3 1.26, the Retry class used `method_whitelist` instead of `allowed_methods`.
121+
retry_strategy = Retry(
122+
total=self.retry_max_times,
123+
status_forcelist=self.retry_status_codes,
124+
backoff_factor=2,
125+
method_whitelist=self.retry_methods,
126+
raise_on_status=False,
127+
)
128+
129+
adapter = HTTPAdapter(
130+
pool_connections=100, pool_maxsize=100, max_retries=retry_strategy
131+
)
132+
session.mount("http://", adapter)
133+
session.mount("https://", adapter)
134+
135+
if self.timeout is not None:
136+
# Shim session.request to apply default timeout values.
137+
# Via https://stackoverflow.com/a/59317604.
138+
session.request = functools.partial( # type: ignore
139+
session.request,
140+
timeout=self.timeout,
141+
)
142+
143+
return session
144+
145+
66146
class DataHubRestEmitter(Closeable, Emitter):
67147
_gms_server: str
68148
_token: Optional[str]
69149
_session: requests.Session
70-
_connect_timeout_sec: float = _DEFAULT_CONNECT_TIMEOUT_SEC
71-
_read_timeout_sec: float = _DEFAULT_READ_TIMEOUT_SEC
72-
_retry_status_codes: List[int] = _DEFAULT_RETRY_STATUS_CODES
73-
_retry_methods: List[str] = _DEFAULT_RETRY_METHODS
74-
_retry_max_times: int = _DEFAULT_RETRY_MAX_TIMES
75150

76151
def __init__(
77152
self,
@@ -102,15 +177,13 @@ def __init__(
102177

103178
self._session = requests.Session()
104179

105-
self._session.headers.update(
106-
{
107-
"X-RestLi-Protocol-Version": "2.0.0",
108-
"X-DataHub-Py-Cli-Version": nice_version_name(),
109-
"Content-Type": "application/json",
110-
}
111-
)
180+
headers = {
181+
"X-RestLi-Protocol-Version": "2.0.0",
182+
"X-DataHub-Py-Cli-Version": nice_version_name(),
183+
"Content-Type": "application/json",
184+
}
112185
if token:
113-
self._session.headers.update({"Authorization": f"Bearer {token}"})
186+
headers["Authorization"] = f"Bearer {token}"
114187
else:
115188
# HACK: When no token is provided but system auth env variables are set, we use them.
116189
# Ideally this should simply get passed in as config, instead of being sneakily injected
@@ -119,75 +192,43 @@ def __init__(
119192
# rest emitter, and the rest sink uses the rest emitter under the hood.
120193
system_auth = config_utils.get_system_auth()
121194
if system_auth is not None:
122-
self._session.headers.update({"Authorization": system_auth})
123-
124-
if extra_headers:
125-
self._session.headers.update(extra_headers)
126-
127-
if client_certificate_path:
128-
self._session.cert = client_certificate_path
129-
130-
if ca_certificate_path:
131-
self._session.verify = ca_certificate_path
132-
133-
if disable_ssl_verification:
134-
self._session.verify = False
135-
136-
self._connect_timeout_sec = (
137-
connect_timeout_sec or timeout_sec or _DEFAULT_CONNECT_TIMEOUT_SEC
138-
)
139-
self._read_timeout_sec = (
140-
read_timeout_sec or timeout_sec or _DEFAULT_READ_TIMEOUT_SEC
141-
)
142-
143-
if self._connect_timeout_sec < 1 or self._read_timeout_sec < 1:
144-
logger.warning(
145-
f"Setting timeout values lower than 1 second is not recommended. Your configuration is connect_timeout:{self._connect_timeout_sec}s, read_timeout:{self._read_timeout_sec}s"
146-
)
147-
148-
if retry_status_codes is not None: # Only if missing. Empty list is allowed
149-
self._retry_status_codes = retry_status_codes
150-
151-
if retry_methods is not None:
152-
self._retry_methods = retry_methods
153-
154-
if retry_max_times:
155-
self._retry_max_times = retry_max_times
195+
headers["Authorization"] = system_auth
156196

157-
try:
158-
# Set raise_on_status to False to propagate errors:
159-
# https://stackoverflow.com/questions/70189330/determine-status-code-from-python-retry-exception
160-
# Must call `raise_for_status` after making a request, which we do
161-
retry_strategy = Retry(
162-
total=self._retry_max_times,
163-
status_forcelist=self._retry_status_codes,
164-
backoff_factor=2,
165-
allowed_methods=self._retry_methods,
166-
raise_on_status=False,
167-
)
168-
except TypeError:
169-
# Prior to urllib3 1.26, the Retry class used `method_whitelist` instead of `allowed_methods`.
170-
retry_strategy = Retry(
171-
total=self._retry_max_times,
172-
status_forcelist=self._retry_status_codes,
173-
backoff_factor=2,
174-
method_whitelist=self._retry_methods,
175-
raise_on_status=False,
197+
timeout: float | tuple[float, float]
198+
if connect_timeout_sec is not None or read_timeout_sec is not None:
199+
timeout = (
200+
connect_timeout_sec or timeout_sec or _DEFAULT_TIMEOUT_SEC,
201+
read_timeout_sec or timeout_sec or _DEFAULT_TIMEOUT_SEC,
176202
)
203+
if (
204+
timeout[0] < _TIMEOUT_LOWER_BOUND_SEC
205+
or timeout[1] < _TIMEOUT_LOWER_BOUND_SEC
206+
):
207+
logger.warning(
208+
f"Setting timeout values lower than {_TIMEOUT_LOWER_BOUND_SEC} second is not recommended. Your configuration is (connect_timeout, read_timeout) = {timeout} seconds"
209+
)
210+
else:
211+
timeout = get_or_else(timeout_sec, _DEFAULT_TIMEOUT_SEC)
212+
if timeout < _TIMEOUT_LOWER_BOUND_SEC:
213+
logger.warning(
214+
f"Setting timeout values lower than {_TIMEOUT_LOWER_BOUND_SEC} second is not recommended. Your configuration is timeout = {timeout} seconds"
215+
)
177216

178-
adapter = HTTPAdapter(
179-
pool_connections=100, pool_maxsize=100, max_retries=retry_strategy
180-
)
181-
self._session.mount("http://", adapter)
182-
self._session.mount("https://", adapter)
183-
184-
# Shim session.request to apply default timeout values.
185-
# Via https://stackoverflow.com/a/59317604.
186-
self._session.request = functools.partial( # type: ignore
187-
self._session.request,
188-
timeout=(self._connect_timeout_sec, self._read_timeout_sec),
217+
self._session_config = RequestsSessionConfig(
218+
timeout=timeout,
219+
retry_status_codes=get_or_else(
220+
retry_status_codes, _DEFAULT_RETRY_STATUS_CODES
221+
),
222+
retry_methods=get_or_else(retry_methods, _DEFAULT_RETRY_METHODS),
223+
retry_max_times=get_or_else(retry_max_times, _DEFAULT_RETRY_MAX_TIMES),
224+
extra_headers={**headers, **(extra_headers or {})},
225+
ca_certificate_path=ca_certificate_path,
226+
client_certificate_path=client_certificate_path,
227+
disable_ssl_verification=disable_ssl_verification,
189228
)
190229

230+
self._session = self._session_config.build_session()
231+
191232
def test_connection(self) -> None:
192233
url = f"{self._gms_server}/config"
193234
response = self._session.get(url)

0 commit comments

Comments
 (0)