Skip to content

Commit 02ee2a2

Browse files
authored
Merge pull request #3 from vantage6/change/indicate-numerical-columns-as-list-of-columns
Indicate numerical columns as list of columns instead of list of bool…
2 parents 32bba58 + 33a2fd0 commit 02ee2a2

File tree

7 files changed

+96
-102
lines changed

7 files changed

+96
-102
lines changed

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ image:
1919
@echo "Building ${REGISTRY}/algorithms/summary:latest"
2020
docker buildx build \
2121
--tag ${REGISTRY}/algorithms/summary:${TAG}-v6-${VANTAGE6_VERSION} \
22-
--tag ${REGISTRY}/algorithms/summary:latest \
22+
--tag ${REGISTRY}/algorithms/summary:${TAG} \
2323
--platform ${PLATFORMS} \
2424
-f ./Dockerfile \
2525
$(if ${_condition_push},--push .,.)

algorithm_store.json

+43-31
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,87 @@
11
{
2+
"image": "harbor2.vantage6.ai/algorithms/summary",
3+
"vantage6_version": "4.9",
24
"description": "Compute common statistics such as mean, SD, min, and max",
5+
"code_url": "https://github.com/vantage6/v6-summary-py.git",
36
"functions": [
47
{
8+
"type": "central",
59
"arguments": [
610
{
7-
"default_value": null,
8-
"conditional_value": null,
9-
"name": "columns",
11+
"type": "column_list",
1012
"is_frontend_only": false,
1113
"description": "The columns to include in the summary. If not given, all columns are included.",
1214
"conditional_operator": null,
13-
"type": "column_list",
15+
"conditional_value": null,
16+
"name": "columns",
17+
"default_value": null,
1418
"display_name": "Columns to include",
1519
"has_default_value": true
1620
},
1721
{
18-
"default_value": null,
22+
"type": "column_list",
23+
"is_frontend_only": false,
24+
"description": "Indicate which columns are to be treated numerical. If not given, this will be inferred from the data",
25+
"conditional_operator": null,
1926
"conditional_value": null,
20-
"name": "organizations_to_include",
27+
"name": "numeric_columns",
28+
"default_value": null,
29+
"display_name": "Numerical columns",
30+
"has_default_value": true
31+
},
32+
{
33+
"type": "organization_list",
2134
"is_frontend_only": false,
2235
"description": "The organizations to include in the task. If not given, all organizations in the collaboration are included.",
2336
"conditional_operator": null,
24-
"type": "organization_list",
37+
"conditional_value": null,
38+
"name": "organizations_to_include",
39+
"default_value": null,
2540
"display_name": "Organizations to include",
2641
"has_default_value": true
2742
}
2843
],
29-
"name": "summary",
30-
"description": "Run summary algorithm and obtain statistics combined for all nodes",
31-
"type": "central",
3244
"ui_visualizations": [
3345
{
46+
"type": "table",
47+
"description": "Table with summary statistics of all numerical columns",
48+
"name": "Summary of numerical columns",
3449
"schema": {
3550
"location": ["numeric"],
3651
"columns": []
37-
},
38-
"name": "Summary of numerical columns",
39-
"description": "Table with summary statistics of all numerical columns",
40-
"type": "table"
52+
}
4153
},
4254
{
55+
"type": "table",
56+
"description": "Table with summary statistics of all categorical columns",
57+
"name": "Summary of categorical columns",
4358
"schema": {
4459
"location": ["categorical"],
4560
"columns": []
46-
},
47-
"name": "Summary of categorical columns",
48-
"description": "Table with summary statistics of all categorical columns",
49-
"type": "table"
61+
}
5062
},
5163
{
64+
"type": "table",
65+
"description": "Table with counts of unique values in the categorical columns",
66+
"name": "Unique value counts",
5267
"schema": {
5368
"location": ["counts_unique_values"],
5469
"columns": []
55-
},
56-
"name": "Unique value counts",
57-
"description": "Table with counts of unique values in the categorical columns",
58-
"type": "table"
70+
}
5971
},
6072
{
73+
"type": "table",
74+
"description": "Table with the number of complete data rows per data station",
75+
"name": "Number of complete data rows",
6176
"schema": {
6277
"location": ["num_complete_rows_per_node"],
6378
"columns": []
64-
},
65-
"name": "Number of complete data rows",
66-
"description": "Table with the number of complete data rows per data station",
67-
"type": "table"
79+
}
6880
}
6981
],
82+
"standalone": true,
83+
"description": "Run summary algorithm and obtain statistics combined for all nodes",
84+
"name": "summary",
7085
"display_name": "Summary",
7186
"databases": [
7287
{
@@ -76,11 +91,8 @@
7691
]
7792
}
7893
],
79-
"vantage6_version": "4.9",
80-
"documentation_url": "https://algorithms.vantage6.ai/en/latest/v6-summary-py/docs/index.html",
8194
"name": "Summary statistics",
82-
"partitioning": "horizontal",
83-
"image": "harbor2.vantage6.ai/algorithms/summary",
8495
"developer_id": 1,
85-
"code_url": "https://github.com/vantage6/v6-summary-py.git"
96+
"partitioning": "horizontal",
97+
"documentation_url": "https://algorithms.vantage6.ai/en/latest/v6-summary-py/docs/index.html"
8698
}

docs/v6-summary-py/usage.rst

+3-3
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@ Input arguments
1515
- List of column names (strings)
1616
- The columns for which to get a data summary. If not provided, all columns will
1717
be used.
18-
* - ``is_numeric``
19-
- List of booleans
20-
- Indicate whether the columns are numeric or not. If not provided, the algorithm
18+
* - ``numeric_columns``
19+
- List of column names (strings)
20+
- Indicate which of the columns are numeric. If not provided, the algorithm
2121
will infer the type of the columns.
2222
* - ``organizations_to_include``
2323
- List of integers

test/test.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ def test_convert_categorical_to_numeric():
247247
"method": "summary",
248248
"kwargs": {
249249
"columns": ["D"],
250-
"is_numeric": [True],
250+
"numeric_columns": ["D"],
251251
},
252252
},
253253
organizations=[org_ids[0]],

v6-summary-py/central.py

+9-7
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
def summary(
2020
client: AlgorithmClient,
2121
columns: list[str] | None = None,
22-
is_numeric: list[bool] | None = None,
22+
numeric_columns: list[str] | None = None,
2323
organizations_to_include: list[int] | None = None,
2424
) -> Any:
2525
"""
@@ -32,16 +32,18 @@ def summary(
3232
The client object used to communicate with the server.
3333
columns : list[str] | None
3434
The columns to include in the summary. If not given, all columns are included.
35-
is_numeric : list[bool] | None
36-
Whether each of the columns is numeric or not. If not given, the algorithm will
37-
try to infer the type of the columns.
35+
numeric_columns : list[str] | None
36+
Which of the columns are numeric. If not given, it will be inferred from the
37+
data.
3838
organizations_to_include : list[int] | None
3939
The organizations to include in the task. If not given, all organizations
4040
in the collaboration are included.
4141
"""
42-
if is_numeric and len(is_numeric) != len(columns):
42+
if columns and numeric_columns and not set(numeric_columns).issubset(set(columns)):
43+
numeric_not_in_columns = set(numeric_columns) - set(columns)
4344
raise InputError(
44-
"Length of is_numeric list does not match the length of columns list"
45+
"The 'numeric_columns' should be a subset of 'columns'. The following "
46+
f"columns are not in 'columns': {numeric_not_in_columns}"
4547
)
4648

4749
# get all organizations (ids) within the collaboration so you can send a
@@ -58,7 +60,7 @@ def summary(
5860
"method": "summary_per_data_station",
5961
"kwargs": {
6062
"columns": columns,
61-
"is_numeric": is_numeric,
63+
"numeric_columns": numeric_columns,
6264
},
6365
}
6466

v6-summary-py/partial_summary.py

+12-17
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,14 @@
1717
ENVVAR_PRIVACY_THRESHOLD,
1818
EnvVarsAllowed,
1919
)
20-
from .utils import check_privacy, check_match_inferred_is_numeric
20+
from .utils import check_privacy, check_match_inferred_numeric
2121

2222

2323
@data(1)
2424
def summary_per_data_station(
2525
df: pd.DataFrame,
2626
columns: list[str] | None = None,
27-
is_numeric: list[bool] | None = None,
27+
numeric_columns: list[str] | None = None,
2828
) -> dict:
2929
"""
3030
Compute the summary statistics for a single data station to share with the
@@ -37,23 +37,23 @@ def summary_per_data_station(
3737
columns : list[str] | None
3838
The columns to compute the summary statistics for. If not provided, all columns
3939
are included.
40-
is_numeric : list[bool]
41-
Whether the columns are numeric or not. For non-numeric columns, other summary
42-
statistics are computed. If not provided, it will be inferred.
40+
numeric_columns : list[str] | None
41+
List of columns that are numeric. If not provided, it will be inferred from the
42+
data.
4343
4444
Returns
4545
-------
4646
dict | None
4747
The summary statistics for the data station. If the summary statistics cannot
4848
be computed, None is returned
4949
"""
50-
return _summary_per_data_station(df, columns, is_numeric)
50+
return _summary_per_data_station(df, columns, numeric_columns)
5151

5252

5353
def _summary_per_data_station(
5454
df: pd.DataFrame,
5555
columns: list[str] | None = None,
56-
is_numeric: list[bool] | None = None,
56+
numeric_columns: list[str] | None = None,
5757
) -> dict:
5858
if not columns:
5959
columns = df.columns
@@ -73,19 +73,14 @@ def _summary_per_data_station(
7373
check_privacy(df, columns)
7474

7575
# Split the data in numeric and non-numeric columns
76-
inferred_is_numeric = [df[col].dtype in [int, float] for col in df.columns]
77-
if is_numeric is None:
78-
is_numeric = inferred_is_numeric
76+
inferred_numeric_columns = [df[col].name in [int, float] for col in df.columns]
77+
if numeric_columns is None:
78+
numeric_columns = inferred_numeric_columns
7979
else:
80-
df = check_match_inferred_is_numeric(
81-
is_numeric, inferred_is_numeric, columns, df
82-
)
80+
df = check_match_inferred_numeric(numeric_columns, inferred_numeric_columns, df)
8381

8482
# set numeric and non-numeric columns
85-
numeric_columns = [col for col, is_num in zip(columns, is_numeric) if is_num]
86-
non_numeric_columns = [
87-
col for col, is_num in zip(columns, is_numeric) if not is_num
88-
]
83+
non_numeric_columns = list(set(columns) - set(numeric_columns))
8984
df_numeric = df[numeric_columns]
9085
df_non_numeric = df[non_numeric_columns]
9186

v6-summary-py/utils.py

+27-42
Original file line numberDiff line numberDiff line change
@@ -65,58 +65,43 @@ def check_privacy(df: pd.DataFrame, requested_columns: list[str]) -> None:
6565
)
6666

6767

68-
def check_match_inferred_is_numeric(
69-
is_numeric: list[bool],
70-
inferred_is_numeric: list[bool],
71-
columns: list[str],
68+
def check_match_inferred_numeric(
69+
numeric_columns: list[str],
70+
inferred_numeric_columns: list[str],
7271
df: pd.DataFrame,
73-
):
72+
) -> pd.DataFrame:
7473
"""
75-
Check if the provided is_numeric list matches the inferred is_numeric list
74+
Check if the provided numeric_columns list matches the inferred numerical columns
7675
7776
Parameters
7877
----------
79-
is_numeric : list[bool]
80-
The provided is_numeric list
81-
inferred_is_numeric : list[bool]
82-
The inferred is_numeric list
83-
columns : list[str]
84-
The columns for which the is_numeric list is provided
78+
numeric_columns : list[str]
79+
The user-provided list of columns to be treated as numeric. If user did not
80+
provide this list, it is equal to the inferred_numeric_columns
81+
inferred_numeric_columns : list[str]
82+
The inferred list of numerical columns
8583
df: pd.DataFrame
8684
The original data. The type of the data may be modified if possible
85+
86+
Returns
87+
-------
88+
pd.DataFrame
89+
The data with the columns cast to numeric if possible
90+
91+
Raises
92+
------
93+
ValueError
94+
If the provided numeric_columns list does not match the inferred_numeric_columns
8795
"""
88-
if len(is_numeric) != len(columns):
89-
raise ValueError(
90-
"Length of is_numeric list does not match the length of columns list"
91-
)
92-
if not all(
93-
[is_numeric[i] == inferred_is_numeric[i] for i in range(len(is_numeric))]
94-
):
95-
# check which columns do not match
96-
wrongly_numeric_columns = [
97-
columns[i]
98-
for i in range(len(columns))
99-
if is_numeric[i] and not inferred_is_numeric[i]
100-
]
101-
wrongly_non_numeric_columns = [
102-
columns[i]
103-
for i in range(len(columns))
104-
if not is_numeric[i] and inferred_is_numeric[i]
105-
]
106-
msg = ""
107-
if wrongly_numeric_columns:
108-
# try to cast the columns to numeric
96+
error_msg = ""
97+
for col in numeric_columns:
98+
if col not in inferred_numeric_columns:
10999
try:
110-
df = cast_df_to_numeric(df, wrongly_numeric_columns)
100+
df = cast_df_to_numeric(df, [col])
111101
except ValueError as exc:
112-
msg += str(exc)
113-
if wrongly_non_numeric_columns:
114-
msg += (
115-
f"Columns {wrongly_non_numeric_columns} are numeric, but is_numeric is "
116-
"set to False"
117-
)
118-
if msg:
119-
raise ValueError(msg)
102+
error_msg += str(exc)
103+
if error_msg:
104+
raise ValueError(error_msg)
120105
return df
121106

122107

0 commit comments

Comments
 (0)