Merge pull request #3 from vantage6/change/indicate-numerical-columns-as-list-of-columns

bartvanb · web-flow · commit 02ee2a2bf1f1 · 2025-02-21T14:43:48.000+01:00
Indicate numerical columns as list of columns instead of list of bool…
diff --git a/Makefile b/Makefile
@@ -19,7 +19,7 @@ image:
 	@echo "Building ${REGISTRY}/algorithms/summary:latest"
 	docker buildx build \
 		--tag ${REGISTRY}/algorithms/summary:${TAG}-v6-${VANTAGE6_VERSION} \
-		--tag ${REGISTRY}/algorithms/summary:latest \
+		--tag ${REGISTRY}/algorithms/summary:${TAG} \
 		--platform ${PLATFORMS} \
 		-f ./Dockerfile \
 		$(if ${_condition_push},--push .,.)
diff --git a/algorithm_store.json b/algorithm_store.json
@@ -1,72 +1,87 @@
 {
+  "image": "harbor2.vantage6.ai/algorithms/summary",
+  "vantage6_version": "4.9",
   "description": "Compute common statistics such as mean, SD, min, and max",
+  "code_url": "https://github.com/vantage6/v6-summary-py.git",
   "functions": [
     {
+      "type": "central",
       "arguments": [
         {
-          "default_value": null,
-          "conditional_value": null,
-          "name": "columns",
+          "type": "column_list",
           "is_frontend_only": false,
           "description": "The columns to include in the summary. If not given, all columns are included.",
           "conditional_operator": null,
-          "type": "column_list",
+          "conditional_value": null,
+          "name": "columns",
+          "default_value": null,
           "display_name": "Columns to include",
           "has_default_value": true
         },
         {
-          "default_value": null,
+          "type": "column_list",
+          "is_frontend_only": false,
+          "description": "Indicate which columns are to be treated numerical. If not given, this will be inferred from the data",
+          "conditional_operator": null,
           "conditional_value": null,
-          "name": "organizations_to_include",
+          "name": "numeric_columns",
+          "default_value": null,
+          "display_name": "Numerical columns",
+          "has_default_value": true
+        },
+        {
+          "type": "organization_list",
           "is_frontend_only": false,
           "description": "The organizations to include in the task. If not given, all organizations in the collaboration are included.",
           "conditional_operator": null,
-          "type": "organization_list",
+          "conditional_value": null,
+          "name": "organizations_to_include",
+          "default_value": null,
           "display_name": "Organizations to include",
           "has_default_value": true
         }
       ],
-      "name": "summary",
-      "description": "Run summary algorithm and obtain statistics combined for all nodes",
-      "type": "central",
       "ui_visualizations": [
         {
+          "type": "table",
+          "description": "Table with summary statistics of all numerical columns",
+          "name": "Summary of numerical columns",
           "schema": {
             "location": ["numeric"],
             "columns": []
-          },
-          "name": "Summary of numerical columns",
-          "description": "Table with summary statistics of all numerical columns",
-          "type": "table"
+          }
         },
         {
+          "type": "table",
+          "description": "Table with summary statistics of all categorical columns",
+          "name": "Summary of categorical columns",
           "schema": {
             "location": ["categorical"],
             "columns": []
-          },
-          "name": "Summary of categorical columns",
-          "description": "Table with summary statistics of all categorical columns",
-          "type": "table"
+          }
         },
         {
+          "type": "table",
+          "description": "Table with counts of unique values in the categorical columns",
+          "name": "Unique value counts",
           "schema": {
             "location": ["counts_unique_values"],
             "columns": []
-          },
-          "name": "Unique value counts",
-          "description": "Table with counts of unique values in the categorical columns",
-          "type": "table"
+          }
         },
         {
+          "type": "table",
+          "description": "Table with the number of complete data rows per data station",
+          "name": "Number of complete data rows",
           "schema": {
             "location": ["num_complete_rows_per_node"],
             "columns": []
-          },
-          "name": "Number of complete data rows",
-          "description": "Table with the number of complete data rows per data station",
-          "type": "table"
+          }
         }
       ],
+      "standalone": true,
+      "description": "Run summary algorithm and obtain statistics combined for all nodes",
+      "name": "summary",
       "display_name": "Summary",
       "databases": [
         {
@@ -76,11 +91,8 @@
       ]
     }
   ],
-  "vantage6_version": "4.9",
-  "documentation_url": "https://algorithms.vantage6.ai/en/latest/v6-summary-py/docs/index.html",
   "name": "Summary statistics",
-  "partitioning": "horizontal",
-  "image": "harbor2.vantage6.ai/algorithms/summary",
   "developer_id": 1,
-  "code_url": "https://github.com/vantage6/v6-summary-py.git"
+  "partitioning": "horizontal",
+  "documentation_url": "https://algorithms.vantage6.ai/en/latest/v6-summary-py/docs/index.html"
 }
diff --git a/docs/v6-summary-py/usage.rst b/docs/v6-summary-py/usage.rst
@@ -15,9 +15,9 @@ Input arguments
      - List of column names (strings)
      - The columns for which to get a data summary. If not provided, all columns will
        be used.
-   * - ``is_numeric``
-     - List of booleans
-     - Indicate whether the columns are numeric or not. If not provided, the algorithm
+   * - ``numeric_columns``
+     - List of column names (strings)
+     - Indicate which of the columns are numeric. If not provided, the algorithm
        will infer the type of the columns.
    * - ``organizations_to_include``
      - List of integers
diff --git a/test/test.py b/test/test.py
@@ -247,7 +247,7 @@ def test_convert_categorical_to_numeric():
             "method": "summary",
             "kwargs": {
                 "columns": ["D"],
-                "is_numeric": [True],
+                "numeric_columns": ["D"],
             },
         },
         organizations=[org_ids[0]],
diff --git a/v6-summary-py/central.py b/v6-summary-py/central.py
@@ -19,7 +19,7 @@
 def summary(
     client: AlgorithmClient,
     columns: list[str] | None = None,
-    is_numeric: list[bool] | None = None,
+    numeric_columns: list[str] | None = None,
     organizations_to_include: list[int] | None = None,
 ) -> Any:
     """
@@ -32,16 +32,18 @@ def summary(
         The client object used to communicate with the server.
     columns : list[str] | None
         The columns to include in the summary. If not given, all columns are included.
-    is_numeric : list[bool] | None
-        Whether each of the columns is numeric or not. If not given, the algorithm will
-        try to infer the type of the columns.
+    numeric_columns : list[str] | None
+        Which of the columns are numeric. If not given, it will be inferred from the
+        data.
     organizations_to_include : list[int] | None
         The organizations to include in the task. If not given, all organizations
         in the collaboration are included.
     """
-    if is_numeric and len(is_numeric) != len(columns):
+    if columns and numeric_columns and not set(numeric_columns).issubset(set(columns)):
+        numeric_not_in_columns = set(numeric_columns) - set(columns)
         raise InputError(
-            "Length of is_numeric list does not match the length of columns list"
+            "The 'numeric_columns' should be a subset of 'columns'. The following "
+            f"columns are not in 'columns': {numeric_not_in_columns}"
         )
 
     # get all organizations (ids) within the collaboration so you can send a
@@ -58,7 +60,7 @@ def summary(
         "method": "summary_per_data_station",
         "kwargs": {
             "columns": columns,
-            "is_numeric": is_numeric,
+            "numeric_columns": numeric_columns,
         },
     }
 
diff --git a/v6-summary-py/partial_summary.py b/v6-summary-py/partial_summary.py
@@ -17,14 +17,14 @@
     ENVVAR_PRIVACY_THRESHOLD,
     EnvVarsAllowed,
 )
-from .utils import check_privacy, check_match_inferred_is_numeric
+from .utils import check_privacy, check_match_inferred_numeric
 
 
 @data(1)
 def summary_per_data_station(
     df: pd.DataFrame,
     columns: list[str] | None = None,
-    is_numeric: list[bool] | None = None,
+    numeric_columns: list[str] | None = None,
 ) -> dict:
     """
     Compute the summary statistics for a single data station to share with the
@@ -37,23 +37,23 @@ def summary_per_data_station(
     columns : list[str] | None
         The columns to compute the summary statistics for. If not provided, all columns
         are included.
-    is_numeric : list[bool]
-        Whether the columns are numeric or not. For non-numeric columns, other summary
-        statistics are computed. If not provided, it will be inferred.
+    numeric_columns : list[str] | None
+        List of columns that are numeric. If not provided, it will be inferred from the
+        data.
 
     Returns
     -------
     dict | None
         The summary statistics for the data station. If the summary statistics cannot
         be computed, None is returned
     """
-    return _summary_per_data_station(df, columns, is_numeric)
+    return _summary_per_data_station(df, columns, numeric_columns)
 
 
 def _summary_per_data_station(
     df: pd.DataFrame,
     columns: list[str] | None = None,
-    is_numeric: list[bool] | None = None,
+    numeric_columns: list[str] | None = None,
 ) -> dict:
     if not columns:
         columns = df.columns
@@ -73,19 +73,14 @@ def _summary_per_data_station(
     check_privacy(df, columns)
 
     # Split the data in numeric and non-numeric columns
-    inferred_is_numeric = [df[col].dtype in [int, float] for col in df.columns]
-    if is_numeric is None:
-        is_numeric = inferred_is_numeric
+    inferred_numeric_columns = [df[col].name in [int, float] for col in df.columns]
+    if numeric_columns is None:
+        numeric_columns = inferred_numeric_columns
     else:
-        df = check_match_inferred_is_numeric(
-            is_numeric, inferred_is_numeric, columns, df
-        )
+        df = check_match_inferred_numeric(numeric_columns, inferred_numeric_columns, df)
 
     # set numeric and non-numeric columns
-    numeric_columns = [col for col, is_num in zip(columns, is_numeric) if is_num]
-    non_numeric_columns = [
-        col for col, is_num in zip(columns, is_numeric) if not is_num
-    ]
+    non_numeric_columns = list(set(columns) - set(numeric_columns))
     df_numeric = df[numeric_columns]
     df_non_numeric = df[non_numeric_columns]
 
diff --git a/v6-summary-py/utils.py b/v6-summary-py/utils.py
@@ -65,58 +65,43 @@ def check_privacy(df: pd.DataFrame, requested_columns: list[str]) -> None:
                 )
 
 
-def check_match_inferred_is_numeric(
-    is_numeric: list[bool],
-    inferred_is_numeric: list[bool],
-    columns: list[str],
+def check_match_inferred_numeric(
+    numeric_columns: list[str],
+    inferred_numeric_columns: list[str],
     df: pd.DataFrame,
-):
+) -> pd.DataFrame:
     """
-    Check if the provided is_numeric list matches the inferred is_numeric list
+    Check if the provided numeric_columns list matches the inferred numerical columns
 
     Parameters
     ----------
-    is_numeric : list[bool]
-        The provided is_numeric list
-    inferred_is_numeric : list[bool]
-        The inferred is_numeric list
-    columns : list[str]
-        The columns for which the is_numeric list is provided
+    numeric_columns : list[str]
+        The user-provided list of columns to be treated as numeric. If user did not
+        provide this list, it is equal to the inferred_numeric_columns
+    inferred_numeric_columns : list[str]
+        The inferred list of numerical columns
     df: pd.DataFrame
         The original data. The type of the data may be modified if possible
+
+    Returns
+    -------
+    pd.DataFrame
+        The data with the columns cast to numeric if possible
+
+    Raises
+    ------
+    ValueError
+        If the provided numeric_columns list does not match the inferred_numeric_columns
     """
-    if len(is_numeric) != len(columns):
-        raise ValueError(
-            "Length of is_numeric list does not match the length of columns list"
-        )
-    if not all(
-        [is_numeric[i] == inferred_is_numeric[i] for i in range(len(is_numeric))]
-    ):
-        # check which columns do not match
-        wrongly_numeric_columns = [
-            columns[i]
-            for i in range(len(columns))
-            if is_numeric[i] and not inferred_is_numeric[i]
-        ]
-        wrongly_non_numeric_columns = [
-            columns[i]
-            for i in range(len(columns))
-            if not is_numeric[i] and inferred_is_numeric[i]
-        ]
-        msg = ""
-        if wrongly_numeric_columns:
-            # try to cast the columns to numeric
+    error_msg = ""
+    for col in numeric_columns:
+        if col not in inferred_numeric_columns:
             try:
-                df = cast_df_to_numeric(df, wrongly_numeric_columns)
+                df = cast_df_to_numeric(df, [col])
             except ValueError as exc:
-                msg += str(exc)
-        if wrongly_non_numeric_columns:
-            msg += (
-                f"Columns {wrongly_non_numeric_columns} are numeric, but is_numeric is "
-                "set to False"
-            )
-        if msg:
-            raise ValueError(msg)
+                error_msg += str(exc)
+    if error_msg:
+        raise ValueError(error_msg)
     return df