Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added test data locally #2098

Closed
wants to merge 11 commits into from
5 changes: 4 additions & 1 deletion .gitattributes
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Set the default behavior, in case people don't have core.autocrlf set.
* text=auto
docs/** linguist-documentation
docs/** linguist-documentation
*.csv.tar.gz filter=lfs diff=lfs merge=lfs -text
*.json.tar.gz filter=lfs diff=lfs merge=lfs -text
*.txt.tar.gz filter=lfs diff=lfs merge=lfs -text
1 change: 1 addition & 0 deletions .github/workflows/build-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ jobs:
uses: actions/checkout@v4
with:
fetch-depth: 0
lfs: true

- name: Setup PDM
uses: pdm-project/setup-pdm@v4
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,8 @@
"TARGET_NAME = \"isHelpful\"\n",
"\n",
"# Paths.\n",
"DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/amazon_review_dataset/reviews.json\"\n",
"DATA_PATH = Path.home() / \".giskard\" / \"amazon_review_dataset\" / \"reviews.json\""
"DATA_URL = \"https://github.com/Giskard-AI/giskard/raw/a501a92b215bf730f6647be4d118a0c1ce67eb6e/tests/test_data/amazon_reviews.json.tar.gz\"\n",
"DATA_PATH = Path.home() / \".giskard\" / \"amazon_review_dataset\" / \"reviews.json.tar.gz\""
]
},
{
Expand Down
10 changes: 5 additions & 5 deletions docs/reference/notebooks/drug_classification_sklearn.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,12 @@
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.svm import SVC\n",
"from imblearn.over_sampling import SMOTE\n",
"from imblearn.pipeline import Pipeline as PipelineImb\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.model_selection import train_test_split\n",
"from imblearn.pipeline import Pipeline as PipelineImb\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.svm import SVC\n",
"\n",
"from giskard import Dataset, Model, scan, testing"
]
Expand Down Expand Up @@ -121,8 +121,8 @@
"NA_TO_K_CATEGORIES = ['<10', '10-20', '20-30', '>30']\n",
"\n",
"# Paths.\n",
"DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/drug_classification_dataset/drug200.csv\"\n",
"DATA_PATH = Path.home() / \".giskard\" / \"drug_classification_dataset\" / \"drug200.csv\""
"DATA_URL = \"https://github.com/Giskard-AI/giskard/raw/abac244d63e5deac5f42009f26971ee9ee6592a9/tests/test_data/drug_classification.csv.tar.gz\"\n",
"DATA_PATH = Path.home() / \".giskard\" / \"drug_classification_dataset\" / \"drug200.csv.tar.gz\""
]
},
{
Expand Down
6 changes: 3 additions & 3 deletions docs/reference/notebooks/hotel_text_regression.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
"outputs": [],
"source": [
"from pathlib import Path\n",
"from typing import Iterable\n",
"from urllib.request import urlretrieve\n",
"\n",
"import pandas as pd\n",
Expand All @@ -76,7 +77,6 @@
"from sklearn.model_selection import train_test_split\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import FunctionTransformer\n",
"from typing import Iterable\n",
"\n",
"from giskard import Model, Dataset, scan, testing"
]
Expand Down Expand Up @@ -107,8 +107,8 @@
"TARGET_COLUMN_NAME = \"Reviewer_Score\"\n",
"\n",
"# Paths.\n",
"DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/hotel_text_regression_dataset/Hotel_Reviews.csv\"\n",
"DATA_PATH = Path.home() / \".giskard\" / \"hotel_text_regression_dataset\" / \"Hotel_Reviews.csv\""
"DATA_URL = \"https://github.com/Giskard-AI/giskard/raw/abac244d63e5deac5f42009f26971ee9ee6592a9/tests/test_data/hotel_reviews.csv.tar.gz\"\n",
"DATA_PATH = Path.home() / \".giskard\" / \"hotel_text_regression_dataset\" / \"Hotel_Reviews.csv.tar.gz\""
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,11 @@
"IDX_LABEL = 'TransactionID'\n",
"\n",
"# Paths.\n",
"DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/fraud_detection_classification_dataset/{}\"\n",
"TEST_IDENTITY_DATA_URL = \"https://github.com/Giskard-AI/giskard/raw/abac244d63e5deac5f42009f26971ee9ee6592a9/tests/test_data/test_identity.csv.tar.gz\"\n",
"TEST_TRANSACTION_DATA_URL = \"https://github.com/Giskard-AI/giskard/raw/abac244d63e5deac5f42009f26971ee9ee6592a9/tests/test_data/test_transaction.csv.tar.gz\"\n",
"TRAIN_IDENTITY_DATA_URL = \"https://github.com/Giskard-AI/giskard/raw/abac244d63e5deac5f42009f26971ee9ee6592a9/tests/test_data/train_identity.csv.tar.gz\"\n",
"TRAIN_TRANSACTION_DATA_URL = \"https://github.com/Giskard-AI/giskard/raw/abac244d63e5deac5f42009f26971ee9ee6592a9/tests/test_data/train_transaction.csv.tar.gz\"\n",
"\n",
"DATA_PATH = Path.home() / \".giskard\" / \"fraud_detection_classification_dataset\""
]
},
Expand Down Expand Up @@ -165,9 +169,9 @@
"\n",
"\n",
"def fetch_dataset():\n",
" files_to_fetch = [\"train_transaction.csv\", \"train_identity.csv\", \"test_transaction.csv\", \"test_identity.csv\"]\n",
" files_to_fetch = [TEST_IDENTITY_DATA_URL, TEST_TRANSACTION_DATA_URL, TRAIN_IDENTITY_DATA_URL, TRAIN_TRANSACTION_DATA_URL]\n",
" for file_name in files_to_fetch:\n",
" fetch_from_ftp(DATA_URL.format(file_name), DATA_PATH / file_name)\n",
" fetch_from_ftp(file_name, DATA_PATH / file_name)\n",
"\n",
"\n",
"# Define data-types of transactions features.\n",
Expand Down Expand Up @@ -225,11 +229,11 @@
"def read_set(_type):\n",
" \"\"\"Read both transactions and identity data.\"\"\"\n",
" print(f\"Reading transactions data...\")\n",
" _df = pd.read_csv(os.path.join(DATA_PATH, f'{_type}_transaction.csv'),\n",
" _df = pd.read_csv(os.path.join(DATA_PATH, f'{_type}_transaction.csv.tar.gz'),\n",
" index_col=IDX_LABEL, dtype=DATA_TYPES_TRANSACTION, nrows=250)\n",
"\n",
" print(f\"Reading identity data...\")\n",
" _df = _df.join(pd.read_csv(os.path.join(DATA_PATH, f'{_type}_identity.csv'),\n",
" _df = _df.join(pd.read_csv(os.path.join(DATA_PATH, f'{_type}_identity.csv.tar.gz'),\n",
" index_col=IDX_LABEL, dtype=DATA_TYPES_ID))\n",
" return _df\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@
"source": [
"import string\n",
"from pathlib import Path\n",
"from typing import Iterable\n",
"from urllib.request import urlretrieve\n",
"\n",
"import nltk\n",
Expand All @@ -107,7 +108,6 @@
"from sklearn.model_selection import train_test_split\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import FunctionTransformer\n",
"from typing import Iterable\n",
"\n",
"from giskard import Dataset, Model, scan, testing"
]
Expand Down Expand Up @@ -154,8 +154,8 @@
"RANDOM_SEED = 8888\n",
"\n",
"# Data.\n",
"DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/medical_transcript_classification_dataset/mtsamples.csv\"\n",
"DATA_PATH = Path.home() / \".giskard\" / \"medical_transcript_classification_dataset\" / \"mtsamples.csv\""
"DATA_URL = \"https://github.com/Giskard-AI/giskard/raw/abac244d63e5deac5f42009f26971ee9ee6592a9/tests/test_data/medical_transcript_classification_dataset_mtsamples.csv.tar.gz\"\n",
"DATA_PATH = Path.home() / \".giskard\" / \"medical_transcript_classification_dataset\" / \"mtsamples.csv.tar.gz\""
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
"import string\n",
"from dataclasses import dataclass\n",
"from pathlib import Path\n",
"from typing import Union, List\n",
"from urllib.request import urlretrieve\n",
"\n",
"import nltk\n",
Expand All @@ -85,7 +86,6 @@
"from torch.utils.data import DataLoader\n",
"from torch.utils.data import TensorDataset\n",
"from transformers import DistilBertForSequenceClassification, DistilBertTokenizer\n",
"from typing import Union, List\n",
"\n",
"from giskard import Dataset, Model, scan, testing"
]
Expand Down Expand Up @@ -123,9 +123,9 @@
"STOP_WORDS = set(stopwords.words('english'))\n",
"RANDOM_SEED = 0\n",
"\n",
"DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/tripadvisor_reviews_dataset/{}\"\n",
"DATA_URL = \"https://github.com/Giskard-AI/giskard/raw/abac244d63e5deac5f42009f26971ee9ee6592a9/tests/test_data/{}\"\n",
"DATA_PATH = Path.home() / \".giskard\" / \"tripadvisor_reviews_dataset\"\n",
"DATA_FILE_NAME = \"tripadvisor_hotel_reviews.csv\""
"DATA_FILE_NAME = \"tripadvisor_hotel_reviews.csv.tar.gz\""
]
},
{
Expand Down
14 changes: 3 additions & 11 deletions tests/fixtures/amazon_review__binary_classification.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import string
from pathlib import Path

import numpy as np
import pandas as pd
Expand All @@ -12,7 +11,7 @@

from giskard import Dataset
from giskard.models.sklearn import SKLearnModel
from tests.url_utils import fetch_from_ftp
from tests import path

# Constants.
RANDOM_SEED = 0
Expand All @@ -24,14 +23,7 @@
FEATURE_COLUMN_NAME = "reviewText"

# Data.
DATA_URL = "ftp://sys.giskard.ai/pub/unit_test_resources/amazon_review_dataset/reviews.json"
DATA_PATH = Path.home() / ".giskard" / "amazon_review_dataset" / "reviews.json"


def download_data(**kwargs) -> pd.DataFrame:
fetch_from_ftp(DATA_URL, DATA_PATH)
_df = pd.read_json(DATA_PATH, lines=True, **kwargs)
return _df
DATA_PATH = path("test_data/amazon_reviews.json.tar.gz")


def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
Expand All @@ -58,7 +50,7 @@ def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:

@pytest.fixture(scope="session")
def amazon_review_raw_data() -> pd.DataFrame:
return preprocess_data(download_data(nrows=5000))
return preprocess_data(pd.read_json(DATA_PATH, lines=True, nrows=5000))


@pytest.fixture()
Expand Down
14 changes: 3 additions & 11 deletions tests/fixtures/drug_classification__multiclass_classification.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from typing import List, Tuple

from pathlib import Path

import pandas as pd
import pytest
from imblearn.over_sampling import SMOTE
Expand All @@ -11,11 +9,10 @@

from giskard import Dataset
from giskard.models.sklearn import SKLearnModel
from tests.url_utils import fetch_from_ftp
from tests import path

# Data.
DATA_URL = "ftp://sys.giskard.ai/pub/unit_test_resources/drug_classification_dataset/drug200.csv"
DATA_PATH = Path.home() / ".giskard" / "drug_classification_dataset" / "drug200.csv"
DATA_PATH = path("test_data/drug_classification.csv.tar.gz")

# Constants.
TARGET_NAME = "Drug"
Expand Down Expand Up @@ -49,12 +46,7 @@ def _bin_na_to_k(_df: pd.DataFrame) -> pd.DataFrame:

@pytest.fixture(scope="session")
def drug_classification_raw_data() -> pd.DataFrame:
# Download data.
fetch_from_ftp(DATA_URL, DATA_PATH)

# Load and wrap data.
raw_data = bin_numerical(pd.read_csv(DATA_PATH))
return raw_data
return bin_numerical(pd.read_csv(DATA_PATH))


@pytest.fixture()
Expand Down
19 changes: 4 additions & 15 deletions tests/fixtures/fraud_detection__binary_classification.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,16 @@
from typing import List, Tuple

from pathlib import Path

import pandas as pd
import pytest
from lightgbm import LGBMClassifier
from pandas.api.types import union_categoricals
from sklearn.model_selection import train_test_split

from giskard import Dataset, Model
from tests.url_utils import fetch_from_ftp
from tests import path

# Data.
DATA_URL = "ftp://sys.giskard.ai/pub/unit_test_resources/fraud_detection_classification_dataset/{}"
DATA_PATH = Path.home() / ".giskard" / "fraud_detection_classification_dataset"
DATA_URL = path("test_data/{}")

# Constants.
TARGET_COLUMN = "isTest"
Expand Down Expand Up @@ -85,20 +82,12 @@
]


def fetch_dataset():
files_to_fetch = ["train_transaction.csv", "train_identity.csv", "test_transaction.csv", "test_identity.csv"]
for file_name in files_to_fetch:
fetch_from_ftp(DATA_URL.format(file_name), DATA_PATH / file_name)


def read_set(_type, nrows=150):
"""Read both transactions and identity data."""
fetch_dataset()

_df = pd.read_csv(
DATA_PATH / f"{_type}_transaction.csv", index_col=IDX_LABEL, dtype=DATA_TYPES_TRANSACTION, nrows=nrows
DATA_URL / f"{_type}_transaction.csv", index_col=IDX_LABEL, dtype=DATA_TYPES_TRANSACTION, nrows=nrows
)
_df = _df.join(pd.read_csv(DATA_PATH / f"{_type}_identity.csv", index_col=IDX_LABEL, dtype=DATA_TYPES_ID))
_df = _df.join(pd.read_csv(DATA_URL / f"{_type}_identity.csv", index_col=IDX_LABEL, dtype=DATA_TYPES_ID))

return _df

Expand Down
9 changes: 2 additions & 7 deletions tests/fixtures/hotel_text__regression.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from typing import Iterable

from pathlib import Path

import pandas as pd
import pytest
from sklearn.ensemble import GradientBoostingRegressor
Expand All @@ -11,11 +9,10 @@

from giskard import Dataset
from giskard.models.sklearn import SKLearnModel
from tests.url_utils import fetch_from_ftp
from tests import path

# Data.
DATA_URL = "ftp://sys.giskard.ai/pub/unit_test_resources/hotel_text_regression_dataset/Hotel_Reviews.csv"
DATA_PATH = Path.home() / ".giskard" / "hotel_text_regression_dataset" / "Hotel_Reviews.csv"
DATA_PATH = path("test_data/hotel_reviews.csv.tar.gz")

# Constants.
FEATURE_COLUMN_NAME = "Full_Review"
Expand All @@ -33,8 +30,6 @@ def load_data(**kwargs) -> pd.DataFrame:

@pytest.fixture(scope="session")
def hotel_text_raw_data():
fetch_from_ftp(DATA_URL, DATA_PATH)

raw_data = load_data(nrows=105)[[FEATURE_COLUMN_NAME, TARGET_COLUMN_NAME]]
return raw_data

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import Iterable

import string
from pathlib import Path

import pandas as pd
import pytest
Expand All @@ -12,7 +11,7 @@

from giskard import Dataset
from giskard.models.sklearn import SKLearnModel
from tests.url_utils import fetch_from_ftp
from tests import path

# Constants.
LABELS_LIST = [
Expand All @@ -29,13 +28,11 @@
LANGUAGE = "english"

# Paths.
DATA_URL = "ftp://sys.giskard.ai/pub/unit_test_resources/medical_transcript_classification_dataset/mtsamples.csv"
DATA_PATH = Path.home() / ".giskard" / "medical_transcript_classification_dataset" / "mtsamples.csv"
DATA_PATH = path("test_data/medical_transcript_classification_dataset_mtsamples.csv.tar.gz")


def load_data() -> pd.DataFrame:
# Download dataset.
fetch_from_ftp(DATA_URL, DATA_PATH)
df = pd.read_csv(DATA_PATH)

# Drop useless columns.
Expand Down
10 changes: 3 additions & 7 deletions tests/fixtures/tripadvisor_text_classification_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import re
import string
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import pandas as pd
Expand All @@ -13,12 +12,10 @@
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

from giskard import Dataset, Model, models
from tests.url_utils import fetch_from_ftp
from tests import path

# Data
DATA_URL = "ftp://sys.giskard.ai/pub/unit_test_resources/tripadvisor_reviews_dataset/{}"
DATA_PATH = Path.home() / ".giskard" / "tripadvisor_reviews_dataset"
DATA_FILE_NAME = "tripadvisor_hotel_reviews.csv"
DATA_PATH = path("test_data/tripadvisor_hotel_reviews.csv.tar.gz")

# Constants
PRETRAINED_WEIGHTS_NAME = "distilbert-base-uncased"
Expand Down Expand Up @@ -115,8 +112,7 @@ def text_preprocessor(df: pd.DataFrame) -> pd.DataFrame:

def load_dataset() -> pd.DataFrame:
# Download dataset
fetch_from_ftp(DATA_URL.format(DATA_FILE_NAME), DATA_PATH / DATA_FILE_NAME)
df = pd.read_csv(DATA_PATH / DATA_FILE_NAME, nrows=MAX_NUM_ROWS)
df = pd.read_csv(DATA_PATH, nrows=MAX_NUM_ROWS)
# Obtain labels for our task.
df[TARGET_COLUMN_NAME] = df.Rating.apply(lambda x: create_label(x))
df.drop(columns="Rating", inplace=True)
Expand Down
3 changes: 3 additions & 0 deletions tests/test_data/adult_wage_classification.csv.tar.gz
Git LFS file not shown
Loading
Loading