Skip to content

Commit 246a026

Browse files
committed
Moved test datasets to s3
1 parent 6206028 commit 246a026

18 files changed

+114
-162
lines changed

docs/reference/notebooks/amazon_review_classification_sklearn.ipynb

+10-21
Original file line numberDiff line numberDiff line change
@@ -117,15 +117,10 @@
117117
]
118118
},
119119
{
120+
"metadata": {},
120121
"cell_type": "code",
121-
"execution_count": 3,
122-
"metadata": {
123-
"ExecuteTime": {
124-
"end_time": "2023-11-08T20:55:58.032511Z",
125-
"start_time": "2023-11-08T20:55:57.792680Z"
126-
}
127-
},
128122
"outputs": [],
123+
"execution_count": null,
129124
"source": [
130125
"# Constants.\n",
131126
"RANDOM_SEED = 0\n",
@@ -135,8 +130,8 @@
135130
"TARGET_NAME = \"isHelpful\"\n",
136131
"\n",
137132
"# Paths.\n",
138-
"DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/amazon_review_dataset/reviews.json\"\n",
139-
"DATA_PATH = Path.home() / \".giskard\" / \"amazon_review_dataset\" / \"reviews.json\""
133+
"DATA_URL = \"https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/amazon_review_dataset-reviews.json.tar.gz\"\n",
134+
"DATA_PATH = Path.home() / \".giskard\" / \"amazon_review_dataset\" / \"reviews.json.tar.gz\""
140135
]
141136
},
142137
{
@@ -156,18 +151,12 @@
156151
]
157152
},
158153
{
154+
"metadata": {},
159155
"cell_type": "code",
160-
"execution_count": 4,
161-
"metadata": {
162-
"ExecuteTime": {
163-
"end_time": "2023-11-08T20:56:00.293536Z",
164-
"start_time": "2023-11-08T20:56:00.234306Z"
165-
},
166-
"collapsed": false
167-
},
168156
"outputs": [],
157+
"execution_count": null,
169158
"source": [
170-
"def fetch_from_ftp(url: str, file: Path) -> None:\n",
159+
"def fetch_demo_data(url: str, file: Path) -> None:\n",
171160
" \"\"\"Helper to fetch data from the FTP server.\"\"\"\n",
172161
" if not file.parent.exists():\n",
173162
" file.parent.mkdir(parents=True, exist_ok=True)\n",
@@ -181,7 +170,7 @@
181170
"\n",
182171
"def download_data(**kwargs) -> pd.DataFrame:\n",
183172
" \"\"\"Download the dataset using URL.\"\"\"\n",
184-
" fetch_from_ftp(DATA_URL, DATA_PATH)\n",
173+
" fetch_demo_data(DATA_URL, DATA_PATH)\n",
185174
" _df = pd.read_json(DATA_PATH, lines=True, **kwargs)\n",
186175
" return _df\n",
187176
"\n",
@@ -215,10 +204,10 @@
215204
]
216205
},
217206
{
218-
"cell_type": "code",
219-
"execution_count": null,
220207
"metadata": {},
208+
"cell_type": "code",
221209
"outputs": [],
210+
"execution_count": null,
222211
"source": [
223212
"reviews_df = download_data()\n",
224213
"reviews_df = preprocess_data(reviews_df)"

docs/reference/notebooks/drug_classification_sklearn.ipynb

+8-14
Original file line numberDiff line numberDiff line change
@@ -97,17 +97,10 @@
9797
]
9898
},
9999
{
100+
"metadata": {},
100101
"cell_type": "code",
101-
"execution_count": 2,
102-
"id": "d44430add2918aa1",
103-
"metadata": {
104-
"ExecuteTime": {
105-
"end_time": "2024-02-09T09:29:15.513819Z",
106-
"start_time": "2024-02-09T09:29:15.470284Z"
107-
},
108-
"collapsed": false
109-
},
110102
"outputs": [],
103+
"execution_count": null,
111104
"source": [
112105
"# Constants.\n",
113106
"RANDOM_SEED = 0\n",
@@ -121,9 +114,10 @@
121114
"NA_TO_K_CATEGORIES = ['<10', '10-20', '20-30', '>30']\n",
122115
"\n",
123116
"# Paths.\n",
124-
"DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/drug_classification_dataset/drug200.csv\"\n",
125-
"DATA_PATH = Path.home() / \".giskard\" / \"drug_classification_dataset\" / \"drug200.csv\""
126-
]
117+
"DATA_URL = \"https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/drug_classification_dataset-drug200.csv.tar.gz\"\n",
118+
"DATA_PATH = Path.home() / \".giskard\" / \"drug_classification_dataset\" / \"drug200.csv.tar.gz\""
119+
],
120+
"id": "a161e40415287e1f"
127121
},
128122
{
129123
"cell_type": "markdown",
@@ -158,7 +152,7 @@
158152
},
159153
"outputs": [],
160154
"source": [
161-
"def fetch_from_ftp(url: str, file: Path) -> None:\n",
155+
"def fetch_demo_data(url: str, file: Path) -> None:\n",
162156
" \"\"\"Helper to fetch data from the FTP server.\"\"\"\n",
163157
" if not file.parent.exists():\n",
164158
" file.parent.mkdir(parents=True, exist_ok=True)\n",
@@ -172,7 +166,7 @@
172166
"\n",
173167
"def load_data() -> pd.DataFrame:\n",
174168
" \"\"\"Load data.\"\"\"\n",
175-
" fetch_from_ftp(DATA_URL, DATA_PATH)\n",
169+
" fetch_demo_data(DATA_URL, DATA_PATH)\n",
176170
" df = pd.read_csv(DATA_PATH)\n",
177171
" return df\n",
178172
"\n",

docs/reference/notebooks/fake_real_news_classification.ipynb

+8-8
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
"import os\n",
6868
"import string\n",
6969
"from pathlib import Path\n",
70+
"from typing import Tuple, Callable\n",
7071
"from urllib.request import urlretrieve\n",
7172
"\n",
7273
"import numpy as np\n",
@@ -79,7 +80,6 @@
7980
"from nltk.corpus import stopwords\n",
8081
"from sklearn.metrics import accuracy_score\n",
8182
"from sklearn.model_selection import train_test_split\n",
82-
"from typing import Tuple, Callable\n",
8383
"\n",
8484
"from giskard import Dataset, Model, scan, testing"
8585
]
@@ -142,7 +142,7 @@
142142
"RANDOM_SEED = 0\n",
143143
"\n",
144144
"# Paths.\n",
145-
"DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/fake_real_news_dataset/{}\"\n",
145+
"DATA_URL = \"https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/fake_real_news_dataset-{}\"\n",
146146
"DATA_PATH = Path.home() / \".giskard\" / \"fake_real_news_dataset\""
147147
]
148148
},
@@ -170,7 +170,7 @@
170170
},
171171
"outputs": [],
172172
"source": [
173-
"def fetch_from_ftp(url: str, file: Path) -> None:\n",
173+
"def fetch_demo_data(url: str, file: Path) -> None:\n",
174174
" \"\"\"Helper to fetch data from the FTP server.\"\"\"\n",
175175
" if not file.parent.exists():\n",
176176
" file.parent.mkdir(parents=True, exist_ok=True)\n",
@@ -184,15 +184,15 @@
184184
"\n",
185185
"def fetch_dataset() -> None:\n",
186186
" \"\"\"Gradually fetch all necessary files from the FTP server.\"\"\"\n",
187-
" files_to_fetch = (\"Fake.csv\", \"True.csv\", \"glove_100d.txt\")\n",
187+
" files_to_fetch = (\"Fake.csv.tar.gz\", \"True.csv.tar.gz\", \"glove_100d.txt.tar.gz\")\n",
188188
" for file_name in files_to_fetch:\n",
189-
" fetch_from_ftp(DATA_URL.format(file_name), DATA_PATH / file_name)\n",
189+
" fetch_demo_data(DATA_URL.format(file_name), DATA_PATH / file_name)\n",
190190
"\n",
191191
"\n",
192192
"def load_data(**kwargs) -> pd.DataFrame:\n",
193193
" \"\"\"Load data.\"\"\"\n",
194-
" real_df = pd.read_csv(DATA_PATH / \"True.csv\", **kwargs)\n",
195-
" fake_df = pd.read_csv(DATA_PATH / \"Fake.csv\", **kwargs)\n",
194+
" real_df = pd.read_csv(DATA_PATH / \"True.csv.tar.gz\", **kwargs)\n",
195+
" fake_df = pd.read_csv(DATA_PATH / \"Fake.csv.tar.gz\", **kwargs)\n",
196196
"\n",
197197
" # Create target column.\n",
198198
" real_df[TARGET_COLUMN_NAME] = 0\n",
@@ -380,7 +380,7 @@
380380
"def get_embeddings_matrix() -> np.ndarray:\n",
381381
" \"\"\"Create matrix, where each row is an embedding of a specific word.\"\"\"\n",
382382
" # Load glove embeddings.\n",
383-
" embeddings_dict = dict(parse_line(*line.rstrip().rsplit(' ')) for line in open(DATA_PATH / \"glove_100d.txt\"))\n",
383+
" embeddings_dict = dict(parse_line(*line.rstrip().rsplit(' ')) for line in open(DATA_PATH / \"glove_100d.txt.tar.gz\"))\n",
384384
"\n",
385385
" # Create embeddings matrix with glove word vectors.\n",
386386
" embeddings_matrix = init_embeddings_matrix(embeddings_dict)\n",

docs/reference/notebooks/hotel_text_regression.ipynb

+6-12
Original file line numberDiff line numberDiff line change
@@ -91,24 +91,18 @@
9191
]
9292
},
9393
{
94+
"metadata": {},
9495
"cell_type": "code",
95-
"execution_count": 2,
96-
"metadata": {
97-
"ExecuteTime": {
98-
"end_time": "2023-11-09T12:12:05.303464Z",
99-
"start_time": "2023-11-09T12:12:05.254149Z"
100-
},
101-
"collapsed": false
102-
},
10396
"outputs": [],
97+
"execution_count": null,
10498
"source": [
10599
"# Constants.\n",
106100
"FEATURE_COLUMN_NAME = \"Full_Review\"\n",
107101
"TARGET_COLUMN_NAME = \"Reviewer_Score\"\n",
108102
"\n",
109103
"# Paths.\n",
110-
"DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/hotel_text_regression_dataset/Hotel_Reviews.csv\"\n",
111-
"DATA_PATH = Path.home() / \".giskard\" / \"hotel_text_regression_dataset\" / \"Hotel_Reviews.csv\""
104+
"DATA_URL = \"https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/hotel_text_regression_dataset-Hotel_Reviews.csv.tar.gz\"\n",
105+
"DATA_PATH = Path.home() / \".giskard\" / \"hotel_text_regression_dataset\" / \"Hotel_Reviews.csv.tar.gz\""
112106
]
113107
},
114108
{
@@ -142,7 +136,7 @@
142136
},
143137
"outputs": [],
144138
"source": [
145-
"def fetch_from_ftp(url: str, file: Path) -> None:\n",
139+
"def fetch_demo_data(url: str, file: Path) -> None:\n",
146140
" \"\"\"Helper to fetch data from the FTP server.\"\"\"\n",
147141
" if not file.parent.exists():\n",
148142
" file.parent.mkdir(parents=True, exist_ok=True)\n",
@@ -155,7 +149,7 @@
155149
"\n",
156150
"\n",
157151
"def load_data(**kwargs) -> pd.DataFrame:\n",
158-
" fetch_from_ftp(DATA_URL, DATA_PATH)\n",
152+
" fetch_demo_data(DATA_URL, DATA_PATH)\n",
159153
" df = pd.read_csv(DATA_PATH, **kwargs)\n",
160154
"\n",
161155
" # Create target column.\n",

docs/reference/notebooks/ieee_fraud_detection_adversarial_validation.ipynb

+12-30
Original file line numberDiff line numberDiff line change
@@ -102,23 +102,17 @@
102102
]
103103
},
104104
{
105+
"metadata": {},
105106
"cell_type": "code",
106-
"execution_count": 2,
107-
"metadata": {
108-
"ExecuteTime": {
109-
"end_time": "2023-11-09T12:17:44.751420Z",
110-
"start_time": "2023-11-09T12:17:44.719440Z"
111-
},
112-
"collapsed": false
113-
},
114107
"outputs": [],
108+
"execution_count": null,
115109
"source": [
116110
"# Constants.\n",
117111
"TARGET_COLUMN = 'isTest'\n",
118112
"IDX_LABEL = 'TransactionID'\n",
119113
"\n",
120114
"# Paths.\n",
121-
"DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/fraud_detection_classification_dataset/{}\"\n",
115+
"DATA_URL = \"https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/fraud_detection_classification_dataset-{}\"\n",
122116
"DATA_PATH = Path.home() / \".giskard\" / \"fraud_detection_classification_dataset\""
123117
]
124118
},
@@ -141,18 +135,12 @@
141135
]
142136
},
143137
{
138+
"metadata": {},
144139
"cell_type": "code",
145-
"execution_count": 3,
146-
"metadata": {
147-
"ExecuteTime": {
148-
"end_time": "2023-11-09T12:17:45.925766Z",
149-
"start_time": "2023-11-09T12:17:45.904823Z"
150-
},
151-
"collapsed": false
152-
},
153140
"outputs": [],
141+
"execution_count": null,
154142
"source": [
155-
"def fetch_from_ftp(url: str, file: Path) -> None:\n",
143+
"def fetch_demo_data(url: str, file: Path) -> None:\n",
156144
" \"\"\"Helper to fetch data from the FTP server.\"\"\"\n",
157145
" if not file.parent.exists():\n",
158146
" file.parent.mkdir(parents=True, exist_ok=True)\n",
@@ -165,9 +153,9 @@
165153
"\n",
166154
"\n",
167155
"def fetch_dataset():\n",
168-
" files_to_fetch = [\"train_transaction.csv\", \"train_identity.csv\", \"test_transaction.csv\", \"test_identity.csv\"]\n",
156+
" files_to_fetch = [\"train_transaction.csv.tar.gz\", \"train_identity.csv.tar.gz\", \"test_transaction.csv.tar.gz\", \"test_identity.csv.tar.gz\"]\n",
169157
" for file_name in files_to_fetch:\n",
170-
" fetch_from_ftp(DATA_URL.format(file_name), DATA_PATH / file_name)\n",
158+
" fetch_demo_data(DATA_URL.format(file_name), DATA_PATH / file_name)\n",
171159
"\n",
172160
"\n",
173161
"# Define data-types of transactions features.\n",
@@ -225,11 +213,11 @@
225213
"def read_set(_type):\n",
226214
" \"\"\"Read both transactions and identity data.\"\"\"\n",
227215
" print(f\"Reading transactions data...\")\n",
228-
" _df = pd.read_csv(os.path.join(DATA_PATH, f'{_type}_transaction.csv'),\n",
216+
" _df = pd.read_csv(os.path.join(DATA_PATH, f'{_type}_transaction.csv.tar.gz'),\n",
229217
" index_col=IDX_LABEL, dtype=DATA_TYPES_TRANSACTION, nrows=250)\n",
230218
"\n",
231219
" print(f\"Reading identity data...\")\n",
232-
" _df = _df.join(pd.read_csv(os.path.join(DATA_PATH, f'{_type}_identity.csv'),\n",
220+
" _df = _df.join(pd.read_csv(os.path.join(DATA_PATH, f'{_type}_identity.csv.tar.gz'),\n",
233221
" index_col=IDX_LABEL, dtype=DATA_TYPES_ID))\n",
234222
" return _df\n",
235223
"\n",
@@ -248,16 +236,10 @@
248236
]
249237
},
250238
{
239+
"metadata": {},
251240
"cell_type": "code",
252-
"execution_count": 4,
253-
"metadata": {
254-
"ExecuteTime": {
255-
"end_time": "2023-11-09T12:17:46.316557Z",
256-
"start_time": "2023-11-09T12:17:46.290804Z"
257-
},
258-
"collapsed": false
259-
},
260241
"outputs": [],
242+
"execution_count": null,
261243
"source": [
262244
"def preprocess_dataset(train_set, test_set):\n",
263245
" \"\"\"Unite train and test into common dataframe.\"\"\"\n",

docs/reference/notebooks/insurance_prediction_lgbm.ipynb

+4-4
Original file line numberDiff line numberDiff line change
@@ -179,8 +179,8 @@
179179
"CATEGORICAL_COLS = [\"sex\", \"smoker\", \"region\"]\n",
180180
"\n",
181181
"# Paths.\n",
182-
"DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/insurance_prediction_dataset/us_health_insurance_dataset.csv\"\n",
183-
"DATA_PATH = Path.home() / \".giskard\" / \"insurance_prediction_dataset\" / \"us_health_insurance_dataset.csv\""
182+
"DATA_URL = \"https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/insurance_prediction_dataset-us_health_insurance_dataset.csv.tar.gz\"\n",
183+
"DATA_PATH = Path.home() / \".giskard\" / \"insurance_prediction_dataset\" / \"us_health_insurance_dataset.csv.tar.gz\""
184184
]
185185
},
186186
{
@@ -216,7 +216,7 @@
216216
},
217217
"outputs": [],
218218
"source": [
219-
"def fetch_from_ftp(url: str, file: Path) -> None:\n",
219+
"def fetch_demo_data(url: str, file: Path) -> None:\n",
220220
" \"\"\"Helper to fetch data from the FTP server.\"\"\"\n",
221221
" if not file.parent.exists():\n",
222222
" file.parent.mkdir(parents=True, exist_ok=True)\n",
@@ -230,7 +230,7 @@
230230
"\n",
231231
"def download_data(**kwargs) -> pd.DataFrame:\n",
232232
" \"\"\"Download the dataset using URL.\"\"\"\n",
233-
" fetch_from_ftp(DATA_URL, DATA_PATH)\n",
233+
" fetch_demo_data(DATA_URL, DATA_PATH)\n",
234234
" _df = pd.read_csv(DATA_PATH, **kwargs)\n",
235235
" return _df"
236236
]

0 commit comments

Comments
 (0)