|
102 | 102 | ]
|
103 | 103 | },
|
104 | 104 | {
|
| 105 | + "metadata": {}, |
105 | 106 | "cell_type": "code",
|
106 |
| - "execution_count": 2, |
107 |
| - "metadata": { |
108 |
| - "ExecuteTime": { |
109 |
| - "end_time": "2023-11-09T12:17:44.751420Z", |
110 |
| - "start_time": "2023-11-09T12:17:44.719440Z" |
111 |
| - }, |
112 |
| - "collapsed": false |
113 |
| - }, |
114 | 107 | "outputs": [],
|
| 108 | + "execution_count": null, |
115 | 109 | "source": [
|
116 | 110 | "# Constants.\n",
|
117 | 111 | "TARGET_COLUMN = 'isTest'\n",
|
118 | 112 | "IDX_LABEL = 'TransactionID'\n",
|
119 | 113 | "\n",
|
120 | 114 | "# Paths.\n",
|
121 |
| - "DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/fraud_detection_classification_dataset/{}\"\n", |
| 115 | + "DATA_URL = \"https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/fraud_detection_classification_dataset-{}\"\n", |
122 | 116 | "DATA_PATH = Path.home() / \".giskard\" / \"fraud_detection_classification_dataset\""
|
123 | 117 | ]
|
124 | 118 | },
|
|
141 | 135 | ]
|
142 | 136 | },
|
143 | 137 | {
|
| 138 | + "metadata": {}, |
144 | 139 | "cell_type": "code",
|
145 |
| - "execution_count": 3, |
146 |
| - "metadata": { |
147 |
| - "ExecuteTime": { |
148 |
| - "end_time": "2023-11-09T12:17:45.925766Z", |
149 |
| - "start_time": "2023-11-09T12:17:45.904823Z" |
150 |
| - }, |
151 |
| - "collapsed": false |
152 |
| - }, |
153 | 140 | "outputs": [],
|
| 141 | + "execution_count": null, |
154 | 142 | "source": [
|
155 |
| - "def fetch_from_ftp(url: str, file: Path) -> None:\n", |
| 143 | + "def fetch_demo_data(url: str, file: Path) -> None:\n", |
156 | 144 | " \"\"\"Helper to fetch data from the FTP server.\"\"\"\n",
|
157 | 145 | " if not file.parent.exists():\n",
|
158 | 146 | " file.parent.mkdir(parents=True, exist_ok=True)\n",
|
|
165 | 153 | "\n",
|
166 | 154 | "\n",
|
167 | 155 | "def fetch_dataset():\n",
|
168 |
| - " files_to_fetch = [\"train_transaction.csv\", \"train_identity.csv\", \"test_transaction.csv\", \"test_identity.csv\"]\n", |
| 156 | + " files_to_fetch = [\"train_transaction.csv.tar.gz\", \"train_identity.csv.tar.gz\", \"test_transaction.csv.tar.gz\", \"test_identity.csv.tar.gz\"]\n", |
169 | 157 | " for file_name in files_to_fetch:\n",
|
170 |
| - " fetch_from_ftp(DATA_URL.format(file_name), DATA_PATH / file_name)\n", |
| 158 | + " fetch_demo_data(DATA_URL.format(file_name), DATA_PATH / file_name)\n", |
171 | 159 | "\n",
|
172 | 160 | "\n",
|
173 | 161 | "# Define data-types of transactions features.\n",
|
|
225 | 213 | "def read_set(_type):\n",
|
226 | 214 | " \"\"\"Read both transactions and identity data.\"\"\"\n",
|
227 | 215 | " print(f\"Reading transactions data...\")\n",
|
228 |
| - " _df = pd.read_csv(os.path.join(DATA_PATH, f'{_type}_transaction.csv'),\n", |
| 216 | + " _df = pd.read_csv(os.path.join(DATA_PATH, f'{_type}_transaction.csv.tar.gz'),\n", |
229 | 217 | " index_col=IDX_LABEL, dtype=DATA_TYPES_TRANSACTION, nrows=250)\n",
|
230 | 218 | "\n",
|
231 | 219 | " print(f\"Reading identity data...\")\n",
|
232 |
| - " _df = _df.join(pd.read_csv(os.path.join(DATA_PATH, f'{_type}_identity.csv'),\n", |
| 220 | + " _df = _df.join(pd.read_csv(os.path.join(DATA_PATH, f'{_type}_identity.csv.tar.gz'),\n", |
233 | 221 | " index_col=IDX_LABEL, dtype=DATA_TYPES_ID))\n",
|
234 | 222 | " return _df\n",
|
235 | 223 | "\n",
|
|
248 | 236 | ]
|
249 | 237 | },
|
250 | 238 | {
|
| 239 | + "metadata": {}, |
251 | 240 | "cell_type": "code",
|
252 |
| - "execution_count": 4, |
253 |
| - "metadata": { |
254 |
| - "ExecuteTime": { |
255 |
| - "end_time": "2023-11-09T12:17:46.316557Z", |
256 |
| - "start_time": "2023-11-09T12:17:46.290804Z" |
257 |
| - }, |
258 |
| - "collapsed": false |
259 |
| - }, |
260 | 241 | "outputs": [],
|
| 242 | + "execution_count": null, |
261 | 243 | "source": [
|
262 | 244 | "def preprocess_dataset(train_set, test_set):\n",
|
263 | 245 | " \"\"\"Unite train and test into common dataframe.\"\"\"\n",
|
|
0 commit comments