34
34
if TYPE_CHECKING :
35
35
from mlflow import MlflowClient
36
36
37
- SAMPLE_SIZE = 1000
38
-
39
37
logger = logging .getLogger (__name__ )
40
38
41
39
@@ -526,10 +524,22 @@ def cast_column_to_dtypes(df, column_dtypes):
526
524
527
525
@classmethod
528
526
def load (cls , local_path : str ):
529
- with open (local_path , "rb" ) as ds_stream :
530
- return pd .read_csv (
531
- ZstdDecompressor ().stream_reader (ds_stream ), keep_default_na = False , na_values = ["_GSK_NA_" ]
532
- )
527
+ # load metadata
528
+ with open (Path (local_path ) / "giskard-dataset-meta.yaml" , "r" ) as meta_f :
529
+ meta = yaml .safe_load (meta_f )
530
+
531
+ # load data
532
+ with open (Path (local_path ) / "data.csv.zst" , "rb" ) as ds_stream :
533
+ df = pd .read_csv (ZstdDecompressor ().stream_reader (ds_stream ), keep_default_na = False , na_values = ["_GSK_NA_" ])
534
+
535
+ return cls (
536
+ df ,
537
+ name = meta .get ("name" ),
538
+ target = meta .get ("target" ),
539
+ cat_columns = [k for k in meta ["category_features" ].keys ()],
540
+ column_types = meta .get ("column_types" ),
541
+ original_id = meta .get ("id" ),
542
+ )
533
543
534
544
@staticmethod
535
545
def _cat_columns (meta ):
@@ -543,21 +553,17 @@ def _cat_columns(meta):
543
553
def cat_columns (self ):
544
554
return self ._cat_columns (self .meta )
545
555
546
- def save (self , local_path : Path , dataset_id ):
547
- with open (local_path / "data.csv.zst" , "wb" ) as f , open ( local_path / "data.sample.csv.zst" , "wb" ) as f_sample :
556
+ def save (self , local_path : str ):
557
+ with ( open (Path ( local_path ) / "data.csv.zst" , "wb" ) as f ,) :
548
558
uncompressed_bytes = save_df (self .df )
549
559
compressed_bytes = compress (uncompressed_bytes )
550
560
f .write (compressed_bytes )
551
561
original_size_bytes , compressed_size_bytes = len (uncompressed_bytes ), len (compressed_bytes )
552
562
553
- uncompressed_bytes = save_df (self .df .sample (min (SAMPLE_SIZE , len (self .df .index ))))
554
- compressed_bytes = compress (uncompressed_bytes )
555
- f_sample .write (compressed_bytes )
556
-
557
563
with open (Path (local_path ) / "giskard-dataset-meta.yaml" , "w" ) as meta_f :
558
564
yaml .dump (
559
565
{
560
- "id" : dataset_id ,
566
+ "id" : str ( self . id ) ,
561
567
"name" : self .meta .name ,
562
568
"target" : self .meta .target ,
563
569
"column_types" : self .meta .column_types ,
0 commit comments