2
2
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
#
4
4
5
- from dataclasses import InitVar , dataclass
6
- from typing import Any , List , Mapping , Optional , Tuple
7
5
import re
6
+ from dataclasses import InitVar , dataclass
8
7
from datetime import datetime as dt
8
+ from typing import Any , List , Mapping , Optional , Tuple
9
9
10
10
from airbyte_cdk .sources .declarative .schema import JsonFileSchemaLoader
11
11
from airbyte_cdk .sources .declarative .transformations import RecordTransformation
14
14
15
15
class ParserError (Exception ):
16
16
"""Replacement for pendulum's ParserError"""
17
+
17
18
pass
18
19
19
20
@@ -30,7 +31,7 @@ def __post_init__(self, parameters: Optional[Mapping[str, Any]] = None):
30
31
# Handle the case when parameters is None
31
32
parameters = parameters or {}
32
33
self .name = parameters .get ("name" )
33
-
34
+
34
35
# Skip schema loading if name is None
35
36
if self .name is None :
36
37
self ._schema = {}
@@ -43,7 +44,7 @@ def _get_schema_root_properties(self):
43
44
# Only call this if self.name is not None
44
45
if not self .name :
45
46
return {}
46
-
47
+
47
48
schema_loader = JsonFileSchemaLoader (config = self .config , parameters = {"name" : self .name })
48
49
schema = schema_loader .get_json_schema ()
49
50
return schema .get ("properties" , {})
@@ -60,75 +61,71 @@ def parse(self, text):
60
61
Handles various date formats including those with timezone information.
61
62
"""
62
63
# Reject dates with zeros like '0000-00-00' or '0000-00-00 00:00:00'
63
- if re .match (r' ^0+[-]0+[-]0+' , text ):
64
+ if re .match (r" ^0+[-]0+[-]0+" , text ):
64
65
raise ParserError ("Zero date not allowed" )
65
-
66
+
66
67
# Comprehensive list of formats to try
67
68
formats = [
68
69
# Basic formats
69
- '%Y-%m-%d' ,
70
- '%Y/%m/%d' ,
71
- '%d-%m-%Y' ,
72
- '%d/%m/%Y' ,
73
-
70
+ "%Y-%m-%d" ,
71
+ "%Y/%m/%d" ,
72
+ "%d-%m-%Y" ,
73
+ "%d/%m/%Y" ,
74
74
# Date and time formats
75
- '%Y-%m-%d %H:%M:%S' ,
76
- '%Y-%m-%d %H:%M:%S.%f' ,
77
- '%Y/%m/%d %H:%M:%S' ,
78
- '%Y/%m/%d %H:%M:%S.%f' ,
79
-
75
+ "%Y-%m-%d %H:%M:%S" ,
76
+ "%Y-%m-%d %H:%M:%S.%f" ,
77
+ "%Y/%m/%d %H:%M:%S" ,
78
+ "%Y/%m/%d %H:%M:%S.%f" ,
80
79
# ISO formats
81
- '%Y-%m-%dT%H:%M:%S' ,
82
- '%Y-%m-%dT%H:%M:%S.%f' ,
83
-
80
+ "%Y-%m-%dT%H:%M:%S" ,
81
+ "%Y-%m-%dT%H:%M:%S.%f" ,
84
82
# With timezone
85
- '%Y-%m-%d %H:%M:%S%z' ,
86
- '%Y-%m-%d %H:%M:%S.%f%z' ,
87
- '%Y-%m-%dT%H:%M:%S%z' ,
88
- '%Y-%m-%dT%H:%M:%S.%f%z' ,
89
-
83
+ "%Y-%m-%d %H:%M:%S%z" ,
84
+ "%Y-%m-%d %H:%M:%S.%f%z" ,
85
+ "%Y-%m-%dT%H:%M:%S%z" ,
86
+ "%Y-%m-%dT%H:%M:%S.%f%z" ,
90
87
# Using Z for UTC
91
- ' %Y-%m-%dT%H:%M:%SZ' ,
92
- ' %Y-%m-%dT%H:%M:%S.%fZ' ,
88
+ " %Y-%m-%dT%H:%M:%SZ" ,
89
+ " %Y-%m-%dT%H:%M:%S.%fZ" ,
93
90
]
94
91
95
92
# Try parsing with different formats
96
93
for fmt in formats :
97
94
try :
98
95
# Handle 'Z' timezone indicator for UTC
99
96
text_to_parse = text
100
- if fmt .endswith ('Z' ) and not text .endswith ('Z' ):
97
+ if fmt .endswith ("Z" ) and not text .endswith ("Z" ):
101
98
continue
102
- if not fmt .endswith ('Z' ) and text .endswith ('Z' ):
99
+ if not fmt .endswith ("Z" ) and text .endswith ("Z" ):
103
100
text_to_parse = text [:- 1 ] # Remove Z
104
- fmt = fmt + 'Z' if 'Z' not in fmt else fmt
105
-
101
+ fmt = fmt + "Z" if "Z" not in fmt else fmt
102
+
106
103
date_obj = dt .strptime (text_to_parse , fmt )
107
104
# In pendulum, dates with zero components are rejected
108
105
if date_obj .year == 0 or date_obj .month == 0 or date_obj .day == 0 :
109
106
raise ParserError ("Date with zero components" )
110
107
return date_obj
111
108
except ValueError :
112
109
continue
113
-
110
+
114
111
# Try ISO format as a last resort
115
112
try :
116
113
# Replace Z with +00:00 for ISO format parsing
117
- iso_text = text .replace ('Z' , ' +00:00' )
118
-
114
+ iso_text = text .replace ("Z" , " +00:00" )
115
+
119
116
# For Python < 3.11 compatibility, remove microseconds if they have more than 6 digits
120
- microseconds_match = re .search (r' \.(\d{7,})(?=[+-Z]|$)' , iso_text )
117
+ microseconds_match = re .search (r" \.(\d{7,})(?=[+-Z]|$)" , iso_text )
121
118
if microseconds_match :
122
119
fixed_micro = microseconds_match .group (1 )[:6 ]
123
- iso_text = iso_text .replace (microseconds_match .group (0 ), f' .{ fixed_micro } ' )
124
-
120
+ iso_text = iso_text .replace (microseconds_match .group (0 ), f" .{ fixed_micro } " )
121
+
125
122
date_obj = dt .fromisoformat (iso_text )
126
123
if date_obj .year == 0 or date_obj .month == 0 or date_obj .day == 0 :
127
124
raise ParserError ("Date with zero components" )
128
125
return date_obj
129
126
except (ValueError , AttributeError ):
130
127
pass
131
-
128
+
132
129
# If nothing worked, raise the error like pendulum would
133
130
raise ParserError (f"Unable to parse: { text } " )
134
131
@@ -142,12 +139,11 @@ def transform(
142
139
# If we don't have any fields to check, just return the record as is
143
140
if not self ._date_and_date_time_fields :
144
141
return record
145
-
142
+
146
143
for item in record :
147
144
if item in self ._date_and_date_time_fields and record .get (item ):
148
145
try :
149
146
self .parse (record [item ])
150
147
except ParserError :
151
148
record [item ] = None
152
149
return record
153
-
0 commit comments