6
6
7
7
8
8
class InputFormat (Enum ):
9
- AUTO = ' auto'
10
- US = 'us'
11
- EU = 'eu'
9
+ AUTO = " auto"
10
+ US = "us"
11
+ EU = "eu"
12
12
13
13
@property
14
14
def kwargs (self ):
15
15
return {
16
- InputFormat .AUTO : {
17
- 'infer_datetime_format' : True ,
18
- 'format' : None
19
- },
20
- InputFormat .US : {
21
- 'infer_datetime_format' : False ,
22
- 'format' : '%m/%d/%Y'
23
- },
24
- InputFormat .EU : {
25
- 'infer_datetime_format' : False ,
26
- 'format' : '%d/%m/%Y'
27
- }
16
+ InputFormat .AUTO : {"infer_datetime_format" : True , "format" : None },
17
+ InputFormat .US : {"infer_datetime_format" : False , "format" : "%m/%d/%Y" },
18
+ InputFormat .EU : {"infer_datetime_format" : False , "format" : "%d/%m/%Y" },
28
19
}[self ]
29
20
30
21
@@ -43,24 +34,26 @@ class ErrorCount:
43
34
total : int = 0
44
35
n_columns : int = 0
45
36
46
- def __add__ (self , rhs : ' ErrorCount' ) -> ' ErrorCount' :
37
+ def __add__ (self , rhs : " ErrorCount" ) -> " ErrorCount" :
47
38
"""Add more errors to this ErrorCount."""
48
- return ErrorCount (self .a_column or rhs .a_column ,
49
- self .a_row or rhs .a_row ,
50
- self .a_value or rhs .a_value ,
51
- self .total + rhs .total ,
52
- self .n_columns + rhs .n_columns )
39
+ return ErrorCount (
40
+ self .a_column or rhs .a_column ,
41
+ self .a_row or rhs .a_row ,
42
+ self .a_value or rhs .a_value ,
43
+ self .total + rhs .total ,
44
+ self .n_columns + rhs .n_columns ,
45
+ )
53
46
54
47
def __str__ (self ):
55
48
if self .total == 1 :
56
- n_errors_str = ' is 1 error'
49
+ n_errors_str = " is 1 error"
57
50
else :
58
- n_errors_str = f' are { self .total } errors'
51
+ n_errors_str = f" are { self .total } errors"
59
52
60
53
if self .n_columns == 1 :
61
- n_columns_str = ' 1 column'
54
+ n_columns_str = " 1 column"
62
55
else :
63
- n_columns_str = f' { self .n_columns } columns'
56
+ n_columns_str = f" { self .n_columns } columns"
64
57
65
58
return (
66
59
f"'{ self .a_value } ' in row { self .a_row + 1 } of "
@@ -76,7 +69,7 @@ def __len__(self):
76
69
return self .total
77
70
78
71
@staticmethod
79
- def from_diff (in_series , out_series ) -> ' ErrorCount' :
72
+ def from_diff (in_series , out_series ) -> " ErrorCount" :
80
73
in_na = in_series .isna ()
81
74
out_na = out_series .isna ()
82
75
out_errors = out_na .index [out_na & ~ in_na ]
@@ -92,27 +85,54 @@ def from_diff(in_series, out_series) -> 'ErrorCount':
92
85
93
86
def render (table , params ):
94
87
# No processing if no columns selected
95
- if not params [' colnames' ]:
88
+ if not params [" colnames" ]:
96
89
return table
97
90
98
- input_format = InputFormat (params [' input_format' ])
91
+ input_format = InputFormat (params [" input_format" ])
99
92
100
93
error_count = ErrorCount ()
101
94
102
- for column in params [' colnames' ]:
95
+ for column in params [" colnames" ]:
103
96
in_series = table [column ]
104
97
105
98
kwargs = {** input_format .kwargs }
106
99
107
100
if is_numeric_dtype (in_series ):
108
101
# For now, assume value is year and cast to string
109
- kwargs ['format' ] = '%Y'
110
-
111
- out_series = pd .to_datetime (in_series , errors = 'coerce' , exact = False ,
112
- cache = True , utc = True ,
113
- ** kwargs ).dt .tz_localize (None )
102
+ kwargs ["format" ] = "%Y"
103
+
104
+ # Build `out_series`, a pd.Series of datetime64[ns]
105
+ if hasattr (in_series , "cat" ):
106
+ # Pandas `to_datetime()` sometimes converts to Categorical; and
107
+ # when it does, `series.dt.tz_localize()` doesn't unwrap the
108
+ # Categorical. We can't blame `to_datetime()` for returning a
109
+ # Categorical but we _can_ blame `.dt.tz_localize()` for not
110
+ # unwrapping it.
111
+ #
112
+ # The bug: https://github.com/pandas-dev/pandas/issues/27952
113
+ #
114
+ # Workaround is to basically do what `pd.to_datetime()` does
115
+ # with its cache, using the assumption that categories are unique.
116
+ # We `tz_localize()` before caching, for speedup.
117
+ #
118
+ # Nix this if-statement and code path when the Pandas bug is fixed.
119
+ text_values = in_series .cat .categories
120
+ date_values = pd .to_datetime (
121
+ text_values ,
122
+ errors = "coerce" ,
123
+ exact = False ,
124
+ cache = False ,
125
+ utc = True ,
126
+ ** kwargs ,
127
+ ).tz_localize (None )
128
+ mapping = pd .Series (date_values , index = text_values )
129
+ out_series = in_series .map (mapping ).astype ("datetime64[ns]" )
130
+ else :
131
+ out_series = pd .to_datetime (
132
+ in_series , errors = "coerce" , exact = False , cache = True , utc = True , ** kwargs
133
+ ).dt .tz_localize (None )
114
134
115
- if not params [' error_means_null' ]:
135
+ if not params [" error_means_null" ]:
116
136
error_count += ErrorCount .from_diff (in_series , out_series )
117
137
118
138
table [column ] = out_series
@@ -130,9 +150,9 @@ def _migrate_params_v0_to_v1(params):
130
150
v1: 'error_means_null' (bool), 'input_format' (enum 'auto'|'us'|'eu')
131
151
"""
132
152
return {
133
- ' colnames' : params [' colnames' ],
134
- ' error_means_null' : params [' type_null' ],
135
- ' input_format' : [' auto' , 'us' , 'eu' ][params [' type_date' ]]
153
+ " colnames" : params [" colnames" ],
154
+ " error_means_null" : params [" type_null" ],
155
+ " input_format" : [" auto" , "us" , "eu" ][params [" type_date" ]],
136
156
}
137
157
138
158
@@ -144,16 +164,13 @@ def _migrate_params_v1_to_v2(params):
144
164
145
165
https://www.pivotaltracker.com/story/show/160463316
146
166
"""
147
- return {
148
- ** params ,
149
- 'colnames' : [c for c in params ['colnames' ].split (',' ) if c ],
150
- }
167
+ return {** params , "colnames" : [c for c in params ["colnames" ].split ("," ) if c ]}
151
168
152
169
153
170
def migrate_params (params ):
154
- if ' type_date' in params :
171
+ if " type_date" in params :
155
172
params = _migrate_params_v0_to_v1 (params )
156
- if isinstance (params [' colnames' ], str ):
173
+ if isinstance (params [" colnames" ], str ):
157
174
params = _migrate_params_v1_to_v2 (params )
158
175
159
176
return params
0 commit comments