Skip to content

Commit 91e9640

Browse files
committed
Black-reformat; avoid .dt.tz_localize() bug
Found a bug, pandas-dev/pandas#27952, which made our module output wrong results when there are lots of duplicates. We work around the wrong Pandas behavior by writing a special code path that avoids the buggy code. [finishes #167858839]
1 parent 04bc8fe commit 91e9640

File tree

2 files changed

+244
-187
lines changed

2 files changed

+244
-187
lines changed

converttodate.py

+61-44
Original file line numberDiff line numberDiff line change
@@ -6,25 +6,16 @@
66

77

88
class InputFormat(Enum):
9-
AUTO = 'auto'
10-
US = 'us'
11-
EU = 'eu'
9+
AUTO = "auto"
10+
US = "us"
11+
EU = "eu"
1212

1313
@property
1414
def kwargs(self):
1515
return {
16-
InputFormat.AUTO: {
17-
'infer_datetime_format': True,
18-
'format': None
19-
},
20-
InputFormat.US: {
21-
'infer_datetime_format': False,
22-
'format': '%m/%d/%Y'
23-
},
24-
InputFormat.EU: {
25-
'infer_datetime_format': False,
26-
'format': '%d/%m/%Y'
27-
}
16+
InputFormat.AUTO: {"infer_datetime_format": True, "format": None},
17+
InputFormat.US: {"infer_datetime_format": False, "format": "%m/%d/%Y"},
18+
InputFormat.EU: {"infer_datetime_format": False, "format": "%d/%m/%Y"},
2819
}[self]
2920

3021

@@ -43,24 +34,26 @@ class ErrorCount:
4334
total: int = 0
4435
n_columns: int = 0
4536

46-
def __add__(self, rhs: 'ErrorCount') -> 'ErrorCount':
37+
def __add__(self, rhs: "ErrorCount") -> "ErrorCount":
4738
"""Add more errors to this ErrorCount."""
48-
return ErrorCount(self.a_column or rhs.a_column,
49-
self.a_row or rhs.a_row,
50-
self.a_value or rhs.a_value,
51-
self.total + rhs.total,
52-
self.n_columns + rhs.n_columns)
39+
return ErrorCount(
40+
self.a_column or rhs.a_column,
41+
self.a_row or rhs.a_row,
42+
self.a_value or rhs.a_value,
43+
self.total + rhs.total,
44+
self.n_columns + rhs.n_columns,
45+
)
5346

5447
def __str__(self):
5548
if self.total == 1:
56-
n_errors_str = 'is 1 error'
49+
n_errors_str = "is 1 error"
5750
else:
58-
n_errors_str = f'are {self.total} errors'
51+
n_errors_str = f"are {self.total} errors"
5952

6053
if self.n_columns == 1:
61-
n_columns_str = '1 column'
54+
n_columns_str = "1 column"
6255
else:
63-
n_columns_str = f'{self.n_columns} columns'
56+
n_columns_str = f"{self.n_columns} columns"
6457

6558
return (
6659
f"'{self.a_value}' in row {self.a_row + 1} of "
@@ -76,7 +69,7 @@ def __len__(self):
7669
return self.total
7770

7871
@staticmethod
79-
def from_diff(in_series, out_series) -> 'ErrorCount':
72+
def from_diff(in_series, out_series) -> "ErrorCount":
8073
in_na = in_series.isna()
8174
out_na = out_series.isna()
8275
out_errors = out_na.index[out_na & ~in_na]
@@ -92,27 +85,54 @@ def from_diff(in_series, out_series) -> 'ErrorCount':
9285

9386
def render(table, params):
9487
# No processing if no columns selected
95-
if not params['colnames']:
88+
if not params["colnames"]:
9689
return table
9790

98-
input_format = InputFormat(params['input_format'])
91+
input_format = InputFormat(params["input_format"])
9992

10093
error_count = ErrorCount()
10194

102-
for column in params['colnames']:
95+
for column in params["colnames"]:
10396
in_series = table[column]
10497

10598
kwargs = {**input_format.kwargs}
10699

107100
if is_numeric_dtype(in_series):
108101
# For now, assume value is year and cast to string
109-
kwargs['format'] = '%Y'
110-
111-
out_series = pd.to_datetime(in_series, errors='coerce', exact=False,
112-
cache=True, utc=True,
113-
**kwargs).dt.tz_localize(None)
102+
kwargs["format"] = "%Y"
103+
104+
# Build `out_series`, a pd.Series of datetime64[ns]
105+
if hasattr(in_series, "cat"):
106+
# Pandas `to_datetime()` sometimes converts to Categorical; and
107+
# when it does, `series.dt.tz_localize()` doesn't unwrap the
108+
# Categorical. We can't blame `to_datetime()` for returning a
109+
# Categorical but we _can_ blame `.dt.tz_localize()` for not
110+
# unwrapping it.
111+
#
112+
# The bug: https://github.com/pandas-dev/pandas/issues/27952
113+
#
114+
# Workaround is to basically do what `pd.to_datetime()` does
115+
# with its cache, using the assumption that categories are unique.
116+
# We `tz_localize()` before caching, for speedup.
117+
#
118+
# Nix this if-statement and code path when the Pandas bug is fixed.
119+
text_values = in_series.cat.categories
120+
date_values = pd.to_datetime(
121+
text_values,
122+
errors="coerce",
123+
exact=False,
124+
cache=False,
125+
utc=True,
126+
**kwargs,
127+
).tz_localize(None)
128+
mapping = pd.Series(date_values, index=text_values)
129+
out_series = in_series.map(mapping).astype("datetime64[ns]")
130+
else:
131+
out_series = pd.to_datetime(
132+
in_series, errors="coerce", exact=False, cache=True, utc=True, **kwargs
133+
).dt.tz_localize(None)
114134

115-
if not params['error_means_null']:
135+
if not params["error_means_null"]:
116136
error_count += ErrorCount.from_diff(in_series, out_series)
117137

118138
table[column] = out_series
@@ -130,9 +150,9 @@ def _migrate_params_v0_to_v1(params):
130150
v1: 'error_means_null' (bool), 'input_format' (enum 'auto'|'us'|'eu')
131151
"""
132152
return {
133-
'colnames': params['colnames'],
134-
'error_means_null': params['type_null'],
135-
'input_format': ['auto', 'us', 'eu'][params['type_date']]
153+
"colnames": params["colnames"],
154+
"error_means_null": params["type_null"],
155+
"input_format": ["auto", "us", "eu"][params["type_date"]],
136156
}
137157

138158

@@ -144,16 +164,13 @@ def _migrate_params_v1_to_v2(params):
144164
145165
https://www.pivotaltracker.com/story/show/160463316
146166
"""
147-
return {
148-
**params,
149-
'colnames': [c for c in params['colnames'].split(',') if c],
150-
}
167+
return {**params, "colnames": [c for c in params["colnames"].split(",") if c]}
151168

152169

153170
def migrate_params(params):
154-
if 'type_date' in params:
171+
if "type_date" in params:
155172
params = _migrate_params_v0_to_v1(params)
156-
if isinstance(params['colnames'], str):
173+
if isinstance(params["colnames"], str):
157174
params = _migrate_params_v1_to_v2(params)
158175

159176
return params

0 commit comments

Comments
 (0)