Skip to content

Commit e886af5

Browse files
miker985jreback
authored andcommitted
BUG: Fix parsing of stata dates (#17797) (#17990)
1 parent 6e11c10 commit e886af5

File tree

4 files changed

+37
-13
lines changed

4 files changed

+37
-13
lines changed

doc/source/whatsnew/v0.21.1.txt

+3
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@ Indexing
7575
I/O
7676
^^^
7777

78+
- Bug in class:`~pandas.io.stata.StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`). Previously columns with display formatting were normally left as ordinal numbers and not converted to datetime objects.
79+
80+
7881
Plotting
7982
^^^^^^^^
8083

pandas/io/stata.py

+13-13
Original file line numberDiff line numberDiff line change
@@ -306,39 +306,42 @@ def convert_delta_safe(base, deltas, unit):
306306
data_col[bad_locs] = 1.0 # Replace with NaT
307307
dates = dates.astype(np.int64)
308308

309-
if fmt in ["%tc", "tc"]: # Delta ms relative to base
309+
if fmt.startswith(("%tc", "tc")): # Delta ms relative to base
310310
base = stata_epoch
311311
ms = dates
312312
conv_dates = convert_delta_safe(base, ms, 'ms')
313-
elif fmt in ["%tC", "tC"]:
313+
elif fmt.startswith(("%tC", "tC")):
314314
from warnings import warn
315315

316316
warn("Encountered %tC format. Leaving in Stata Internal Format.")
317317
conv_dates = Series(dates, dtype=np.object)
318318
if has_bad_values:
319319
conv_dates[bad_locs] = pd.NaT
320320
return conv_dates
321-
elif fmt in ["%td", "td", "%d", "d"]: # Delta days relative to base
321+
# Delta days relative to base
322+
elif fmt.startswith(("%td", "td", "%d", "d")):
322323
base = stata_epoch
323324
days = dates
324325
conv_dates = convert_delta_safe(base, days, 'd')
325-
elif fmt in ["%tw", "tw"]: # does not count leap days - 7 days is a week
326+
# does not count leap days - 7 days is a week.
327+
# 52nd week may have more than 7 days
328+
elif fmt.startswith(("%tw", "tw")):
326329
year = stata_epoch.year + dates // 52
327330
days = (dates % 52) * 7
328331
conv_dates = convert_year_days_safe(year, days)
329-
elif fmt in ["%tm", "tm"]: # Delta months relative to base
332+
elif fmt.startswith(("%tm", "tm")): # Delta months relative to base
330333
year = stata_epoch.year + dates // 12
331334
month = (dates % 12) + 1
332335
conv_dates = convert_year_month_safe(year, month)
333-
elif fmt in ["%tq", "tq"]: # Delta quarters relative to base
336+
elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base
334337
year = stata_epoch.year + dates // 4
335338
month = (dates % 4) * 3 + 1
336339
conv_dates = convert_year_month_safe(year, month)
337-
elif fmt in ["%th", "th"]: # Delta half-years relative to base
340+
elif fmt.startswith(("%th", "th")): # Delta half-years relative to base
338341
year = stata_epoch.year + dates // 2
339342
month = (dates % 2) * 6 + 1
340343
conv_dates = convert_year_month_safe(year, month)
341-
elif fmt in ["%ty", "ty"]: # Years -- not delta
344+
elif fmt.startswith(("%ty", "ty")): # Years -- not delta
342345
year = dates
343346
month = np.ones_like(dates)
344347
conv_dates = convert_year_month_safe(year, month)
@@ -1029,10 +1032,6 @@ def _read_header(self):
10291032
# calculate size of a data record
10301033
self.col_sizes = lmap(lambda x: self._calcsize(x), self.typlist)
10311034

1032-
# remove format details from %td
1033-
self.fmtlist = ["%td" if x.startswith("%td") else x
1034-
for x in self.fmtlist]
1035-
10361035
def _read_new_header(self, first_char):
10371036
# The first part of the header is common to 117 and 118.
10381037
self.path_or_buf.read(27) # stata_dta><header><release>
@@ -1578,7 +1577,8 @@ def read(self, nrows=None, convert_dates=None,
15781577
self._do_convert_missing(data, convert_missing)
15791578

15801579
if convert_dates:
1581-
cols = np.where(lmap(lambda x: x in _date_formats,
1580+
cols = np.where(lmap(lambda x: any(x.startswith(fmt)
1581+
for fmt in _date_formats),
15821582
self.fmtlist))[0]
15831583
for i in cols:
15841584
col = data.columns[i]
3.31 KB
Binary file not shown.

pandas/tests/io/test_stata.py

+21
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ def setup_method(self, method):
9696

9797
self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta')
9898

99+
self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta')
100+
99101
def read_dta(self, file):
100102
# Legacy default reader configuration
101103
return read_stata(file, convert_dates=True)
@@ -1327,3 +1329,22 @@ def test_set_index(self):
13271329
df.to_stata(path)
13281330
reread = pd.read_stata(path, index_col='index')
13291331
tm.assert_frame_equal(df, reread)
1332+
1333+
@pytest.mark.parametrize(
1334+
'column', ['ms', 'day', 'week', 'month', 'qtr', 'half', 'yr'])
1335+
def test_date_parsing_ignores_format_details(self, column):
1336+
# GH 17797
1337+
#
1338+
# Test that display formats are ignored when determining if a numeric
1339+
# column is a date value.
1340+
#
1341+
# All date types are stored as numbers and format associated with the
1342+
# column denotes both the type of the date and the display format.
1343+
#
1344+
# STATA supports 9 date types which each have distinct units. We test 7
1345+
# of the 9 types, ignoring %tC and %tb. %tC is a variant of %tc that
1346+
# accounts for leap seconds and %tb relies on STATAs business calendar.
1347+
df = read_stata(self.stata_dates)
1348+
unformatted = df.loc[0, column]
1349+
formatted = df.loc[0, column + "_fmt"]
1350+
assert unformatted == formatted

0 commit comments

Comments
 (0)