Skip to content

BUG: Fix parsing of stata dates (#17797) #17990

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Oct 31, 2017
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v0.21.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ Indexing
I/O
^^^

- Bug in `StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you do class:`~pandas.io.stata.StataReader`. Can you make a bit more clear what is being fixed here.



Plotting
^^^^^^^^

Expand Down
26 changes: 13 additions & 13 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,39 +306,42 @@ def convert_delta_safe(base, deltas, unit):
data_col[bad_locs] = 1.0 # Replace with NaT
dates = dates.astype(np.int64)

if fmt in ["%tc", "tc"]: # Delta ms relative to base
if fmt.startswith(("%tc", "tc")): # Delta ms relative to base
base = stata_epoch
ms = dates
conv_dates = convert_delta_safe(base, ms, 'ms')
elif fmt in ["%tC", "tC"]:
elif fmt.startswith(("%tC", "tC")):
from warnings import warn

warn("Encountered %tC format. Leaving in Stata Internal Format.")
conv_dates = Series(dates, dtype=np.object)
if has_bad_values:
conv_dates[bad_locs] = pd.NaT
return conv_dates
elif fmt in ["%td", "td", "%d", "d"]: # Delta days relative to base
# Delta days relative to base
elif fmt.startswith(("%td", "td", "%d", "d")):
base = stata_epoch
days = dates
conv_dates = convert_delta_safe(base, days, 'd')
elif fmt in ["%tw", "tw"]: # does not count leap days - 7 days is a week
# does not count leap days - 7 days is a week.
# 52nd week may have more than 7 days
elif fmt.startswith(("%tw", "tw")):
year = stata_epoch.year + dates // 52
days = (dates % 52) * 7
conv_dates = convert_year_days_safe(year, days)
elif fmt in ["%tm", "tm"]: # Delta months relative to base
elif fmt.startswith(("%tm", "tm")): # Delta months relative to base
year = stata_epoch.year + dates // 12
month = (dates % 12) + 1
conv_dates = convert_year_month_safe(year, month)
elif fmt in ["%tq", "tq"]: # Delta quarters relative to base
elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base
year = stata_epoch.year + dates // 4
month = (dates % 4) * 3 + 1
conv_dates = convert_year_month_safe(year, month)
elif fmt in ["%th", "th"]: # Delta half-years relative to base
elif fmt.startswith(("%th", "th")): # Delta half-years relative to base
year = stata_epoch.year + dates // 2
month = (dates % 2) * 6 + 1
conv_dates = convert_year_month_safe(year, month)
elif fmt in ["%ty", "ty"]: # Years -- not delta
elif fmt.startswith(("%ty", "ty")): # Years -- not delta
year = dates
month = np.ones_like(dates)
conv_dates = convert_year_month_safe(year, month)
Expand Down Expand Up @@ -1029,10 +1032,6 @@ def _read_header(self):
# calculate size of a data record
self.col_sizes = lmap(lambda x: self._calcsize(x), self.typlist)

# remove format details from %td
self.fmtlist = ["%td" if x.startswith("%td") else x
for x in self.fmtlist]

def _read_new_header(self, first_char):
# The first part of the header is common to 117 and 118.
self.path_or_buf.read(27) # stata_dta><header><release>
Expand Down Expand Up @@ -1578,7 +1577,8 @@ def read(self, nrows=None, convert_dates=None,
self._do_convert_missing(data, convert_missing)

if convert_dates:
cols = np.where(lmap(lambda x: x in _date_formats,
cols = np.where(lmap(lambda x: any(x.startswith(fmt)
for fmt in _date_formats),
self.fmtlist))[0]
for i in cols:
col = data.columns[i]
Expand Down
Binary file added pandas/tests/io/data/stata13_dates.dta
Binary file not shown.
21 changes: 21 additions & 0 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ def setup_method(self, method):

self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta')

self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta')

def read_dta(self, file):
# Legacy default reader configuration
return read_stata(file, convert_dates=True)
Expand Down Expand Up @@ -1327,3 +1329,22 @@ def test_set_index(self):
df.to_stata(path)
reread = pd.read_stata(path, index_col='index')
tm.assert_frame_equal(df, reread)

@pytest.mark.parametrize(
'column', ['ms', 'day', 'week', 'month', 'qtr', 'half', 'yr'])
def test_date_parsing_ignores_format_details(self, column):
# GH 17797
#
# Test that display formats are ignored when determining if a numeric
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: add a newline under the issue reference.

# column is a date value.
#
# All date types are stored as numbers and format associated with the
# column denotes both the type of the date and the display format.
#
# STATA supports 9 date types which each have distinct units. We test 7
# of the 9 types, ignoring %tC and %tb. %tC is a variant of %tc that
# accounts for leap seconds and %tb relies on STATAs business calendar.
df = read_stata(self.stata_dates)
unformatted = df.loc[0, column]
formatted = df.loc[0, column + "_fmt"]
assert unformatted == formatted
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are these supposed to be datetime64[ns] dtype?

what happens for the ignored formats? should raise?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are these supposed to be datetime64[ns] dtype?

At this point in the code formatted and unformatted are pandas._libs.tslib.Timestamp objects. Every column in df has a dtype of datetime64[ns]

what happens for the ignored formats? should raise?

Ignored formats are not converted to dates (consistent with previous behavior) source