diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 422a239e86ece..dad212c9cb7a4 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -74,6 +74,9 @@ Indexing I/O ^^^ +- Bug in class:`~pandas.io.stata.StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`). Previously columns with display formatting were normally left as ordinal numbers and not converted to datetime objects. + + Plotting ^^^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index afc1631a947c8..aafe5f2ce76bd 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -306,11 +306,11 @@ def convert_delta_safe(base, deltas, unit): data_col[bad_locs] = 1.0 # Replace with NaT dates = dates.astype(np.int64) - if fmt in ["%tc", "tc"]: # Delta ms relative to base + if fmt.startswith(("%tc", "tc")): # Delta ms relative to base base = stata_epoch ms = dates conv_dates = convert_delta_safe(base, ms, 'ms') - elif fmt in ["%tC", "tC"]: + elif fmt.startswith(("%tC", "tC")): from warnings import warn warn("Encountered %tC format. Leaving in Stata Internal Format.") @@ -318,27 +318,30 @@ def convert_delta_safe(base, deltas, unit): if has_bad_values: conv_dates[bad_locs] = pd.NaT return conv_dates - elif fmt in ["%td", "td", "%d", "d"]: # Delta days relative to base + # Delta days relative to base + elif fmt.startswith(("%td", "td", "%d", "d")): base = stata_epoch days = dates conv_dates = convert_delta_safe(base, days, 'd') - elif fmt in ["%tw", "tw"]: # does not count leap days - 7 days is a week + # does not count leap days - 7 days is a week. + # 52nd week may have more than 7 days + elif fmt.startswith(("%tw", "tw")): year = stata_epoch.year + dates // 52 days = (dates % 52) * 7 conv_dates = convert_year_days_safe(year, days) - elif fmt in ["%tm", "tm"]: # Delta months relative to base + elif fmt.startswith(("%tm", "tm")): # Delta months relative to base year = stata_epoch.year + dates // 12 month = (dates % 12) + 1 conv_dates = convert_year_month_safe(year, month) - elif fmt in ["%tq", "tq"]: # Delta quarters relative to base + elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base year = stata_epoch.year + dates // 4 month = (dates % 4) * 3 + 1 conv_dates = convert_year_month_safe(year, month) - elif fmt in ["%th", "th"]: # Delta half-years relative to base + elif fmt.startswith(("%th", "th")): # Delta half-years relative to base year = stata_epoch.year + dates // 2 month = (dates % 2) * 6 + 1 conv_dates = convert_year_month_safe(year, month) - elif fmt in ["%ty", "ty"]: # Years -- not delta + elif fmt.startswith(("%ty", "ty")): # Years -- not delta year = dates month = np.ones_like(dates) conv_dates = convert_year_month_safe(year, month) @@ -1029,10 +1032,6 @@ def _read_header(self): # calculate size of a data record self.col_sizes = lmap(lambda x: self._calcsize(x), self.typlist) - # remove format details from %td - self.fmtlist = ["%td" if x.startswith("%td") else x - for x in self.fmtlist] - def _read_new_header(self, first_char): # The first part of the header is common to 117 and 118. self.path_or_buf.read(27) # stata_dta>
@@ -1578,7 +1577,8 @@ def read(self, nrows=None, convert_dates=None, self._do_convert_missing(data, convert_missing) if convert_dates: - cols = np.where(lmap(lambda x: x in _date_formats, + cols = np.where(lmap(lambda x: any(x.startswith(fmt) + for fmt in _date_formats), self.fmtlist))[0] for i in cols: col = data.columns[i] diff --git a/pandas/tests/io/data/stata13_dates.dta b/pandas/tests/io/data/stata13_dates.dta new file mode 100644 index 0000000000000..87b857559e501 Binary files /dev/null and b/pandas/tests/io/data/stata13_dates.dta differ diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 055a490bc6b5d..78b47960e1a04 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -96,6 +96,8 @@ def setup_method(self, method): self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta') + self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta') + def read_dta(self, file): # Legacy default reader configuration return read_stata(file, convert_dates=True) @@ -1327,3 +1329,22 @@ def test_set_index(self): df.to_stata(path) reread = pd.read_stata(path, index_col='index') tm.assert_frame_equal(df, reread) + + @pytest.mark.parametrize( + 'column', ['ms', 'day', 'week', 'month', 'qtr', 'half', 'yr']) + def test_date_parsing_ignores_format_details(self, column): + # GH 17797 + # + # Test that display formats are ignored when determining if a numeric + # column is a date value. + # + # All date types are stored as numbers and format associated with the + # column denotes both the type of the date and the display format. + # + # STATA supports 9 date types which each have distinct units. We test 7 + # of the 9 types, ignoring %tC and %tb. %tC is a variant of %tc that + # accounts for leap seconds and %tb relies on STATAs business calendar. + df = read_stata(self.stata_dates) + unformatted = df.loc[0, column] + formatted = df.loc[0, column + "_fmt"] + assert unformatted == formatted