From b5c2b48aa8144da912f5877b90a80aaf6955af20 Mon Sep 17 00:00:00 2001 From: Mike Richards Date: Thu, 26 Oct 2017 09:11:40 -0700 Subject: [PATCH 1/5] BUG: Fix parsing of stata dates (#17797) --- pandas/io/stata.py | 26 ++++++++++++------------- pandas/tests/io/data/stata13_dates.dta | Bin 0 -> 3386 bytes pandas/tests/io/test_stata.py | 22 +++++++++++++++++++++ 3 files changed, 35 insertions(+), 13 deletions(-) create mode 100644 pandas/tests/io/data/stata13_dates.dta diff --git a/pandas/io/stata.py b/pandas/io/stata.py index afc1631a947c8..aafe5f2ce76bd 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -306,11 +306,11 @@ def convert_delta_safe(base, deltas, unit): data_col[bad_locs] = 1.0 # Replace with NaT dates = dates.astype(np.int64) - if fmt in ["%tc", "tc"]: # Delta ms relative to base + if fmt.startswith(("%tc", "tc")): # Delta ms relative to base base = stata_epoch ms = dates conv_dates = convert_delta_safe(base, ms, 'ms') - elif fmt in ["%tC", "tC"]: + elif fmt.startswith(("%tC", "tC")): from warnings import warn warn("Encountered %tC format. Leaving in Stata Internal Format.") @@ -318,27 +318,30 @@ def convert_delta_safe(base, deltas, unit): if has_bad_values: conv_dates[bad_locs] = pd.NaT return conv_dates - elif fmt in ["%td", "td", "%d", "d"]: # Delta days relative to base + # Delta days relative to base + elif fmt.startswith(("%td", "td", "%d", "d")): base = stata_epoch days = dates conv_dates = convert_delta_safe(base, days, 'd') - elif fmt in ["%tw", "tw"]: # does not count leap days - 7 days is a week + # does not count leap days - 7 days is a week. + # 52nd week may have more than 7 days + elif fmt.startswith(("%tw", "tw")): year = stata_epoch.year + dates // 52 days = (dates % 52) * 7 conv_dates = convert_year_days_safe(year, days) - elif fmt in ["%tm", "tm"]: # Delta months relative to base + elif fmt.startswith(("%tm", "tm")): # Delta months relative to base year = stata_epoch.year + dates // 12 month = (dates % 12) + 1 conv_dates = convert_year_month_safe(year, month) - elif fmt in ["%tq", "tq"]: # Delta quarters relative to base + elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base year = stata_epoch.year + dates // 4 month = (dates % 4) * 3 + 1 conv_dates = convert_year_month_safe(year, month) - elif fmt in ["%th", "th"]: # Delta half-years relative to base + elif fmt.startswith(("%th", "th")): # Delta half-years relative to base year = stata_epoch.year + dates // 2 month = (dates % 2) * 6 + 1 conv_dates = convert_year_month_safe(year, month) - elif fmt in ["%ty", "ty"]: # Years -- not delta + elif fmt.startswith(("%ty", "ty")): # Years -- not delta year = dates month = np.ones_like(dates) conv_dates = convert_year_month_safe(year, month) @@ -1029,10 +1032,6 @@ def _read_header(self): # calculate size of a data record self.col_sizes = lmap(lambda x: self._calcsize(x), self.typlist) - # remove format details from %td - self.fmtlist = ["%td" if x.startswith("%td") else x - for x in self.fmtlist] - def _read_new_header(self, first_char): # The first part of the header is common to 117 and 118. self.path_or_buf.read(27) # stata_dta>
@@ -1578,7 +1577,8 @@ def read(self, nrows=None, convert_dates=None, self._do_convert_missing(data, convert_missing) if convert_dates: - cols = np.where(lmap(lambda x: x in _date_formats, + cols = np.where(lmap(lambda x: any(x.startswith(fmt) + for fmt in _date_formats), self.fmtlist))[0] for i in cols: col = data.columns[i] diff --git a/pandas/tests/io/data/stata13_dates.dta b/pandas/tests/io/data/stata13_dates.dta new file mode 100644 index 0000000000000000000000000000000000000000..87b857559e501e555b74131bce5abc7b2b1d98ff GIT binary patch literal 3386 zcmeHKO=uHA6n?g1#Xl&Bco706;HBM75vy@nAq`TfjY{jGc-T&}lkVbfwwax@1`qAg zqZbd_i+F1f;=zMntXFS}XQ3bz5%Hi#4y9Q z!?1;1f*nFFM2SyuM666^2pM$)P1hI+sUTb(zk-a0iAJpxfQ(TK6|4gQyj!r4kEe-m z@xJOH<^_aH1+!M~;Q7?GB1;XLnW2<LIc&PED$j^@(8qakr zdq(5gL&`pLSmD!S3g2n$I0bzJIf=t2MXQ(g=cCi|< zFn~(6jRYsicxqG9mf&K)1em9{=dkZNn0ev9qYwm{!T~S@WyJmet%=uYueF|95%8_u4DozI6c{JqU1%w+Fm^;Oz_d!JdSLIR5@4 z{xc_@V>vc(JQd%O&!p;G3>|`~^Lx*OsBf(+-1s4s>mD`g-KXc>qD;JfvucZa2aoP| z342l7@5d^#jMVK*eaha(mrErsorIk^gwD2s&YRLGCQaYgy8fV|izzNLLiyL1SBxxV zZ1eEXL(B`+;^*AGYOap!x%J#8Y{C Date: Thu, 26 Oct 2017 11:24:12 -0700 Subject: [PATCH 2/5] Update comment per @gfyoung comment --- pandas/tests/io/test_stata.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index a0dbca2995e17..32623bda10192 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1333,18 +1333,16 @@ def test_set_index(self): @pytest.mark.parametrize( 'column', ['ms', 'day', 'week', 'month', 'qtr', 'half', 'yr']) def test_date_parsing_ignores_format_details(self, column): - """ - Test that display formats are ignored when determining if a numeric - column is a date value. - - All date types are stored as numbers and format associated with the - column denotes both the type of the date and the display format. - - STATA supports 9 date types which each have distinct units. We test 7 - of the 9 types, ignoring %tC and %tb. %tC is a variant of %tc that - accounts for leap seconds and %tb relies on STATAs business calendar. - """ # GH 17797 + # Test that display formats are ignored when determining if a numeric + # column is a date value. + # + # All date types are stored as numbers and format associated with the + # column denotes both the type of the date and the display format. + # + # STATA supports 9 date types which each have distinct units. We test 7 + # of the 9 types, ignoring %tC and %tb. %tC is a variant of %tc that + # accounts for leap seconds and %tb relies on STATAs business calendar. df = read_stata(self.stata_dates) unformatted = df.loc[0, column] formatted = df.loc[0, column + "_fmt"] From 16cfb8792b874539c0b6ef3d1327b6ea68802004 Mon Sep 17 00:00:00 2001 From: Mike Richards Date: Thu, 26 Oct 2017 11:39:17 -0700 Subject: [PATCH 3/5] Update comment per @gfyoung nit --- pandas/tests/io/test_stata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 32623bda10192..78b47960e1a04 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1334,6 +1334,7 @@ def test_set_index(self): 'column', ['ms', 'day', 'week', 'month', 'qtr', 'half', 'yr']) def test_date_parsing_ignores_format_details(self, column): # GH 17797 + # # Test that display formats are ignored when determining if a numeric # column is a date value. # From b3273c46b7dd998b87e8c4de187e642289bc9c1d Mon Sep 17 00:00:00 2001 From: Mike Richards Date: Fri, 27 Oct 2017 08:13:08 -0700 Subject: [PATCH 4/5] Add whatsnew note for 0.21.1 --- doc/source/whatsnew/v0.21.1.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 422a239e86ece..614cbab8f338c 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -74,6 +74,9 @@ Indexing I/O ^^^ +- Bug in `StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`) + + Plotting ^^^^^^^^ From 04ec86f385cf33ab2e5976ca8378e74f6be250de Mon Sep 17 00:00:00 2001 From: Mike Richards Date: Mon, 30 Oct 2017 07:58:54 -0700 Subject: [PATCH 5/5] Be clearer about what is being fixed per @jreback --- doc/source/whatsnew/v0.21.1.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 614cbab8f338c..dad212c9cb7a4 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -74,7 +74,7 @@ Indexing I/O ^^^ -- Bug in `StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`) +- Bug in class:`~pandas.io.stata.StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`). Previously columns with display formatting were normally left as ordinal numbers and not converted to datetime objects. Plotting