diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f9117253b61c1..405036174aef5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -29,6 +29,7 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) +- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 15ef20e9f453e..a2c15938c04bf 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -62,7 +62,6 @@ Timestamp, isna, to_datetime, - to_timedelta, ) from pandas.core.frame import DataFrame from pandas.core.indexes.base import Index @@ -232,6 +231,7 @@ stata_epoch: Final = datetime(1960, 1, 1) +unix_epoch: Final = datetime(1970, 1, 1) def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series: @@ -256,7 +256,7 @@ def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series: >>> dates = pd.Series([52]) >>> _stata_elapsed_date_to_datetime_vec(dates , "%tw") 0 1961-01-01 - dtype: datetime64[ns] + dtype: datetime64[s] Notes ----- @@ -280,64 +280,43 @@ def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series: date - ty years since 0000 """ - MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year - MAX_DAY_DELTA = (Timestamp.max - datetime(1960, 1, 1)).days - MIN_DAY_DELTA = (Timestamp.min - datetime(1960, 1, 1)).days - MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000 - MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000 - def convert_year_month_safe(year, month) -> Series: - """ - Convert year and month to datetimes, using pandas vectorized versions - when the date range falls within the range supported by pandas. - Otherwise it falls back to a slower but more robust method - using datetime. - """ - if year.max() < MAX_YEAR and year.min() > MIN_YEAR: - return to_datetime(100 * year + month, format="%Y%m") - else: - index = getattr(year, "index", None) - return Series([datetime(y, m, 1) for y, m in zip(year, month)], index=index) - - def convert_year_days_safe(year, days) -> Series: - """ - Converts year (e.g. 1999) and days since the start of the year to a - datetime or datetime64 Series - """ - if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR: - return to_datetime(year, format="%Y") + to_timedelta(days, unit="d") - else: - index = getattr(year, "index", None) - value = [ - datetime(y, 1, 1) + timedelta(days=int(d)) for y, d in zip(year, days) - ] - return Series(value, index=index) + if fmt.startswith(("%tc", "tc")): + # Delta ms relative to base + td = np.timedelta64(stata_epoch - unix_epoch, "ms") + res = np.array(dates._values, dtype="M8[ms]") + td + return Series(res, index=dates.index) - def convert_delta_safe(base, deltas, unit) -> Series: - """ - Convert base dates and deltas to datetimes, using pandas vectorized - versions if the deltas satisfy restrictions required to be expressed - as dates in pandas. - """ - index = getattr(deltas, "index", None) - if unit == "d": - if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA: - values = [base + timedelta(days=int(d)) for d in deltas] - return Series(values, index=index) - elif unit == "ms": - if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA: - values = [ - base + timedelta(microseconds=(int(d) * 1000)) for d in deltas - ] - return Series(values, index=index) - else: - raise ValueError("format not understood") - base = to_datetime(base) - deltas = to_timedelta(deltas, unit=unit) - return base + deltas + elif fmt.startswith(("%td", "td", "%d", "d")): + # Delta days relative to base + td = np.timedelta64(stata_epoch - unix_epoch, "D") + res = np.array(dates._values, dtype="M8[D]") + td + return Series(res, index=dates.index) + + elif fmt.startswith(("%tm", "tm")): + # Delta months relative to base + ordinals = dates + (stata_epoch.year - unix_epoch.year) * 12 + res = np.array(ordinals, dtype="M8[M]").astype("M8[s]") + return Series(res, index=dates.index) + + elif fmt.startswith(("%tq", "tq")): + # Delta quarters relative to base + ordinals = dates + (stata_epoch.year - unix_epoch.year) * 4 + res = np.array(ordinals, dtype="M8[3M]").astype("M8[s]") + return Series(res, index=dates.index) + + elif fmt.startswith(("%th", "th")): + # Delta half-years relative to base + ordinals = dates + (stata_epoch.year - unix_epoch.year) * 2 + res = np.array(ordinals, dtype="M8[6M]").astype("M8[s]") + return Series(res, index=dates.index) + + elif fmt.startswith(("%ty", "ty")): + # Years -- not delta + ordinals = dates - 1970 + res = np.array(ordinals, dtype="M8[Y]").astype("M8[s]") + return Series(res, index=dates.index) - # TODO(non-nano): If/when pandas supports more than datetime64[ns], this - # should be improved to use correct range, e.g. datetime[Y] for yearly bad_locs = np.isnan(dates) has_bad_values = False if bad_locs.any(): @@ -345,11 +324,7 @@ def convert_delta_safe(base, deltas, unit) -> Series: dates._values[bad_locs] = 1.0 # Replace with NaT dates = dates.astype(np.int64) - if fmt.startswith(("%tc", "tc")): # Delta ms relative to base - base = stata_epoch - ms = dates - conv_dates = convert_delta_safe(base, ms, "ms") - elif fmt.startswith(("%tC", "tC")): + if fmt.startswith(("%tC", "tC")): warnings.warn( "Encountered %tC format. Leaving in Stata Internal Format.", stacklevel=find_stack_level(), @@ -358,33 +333,18 @@ def convert_delta_safe(base, deltas, unit) -> Series: if has_bad_values: conv_dates[bad_locs] = NaT return conv_dates - # Delta days relative to base - elif fmt.startswith(("%td", "td", "%d", "d")): - base = stata_epoch - days = dates - conv_dates = convert_delta_safe(base, days, "d") # does not count leap days - 7 days is a week. # 52nd week may have more than 7 days elif fmt.startswith(("%tw", "tw")): year = stata_epoch.year + dates // 52 days = (dates % 52) * 7 - conv_dates = convert_year_days_safe(year, days) - elif fmt.startswith(("%tm", "tm")): # Delta months relative to base - year = stata_epoch.year + dates // 12 - month = (dates % 12) + 1 - conv_dates = convert_year_month_safe(year, month) - elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base - year = stata_epoch.year + dates // 4 - quarter_month = (dates % 4) * 3 + 1 - conv_dates = convert_year_month_safe(year, quarter_month) - elif fmt.startswith(("%th", "th")): # Delta half-years relative to base - year = stata_epoch.year + dates // 2 - month = (dates % 2) * 6 + 1 - conv_dates = convert_year_month_safe(year, month) - elif fmt.startswith(("%ty", "ty")): # Years -- not delta - year = dates - first_month = np.ones_like(dates) - conv_dates = convert_year_month_safe(year, first_month) + per_y = (year - 1970).array.view("Period[Y]") + per_d = per_y.asfreq("D", how="S") + per_d_shifted = per_d + days._values + per_s = per_d_shifted.asfreq("s", how="S") + conv_dates_arr = per_s.view("M8[s]") + conv_dates = Series(conv_dates_arr, index=dates.index) + else: raise ValueError(f"Date fmt {fmt} not understood") @@ -409,6 +369,7 @@ def _datetime_to_stata_elapsed_vec(dates: Series, fmt: str) -> Series: index = dates.index NS_PER_DAY = 24 * 3600 * 1000 * 1000 * 1000 US_PER_DAY = NS_PER_DAY / 1000 + MS_PER_DAY = NS_PER_DAY / 1_000_000 def parse_dates_safe( dates: Series, delta: bool = False, year: bool = False, days: bool = False @@ -416,17 +377,18 @@ def parse_dates_safe( d = {} if lib.is_np_dtype(dates.dtype, "M"): if delta: - time_delta = dates - Timestamp(stata_epoch).as_unit("ns") - d["delta"] = time_delta._values.view(np.int64) // 1000 # microseconds + time_delta = dates.dt.as_unit("ms") - Timestamp(stata_epoch).as_unit( + "ms" + ) + d["delta"] = time_delta._values.view(np.int64) if days or year: date_index = DatetimeIndex(dates) d["year"] = date_index._data.year d["month"] = date_index._data.month if days: - days_in_ns = dates._values.view(np.int64) - to_datetime( - d["year"], format="%Y" - )._values.view(np.int64) - d["days"] = days_in_ns // NS_PER_DAY + year_start = np.asarray(dates).astype("M8[Y]").astype(dates.dtype) + diff = dates - year_start + d["days"] = np.asarray(diff).astype("m8[D]").view("int64") elif infer_dtype(dates, skipna=False) == "datetime": if delta: @@ -466,7 +428,7 @@ def g(x: datetime) -> int: if fmt in ["%tc", "tc"]: d = parse_dates_safe(dates, delta=True) - conv_dates = d.delta / 1000 + conv_dates = d.delta elif fmt in ["%tC", "tC"]: warnings.warn( "Stata Internal Format tC not supported.", @@ -475,7 +437,7 @@ def g(x: datetime) -> int: conv_dates = dates elif fmt in ["%td", "td"]: d = parse_dates_safe(dates, delta=True) - conv_dates = d.delta // US_PER_DAY + conv_dates = d.delta // MS_PER_DAY elif fmt in ["%tw", "tw"]: d = parse_dates_safe(dates, year=True, days=True) conv_dates = 52 * (d.year - stata_epoch.year) + d.days // 7 diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 4aa0b2f5ae8c9..c12bcfb91a4c7 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -174,7 +174,16 @@ def test_read_dta2(self, datapath): "yearly_date", ], ) - expected["yearly_date"] = expected["yearly_date"].astype("O") + # TODO(GH#55564): just pass M8[s] to the constructor + expected["datetime_c"] = expected["datetime_c"].astype("M8[ms]") + expected["date"] = expected["date"].astype("M8[s]") + expected["weekly_date"] = expected["weekly_date"].astype("M8[s]") + expected["monthly_date"] = expected["monthly_date"].astype("M8[s]") + expected["quarterly_date"] = expected["quarterly_date"].astype("M8[s]") + expected["half_yearly_date"] = expected["half_yearly_date"].astype("M8[s]") + expected["yearly_date"] = ( + expected["yearly_date"].astype("Period[s]").array.view("M8[s]") + ) path1 = datapath("io", "data", "stata", "stata2_114.dta") path2 = datapath("io", "data", "stata", "stata2_115.dta") @@ -360,12 +369,15 @@ def test_read_write_dta10(self, version): with tm.ensure_clean() as path: original.to_stata(path, convert_dates={"datetime": "tc"}, version=version) written_and_read_again = self.read_dta(path) - # original.index is np.int32, read index is np.int64 - tm.assert_frame_equal( - written_and_read_again.set_index("index"), - original, - check_index_type=False, - ) + + expected = original[:] + # "tc" convert_dates means we store in ms + expected["datetime"] = expected["datetime"].astype("M8[ms]") + + tm.assert_frame_equal( + written_and_read_again.set_index("index"), + expected, + ) def test_stata_doc_examples(self): with tm.ensure_clean() as path: @@ -514,9 +526,10 @@ def test_read_write_reread_dta15(self, file, datapath): expected["long_"] = expected["long_"].astype(np.int32) expected["float_"] = expected["float_"].astype(np.float32) expected["double_"] = expected["double_"].astype(np.float64) - expected["date_td"] = expected["date_td"].apply( - datetime.strptime, args=("%Y-%m-%d",) - ) + + # TODO(GH#55564): directly cast to M8[s] + arr = expected["date_td"].astype("Period[D]")._values.asfreq("s", how="S") + expected["date_td"] = arr.view("M8[s]") file = datapath("io", "data", "stata", f"{file}.dta") parsed = self.read_dta(file) @@ -636,10 +649,11 @@ def test_dates_invalid_column(self): written_and_read_again = self.read_dta(path) - modified = original - modified.columns = ["_0"] - modified.index = original.index.astype(np.int32) - tm.assert_frame_equal(written_and_read_again.set_index("index"), modified) + expected = original.copy() + expected.columns = ["_0"] + expected.index = original.index.astype(np.int32) + expected["_0"] = expected["_0"].astype("M8[ms]") + tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) def test_105(self, datapath): # Data obtained from: @@ -684,7 +698,9 @@ def test_date_export_formats(self): [expected_values], index=pd.Index([0], dtype=np.int32, name="index"), columns=columns, + dtype="M8[s]", ) + expected["tc"] = expected["tc"].astype("M8[ms]") with tm.ensure_clean() as path: original.to_stata(path, convert_dates=conversions) @@ -881,6 +897,14 @@ def test_big_dates(self, datapath): expected[5][5] = expected[5][6] = datetime(1678, 1, 1) expected = DataFrame(expected, columns=columns, dtype=object) + expected["date_tc"] = expected["date_tc"].astype("M8[ms]") + expected["date_td"] = expected["date_td"].astype("M8[s]") + expected["date_tm"] = expected["date_tm"].astype("M8[s]") + expected["date_tw"] = expected["date_tw"].astype("M8[s]") + expected["date_tq"] = expected["date_tq"].astype("M8[s]") + expected["date_th"] = expected["date_th"].astype("M8[s]") + expected["date_ty"] = expected["date_ty"].astype("M8[s]") + parsed_115 = read_stata(datapath("io", "data", "stata", "stata9_115.dta")) parsed_117 = read_stata(datapath("io", "data", "stata", "stata9_117.dta")) tm.assert_frame_equal(expected, parsed_115, check_datetimelike_compat=True) @@ -906,9 +930,7 @@ def test_dtype_conversion(self, datapath): expected["long_"] = expected["long_"].astype(np.int32) expected["float_"] = expected["float_"].astype(np.float32) expected["double_"] = expected["double_"].astype(np.float64) - expected["date_td"] = expected["date_td"].apply( - datetime.strptime, args=("%Y-%m-%d",) - ) + expected["date_td"] = expected["date_td"].astype("M8[s]") no_conversion = read_stata( datapath("io", "data", "stata", "stata6_117.dta"), convert_dates=True @@ -922,12 +944,10 @@ def test_dtype_conversion(self, datapath): ) # read_csv types are the same - expected = self.read_csv(datapath("io", "data", "stata", "stata6.csv")) - expected["date_td"] = expected["date_td"].apply( - datetime.strptime, args=("%Y-%m-%d",) - ) + expected2 = self.read_csv(datapath("io", "data", "stata", "stata6.csv")) + expected2["date_td"] = expected["date_td"] - tm.assert_frame_equal(expected, conversion) + tm.assert_frame_equal(expected2, conversion) def test_drop_column(self, datapath): expected = self.read_csv(datapath("io", "data", "stata", "stata6.csv")) @@ -1392,10 +1412,14 @@ def test_default_date_conversion(self): } ) + expected = original[:] + # "tc" for convert_dates below stores with "ms" resolution + expected["dates"] = expected["dates"].astype("M8[ms]") + with tm.ensure_clean() as path: original.to_stata(path, write_index=False) reread = read_stata(path, convert_dates=True) - tm.assert_frame_equal(original, reread) + tm.assert_frame_equal(expected, reread) original.to_stata(path, write_index=False, convert_dates={"dates": "tc"}) direct = read_stata(path, convert_dates=True) @@ -1655,11 +1679,14 @@ def test_writer_117(self): version=117, ) written_and_read_again = self.read_dta(path) - # original.index is np.int32, read index is np.int64 + + expected = original[:] + # "tc" for convert_dates means we store with "ms" resolution + expected["datetime"] = expected["datetime"].astype("M8[ms]") + tm.assert_frame_equal( written_and_read_again.set_index("index"), - original, - check_index_type=False, + expected, ) tm.assert_frame_equal(original, copy) @@ -1932,7 +1959,8 @@ def test_read_write_ea_dtypes(self, dtype_backend): "b": ["a", "b", "c"], "c": [1.0, 0, np.nan], "d": [1.5, 2.5, 3.5], - "e": pd.date_range("2020-12-31", periods=3, freq="D"), + # stata stores with ms unit, so unit does not round-trip exactly + "e": pd.date_range("2020-12-31", periods=3, freq="D", unit="ms"), }, index=pd.Index([0, 1, 2], name="index", dtype=np.int32), )