Skip to content

ENH: read_stata return non-nano #55642

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Feb 2, 2024
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ enhancement2
Other enhancements
^^^^^^^^^^^^^^^^^^
- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
-

.. ---------------------------------------------------------------------------
Expand Down
146 changes: 54 additions & 92 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@
Timestamp,
isna,
to_datetime,
to_timedelta,
)
from pandas.core.frame import DataFrame
from pandas.core.indexes.base import Index
Expand Down Expand Up @@ -232,6 +231,7 @@


stata_epoch: Final = datetime(1960, 1, 1)
unix_epoch: Final = datetime(1970, 1, 1)


def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series:
Expand All @@ -256,7 +256,7 @@ def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series:
>>> dates = pd.Series([52])
>>> _stata_elapsed_date_to_datetime_vec(dates , "%tw")
0 1961-01-01
dtype: datetime64[ns]
dtype: datetime64[s]

Notes
-----
Expand All @@ -280,76 +280,51 @@ def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series:
date - ty
years since 0000
"""
MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year
MAX_DAY_DELTA = (Timestamp.max - datetime(1960, 1, 1)).days
MIN_DAY_DELTA = (Timestamp.min - datetime(1960, 1, 1)).days
MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000
MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000

def convert_year_month_safe(year, month) -> Series:
"""
Convert year and month to datetimes, using pandas vectorized versions
when the date range falls within the range supported by pandas.
Otherwise it falls back to a slower but more robust method
using datetime.
"""
if year.max() < MAX_YEAR and year.min() > MIN_YEAR:
return to_datetime(100 * year + month, format="%Y%m")
else:
index = getattr(year, "index", None)
return Series([datetime(y, m, 1) for y, m in zip(year, month)], index=index)

def convert_year_days_safe(year, days) -> Series:
"""
Converts year (e.g. 1999) and days since the start of the year to a
datetime or datetime64 Series
"""
if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR:
return to_datetime(year, format="%Y") + to_timedelta(days, unit="d")
else:
index = getattr(year, "index", None)
value = [
datetime(y, 1, 1) + timedelta(days=int(d)) for y, d in zip(year, days)
]
return Series(value, index=index)
if fmt.startswith(("%tc", "tc")):
# Delta ms relative to base
td = np.timedelta64(stata_epoch - unix_epoch, "ms")
res = np.array(dates._values, dtype="M8[ms]") + td
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not very familiar with this part of the code base, but this looks like it would overflow when dates._values are close to the minimum storage value. Does numpy catch that for us or return junk values?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch. numpy does not catch this

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bashtage am i right in thinking stata doesn't support dates millions of years in the past/future so we don't need to worry about these overflows?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just did a check of type tc and found that the maximum date it would represent is 31Dec9999 21:41:39

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This value is stored as 253717911699456

return Series(res, index=dates.index)

def convert_delta_safe(base, deltas, unit) -> Series:
"""
Convert base dates and deltas to datetimes, using pandas vectorized
versions if the deltas satisfy restrictions required to be expressed
as dates in pandas.
"""
index = getattr(deltas, "index", None)
if unit == "d":
if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA:
values = [base + timedelta(days=int(d)) for d in deltas]
return Series(values, index=index)
elif unit == "ms":
if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA:
values = [
base + timedelta(microseconds=(int(d) * 1000)) for d in deltas
]
return Series(values, index=index)
else:
raise ValueError("format not understood")
base = to_datetime(base)
deltas = to_timedelta(deltas, unit=unit)
return base + deltas
elif fmt.startswith(("%td", "td", "%d", "d")):
# Delta days relative to base
td = np.timedelta64(stata_epoch - unix_epoch, "D")
res = np.array(dates._values, dtype="M8[D]") + td
return Series(res, index=dates.index)

elif fmt.startswith(("%tm", "tm")):
# Delta months relative to base
ordinals = dates + (stata_epoch.year - unix_epoch.year) * 12
res = np.array(ordinals, dtype="M8[M]").astype("M8[s]")
return Series(res, index=dates.index)

elif fmt.startswith(("%tq", "tq")):
# Delta quarters relative to base
ordinals = dates + (stata_epoch.year - unix_epoch.year) * 4
res = np.array(ordinals, dtype="M8[3M]").astype("M8[s]")
return Series(res, index=dates.index)

elif fmt.startswith(("%th", "th")):
# Delta half-years relative to base
ordinals = dates + (stata_epoch.year - unix_epoch.year) * 2
res = np.array(ordinals, dtype="M8[6M]").astype("M8[s]")
return Series(res, index=dates.index)

elif fmt.startswith(("%ty", "ty")):
# Years -- not delta
ordinals = dates - 1970
res = np.array(ordinals, dtype="M8[Y]").astype("M8[s]")
return Series(res, index=dates.index)

# TODO(non-nano): If/when pandas supports more than datetime64[ns], this
# should be improved to use correct range, e.g. datetime[Y] for yearly
bad_locs = np.isnan(dates)
has_bad_values = False
if bad_locs.any():
has_bad_values = True
dates._values[bad_locs] = 1.0 # Replace with NaT
dates = dates.astype(np.int64)

if fmt.startswith(("%tc", "tc")): # Delta ms relative to base
base = stata_epoch
ms = dates
conv_dates = convert_delta_safe(base, ms, "ms")
elif fmt.startswith(("%tC", "tC")):
if fmt.startswith(("%tC", "tC")):
warnings.warn(
"Encountered %tC format. Leaving in Stata Internal Format.",
stacklevel=find_stack_level(),
Expand All @@ -358,33 +333,18 @@ def convert_delta_safe(base, deltas, unit) -> Series:
if has_bad_values:
conv_dates[bad_locs] = NaT
return conv_dates
# Delta days relative to base
elif fmt.startswith(("%td", "td", "%d", "d")):
base = stata_epoch
days = dates
conv_dates = convert_delta_safe(base, days, "d")
# does not count leap days - 7 days is a week.
# 52nd week may have more than 7 days
elif fmt.startswith(("%tw", "tw")):
year = stata_epoch.year + dates // 52
days = (dates % 52) * 7
conv_dates = convert_year_days_safe(year, days)
elif fmt.startswith(("%tm", "tm")): # Delta months relative to base
year = stata_epoch.year + dates // 12
month = (dates % 12) + 1
conv_dates = convert_year_month_safe(year, month)
elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base
year = stata_epoch.year + dates // 4
quarter_month = (dates % 4) * 3 + 1
conv_dates = convert_year_month_safe(year, quarter_month)
elif fmt.startswith(("%th", "th")): # Delta half-years relative to base
year = stata_epoch.year + dates // 2
month = (dates % 2) * 6 + 1
conv_dates = convert_year_month_safe(year, month)
elif fmt.startswith(("%ty", "ty")): # Years -- not delta
year = dates
first_month = np.ones_like(dates)
conv_dates = convert_year_month_safe(year, first_month)
per_y = (year - 1970).array.view("Period[Y]")
per_d = per_y.asfreq("D", how="S")
per_d_shifted = per_d + days._values
per_s = per_d_shifted.asfreq("s", how="S")
conv_dates_arr = per_s.view("M8[s]")
conv_dates = Series(conv_dates_arr, index=dates.index)

else:
raise ValueError(f"Date fmt {fmt} not understood")

Expand All @@ -409,24 +369,26 @@ def _datetime_to_stata_elapsed_vec(dates: Series, fmt: str) -> Series:
index = dates.index
NS_PER_DAY = 24 * 3600 * 1000 * 1000 * 1000
US_PER_DAY = NS_PER_DAY / 1000
MS_PER_DAY = NS_PER_DAY / 1_000_000

def parse_dates_safe(
dates: Series, delta: bool = False, year: bool = False, days: bool = False
):
d = {}
if lib.is_np_dtype(dates.dtype, "M"):
if delta:
time_delta = dates - Timestamp(stata_epoch).as_unit("ns")
d["delta"] = time_delta._values.view(np.int64) // 1000 # microseconds
time_delta = dates.dt.as_unit("ms") - Timestamp(stata_epoch).as_unit(
"ms"
)
d["delta"] = time_delta._values.view(np.int64)
if days or year:
date_index = DatetimeIndex(dates)
d["year"] = date_index._data.year
d["month"] = date_index._data.month
if days:
days_in_ns = dates._values.view(np.int64) - to_datetime(
d["year"], format="%Y"
)._values.view(np.int64)
d["days"] = days_in_ns // NS_PER_DAY
year_start = np.asarray(dates).astype("M8[Y]").astype(dates.dtype)
diff = dates - year_start
d["days"] = np.asarray(diff).astype("m8[D]").view("int64")

elif infer_dtype(dates, skipna=False) == "datetime":
if delta:
Expand Down Expand Up @@ -466,7 +428,7 @@ def g(x: datetime) -> int:

if fmt in ["%tc", "tc"]:
d = parse_dates_safe(dates, delta=True)
conv_dates = d.delta / 1000
conv_dates = d.delta
elif fmt in ["%tC", "tC"]:
warnings.warn(
"Stata Internal Format tC not supported.",
Expand All @@ -475,7 +437,7 @@ def g(x: datetime) -> int:
conv_dates = dates
elif fmt in ["%td", "td"]:
d = parse_dates_safe(dates, delta=True)
conv_dates = d.delta // US_PER_DAY
conv_dates = d.delta // MS_PER_DAY
elif fmt in ["%tw", "tw"]:
d = parse_dates_safe(dates, year=True, days=True)
conv_dates = 52 * (d.year - stata_epoch.year) + d.days // 7
Expand Down
Loading