Skip to content

Commit 024a503

Browse files
jbrockmendelpmhatre1
authored andcommitted
ENH: read_stata return non-nano (pandas-dev#55642)
* ENH: read_stata return non-nano * GH ref * mypy fixup * update doctest * simplify * avoid Series.view * dont go through Series * move whatsnew * remove outdated whatsnew
1 parent 7c6414d commit 024a503

File tree

3 files changed

+110
-119
lines changed

3 files changed

+110
-119
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ enhancement2
2929
Other enhancements
3030
^^^^^^^^^^^^^^^^^^
3131
- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
32+
- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
3233
-
3334

3435
.. ---------------------------------------------------------------------------

pandas/io/stata.py

+54-92
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,6 @@
6262
Timestamp,
6363
isna,
6464
to_datetime,
65-
to_timedelta,
6665
)
6766
from pandas.core.frame import DataFrame
6867
from pandas.core.indexes.base import Index
@@ -232,6 +231,7 @@
232231

233232

234233
stata_epoch: Final = datetime(1960, 1, 1)
234+
unix_epoch: Final = datetime(1970, 1, 1)
235235

236236

237237
def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series:
@@ -256,7 +256,7 @@ def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series:
256256
>>> dates = pd.Series([52])
257257
>>> _stata_elapsed_date_to_datetime_vec(dates , "%tw")
258258
0 1961-01-01
259-
dtype: datetime64[ns]
259+
dtype: datetime64[s]
260260
261261
Notes
262262
-----
@@ -280,76 +280,51 @@ def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series:
280280
date - ty
281281
years since 0000
282282
"""
283-
MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year
284-
MAX_DAY_DELTA = (Timestamp.max - datetime(1960, 1, 1)).days
285-
MIN_DAY_DELTA = (Timestamp.min - datetime(1960, 1, 1)).days
286-
MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000
287-
MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000
288283

289-
def convert_year_month_safe(year, month) -> Series:
290-
"""
291-
Convert year and month to datetimes, using pandas vectorized versions
292-
when the date range falls within the range supported by pandas.
293-
Otherwise it falls back to a slower but more robust method
294-
using datetime.
295-
"""
296-
if year.max() < MAX_YEAR and year.min() > MIN_YEAR:
297-
return to_datetime(100 * year + month, format="%Y%m")
298-
else:
299-
index = getattr(year, "index", None)
300-
return Series([datetime(y, m, 1) for y, m in zip(year, month)], index=index)
301-
302-
def convert_year_days_safe(year, days) -> Series:
303-
"""
304-
Converts year (e.g. 1999) and days since the start of the year to a
305-
datetime or datetime64 Series
306-
"""
307-
if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR:
308-
return to_datetime(year, format="%Y") + to_timedelta(days, unit="d")
309-
else:
310-
index = getattr(year, "index", None)
311-
value = [
312-
datetime(y, 1, 1) + timedelta(days=int(d)) for y, d in zip(year, days)
313-
]
314-
return Series(value, index=index)
284+
if fmt.startswith(("%tc", "tc")):
285+
# Delta ms relative to base
286+
td = np.timedelta64(stata_epoch - unix_epoch, "ms")
287+
res = np.array(dates._values, dtype="M8[ms]") + td
288+
return Series(res, index=dates.index)
315289

316-
def convert_delta_safe(base, deltas, unit) -> Series:
317-
"""
318-
Convert base dates and deltas to datetimes, using pandas vectorized
319-
versions if the deltas satisfy restrictions required to be expressed
320-
as dates in pandas.
321-
"""
322-
index = getattr(deltas, "index", None)
323-
if unit == "d":
324-
if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA:
325-
values = [base + timedelta(days=int(d)) for d in deltas]
326-
return Series(values, index=index)
327-
elif unit == "ms":
328-
if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA:
329-
values = [
330-
base + timedelta(microseconds=(int(d) * 1000)) for d in deltas
331-
]
332-
return Series(values, index=index)
333-
else:
334-
raise ValueError("format not understood")
335-
base = to_datetime(base)
336-
deltas = to_timedelta(deltas, unit=unit)
337-
return base + deltas
290+
elif fmt.startswith(("%td", "td", "%d", "d")):
291+
# Delta days relative to base
292+
td = np.timedelta64(stata_epoch - unix_epoch, "D")
293+
res = np.array(dates._values, dtype="M8[D]") + td
294+
return Series(res, index=dates.index)
295+
296+
elif fmt.startswith(("%tm", "tm")):
297+
# Delta months relative to base
298+
ordinals = dates + (stata_epoch.year - unix_epoch.year) * 12
299+
res = np.array(ordinals, dtype="M8[M]").astype("M8[s]")
300+
return Series(res, index=dates.index)
301+
302+
elif fmt.startswith(("%tq", "tq")):
303+
# Delta quarters relative to base
304+
ordinals = dates + (stata_epoch.year - unix_epoch.year) * 4
305+
res = np.array(ordinals, dtype="M8[3M]").astype("M8[s]")
306+
return Series(res, index=dates.index)
307+
308+
elif fmt.startswith(("%th", "th")):
309+
# Delta half-years relative to base
310+
ordinals = dates + (stata_epoch.year - unix_epoch.year) * 2
311+
res = np.array(ordinals, dtype="M8[6M]").astype("M8[s]")
312+
return Series(res, index=dates.index)
313+
314+
elif fmt.startswith(("%ty", "ty")):
315+
# Years -- not delta
316+
ordinals = dates - 1970
317+
res = np.array(ordinals, dtype="M8[Y]").astype("M8[s]")
318+
return Series(res, index=dates.index)
338319

339-
# TODO(non-nano): If/when pandas supports more than datetime64[ns], this
340-
# should be improved to use correct range, e.g. datetime[Y] for yearly
341320
bad_locs = np.isnan(dates)
342321
has_bad_values = False
343322
if bad_locs.any():
344323
has_bad_values = True
345324
dates._values[bad_locs] = 1.0 # Replace with NaT
346325
dates = dates.astype(np.int64)
347326

348-
if fmt.startswith(("%tc", "tc")): # Delta ms relative to base
349-
base = stata_epoch
350-
ms = dates
351-
conv_dates = convert_delta_safe(base, ms, "ms")
352-
elif fmt.startswith(("%tC", "tC")):
327+
if fmt.startswith(("%tC", "tC")):
353328
warnings.warn(
354329
"Encountered %tC format. Leaving in Stata Internal Format.",
355330
stacklevel=find_stack_level(),
@@ -358,33 +333,18 @@ def convert_delta_safe(base, deltas, unit) -> Series:
358333
if has_bad_values:
359334
conv_dates[bad_locs] = NaT
360335
return conv_dates
361-
# Delta days relative to base
362-
elif fmt.startswith(("%td", "td", "%d", "d")):
363-
base = stata_epoch
364-
days = dates
365-
conv_dates = convert_delta_safe(base, days, "d")
366336
# does not count leap days - 7 days is a week.
367337
# 52nd week may have more than 7 days
368338
elif fmt.startswith(("%tw", "tw")):
369339
year = stata_epoch.year + dates // 52
370340
days = (dates % 52) * 7
371-
conv_dates = convert_year_days_safe(year, days)
372-
elif fmt.startswith(("%tm", "tm")): # Delta months relative to base
373-
year = stata_epoch.year + dates // 12
374-
month = (dates % 12) + 1
375-
conv_dates = convert_year_month_safe(year, month)
376-
elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base
377-
year = stata_epoch.year + dates // 4
378-
quarter_month = (dates % 4) * 3 + 1
379-
conv_dates = convert_year_month_safe(year, quarter_month)
380-
elif fmt.startswith(("%th", "th")): # Delta half-years relative to base
381-
year = stata_epoch.year + dates // 2
382-
month = (dates % 2) * 6 + 1
383-
conv_dates = convert_year_month_safe(year, month)
384-
elif fmt.startswith(("%ty", "ty")): # Years -- not delta
385-
year = dates
386-
first_month = np.ones_like(dates)
387-
conv_dates = convert_year_month_safe(year, first_month)
341+
per_y = (year - 1970).array.view("Period[Y]")
342+
per_d = per_y.asfreq("D", how="S")
343+
per_d_shifted = per_d + days._values
344+
per_s = per_d_shifted.asfreq("s", how="S")
345+
conv_dates_arr = per_s.view("M8[s]")
346+
conv_dates = Series(conv_dates_arr, index=dates.index)
347+
388348
else:
389349
raise ValueError(f"Date fmt {fmt} not understood")
390350

@@ -409,24 +369,26 @@ def _datetime_to_stata_elapsed_vec(dates: Series, fmt: str) -> Series:
409369
index = dates.index
410370
NS_PER_DAY = 24 * 3600 * 1000 * 1000 * 1000
411371
US_PER_DAY = NS_PER_DAY / 1000
372+
MS_PER_DAY = NS_PER_DAY / 1_000_000
412373

413374
def parse_dates_safe(
414375
dates: Series, delta: bool = False, year: bool = False, days: bool = False
415376
):
416377
d = {}
417378
if lib.is_np_dtype(dates.dtype, "M"):
418379
if delta:
419-
time_delta = dates - Timestamp(stata_epoch).as_unit("ns")
420-
d["delta"] = time_delta._values.view(np.int64) // 1000 # microseconds
380+
time_delta = dates.dt.as_unit("ms") - Timestamp(stata_epoch).as_unit(
381+
"ms"
382+
)
383+
d["delta"] = time_delta._values.view(np.int64)
421384
if days or year:
422385
date_index = DatetimeIndex(dates)
423386
d["year"] = date_index._data.year
424387
d["month"] = date_index._data.month
425388
if days:
426-
days_in_ns = dates._values.view(np.int64) - to_datetime(
427-
d["year"], format="%Y"
428-
)._values.view(np.int64)
429-
d["days"] = days_in_ns // NS_PER_DAY
389+
year_start = np.asarray(dates).astype("M8[Y]").astype(dates.dtype)
390+
diff = dates - year_start
391+
d["days"] = np.asarray(diff).astype("m8[D]").view("int64")
430392

431393
elif infer_dtype(dates, skipna=False) == "datetime":
432394
if delta:
@@ -466,7 +428,7 @@ def g(x: datetime) -> int:
466428

467429
if fmt in ["%tc", "tc"]:
468430
d = parse_dates_safe(dates, delta=True)
469-
conv_dates = d.delta / 1000
431+
conv_dates = d.delta
470432
elif fmt in ["%tC", "tC"]:
471433
warnings.warn(
472434
"Stata Internal Format tC not supported.",
@@ -475,7 +437,7 @@ def g(x: datetime) -> int:
475437
conv_dates = dates
476438
elif fmt in ["%td", "td"]:
477439
d = parse_dates_safe(dates, delta=True)
478-
conv_dates = d.delta // US_PER_DAY
440+
conv_dates = d.delta // MS_PER_DAY
479441
elif fmt in ["%tw", "tw"]:
480442
d = parse_dates_safe(dates, year=True, days=True)
481443
conv_dates = 52 * (d.year - stata_epoch.year) + d.days // 7

0 commit comments

Comments
 (0)