62
62
Timestamp ,
63
63
isna ,
64
64
to_datetime ,
65
- to_timedelta ,
66
65
)
67
66
from pandas .core .frame import DataFrame
68
67
from pandas .core .indexes .base import Index
232
231
233
232
234
233
stata_epoch : Final = datetime (1960 , 1 , 1 )
234
+ unix_epoch : Final = datetime (1970 , 1 , 1 )
235
235
236
236
237
237
def _stata_elapsed_date_to_datetime_vec (dates : Series , fmt : str ) -> Series :
@@ -256,7 +256,7 @@ def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series:
256
256
>>> dates = pd.Series([52])
257
257
>>> _stata_elapsed_date_to_datetime_vec(dates , "%tw")
258
258
0 1961-01-01
259
- dtype: datetime64[ns ]
259
+ dtype: datetime64[s ]
260
260
261
261
Notes
262
262
-----
@@ -280,76 +280,51 @@ def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series:
280
280
date - ty
281
281
years since 0000
282
282
"""
283
- MIN_YEAR , MAX_YEAR = Timestamp .min .year , Timestamp .max .year
284
- MAX_DAY_DELTA = (Timestamp .max - datetime (1960 , 1 , 1 )).days
285
- MIN_DAY_DELTA = (Timestamp .min - datetime (1960 , 1 , 1 )).days
286
- MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000
287
- MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000
288
283
289
- def convert_year_month_safe (year , month ) -> Series :
290
- """
291
- Convert year and month to datetimes, using pandas vectorized versions
292
- when the date range falls within the range supported by pandas.
293
- Otherwise it falls back to a slower but more robust method
294
- using datetime.
295
- """
296
- if year .max () < MAX_YEAR and year .min () > MIN_YEAR :
297
- return to_datetime (100 * year + month , format = "%Y%m" )
298
- else :
299
- index = getattr (year , "index" , None )
300
- return Series ([datetime (y , m , 1 ) for y , m in zip (year , month )], index = index )
301
-
302
- def convert_year_days_safe (year , days ) -> Series :
303
- """
304
- Converts year (e.g. 1999) and days since the start of the year to a
305
- datetime or datetime64 Series
306
- """
307
- if year .max () < (MAX_YEAR - 1 ) and year .min () > MIN_YEAR :
308
- return to_datetime (year , format = "%Y" ) + to_timedelta (days , unit = "d" )
309
- else :
310
- index = getattr (year , "index" , None )
311
- value = [
312
- datetime (y , 1 , 1 ) + timedelta (days = int (d )) for y , d in zip (year , days )
313
- ]
314
- return Series (value , index = index )
284
+ if fmt .startswith (("%tc" , "tc" )):
285
+ # Delta ms relative to base
286
+ td = np .timedelta64 (stata_epoch - unix_epoch , "ms" )
287
+ res = np .array (dates ._values , dtype = "M8[ms]" ) + td
288
+ return Series (res , index = dates .index )
315
289
316
- def convert_delta_safe (base , deltas , unit ) -> Series :
317
- """
318
- Convert base dates and deltas to datetimes, using pandas vectorized
319
- versions if the deltas satisfy restrictions required to be expressed
320
- as dates in pandas.
321
- """
322
- index = getattr (deltas , "index" , None )
323
- if unit == "d" :
324
- if deltas .max () > MAX_DAY_DELTA or deltas .min () < MIN_DAY_DELTA :
325
- values = [base + timedelta (days = int (d )) for d in deltas ]
326
- return Series (values , index = index )
327
- elif unit == "ms" :
328
- if deltas .max () > MAX_MS_DELTA or deltas .min () < MIN_MS_DELTA :
329
- values = [
330
- base + timedelta (microseconds = (int (d ) * 1000 )) for d in deltas
331
- ]
332
- return Series (values , index = index )
333
- else :
334
- raise ValueError ("format not understood" )
335
- base = to_datetime (base )
336
- deltas = to_timedelta (deltas , unit = unit )
337
- return base + deltas
290
+ elif fmt .startswith (("%td" , "td" , "%d" , "d" )):
291
+ # Delta days relative to base
292
+ td = np .timedelta64 (stata_epoch - unix_epoch , "D" )
293
+ res = np .array (dates ._values , dtype = "M8[D]" ) + td
294
+ return Series (res , index = dates .index )
295
+
296
+ elif fmt .startswith (("%tm" , "tm" )):
297
+ # Delta months relative to base
298
+ ordinals = dates + (stata_epoch .year - unix_epoch .year ) * 12
299
+ res = np .array (ordinals , dtype = "M8[M]" ).astype ("M8[s]" )
300
+ return Series (res , index = dates .index )
301
+
302
+ elif fmt .startswith (("%tq" , "tq" )):
303
+ # Delta quarters relative to base
304
+ ordinals = dates + (stata_epoch .year - unix_epoch .year ) * 4
305
+ res = np .array (ordinals , dtype = "M8[3M]" ).astype ("M8[s]" )
306
+ return Series (res , index = dates .index )
307
+
308
+ elif fmt .startswith (("%th" , "th" )):
309
+ # Delta half-years relative to base
310
+ ordinals = dates + (stata_epoch .year - unix_epoch .year ) * 2
311
+ res = np .array (ordinals , dtype = "M8[6M]" ).astype ("M8[s]" )
312
+ return Series (res , index = dates .index )
313
+
314
+ elif fmt .startswith (("%ty" , "ty" )):
315
+ # Years -- not delta
316
+ ordinals = dates - 1970
317
+ res = np .array (ordinals , dtype = "M8[Y]" ).astype ("M8[s]" )
318
+ return Series (res , index = dates .index )
338
319
339
- # TODO(non-nano): If/when pandas supports more than datetime64[ns], this
340
- # should be improved to use correct range, e.g. datetime[Y] for yearly
341
320
bad_locs = np .isnan (dates )
342
321
has_bad_values = False
343
322
if bad_locs .any ():
344
323
has_bad_values = True
345
324
dates ._values [bad_locs ] = 1.0 # Replace with NaT
346
325
dates = dates .astype (np .int64 )
347
326
348
- if fmt .startswith (("%tc" , "tc" )): # Delta ms relative to base
349
- base = stata_epoch
350
- ms = dates
351
- conv_dates = convert_delta_safe (base , ms , "ms" )
352
- elif fmt .startswith (("%tC" , "tC" )):
327
+ if fmt .startswith (("%tC" , "tC" )):
353
328
warnings .warn (
354
329
"Encountered %tC format. Leaving in Stata Internal Format." ,
355
330
stacklevel = find_stack_level (),
@@ -358,33 +333,18 @@ def convert_delta_safe(base, deltas, unit) -> Series:
358
333
if has_bad_values :
359
334
conv_dates [bad_locs ] = NaT
360
335
return conv_dates
361
- # Delta days relative to base
362
- elif fmt .startswith (("%td" , "td" , "%d" , "d" )):
363
- base = stata_epoch
364
- days = dates
365
- conv_dates = convert_delta_safe (base , days , "d" )
366
336
# does not count leap days - 7 days is a week.
367
337
# 52nd week may have more than 7 days
368
338
elif fmt .startswith (("%tw" , "tw" )):
369
339
year = stata_epoch .year + dates // 52
370
340
days = (dates % 52 ) * 7
371
- conv_dates = convert_year_days_safe (year , days )
372
- elif fmt .startswith (("%tm" , "tm" )): # Delta months relative to base
373
- year = stata_epoch .year + dates // 12
374
- month = (dates % 12 ) + 1
375
- conv_dates = convert_year_month_safe (year , month )
376
- elif fmt .startswith (("%tq" , "tq" )): # Delta quarters relative to base
377
- year = stata_epoch .year + dates // 4
378
- quarter_month = (dates % 4 ) * 3 + 1
379
- conv_dates = convert_year_month_safe (year , quarter_month )
380
- elif fmt .startswith (("%th" , "th" )): # Delta half-years relative to base
381
- year = stata_epoch .year + dates // 2
382
- month = (dates % 2 ) * 6 + 1
383
- conv_dates = convert_year_month_safe (year , month )
384
- elif fmt .startswith (("%ty" , "ty" )): # Years -- not delta
385
- year = dates
386
- first_month = np .ones_like (dates )
387
- conv_dates = convert_year_month_safe (year , first_month )
341
+ per_y = (year - 1970 ).array .view ("Period[Y]" )
342
+ per_d = per_y .asfreq ("D" , how = "S" )
343
+ per_d_shifted = per_d + days ._values
344
+ per_s = per_d_shifted .asfreq ("s" , how = "S" )
345
+ conv_dates_arr = per_s .view ("M8[s]" )
346
+ conv_dates = Series (conv_dates_arr , index = dates .index )
347
+
388
348
else :
389
349
raise ValueError (f"Date fmt { fmt } not understood" )
390
350
@@ -409,24 +369,26 @@ def _datetime_to_stata_elapsed_vec(dates: Series, fmt: str) -> Series:
409
369
index = dates .index
410
370
NS_PER_DAY = 24 * 3600 * 1000 * 1000 * 1000
411
371
US_PER_DAY = NS_PER_DAY / 1000
372
+ MS_PER_DAY = NS_PER_DAY / 1_000_000
412
373
413
374
def parse_dates_safe (
414
375
dates : Series , delta : bool = False , year : bool = False , days : bool = False
415
376
):
416
377
d = {}
417
378
if lib .is_np_dtype (dates .dtype , "M" ):
418
379
if delta :
419
- time_delta = dates - Timestamp (stata_epoch ).as_unit ("ns" )
420
- d ["delta" ] = time_delta ._values .view (np .int64 ) // 1000 # microseconds
380
+ time_delta = dates .dt .as_unit ("ms" ) - Timestamp (stata_epoch ).as_unit (
381
+ "ms"
382
+ )
383
+ d ["delta" ] = time_delta ._values .view (np .int64 )
421
384
if days or year :
422
385
date_index = DatetimeIndex (dates )
423
386
d ["year" ] = date_index ._data .year
424
387
d ["month" ] = date_index ._data .month
425
388
if days :
426
- days_in_ns = dates ._values .view (np .int64 ) - to_datetime (
427
- d ["year" ], format = "%Y"
428
- )._values .view (np .int64 )
429
- d ["days" ] = days_in_ns // NS_PER_DAY
389
+ year_start = np .asarray (dates ).astype ("M8[Y]" ).astype (dates .dtype )
390
+ diff = dates - year_start
391
+ d ["days" ] = np .asarray (diff ).astype ("m8[D]" ).view ("int64" )
430
392
431
393
elif infer_dtype (dates , skipna = False ) == "datetime" :
432
394
if delta :
@@ -466,7 +428,7 @@ def g(x: datetime) -> int:
466
428
467
429
if fmt in ["%tc" , "tc" ]:
468
430
d = parse_dates_safe (dates , delta = True )
469
- conv_dates = d .delta / 1000
431
+ conv_dates = d .delta
470
432
elif fmt in ["%tC" , "tC" ]:
471
433
warnings .warn (
472
434
"Stata Internal Format tC not supported." ,
@@ -475,7 +437,7 @@ def g(x: datetime) -> int:
475
437
conv_dates = dates
476
438
elif fmt in ["%td" , "td" ]:
477
439
d = parse_dates_safe (dates , delta = True )
478
- conv_dates = d .delta // US_PER_DAY
440
+ conv_dates = d .delta // MS_PER_DAY
479
441
elif fmt in ["%tw" , "tw" ]:
480
442
d = parse_dates_safe (dates , year = True , days = True )
481
443
conv_dates = 52 * (d .year - stata_epoch .year ) + d .days // 7
0 commit comments