Skip to content

Commit 270a10b

Browse files
committed
ENH: more datetime64 integration in core data algorithms per #996, close #1035
1 parent 20f4527 commit 270a10b

11 files changed

+201
-74
lines changed

pandas/core/common.py

+80-19
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def isnull(obj):
6666
result = Series(result, index=obj.index, copy=False)
6767
elif obj.dtype == np.datetime64:
6868
# this is the NaT pattern
69-
result = obj.ravel().view('i8') == 0x8000000000000000
69+
result = obj.view('i8') == lib.NaT
7070
else:
7171
result = -np.isfinite(obj)
7272
return result
@@ -106,43 +106,59 @@ def _unpickle_array(bytes):
106106
arr = read_array(BytesIO(bytes))
107107
return arr
108108

109-
def _take_1d_bool(arr, indexer, out, fill_value=np.nan):
110-
view = arr.view(np.uint8)
111-
outview = out.view(np.uint8)
109+
def _take_1d_datetime(arr, indexer, out, fill_value=np.nan):
110+
view = arr.view(np.int64)
111+
outview = out.view(np.int64)
112112
lib.take_1d_bool(view, indexer, outview, fill_value=fill_value)
113113

114-
def _take_2d_axis0_bool(arr, indexer, out, fill_value=np.nan):
115-
view = arr.view(np.uint8)
116-
outview = out.view(np.uint8)
117-
lib.take_2d_axis0_bool(view, indexer, outview, fill_value=fill_value)
114+
def _take_2d_axis0_datetime(arr, indexer, out, fill_value=np.nan):
115+
view = arr.view(np.int64)
116+
outview = out.view(np.int64)
117+
lib.take_1d_bool(view, indexer, outview, fill_value=fill_value)
118118

119-
def _take_2d_axis1_bool(arr, indexer, out, fill_value=np.nan):
119+
def _take_2d_axis1_datetime(arr, indexer, out, fill_value=np.nan):
120120
view = arr.view(np.uint8)
121121
outview = out.view(np.uint8)
122-
lib.take_2d_axis1_bool(view, indexer, outview, fill_value=fill_value)
122+
lib.take_1d_bool(view, indexer, outview, fill_value=fill_value)
123+
124+
def _view_wrapper(f, wrap_dtype, na_override=None):
125+
def wrapper(arr, indexer, out, fill_value=np.nan):
126+
if na_override is not None and np.isnan(fill_value):
127+
fill_value = na_override
128+
view = arr.view(wrap_dtype)
129+
outview = out.view(wrap_dtype)
130+
f(view, indexer, outview, fill_value=fill_value)
131+
return wrapper
132+
123133

124134
_take1d_dict = {
125135
'float64' : lib.take_1d_float64,
126136
'int32' : lib.take_1d_int32,
127137
'int64' : lib.take_1d_int64,
128138
'object' : lib.take_1d_object,
129-
'bool' : _take_1d_bool
139+
'bool' : _view_wrapper(lib.take_1d_bool, np.uint8),
140+
'datetime64[us]' : _view_wrapper(lib.take_1d_int64, np.int64,
141+
na_override=lib.NaT),
130142
}
131143

132144
_take2d_axis0_dict = {
133145
'float64' : lib.take_2d_axis0_float64,
134146
'int32' : lib.take_2d_axis0_int32,
135147
'int64' : lib.take_2d_axis0_int64,
136148
'object' : lib.take_2d_axis0_object,
137-
'bool' : _take_2d_axis0_bool
149+
'bool' : _view_wrapper(lib.take_2d_axis0_bool, np.uint8),
150+
'datetime64[us]' : _view_wrapper(lib.take_2d_axis0_int64, np.int64,
151+
na_override=lib.NaT),
138152
}
139153

140154
_take2d_axis1_dict = {
141155
'float64' : lib.take_2d_axis1_float64,
142156
'int32' : lib.take_2d_axis1_int32,
143157
'int64' : lib.take_2d_axis1_int64,
144158
'object' : lib.take_2d_axis1_object,
145-
'bool' : _take_2d_axis1_bool
159+
'bool' : _view_wrapper(lib.take_2d_axis1_bool, np.uint8),
160+
'datetime64[us]' : _view_wrapper(lib.take_2d_axis1_int64, np.int64,
161+
na_override=lib.NaT),
146162
}
147163

148164
def _get_take2d_function(dtype_str, axis=0):
@@ -164,14 +180,13 @@ def take_1d(arr, indexer, out=None, fill_value=np.nan):
164180
indexer = np.array(indexer, dtype=np.int32)
165181

166182
indexer = _ensure_int32(indexer)
167-
168183
out_passed = out is not None
184+
take_f = _take1d_dict.get(dtype_str)
169185

170186
if dtype_str in ('int32', 'int64', 'bool'):
171187
try:
172188
if out is None:
173189
out = np.empty(n, dtype=arr.dtype)
174-
take_f = _take1d_dict[dtype_str]
175190
take_f(arr, indexer, out=out, fill_value=fill_value)
176191
except ValueError:
177192
mask = indexer == -1
@@ -186,10 +201,9 @@ def take_1d(arr, indexer, out=None, fill_value=np.nan):
186201
out.dtype)
187202
out = _maybe_upcast(out)
188203
np.putmask(out, mask, fill_value)
189-
elif dtype_str in ('float64', 'object'):
204+
elif dtype_str in ('float64', 'object', 'datetime64[us]'):
190205
if out is None:
191206
out = np.empty(n, dtype=arr.dtype)
192-
take_f = _take1d_dict[dtype_str]
193207
take_f(arr, indexer, out=out, fill_value=fill_value)
194208
else:
195209
out = arr.take(indexer, out=out)
@@ -238,7 +252,7 @@ def take_2d(arr, indexer, out=None, mask=None, needs_masking=None, axis=0,
238252
take_f = _get_take2d_function(dtype_str, axis=axis)
239253
take_f(arr, indexer, out=out, fill_value=fill_value)
240254
return out
241-
elif dtype_str in ('float64', 'object'):
255+
elif dtype_str in ('float64', 'object', 'datetime64[us]'):
242256
if out is None:
243257
out = np.empty(out_shape, dtype=arr.dtype)
244258
take_f = _get_take2d_function(dtype_str, axis=axis)
@@ -301,9 +315,22 @@ def _need_upcast(values):
301315
return True
302316
return False
303317

318+
def _interp_wrapper(f, wrap_dtype, na_override=None):
319+
def wrapper(arr, mask, limit=None):
320+
view = arr.view(wrap_dtype)
321+
f(view, mask, limit=limit)
322+
return wrapper
323+
324+
_pad_1d_datetime = _interp_wrapper(lib.pad_inplace_int64, np.int64)
325+
_pad_2d_datetime = _interp_wrapper(lib.pad_2d_inplace_int64, np.int64)
326+
_backfill_1d_datetime = _interp_wrapper(lib.backfill_inplace_int64, np.int64)
327+
_backfill_2d_datetime = _interp_wrapper(lib.backfill_2d_inplace_int64, np.int64)
328+
304329
def pad_1d(values, limit=None):
305330
if is_float_dtype(values):
306331
_method = lib.pad_inplace_float64
332+
elif is_datetime64_dtype(values):
333+
_method = _pad_1d_datetime
307334
elif values.dtype == np.object_:
308335
_method = lib.pad_inplace_object
309336
else: # pragma: no cover
@@ -314,21 +341,48 @@ def pad_1d(values, limit=None):
314341
def backfill_1d(values, limit=None):
315342
if is_float_dtype(values):
316343
_method = lib.backfill_inplace_float64
344+
elif is_datetime64_dtype(values):
345+
_method = _backfill_1d_datetime
317346
elif values.dtype == np.object_:
318347
_method = lib.backfill_inplace_object
319348
else: # pragma: no cover
320349
raise ValueError('Invalid dtype for padding')
321350

322351
_method(values, isnull(values).view(np.uint8), limit=limit)
323352

353+
def pad_2d(values, limit=None):
354+
if is_float_dtype(values):
355+
_method = lib.pad_2d_inplace_float64
356+
elif is_datetime64_dtype(values):
357+
_method = _pad_2d_datetime
358+
elif values.dtype == np.object_:
359+
_method = lib.pad_2d_inplace_object
360+
else: # pragma: no cover
361+
raise ValueError('Invalid dtype for padding')
362+
363+
_method(values, isnull(values).view(np.uint8), limit=limit)
364+
365+
def backfill_2d(values, limit=None):
366+
if is_float_dtype(values):
367+
_method = lib.backfill_2d_inplace_float64
368+
elif is_datetime64_dtype(values):
369+
_method = _backfill_2d_datetime
370+
elif values.dtype == np.object_:
371+
_method = lib.backfill_2d_inplace_object
372+
else: # pragma: no cover
373+
raise ValueError('Invalid dtype for padding')
374+
375+
_method(values, isnull(values).view(np.uint8), limit=limit)
376+
377+
324378
def _consensus_name_attr(objs):
325379
name = objs[0].name
326380
for obj in objs[1:]:
327381
if obj.name != name:
328382
return None
329383
return name
330384

331-
#-------------------------------------------------------------------------------
385+
#----------------------------------------------------------------------
332386
# Lots of little utilities
333387

334388
def _infer_dtype(value):
@@ -579,6 +633,13 @@ def is_integer_dtype(arr_or_dtype):
579633
return (issubclass(tipo, np.integer) and not
580634
issubclass(tipo, np.datetime64))
581635

636+
def is_datetime64_dtype(arr_or_dtype):
637+
if isinstance(arr_or_dtype, np.dtype):
638+
tipo = arr_or_dtype.type
639+
else:
640+
tipo = arr_or_dtype.dtype.type
641+
return issubclass(tipo, np.datetime64)
642+
582643
def is_float_dtype(arr_or_dtype):
583644
if isinstance(arr_or_dtype, np.dtype):
584645
tipo = arr_or_dtype.type

pandas/core/format.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,8 @@ def format_array(values, formatter, float_format=None, na_rep='NaN',
413413
fmt_klass = FloatArrayFormatter
414414
elif com.is_integer_dtype(values.dtype):
415415
fmt_klass = IntArrayFormatter
416+
elif com.is_datetime64_dtype(values.dtype):
417+
fmt_klass = Datetime64Formatter
416418
else:
417419
fmt_klass = GenericArrayFormatter
418420

@@ -549,6 +551,23 @@ def get_result(self):
549551
return _make_fixed_width(fmt_values, self.justify)
550552

551553

554+
class Datetime64Formatter(GenericArrayFormatter):
555+
556+
def get_result(self):
557+
if self.formatter:
558+
formatter = self.formatter
559+
else:
560+
def formatter(x):
561+
if isnull(x):
562+
return 'NaT'
563+
else:
564+
return str(x)
565+
566+
fmt_values = [formatter(x) for x in self.values]
567+
568+
return _make_fixed_width(fmt_values, self.justify)
569+
570+
552571
def _make_fixed_width(strings, justify='right'):
553572
if len(strings) == 0:
554573
return strings
@@ -609,7 +628,7 @@ def _has_names(index):
609628

610629
def set_printoptions(precision=None, column_space=None, max_rows=None,
611630
max_columns=None, colheader_justify='right',
612-
notebook_repr_html=None,
631+
notebook_repr_html=None,
613632
date_dayfirst=None, date_yearfirst=None):
614633
"""
615634
Alter default behavior of DataFrame.toString

pandas/core/frame.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -2457,8 +2457,6 @@ def fillna(self, value=None, method='pad', axis=0, inplace=False,
24572457
-------
24582458
filled : DataFrame
24592459
"""
2460-
from pandas.core.internals import FloatBlock, ObjectBlock
2461-
24622460
self._consolidate_inplace()
24632461

24642462
if value is None:
@@ -2468,7 +2466,7 @@ def fillna(self, value=None, method='pad', axis=0, inplace=False,
24682466
new_blocks = []
24692467
method = com._clean_fill_method(method)
24702468
for block in self._data.blocks:
2471-
if isinstance(block, (FloatBlock, ObjectBlock)):
2469+
if block._can_hold_na:
24722470
newb = block.interpolate(method, axis=axis,
24732471
limit=limit, inplace=inplace)
24742472
else:

pandas/core/internals.py

+3-24
Original file line numberDiff line numberDiff line change
@@ -225,33 +225,12 @@ def interpolate(self, method='pad', axis=0, inplace=False, limit=None):
225225
transf = (lambda x: x) if axis == 0 else (lambda x: x.T)
226226

227227
if method == 'pad':
228-
_pad(transf(values), limit=limit)
228+
com.pad_2d(transf(values), limit=limit)
229229
else:
230-
_backfill(transf(values), limit=limit)
230+
com.backfill_2d(transf(values), limit=limit)
231231

232232
return make_block(values, self.items, self.ref_items)
233233

234-
def _pad(values, limit=None):
235-
if com.is_float_dtype(values):
236-
_method = lib.pad_2d_inplace_float64
237-
elif values.dtype == np.object_:
238-
_method = lib.pad_2d_inplace_object
239-
else: # pragma: no cover
240-
raise ValueError('Invalid dtype for padding')
241-
242-
_method(values, com.isnull(values).view(np.uint8),
243-
limit=limit)
244-
245-
def _backfill(values, limit=None):
246-
if com.is_float_dtype(values):
247-
_method = lib.backfill_2d_inplace_float64
248-
elif values.dtype == np.object_:
249-
_method = lib.backfill_2d_inplace_object
250-
else: # pragma: no cover
251-
raise ValueError('Invalid dtype for padding')
252-
253-
_method(values, com.isnull(values).view(np.uint8),
254-
limit=limit)
255234

256235
#-------------------------------------------------------------------------------
257236
# Is this even possible?
@@ -284,7 +263,7 @@ def should_store(self, value):
284263
(np.integer, np.floating, np.bool_))
285264

286265
class DatetimeBlock(IntBlock):
287-
pass
266+
_can_hold_na = True
288267

289268

290269
def make_block(values, items, ref_items, do_integrity_check=False):

pandas/core/series.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -1910,10 +1910,9 @@ def reindex(self, index=None, method=None, level=None, fill_value=np.nan,
19101910
if len(self.index) == 0:
19111911
return Series(nan, index=index, name=self.name)
19121912

1913-
new_index, fill_vec = self.index.reindex(index, method=method,
1913+
new_index, indexer = self.index.reindex(index, method=method,
19141914
level=level, limit=limit)
1915-
fill_vec = com._ensure_int32(fill_vec)
1916-
new_values = com.take_1d(self.values, fill_vec, fill_value=fill_value)
1915+
new_values = com.take_1d(self.values, indexer, fill_value=fill_value)
19171916
return Series(new_values, index=new_index, name=self.name)
19181917

19191918
def reindex_like(self, other, method=None, limit=None):

pandas/src/generate_code.py

+14-4
Original file line numberDiff line numberDiff line change
@@ -282,13 +282,12 @@ def pad_%(name)s(ndarray[%(c_type)s] old, ndarray[%(c_type)s] new,
282282
@cython.wraparound(False)
283283
def pad_inplace_%(name)s(ndarray[%(c_type)s] values,
284284
ndarray[uint8_t, cast=True] mask,
285-
limit=None):
285+
limit=None):
286286
cdef Py_ssize_t i, N
287287
cdef %(c_type)s val
288288
cdef int lim, fill_count = 0
289289
290290
N = len(values)
291-
val = np.nan
292291
293292
if limit is None:
294293
lim = N
@@ -307,6 +306,7 @@ def pad_inplace_%(name)s(ndarray[%(c_type)s] values,
307306
else:
308307
fill_count = 0
309308
val = values[i]
309+
310310
"""
311311

312312
pad_2d_template = """@cython.boundscheck(False)
@@ -320,8 +320,6 @@ def pad_2d_inplace_%(name)s(ndarray[%(c_type)s, ndim=2] values,
320320
321321
K, N = (<object> values).shape
322322
323-
val = np.nan
324-
325323
if limit is None:
326324
lim = N
327325
else:
@@ -795,6 +793,12 @@ def generate_from_template(template, ndim=1, exclude=None):
795793
templates_2d = [take_2d_axis0_template,
796794
take_2d_axis1_template]
797795

796+
797+
# templates_1d_datetime = [take_1d_template]
798+
# templates_2d_datetime = [take_2d_axis0_template,
799+
# take_2d_axis1_template]
800+
801+
798802
def generate_take_cython_file(path='generated.pyx'):
799803
with open(path, 'w') as f:
800804
for template in templates_1d:
@@ -803,6 +807,12 @@ def generate_take_cython_file(path='generated.pyx'):
803807
for template in templates_2d:
804808
print >> f, generate_from_template(template, ndim=2)
805809

810+
# for template in templates_1d_datetime:
811+
# print >> f, generate_from_template_datetime(template)
812+
813+
# for template in templates_2d_datetime:
814+
# print >> f, generate_from_template_datetime(template, ndim=2)
815+
806816
for template in nobool_1d_templates:
807817
print >> f, generate_from_template(template, exclude=['bool'])
808818

0 commit comments

Comments
 (0)