From d225df3fc9afd358e23dd60ca629e8b88ef2cc87 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 24 Dec 2012 23:33:18 -0500 Subject: [PATCH 1/4] BUG: provide for automatic conversion of object -> datetime64[ns] types upon creation (in make_block) this obviates the need to convert_objects (mostly) in addition, enabled setting of NaT in datetime64[ns] columns via np.nan (on-the-fly-conversion) --- pandas/core/indexing.py | 24 +++++++++++++++++++----- pandas/core/internals.py | 16 +++++++++++++++- pandas/tests/test_frame.py | 23 ++++++++++++++++++++++- pandas/tseries/tests/test_timeseries.py | 5 +++-- 4 files changed, 59 insertions(+), 9 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index f8c977a3b9015..6f191caa8f53d 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -113,11 +113,25 @@ def _setitem_with_indexer(self, indexer, value): if np.prod(values.shape): values[plane_indexer] = value except ValueError: - for item, v in zip(item_labels[het_idx], value): - data = self.obj[item] - values = data.values - if np.prod(values.shape): - values[plane_indexer] = v + + # convert nan to iNaT if possible + if data.dtype == 'M8[ns]': + mask = com._isnull(value) + if np.isscalar(value) and mask: + from pandas import tslib + value = tslib.iNaT + values[plane_indexer] = value + elif isinstance(value, np.array) and mask.any(): + from pandas import tslib + value = value.copy() + value.putmask(iNat,mask) + values[plane_indexer] = value + else: + for item, v in zip(item_labels[het_idx], value): + data = self.obj[item] + values = data.values + if np.prod(values.shape): + values[plane_indexer] = v else: if isinstance(indexer, tuple): indexer = _maybe_convert_ix(*indexer) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 639141e4edba6..67cda37578459 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -446,6 +446,7 @@ def get_values(self, dtype): def make_block(values, items, ref_items): dtype = values.dtype vtype = dtype.type + klass = None if issubclass(vtype, np.floating): klass = FloatBlock @@ -459,7 +460,20 @@ def make_block(values, items, ref_items): klass = IntBlock elif dtype == np.bool_: klass = BoolBlock - else: + + # try to infer a datetimeblock + if klass is None and np.prod(values.shape): + inferred_type = lib.infer_dtype(values.flatten()) + if inferred_type == 'datetime': + + # we have an object array that has been inferred as datetime, so convert it + try: + values = tslib.array_to_datetime(values.flatten()).reshape(values.shape) + klass = DatetimeBlock + except: # it already object, so leave it + pass + + if klass is None: klass = ObjectBlock return klass(values, items, ref_items, ndim=values.ndim) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index cf485f70ffbc8..623ee9bca257d 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -21,7 +21,7 @@ import pandas.core.format as fmt import pandas.core.datetools as datetools from pandas.core.api import (DataFrame, Index, Series, notnull, isnull, - MultiIndex, DatetimeIndex) + MultiIndex, DatetimeIndex, Timestamp) from pandas.io.parsers import read_csv from pandas.util.testing import (assert_almost_equal, @@ -1073,6 +1073,27 @@ def test_setitem_single_column_mixed(self): expected = [nan, 'qux', nan, 'qux', nan] assert_almost_equal(df['str'].values, expected) + def test_setitem_single_column_mixed_datetime(self): + df = DataFrame(randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], + columns=['foo', 'bar', 'baz']) + + df['timestamp'] = Timestamp('20010102') + + # check our dtypes + result = df.get_dtype_counts() + expected = Series({ 'float64' : 3, 'datetime64[ns]' : 1}) + assert_series_equal(result, expected) + + # set an allowable datetime64 type + from pandas import tslib + df.ix['b','timestamp'] = tslib.iNaT + + # this fails because nan is a float type + df.ix['b','timestamp'] = nan + + # prior to 0.10.1 this failed + #self.assertRaises(TypeError, df.ix.__setitem__, ('c','timestamp'), nan) + def test_setitem_frame(self): piece = self.frame.ix[:2, ['A', 'B']] self.frame.ix[-2:, ['A', 'B']] = piece.values diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 855bbd02489bb..7ba370ea52861 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -1235,9 +1235,9 @@ def test_append_concat(self): def test_set_dataframe_column_ns_dtype(self): x = DataFrame([datetime.now(), datetime.now()]) - self.assert_(x[0].dtype == object) + #self.assert_(x[0].dtype == object) - x[0] = to_datetime(x[0]) + #x[0] = to_datetime(x[0]) self.assert_(x[0].dtype == np.dtype('M8[ns]')) def test_groupby_count_dateparseerror(self): @@ -2066,6 +2066,7 @@ def test_get_level_values_box(self): def test_frame_apply_dont_convert_datetime64(self): from pandas.tseries.offsets import BDay df = DataFrame({'x1': [datetime(1996,1,1)]}) + df = df.applymap(lambda x: x+BDay()) df = df.applymap(lambda x: x+BDay()) From 777b5fadd0978258eb9b6400b16f6798af13b00c Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 25 Dec 2012 00:13:24 -0500 Subject: [PATCH 2/4] TST: fixed up a failing test --- pandas/core/indexing.py | 12 +++++------- pandas/tests/test_frame.py | 9 +++++++-- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 6f191caa8f53d..c096df9155944 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -4,6 +4,7 @@ from pandas.core.index import Index, MultiIndex import pandas.core.common as com import pandas.lib as lib +import pandas.tslib as tslib import numpy as np @@ -117,15 +118,12 @@ def _setitem_with_indexer(self, indexer, value): # convert nan to iNaT if possible if data.dtype == 'M8[ns]': mask = com._isnull(value) - if np.isscalar(value) and mask: - from pandas import tslib + if np.isscalar(value) and com.isnull(value): value = tslib.iNaT values[plane_indexer] = value - elif isinstance(value, np.array) and mask.any(): - from pandas import tslib - value = value.copy() - value.putmask(iNat,mask) - values[plane_indexer] = value + else: + raise ValueError("Cannot set indexer value of datetime64[ns] with [%s]" % value) + else: for item, v in zip(item_labels[het_idx], value): data = self.obj[item] diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 623ee9bca257d..b6c38ab98f334 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1087,9 +1087,14 @@ def test_setitem_single_column_mixed_datetime(self): # set an allowable datetime64 type from pandas import tslib df.ix['b','timestamp'] = tslib.iNaT + self.assert_(com.isnull(df.ix['b','timestamp'])) - # this fails because nan is a float type - df.ix['b','timestamp'] = nan + # allow this syntax + df.ix['c','timestamp'] = nan + self.assert_(com.isnull(df.ix['c','timestamp'])) + + # try to set with a list like item + self.assertRaises(Exception, df.ix.__setitem__, ('d','timestamp'), [nan]) # prior to 0.10.1 this failed #self.assertRaises(TypeError, df.ix.__setitem__, ('c','timestamp'), nan) From add7ae773776797216d04b8896a74df0536c84aa Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 25 Dec 2012 12:51:32 -0500 Subject: [PATCH 3/4] BUG/ENH: explicity support Series construction with a datetime64 dtype (and allow np.nan) to be passed e.g. Series(np.nan,index=range(5),dtype='M8[ns]') bugfix in core/frame for applymap, handle dtype=M8[ns] series explicity (needed to cast datetim64 to Timestamp) --- pandas/core/common.py | 23 +++++++++++++++++++++++ pandas/core/frame.py | 9 ++++++++- pandas/core/indexing.py | 22 ++++++---------------- pandas/core/internals.py | 5 +++-- pandas/core/series.py | 7 +++++-- pandas/tests/test_frame.py | 4 ++++ pandas/tests/test_series.py | 20 ++++++++++++++++++++ pandas/tseries/tests/test_timeseries.py | 2 +- 8 files changed, 70 insertions(+), 22 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 15f6cb6412c78..ad91a96f46a71 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -628,6 +628,29 @@ def _consensus_name_attr(objs): #---------------------------------------------------------------------- # Lots of little utilities +def _possibly_cast_to_datetime(value, dtype): + """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT """ + + if dtype == 'M8[ns]': + import pandas.tslib as tslib + if np.isscalar(value): + if value == tslib.iNaT or isnull(value): + value = tslib.iNaT + else: + value = np.array(value) + + # have a scalar array-like (e.g. NaT) + if value.ndim == 0: + value = tslib.iNaT + + # we have an array of datetime & nulls + elif np.prod(value.shape): + try: + value = tslib.array_to_datetime(value) + except: + pass + + return value def _infer_dtype(value): if isinstance(value, (float, np.floating)): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b3fef8943baf3..020e4d0098d4c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4218,7 +4218,14 @@ def applymap(self, func): ------- applied : DataFrame """ - return self.apply(lambda x: lib.map_infer(x, func)) + + # if we have a dtype == 'M8[ns]', provide boxed values + def infer(x): + if x.dtype == 'M8[ns]': + from pandas import Timestamp + return [ func(Timestamp(e)) for e in x ] + return lib.map_infer(x, func) + return self.apply(infer) #---------------------------------------------------------------------- # Merging / joining methods diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index c096df9155944..53eb18c12f172 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -4,7 +4,6 @@ from pandas.core.index import Index, MultiIndex import pandas.core.common as com import pandas.lib as lib -import pandas.tslib as tslib import numpy as np @@ -112,24 +111,15 @@ def _setitem_with_indexer(self, indexer, value): data = self.obj[item] values = data.values if np.prod(values.shape): + value = com._possibly_cast_to_datetime(value,getattr(data,'dtype',None)) values[plane_indexer] = value except ValueError: + for item, v in zip(item_labels[het_idx], value): + data = self.obj[item] + values = data.values + if np.prod(values.shape): + values[plane_indexer] = v - # convert nan to iNaT if possible - if data.dtype == 'M8[ns]': - mask = com._isnull(value) - if np.isscalar(value) and com.isnull(value): - value = tslib.iNaT - values[plane_indexer] = value - else: - raise ValueError("Cannot set indexer value of datetime64[ns] with [%s]" % value) - - else: - for item, v in zip(item_labels[het_idx], value): - data = self.obj[item] - values = data.values - if np.prod(values.shape): - values[plane_indexer] = v else: if isinstance(indexer, tuple): indexer = _maybe_convert_ix(*indexer) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 67cda37578459..57844656bf113 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -463,12 +463,13 @@ def make_block(values, items, ref_items): # try to infer a datetimeblock if klass is None and np.prod(values.shape): - inferred_type = lib.infer_dtype(values.flatten()) + flat = values.flatten() + inferred_type = lib.infer_dtype(flat) if inferred_type == 'datetime': # we have an object array that has been inferred as datetime, so convert it try: - values = tslib.array_to_datetime(values.flatten()).reshape(values.shape) + values = tslib.array_to_datetime(flat).reshape(values.shape) klass = DatetimeBlock except: # it already object, so leave it pass diff --git a/pandas/core/series.py b/pandas/core/series.py index 6cf511d32bfb3..7ffdc1051ee63 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2983,12 +2983,13 @@ def _sanitize_array(data, index, dtype=None, copy=False, def _try_cast(arr): try: - subarr = np.array(data, dtype=dtype, copy=copy) + arr = com._possibly_cast_to_datetime(arr, dtype) + subarr = np.array(arr, dtype=dtype, copy=copy) except (ValueError, TypeError): if dtype is not None and raise_cast_failure: raise else: # pragma: no cover - subarr = np.array(data, dtype=object, copy=copy) + subarr = np.array(arr, dtype=object, copy=copy) return subarr # GH #846 @@ -3047,6 +3048,8 @@ def _try_cast(arr): value, dtype = _dtype_from_scalar(value) subarr = np.empty(len(index), dtype=dtype) else: + # need to possibly convert the value here + value = com._possibly_cast_to_datetime(value, dtype) subarr = np.empty(len(index), dtype=dtype) subarr.fill(value) else: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index b6c38ab98f334..462812296c9da 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1093,6 +1093,10 @@ def test_setitem_single_column_mixed_datetime(self): df.ix['c','timestamp'] = nan self.assert_(com.isnull(df.ix['c','timestamp'])) + # allow this syntax + df.ix['d',:] = nan + self.assert_(com.isnull(df.ix['c',:]).all() == False) + # try to set with a list like item self.assertRaises(Exception, df.ix.__setitem__, ('d','timestamp'), [nan]) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 1b0065c18923f..111b1e69bb823 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -351,6 +351,26 @@ def test_constructor_dtype_nocast(self): s2[1] = 5 self.assertEquals(s[1], 5) + def test_constructor_dtype_datetime64(self): + import pandas.tslib as tslib + + s = Series(tslib.iNaT,dtype='M8[ns]',index=range(5)) + self.assert_(isnull(s).all() == True) + + s = Series(tslib.NaT,dtype='M8[ns]',index=range(5)) + self.assert_(isnull(s).all() == True) + + s = Series(nan,dtype='M8[ns]',index=range(5)) + self.assert_(isnull(s).all() == True) + + s = Series([ datetime(2001,1,2,0,0), tslib.iNaT ],dtype='M8[ns]') + self.assert_(isnull(s[1]) == True) + self.assert_(s.dtype == 'M8[ns]') + + s = Series([ datetime(2001,1,2,0,0), nan ],dtype='M8[ns]') + self.assert_(isnull(s[1]) == True) + self.assert_(s.dtype == 'M8[ns]') + def test_constructor_dict(self): d = {'a' : 0., 'b' : 1., 'c' : 2.} result = Series(d, index=['b', 'c', 'd', 'a']) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 7ba370ea52861..60b2e989ea683 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -2070,7 +2070,7 @@ def test_frame_apply_dont_convert_datetime64(self): df = df.applymap(lambda x: x+BDay()) df = df.applymap(lambda x: x+BDay()) - self.assertTrue(df.x1.dtype == object) + self.assertTrue(df.x1.dtype == 'M8[ns]') class TestLegacyCompat(unittest.TestCase): From 25cf4dca75976ce7b177c257527d55688fb7801d Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 25 Dec 2012 15:25:14 -0500 Subject: [PATCH 4/4] cleaner Timestamp boxing in frame/applymap --- pandas/core/common.py | 1 - pandas/core/frame.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index ad91a96f46a71..c7a0dd6c6e179 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -632,7 +632,6 @@ def _possibly_cast_to_datetime(value, dtype): """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT """ if dtype == 'M8[ns]': - import pandas.tslib as tslib if np.isscalar(value): if value == tslib.iNaT or isnull(value): value = tslib.iNaT diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 020e4d0098d4c..49b667b7f0bb4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4222,8 +4222,7 @@ def applymap(self, func): # if we have a dtype == 'M8[ns]', provide boxed values def infer(x): if x.dtype == 'M8[ns]': - from pandas import Timestamp - return [ func(Timestamp(e)) for e in x ] + x = lib.map_infer(x, lib.Timestamp) return lib.map_infer(x, func) return self.apply(infer)