From 6b5ca31c53c0e9246a6441609e900b92664e79c1 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 13 May 2013 18:05:33 -0400 Subject: [PATCH] BUG: (GH3593) fixed a bug in the incorrect conversion of datetime64[ns] in combine_first --- RELEASE.rst | 2 ++ pandas/core/common.py | 34 +++++++++++++++++++++++++++ pandas/core/frame.py | 47 ++++++++++++++++++++++++++++++++----- pandas/core/internals.py | 16 ++++++++++--- pandas/core/series.py | 13 ++++++---- pandas/tests/test_frame.py | 19 +++++++++++++++ pandas/tests/test_series.py | 2 +- 7 files changed, 119 insertions(+), 14 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 4085d350f3766..862d458f34e22 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -104,6 +104,7 @@ pandas 0.11.1 - ``combine_first`` not returning the same dtype in cases where it can (GH3552_) - Fixed bug with ``Panel.transpose`` argument aliases (GH3556_) - Fixed platform bug in ``PeriodIndex.take`` (GH3579_) + - Fixed bud in incorrect conversion of datetime64[ns] in ``combine_first`` (GH3593_) - Fixed bug in reset_index with ``NaN`` in a multi-index (GH3586_) .. _GH3164: https://github.com/pydata/pandas/issues/3164 @@ -145,6 +146,7 @@ pandas 0.11.1 .. _GH3586: https://github.com/pydata/pandas/issues/3586 .. _GH3493: https://github.com/pydata/pandas/issues/3493 .. _GH3579: https://github.com/pydata/pandas/issues/3579 +.. _GH3593: https://github.com/pydata/pandas/issues/3593 .. _GH3556: https://github.com/pydata/pandas/issues/3556 diff --git a/pandas/core/common.py b/pandas/core/common.py index f71627be1296d..2da2db052cb93 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -921,6 +921,33 @@ def _possibly_downcast_to_dtype(result, dtype): return result +def _lcd_dtypes(a_dtype, b_dtype): + """ return the lcd dtype to hold these types """ + + if is_datetime64_dtype(a_dtype) or is_datetime64_dtype(b_dtype): + return _NS_DTYPE + elif is_timedelta64_dtype(a_dtype) or is_timedelta64_dtype(b_dtype): + return _TD_DTYPE + elif is_complex_dtype(a_dtype): + if is_complex_dtype(b_dtype): + return a_dtype + return np.float64 + elif is_integer_dtype(a_dtype): + if is_integer_dtype(b_dtype): + if a_dtype.itemsize == b_dtype.itemsize: + return a_dtype + return np.int64 + return np.float64 + elif is_float_dtype(a_dtype): + if is_float_dtype(b_dtype): + if a_dtype.itemsize == b_dtype.itemsize: + return a_dtype + else: + return np.float64 + elif is_integer(b_dtype): + return np.float64 + return np.object + def _interp_wrapper(f, wrap_dtype, na_override=None): def wrapper(arr, mask, limit=None): view = arr.view(wrap_dtype) @@ -1524,6 +1551,13 @@ def is_float_dtype(arr_or_dtype): tipo = arr_or_dtype.dtype.type return issubclass(tipo, np.floating) +def is_complex_dtype(arr_or_dtype): + if isinstance(arr_or_dtype, np.dtype): + tipo = arr_or_dtype.type + else: + tipo = arr_or_dtype.dtype.type + return issubclass(tipo, np.complexfloating) + def is_list_like(arg): return hasattr(arg, '__iter__') and not isinstance(arg, basestring) or hasattr(arg,'len') diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3df95b27f8736..1b01c92f03a32 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3738,8 +3738,11 @@ def combine(self, other, func, fill_value=None, overwrite=True): result = {} for col in new_columns: - series = this[col].values - otherSeries = other[col].values + series = this[col] + otherSeries = other[col] + + this_dtype = series.dtype + other_dtype = otherSeries.dtype this_mask = isnull(series) other_mask = isnull(otherSeries) @@ -3756,18 +3759,40 @@ def combine(self, other, func, fill_value=None, overwrite=True): series[this_mask] = fill_value otherSeries[other_mask] = fill_value - arr = func(series, otherSeries) + # if we have different dtypes, possibily promote + new_dtype = this_dtype + if this_dtype != other_dtype: + new_dtype = com._lcd_dtypes(this_dtype,other_dtype) + series = series.astype(new_dtype) + otherSeries = otherSeries.astype(new_dtype) + + # see if we need to be represented as i8 (datetimelike) + # try to keep us at this dtype + needs_i8_conversion = com.needs_i8_conversion(new_dtype) + if needs_i8_conversion: + this_dtype = new_dtype + arr = func(series, otherSeries, True) + else: + arr = func(series, otherSeries) if do_fill: arr = com.ensure_float(arr) arr[this_mask & other_mask] = NA + # try to downcast back to the original dtype + if needs_i8_conversion: + arr = com._possibly_cast_to_datetime(arr, this_dtype) + else: + arr = com._possibly_downcast_to_dtype(arr, this_dtype) + result[col] = arr # convert_objects just in case return self._constructor(result, index=new_index, - columns=new_columns).convert_objects(copy=False) + columns=new_columns).convert_objects( + convert_dates=True, + copy=False) def combine_first(self, other): """ @@ -3788,8 +3813,18 @@ def combine_first(self, other): ------- combined : DataFrame """ - def combiner(x, y): - return expressions.where(isnull(x), y, x, raise_on_error=True) + def combiner(x, y, needs_i8_conversion=False): + x_values = x.values if hasattr(x,'values') else x + y_values = y.values if hasattr(y,'values') else y + if needs_i8_conversion: + mask = isnull(x) + x_values = x_values.view('i8') + y_values = y_values.view('i8') + else: + mask = isnull(x_values) + + return expressions.where(mask, y_values, x_values, raise_on_error=True) + return self.combine(other, combiner, overwrite=False) def update(self, other, join='left', overwrite=True, filter_func=None, diff --git a/pandas/core/internals.py b/pandas/core/internals.py index b6459b0e461b4..d058d20427ad7 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -258,14 +258,15 @@ def downcast(self, dtypes = None): return blocks - def astype(self, dtype, copy = True, raise_on_error = True): + def astype(self, dtype, copy = True, raise_on_error = True, values = None): """ Coerce to the new type (if copy=True, return a new copy) raise on an except if raise == True """ try: - newb = make_block(com._astype_nansafe(self.values, dtype, copy = copy), - self.items, self.ref_items, fastpath=True) + if values is None: + values = com._astype_nansafe(self.values, dtype, copy = copy) + newb = make_block(values, self.items, self.ref_items, fastpath=True) except: if raise_on_error is True: raise @@ -708,6 +709,15 @@ def is_bool(self): """ we can be a bool if we have only bool values but are of type object """ return lib.is_bool_array(self.values.ravel()) + def astype(self, dtype, copy=True, raise_on_error=True, values=None): + """ allow astypes to datetime64[ns],timedelta64[ns] with coercion """ + dtype = np.dtype(dtype) + if dtype == _NS_DTYPE or dtype == _TD_DTYPE: + values = com._possibly_convert_datetime(self.values,dtype) + else: + values = None + return super(ObjectBlock, self).astype(dtype=dtype,copy=copy,raise_on_error=raise_on_error,values=values) + def convert(self, convert_dates = True, convert_numeric = True, copy = True): """ attempt to coerce any object types to better types return a copy of the block (if copy = True) diff --git a/pandas/core/series.py b/pandas/core/series.py index cebf2f4ef9d1f..8a3f353aa7c4a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -17,7 +17,8 @@ from pandas.core.common import (isnull, notnull, _is_bool_indexer, _default_index, _maybe_promote, _maybe_upcast, _asarray_tuplesafe, is_integer_dtype, - _infer_dtype_from_scalar, is_list_like) + _infer_dtype_from_scalar, is_list_like, + _NS_DTYPE, _TD_DTYPE) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, _ensure_index, _handle_legacy_indexes) from pandas.core.indexing import _SeriesIndexer, _check_bool_indexer, _check_slice_bounds @@ -929,9 +930,13 @@ def astype(self, dtype): """ See numpy.ndarray.astype """ - casted = com._astype_nansafe(self.values, dtype) - return self._constructor(casted, index=self.index, name=self.name, - dtype=casted.dtype) + dtype = np.dtype(dtype) + if dtype == _NS_DTYPE or dtype == _TD_DTYPE: + values = com._possibly_cast_to_datetime(self.values,dtype) + else: + values = com._astype_nansafe(self.values, dtype) + return self._constructor(values, index=self.index, name=self.name, + dtype=values.dtype) def convert_objects(self, convert_dates=True, convert_numeric=True, copy=True): """ diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 7e7813e048bd1..ce24c72f75882 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -7907,6 +7907,25 @@ def test_combine_first_mixed_bug(self): expected = Series([True,True,False]) assert_series_equal(result,expected) + # GH 3593, converting datetime64[ns] incorrecly + df0 = DataFrame({"a":[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}) + df1 = DataFrame({"a":[None, None, None]}) + df2 = df1.combine_first(df0) + assert_frame_equal(df2,df0) + + df2 = df0.combine_first(df1) + assert_frame_equal(df2,df0) + + df0 = DataFrame({"a":[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}) + df1 = DataFrame({"a":[datetime(2000, 1, 2), None, None]}) + df2 = df1.combine_first(df0) + result = df0.copy() + result.iloc[0,:] = df1.iloc[0,:] + assert_frame_equal(df2,result) + + df2 = df0.combine_first(df1) + assert_frame_equal(df2,df0) + def test_update(self): df = DataFrame([[1.5, nan, 3.], [1.5, nan, 3.], diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 6fbce9df753d8..94d29e9233fb6 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -1856,7 +1856,7 @@ def test_operators_timedelta64(self): v1 = date_range('2012-1-1', periods=3, freq='D') v2 = date_range('2012-1-2', periods=3, freq='D') rs = Series(v2) - Series(v1) - xp = Series(1e9 * 3600 * 24, rs.index).astype('timedelta64[ns]') + xp = Series(1e9 * 3600 * 24, rs.index).astype('int64').astype('timedelta64[ns]') assert_series_equal(rs, xp) self.assert_(rs.dtype=='timedelta64[ns]')