From 2046cb57a5871c618829e1772f35bc63147d8d13 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 11 Aug 2016 23:00:12 +0900 Subject: [PATCH] BUG/DEPR: combine dtype fixes --- doc/source/whatsnew/v0.19.0.txt | 2 + pandas/core/frame.py | 26 +- pandas/tests/frame/test_combine_concat.py | 461 ++++++++++++++-------- pandas/tests/frame/test_operators.py | 58 +-- pandas/tests/types/test_cast.py | 40 +- pandas/types/cast.py | 21 +- pandas/types/common.py | 29 +- 7 files changed, 410 insertions(+), 227 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 5cbdbe6168bba..411b2b0abaf5a 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -788,6 +788,7 @@ Deprecations - ``pandas.tseries.frequencies.get_standard_freq`` is deprecated. Use ``pandas.tseries.frequencies.to_offset(freq).rule_code`` instead. (:issue:`13874`) - ``pandas.tseries.frequencies.to_offset``'s ``freqstr`` keyword is deprecated in favor of ``freq``. (:issue:`13874`) + .. _whatsnew_0190.prior_deprecations: Removal of prior version deprecations/changes @@ -939,6 +940,7 @@ Bug Fixes - Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) +- Bug in ``.combine_first`` may return incorrect ``dtype`` (:issue:`7630`, :issue:`10567`) - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) - Bug in ``groupby(..).nth()`` where the group key is included inconsistently if called after ``.head()/.tail()`` (:issue:`12839`) - Bug in ``.to_html``, ``.to_latex`` and ``.to_string`` silently ignore custom datetime formatter passed through the ``formatters`` key word (:issue:`10690`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4416213817ab4..ea83200465582 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -31,20 +31,22 @@ _possibly_downcast_to_dtype, _invalidate_string_dtypes, _coerce_to_dtypes, - _maybe_upcast_putmask) + _maybe_upcast_putmask, + _find_common_type) from pandas.types.common import (is_categorical_dtype, is_object_dtype, is_extension_type, is_datetimetz, is_datetime64_dtype, + is_datetime64tz_dtype, is_bool_dtype, is_integer_dtype, is_float_dtype, is_integer, is_scalar, + is_dtype_equal, needs_i8_conversion, _get_dtype_from_object, - _lcd_dtypes, _ensure_float, _ensure_float64, _ensure_int64, @@ -3700,17 +3702,20 @@ def combine(self, other, func, fill_value=None, overwrite=True): otherSeries[other_mask] = fill_value # if we have different dtypes, possibily promote - new_dtype = this_dtype - if this_dtype != other_dtype: - new_dtype = _lcd_dtypes(this_dtype, other_dtype) - series = series.astype(new_dtype) + if notnull(series).all(): + new_dtype = this_dtype otherSeries = otherSeries.astype(new_dtype) + else: + new_dtype = _find_common_type([this_dtype, other_dtype]) + if not is_dtype_equal(this_dtype, new_dtype): + series = series.astype(new_dtype) + if not is_dtype_equal(other_dtype, new_dtype): + otherSeries = otherSeries.astype(new_dtype) # see if we need to be represented as i8 (datetimelike) # try to keep us at this dtype needs_i8_conversion_i = needs_i8_conversion(new_dtype) if needs_i8_conversion_i: - this_dtype = new_dtype arr = func(series, otherSeries, True) else: arr = func(series, otherSeries) @@ -3721,7 +3726,12 @@ def combine(self, other, func, fill_value=None, overwrite=True): # try to downcast back to the original dtype if needs_i8_conversion_i: - arr = _possibly_cast_to_datetime(arr, this_dtype) + # ToDo: This conversion should be handled in + # _possibly_cast_to_datetime but the change affects lot... + if is_datetime64tz_dtype(new_dtype): + arr = DatetimeIndex._simple_new(arr, tz=new_dtype.tz) + else: + arr = _possibly_cast_to_datetime(arr, new_dtype) else: arr = _possibly_downcast_to_dtype(arr, this_dtype) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 7202915f13258..e5aaba26135e7 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -20,23 +20,11 @@ from pandas.tests.frame.common import TestData -class TestDataFrameCombineConcat(tm.TestCase, TestData): +class TestDataFrameConcatCommon(tm.TestCase, TestData): _multiprocess_can_split_ = True - def test_combine_first_mixed(self): - a = Series(['a', 'b'], index=lrange(2)) - b = Series(lrange(2), index=lrange(2)) - f = DataFrame({'A': a, 'B': b}) - - a = Series(['a', 'b'], index=lrange(5, 7)) - b = Series(lrange(2), index=lrange(5, 7)) - g = DataFrame({'A': a, 'B': b}) - - # TODO(wesm): no verification? - combined = f.combine_first(g) # noqa - - def test_combine_multiple_frames_dtypes(self): + def test_concat_multiple_frames_dtypes(self): # GH 2759 A = DataFrame(data=np.ones((10, 2)), columns=[ @@ -46,7 +34,7 @@ def test_combine_multiple_frames_dtypes(self): expected = Series(dict(float64=2, float32=2)) assert_series_equal(results, expected) - def test_combine_multiple_tzs(self): + def test_concat_multiple_tzs(self): # GH 12467 # combining datetime tz-aware and naive DataFrames ts1 = Timestamp('2015-01-01', tz=None) @@ -194,147 +182,6 @@ def test_append_dtypes(self): expected = DataFrame({'bar': Series([Timestamp('20130101'), 1])}) assert_frame_equal(result, expected) - def test_combine_first(self): - # disjoint - head, tail = self.frame[:5], self.frame[5:] - - combined = head.combine_first(tail) - reordered_frame = self.frame.reindex(combined.index) - assert_frame_equal(combined, reordered_frame) - self.assertTrue(tm.equalContents(combined.columns, self.frame.columns)) - assert_series_equal(combined['A'], reordered_frame['A']) - - # same index - fcopy = self.frame.copy() - fcopy['A'] = 1 - del fcopy['C'] - - fcopy2 = self.frame.copy() - fcopy2['B'] = 0 - del fcopy2['D'] - - combined = fcopy.combine_first(fcopy2) - - self.assertTrue((combined['A'] == 1).all()) - assert_series_equal(combined['B'], fcopy['B']) - assert_series_equal(combined['C'], fcopy2['C']) - assert_series_equal(combined['D'], fcopy['D']) - - # overlap - head, tail = reordered_frame[:10].copy(), reordered_frame - head['A'] = 1 - - combined = head.combine_first(tail) - self.assertTrue((combined['A'][:10] == 1).all()) - - # reverse overlap - tail['A'][:10] = 0 - combined = tail.combine_first(head) - self.assertTrue((combined['A'][:10] == 0).all()) - - # no overlap - f = self.frame[:10] - g = self.frame[10:] - combined = f.combine_first(g) - assert_series_equal(combined['A'].reindex(f.index), f['A']) - assert_series_equal(combined['A'].reindex(g.index), g['A']) - - # corner cases - comb = self.frame.combine_first(self.empty) - assert_frame_equal(comb, self.frame) - - comb = self.empty.combine_first(self.frame) - assert_frame_equal(comb, self.frame) - - comb = self.frame.combine_first(DataFrame(index=["faz", "boo"])) - self.assertTrue("faz" in comb.index) - - # #2525 - df = DataFrame({'a': [1]}, index=[datetime(2012, 1, 1)]) - df2 = DataFrame({}, columns=['b']) - result = df.combine_first(df2) - self.assertTrue('b' in result) - - def test_combine_first_mixed_bug(self): - idx = Index(['a', 'b', 'c', 'e']) - ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) - ser2 = Series(['a', 'b', 'c', 'e'], index=idx) - ser3 = Series([12, 4, 5, 97], index=idx) - - frame1 = DataFrame({"col0": ser1, - "col2": ser2, - "col3": ser3}) - - idx = Index(['a', 'b', 'c', 'f']) - ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) - ser2 = Series(['a', 'b', 'c', 'f'], index=idx) - ser3 = Series([12, 4, 5, 97], index=idx) - - frame2 = DataFrame({"col1": ser1, - "col2": ser2, - "col5": ser3}) - - combined = frame1.combine_first(frame2) - self.assertEqual(len(combined.columns), 5) - - # gh 3016 (same as in update) - df = DataFrame([[1., 2., False, True], [4., 5., True, False]], - columns=['A', 'B', 'bool1', 'bool2']) - - other = DataFrame([[45, 45]], index=[0], columns=['A', 'B']) - result = df.combine_first(other) - assert_frame_equal(result, df) - - df.ix[0, 'A'] = np.nan - result = df.combine_first(other) - df.ix[0, 'A'] = 45 - assert_frame_equal(result, df) - - # doc example - df1 = DataFrame({'A': [1., np.nan, 3., 5., np.nan], - 'B': [np.nan, 2., 3., np.nan, 6.]}) - - df2 = DataFrame({'A': [5., 2., 4., np.nan, 3., 7.], - 'B': [np.nan, np.nan, 3., 4., 6., 8.]}) - - result = df1.combine_first(df2) - expected = DataFrame( - {'A': [1, 2, 3, 5, 3, 7.], 'B': [np.nan, 2, 3, 4, 6, 8]}) - assert_frame_equal(result, expected) - - # GH3552, return object dtype with bools - df1 = DataFrame( - [[np.nan, 3., True], [-4.6, np.nan, True], [np.nan, 7., False]]) - df2 = DataFrame( - [[-42.6, np.nan, True], [-5., 1.6, False]], index=[1, 2]) - - result = df1.combine_first(df2)[2] - expected = Series([True, True, False], name=2) - assert_series_equal(result, expected) - - # GH 3593, converting datetime64[ns] incorrecly - df0 = DataFrame({"a": [datetime(2000, 1, 1), - datetime(2000, 1, 2), - datetime(2000, 1, 3)]}) - df1 = DataFrame({"a": [None, None, None]}) - df2 = df1.combine_first(df0) - assert_frame_equal(df2, df0) - - df2 = df0.combine_first(df1) - assert_frame_equal(df2, df0) - - df0 = DataFrame({"a": [datetime(2000, 1, 1), - datetime(2000, 1, 2), - datetime(2000, 1, 3)]}) - df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]}) - df2 = df1.combine_first(df0) - result = df0.copy() - result.iloc[0, :] = df1.iloc[0, :] - assert_frame_equal(df2, result) - - df2 = df0.combine_first(df1) - assert_frame_equal(df2, df0) - def test_update(self): df = DataFrame([[1.5, nan, 3.], [1.5, nan, 3.], @@ -476,3 +323,305 @@ def test_join_multiindex_leftright(self): assert_frame_equal(df1.join(df2, how='right'), exp) assert_frame_equal(df2.join(df1, how='left'), exp[['value2', 'value1']]) + + +class TestDataFrameCombineFirst(tm.TestCase, TestData): + + _multiprocess_can_split_ = True + + def test_combine_first_mixed(self): + a = Series(['a', 'b'], index=lrange(2)) + b = Series(lrange(2), index=lrange(2)) + f = DataFrame({'A': a, 'B': b}) + + a = Series(['a', 'b'], index=lrange(5, 7)) + b = Series(lrange(2), index=lrange(5, 7)) + g = DataFrame({'A': a, 'B': b}) + + exp = pd.DataFrame({'A': list('abab'), 'B': [0., 1., 0., 1.]}, + index=[0, 1, 5, 6]) + combined = f.combine_first(g) + tm.assert_frame_equal(combined, exp) + + def test_combine_first(self): + # disjoint + head, tail = self.frame[:5], self.frame[5:] + + combined = head.combine_first(tail) + reordered_frame = self.frame.reindex(combined.index) + assert_frame_equal(combined, reordered_frame) + self.assertTrue(tm.equalContents(combined.columns, self.frame.columns)) + assert_series_equal(combined['A'], reordered_frame['A']) + + # same index + fcopy = self.frame.copy() + fcopy['A'] = 1 + del fcopy['C'] + + fcopy2 = self.frame.copy() + fcopy2['B'] = 0 + del fcopy2['D'] + + combined = fcopy.combine_first(fcopy2) + + self.assertTrue((combined['A'] == 1).all()) + assert_series_equal(combined['B'], fcopy['B']) + assert_series_equal(combined['C'], fcopy2['C']) + assert_series_equal(combined['D'], fcopy['D']) + + # overlap + head, tail = reordered_frame[:10].copy(), reordered_frame + head['A'] = 1 + + combined = head.combine_first(tail) + self.assertTrue((combined['A'][:10] == 1).all()) + + # reverse overlap + tail['A'][:10] = 0 + combined = tail.combine_first(head) + self.assertTrue((combined['A'][:10] == 0).all()) + + # no overlap + f = self.frame[:10] + g = self.frame[10:] + combined = f.combine_first(g) + assert_series_equal(combined['A'].reindex(f.index), f['A']) + assert_series_equal(combined['A'].reindex(g.index), g['A']) + + # corner cases + comb = self.frame.combine_first(self.empty) + assert_frame_equal(comb, self.frame) + + comb = self.empty.combine_first(self.frame) + assert_frame_equal(comb, self.frame) + + comb = self.frame.combine_first(DataFrame(index=["faz", "boo"])) + self.assertTrue("faz" in comb.index) + + # #2525 + df = DataFrame({'a': [1]}, index=[datetime(2012, 1, 1)]) + df2 = DataFrame({}, columns=['b']) + result = df.combine_first(df2) + self.assertTrue('b' in result) + + def test_combine_first_mixed_bug(self): + idx = Index(['a', 'b', 'c', 'e']) + ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) + ser2 = Series(['a', 'b', 'c', 'e'], index=idx) + ser3 = Series([12, 4, 5, 97], index=idx) + + frame1 = DataFrame({"col0": ser1, + "col2": ser2, + "col3": ser3}) + + idx = Index(['a', 'b', 'c', 'f']) + ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) + ser2 = Series(['a', 'b', 'c', 'f'], index=idx) + ser3 = Series([12, 4, 5, 97], index=idx) + + frame2 = DataFrame({"col1": ser1, + "col2": ser2, + "col5": ser3}) + + combined = frame1.combine_first(frame2) + self.assertEqual(len(combined.columns), 5) + + # gh 3016 (same as in update) + df = DataFrame([[1., 2., False, True], [4., 5., True, False]], + columns=['A', 'B', 'bool1', 'bool2']) + + other = DataFrame([[45, 45]], index=[0], columns=['A', 'B']) + result = df.combine_first(other) + assert_frame_equal(result, df) + + df.ix[0, 'A'] = np.nan + result = df.combine_first(other) + df.ix[0, 'A'] = 45 + assert_frame_equal(result, df) + + # doc example + df1 = DataFrame({'A': [1., np.nan, 3., 5., np.nan], + 'B': [np.nan, 2., 3., np.nan, 6.]}) + + df2 = DataFrame({'A': [5., 2., 4., np.nan, 3., 7.], + 'B': [np.nan, np.nan, 3., 4., 6., 8.]}) + + result = df1.combine_first(df2) + expected = DataFrame( + {'A': [1, 2, 3, 5, 3, 7.], 'B': [np.nan, 2, 3, 4, 6, 8]}) + assert_frame_equal(result, expected) + + # GH3552, return object dtype with bools + df1 = DataFrame( + [[np.nan, 3., True], [-4.6, np.nan, True], [np.nan, 7., False]]) + df2 = DataFrame( + [[-42.6, np.nan, True], [-5., 1.6, False]], index=[1, 2]) + + result = df1.combine_first(df2)[2] + expected = Series([True, True, False], name=2) + assert_series_equal(result, expected) + + # GH 3593, converting datetime64[ns] incorrecly + df0 = DataFrame({"a": [datetime(2000, 1, 1), + datetime(2000, 1, 2), + datetime(2000, 1, 3)]}) + df1 = DataFrame({"a": [None, None, None]}) + df2 = df1.combine_first(df0) + assert_frame_equal(df2, df0) + + df2 = df0.combine_first(df1) + assert_frame_equal(df2, df0) + + df0 = DataFrame({"a": [datetime(2000, 1, 1), + datetime(2000, 1, 2), + datetime(2000, 1, 3)]}) + df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]}) + df2 = df1.combine_first(df0) + result = df0.copy() + result.iloc[0, :] = df1.iloc[0, :] + assert_frame_equal(df2, result) + + df2 = df0.combine_first(df1) + assert_frame_equal(df2, df0) + + def test_combine_first_align_nan(self): + # GH 7509 (not fixed) + dfa = pd.DataFrame([[pd.Timestamp('2011-01-01'), 2]], + columns=['a', 'b']) + dfb = pd.DataFrame([[4], [5]], columns=['b']) + self.assertEqual(dfa['a'].dtype, 'datetime64[ns]') + self.assertEqual(dfa['b'].dtype, 'int64') + + res = dfa.combine_first(dfb) + exp = pd.DataFrame({'a': [pd.Timestamp('2011-01-01'), pd.NaT], + 'b': [2., 5.]}, columns=['a', 'b']) + tm.assert_frame_equal(res, exp) + self.assertEqual(res['a'].dtype, 'datetime64[ns]') + # ToDo: this must be int64 + self.assertEqual(res['b'].dtype, 'float64') + + res = dfa.iloc[:0].combine_first(dfb) + exp = pd.DataFrame({'a': [np.nan, np.nan], + 'b': [4, 5]}, columns=['a', 'b']) + tm.assert_frame_equal(res, exp) + # ToDo: this must be datetime64 + self.assertEqual(res['a'].dtype, 'float64') + # ToDo: this must be int64 + self.assertEqual(res['b'].dtype, 'int64') + + def test_combine_first_timezone(self): + # GH 7630 + data1 = pd.to_datetime('20100101 01:01').tz_localize('UTC') + df1 = pd.DataFrame(columns=['UTCdatetime', 'abc'], + data=data1, + index=pd.date_range('20140627', periods=1)) + data2 = pd.to_datetime('20121212 12:12').tz_localize('UTC') + df2 = pd.DataFrame(columns=['UTCdatetime', 'xyz'], + data=data2, + index=pd.date_range('20140628', periods=1)) + res = df2[['UTCdatetime']].combine_first(df1) + exp = pd.DataFrame({'UTCdatetime': [pd.Timestamp('2010-01-01 01:01', + tz='UTC'), + pd.Timestamp('2012-12-12 12:12', + tz='UTC')], + 'abc': [pd.Timestamp('2010-01-01 01:01:00', + tz='UTC'), pd.NaT]}, + columns=['UTCdatetime', 'abc'], + index=pd.date_range('20140627', periods=2, + freq='D')) + tm.assert_frame_equal(res, exp) + self.assertEqual(res['UTCdatetime'].dtype, 'datetime64[ns, UTC]') + self.assertEqual(res['abc'].dtype, 'datetime64[ns, UTC]') + + # GH 10567 + dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='UTC') + df1 = pd.DataFrame({'DATE': dts1}) + dts2 = pd.date_range('2015-01-03', '2015-01-05', tz='UTC') + df2 = pd.DataFrame({'DATE': dts2}) + + res = df1.combine_first(df2) + tm.assert_frame_equal(res, df1) + self.assertEqual(res['DATE'].dtype, 'datetime64[ns, UTC]') + + dts1 = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03', + '2011-01-04'], tz='US/Eastern') + df1 = pd.DataFrame({'DATE': dts1}, index=[1, 3, 5, 7]) + dts2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02', + '2012-01-03'], tz='US/Eastern') + df2 = pd.DataFrame({'DATE': dts2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = pd.DatetimeIndex(['2011-01-01', '2012-01-01', 'NaT', + '2012-01-02', '2011-01-03', '2011-01-04'], + tz='US/Eastern') + exp = pd.DataFrame({'DATE': exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + + # different tz + dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='US/Eastern') + df1 = pd.DataFrame({'DATE': dts1}) + dts2 = pd.date_range('2015-01-03', '2015-01-05') + df2 = pd.DataFrame({'DATE': dts2}) + + # if df1 doesn't have NaN, keep its dtype + res = df1.combine_first(df2) + tm.assert_frame_equal(res, df1) + self.assertEqual(res['DATE'].dtype, 'datetime64[ns, US/Eastern]') + + dts1 = pd.date_range('2015-01-01', '2015-01-02', tz='US/Eastern') + df1 = pd.DataFrame({'DATE': dts1}) + dts2 = pd.date_range('2015-01-01', '2015-01-03') + df2 = pd.DataFrame({'DATE': dts2}) + + res = df1.combine_first(df2) + exp_dts = [pd.Timestamp('2015-01-01', tz='US/Eastern'), + pd.Timestamp('2015-01-02', tz='US/Eastern'), + pd.Timestamp('2015-01-03')] + exp = pd.DataFrame({'DATE': exp_dts}) + tm.assert_frame_equal(res, exp) + self.assertEqual(res['DATE'].dtype, 'object') + + def test_combine_first_timedelta(self): + data1 = pd.TimedeltaIndex(['1 day', 'NaT', '3 day', '4day']) + df1 = pd.DataFrame({'TD': data1}, index=[1, 3, 5, 7]) + data2 = pd.TimedeltaIndex(['10 day', '11 day', '12 day']) + df2 = pd.DataFrame({'TD': data2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = pd.TimedeltaIndex(['1 day', '10 day', 'NaT', + '11 day', '3 day', '4 day']) + exp = pd.DataFrame({'TD': exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + self.assertEqual(res['TD'].dtype, 'timedelta64[ns]') + + def test_combine_first_period(self): + data1 = pd.PeriodIndex(['2011-01', 'NaT', '2011-03', + '2011-04'], freq='M') + df1 = pd.DataFrame({'P': data1}, index=[1, 3, 5, 7]) + data2 = pd.PeriodIndex(['2012-01-01', '2012-02', + '2012-03'], freq='M') + df2 = pd.DataFrame({'P': data2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = pd.PeriodIndex(['2011-01', '2012-01', 'NaT', + '2012-02', '2011-03', '2011-04'], + freq='M') + exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + self.assertEqual(res['P'].dtype, 'object') + + # different freq + dts2 = pd.PeriodIndex(['2012-01-01', '2012-01-02', + '2012-01-03'], freq='D') + df2 = pd.DataFrame({'P': dts2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = [pd.Period('2011-01', freq='M'), + pd.Period('2012-01-01', freq='D'), + pd.NaT, + pd.Period('2012-01-02', freq='D'), + pd.Period('2011-03', freq='M'), + pd.Period('2011-04', freq='M')] + exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + self.assertEqual(res['P'].dtype, 'object') diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index c91585a28d867..ce7af25eb0460 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -1013,44 +1013,52 @@ def test_combineAdd(self): with tm.assert_produces_warning(FutureWarning): # trivial comb = self.frame.combineAdd(self.frame) - assert_frame_equal(comb, self.frame * 2) + assert_frame_equal(comb, self.frame * 2) - # more rigorous - a = DataFrame([[1., nan, nan, 2., nan]], - columns=np.arange(5)) - b = DataFrame([[2., 3., nan, 2., 6., nan]], - columns=np.arange(6)) - expected = DataFrame([[3., 3., nan, 4., 6., nan]], - columns=np.arange(6)) + # more rigorous + a = DataFrame([[1., nan, nan, 2., nan]], + columns=np.arange(5)) + b = DataFrame([[2., 3., nan, 2., 6., nan]], + columns=np.arange(6)) + expected = DataFrame([[3., 3., nan, 4., 6., nan]], + columns=np.arange(6)) + with tm.assert_produces_warning(FutureWarning): result = a.combineAdd(b) - assert_frame_equal(result, expected) + assert_frame_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning): result2 = a.T.combineAdd(b.T) - assert_frame_equal(result2, expected.T) + assert_frame_equal(result2, expected.T) - expected2 = a.combine(b, operator.add, fill_value=0.) - assert_frame_equal(expected, expected2) + expected2 = a.combine(b, operator.add, fill_value=0.) + assert_frame_equal(expected, expected2) - # corner cases + # corner cases + with tm.assert_produces_warning(FutureWarning): comb = self.frame.combineAdd(self.empty) - assert_frame_equal(comb, self.frame) + assert_frame_equal(comb, self.frame) + with tm.assert_produces_warning(FutureWarning): comb = self.empty.combineAdd(self.frame) - assert_frame_equal(comb, self.frame) + assert_frame_equal(comb, self.frame) - # integer corner case - df1 = DataFrame({'x': [5]}) - df2 = DataFrame({'x': [1]}) - df3 = DataFrame({'x': [6]}) + # integer corner case + df1 = DataFrame({'x': [5]}) + df2 = DataFrame({'x': [1]}) + df3 = DataFrame({'x': [6]}) + + with tm.assert_produces_warning(FutureWarning): comb = df1.combineAdd(df2) - assert_frame_equal(comb, df3) + assert_frame_equal(comb, df3) - # mixed type GH2191 - df1 = DataFrame({'A': [1, 2], 'B': [3, 4]}) - df2 = DataFrame({'A': [1, 2], 'C': [5, 6]}) + # mixed type GH2191 + df1 = DataFrame({'A': [1, 2], 'B': [3, 4]}) + df2 = DataFrame({'A': [1, 2], 'C': [5, 6]}) + with tm.assert_produces_warning(FutureWarning): rs = df1.combineAdd(df2) - xp = DataFrame({'A': [2, 4], 'B': [3, 4.], 'C': [5, 6.]}) - assert_frame_equal(xp, rs) + xp = DataFrame({'A': [2, 4], 'B': [3, 4.], 'C': [5, 6.]}) + assert_frame_equal(xp, rs) # TODO: test integer fill corner? diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py index 3394974d833fb..46f37bf0ef8c2 100644 --- a/pandas/tests/types/test_cast.py +++ b/pandas/tests/types/test_cast.py @@ -192,6 +192,7 @@ def test_possibly_convert_objects_copy(self): class TestCommonTypes(tm.TestCase): + def test_numpy_dtypes(self): # (source_types, destination_type) testcases = ( @@ -218,18 +219,43 @@ def test_numpy_dtypes(self): ((np.complex128, np.int32), np.complex128), ((np.object, np.float32), np.object), ((np.object, np.int16), np.object), + + ((np.dtype('datetime64[ns]'), np.dtype('datetime64[ns]')), + np.dtype('datetime64[ns]')), + ((np.dtype('timedelta64[ns]'), np.dtype('timedelta64[ns]')), + np.dtype('timedelta64[ns]')), + + ((np.dtype('datetime64[ns]'), np.dtype('datetime64[ms]')), + np.dtype('datetime64[ns]')), + ((np.dtype('timedelta64[ms]'), np.dtype('timedelta64[ns]')), + np.dtype('timedelta64[ns]')), + + ((np.dtype('datetime64[ns]'), np.dtype('timedelta64[ns]')), + np.object), + ((np.dtype('datetime64[ns]'), np.int64), np.object) ) for src, common in testcases: self.assertEqual(_find_common_type(src), common) + with tm.assertRaises(ValueError): + # empty + _find_common_type([]) + def test_pandas_dtypes(self): - # TODO: not implemented yet - with self.assertRaises(TypeError): - self.assertEqual(_find_common_type([CategoricalDtype()]), - CategoricalDtype) - with self.assertRaises(TypeError): - self.assertEqual(_find_common_type([DatetimeTZDtype()]), - DatetimeTZDtype) + dtype = CategoricalDtype() + self.assertEqual(_find_common_type([dtype]), 'category') + self.assertEqual(_find_common_type([dtype, dtype]), 'category') + self.assertEqual(_find_common_type([np.object, dtype]), np.object) + + dtype = DatetimeTZDtype(unit='ns', tz='US/Eastern') + self.assertEqual(_find_common_type([dtype, dtype]), + 'datetime64[ns, US/Eastern]') + + for dtype2 in [DatetimeTZDtype(unit='ns', tz='Asia/Tokyo'), + np.dtype('datetime64[ns]'), np.object, np.int64]: + self.assertEqual(_find_common_type([dtype, dtype2]), np.object) + self.assertEqual(_find_common_type([dtype2, dtype]), np.object) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/types/cast.py b/pandas/types/cast.py index 93be926fe1eeb..59c939126d2a4 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -866,8 +866,23 @@ def _possibly_cast_to_datetime(value, dtype, errors='raise'): def _find_common_type(types): """Find a common data type among the given dtypes.""" - # TODO: enable using pandas-specific types + + if len(types) == 0: + raise ValueError('no types given') + + first = types[0] + # workaround for find_common_type([np.dtype('datetime64[ns]')] * 2) + # => object + if all(is_dtype_equal(first, t) for t in types[1:]): + return first + if any(isinstance(t, ExtensionDtype) for t in types): - raise TypeError("Common type discovery is currently only " - "supported for pure numpy dtypes.") + return np.object + + # take lowest unit + if all(is_datetime64_dtype(t) for t in types): + return np.dtype('datetime64[ns]') + if all(is_timedelta64_dtype(t) for t in types): + return np.dtype('timedelta64[ns]') + return np.find_common_type(types, []) diff --git a/pandas/types/common.py b/pandas/types/common.py index bffff0357f329..39db0be3e416e 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -9,7 +9,7 @@ from .generic import (ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries, ABCSparseArray, ABCSparseSeries) -from .inference import is_integer, is_string_like +from .inference import is_string_like from .inference import * # noqa @@ -386,33 +386,6 @@ def _validate_date_like_dtype(dtype): (dtype.name, dtype.type.__name__)) -def _lcd_dtypes(a_dtype, b_dtype): - """ return the lcd dtype to hold these types """ - - if is_datetime64_dtype(a_dtype) or is_datetime64_dtype(b_dtype): - return _NS_DTYPE - elif is_timedelta64_dtype(a_dtype) or is_timedelta64_dtype(b_dtype): - return _TD_DTYPE - elif is_complex_dtype(a_dtype): - if is_complex_dtype(b_dtype): - return a_dtype - return np.float64 - elif is_integer_dtype(a_dtype): - if is_integer_dtype(b_dtype): - if a_dtype.itemsize == b_dtype.itemsize: - return a_dtype - return np.int64 - return np.float64 - elif is_float_dtype(a_dtype): - if is_float_dtype(b_dtype): - if a_dtype.itemsize == b_dtype.itemsize: - return a_dtype - else: - return np.float64 - elif is_integer(b_dtype): - return np.float64 - return np.object - _string_dtypes = frozenset(map(_get_dtype_from_object, (binary_type, text_type)))