diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index e664020946baf..1ac95c3ea6195 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -122,6 +122,7 @@ Bug Fixes - Bug in printing data which contains ``Period`` with different ``freq`` raises ``ValueError`` (:issue:`12615`) - Bug in numpy compatibility of ``np.round()`` on a ``Series`` (:issue:`12600`) - Bug in ``Series`` construction with ``Categorical`` and ``dtype='category'`` is specified (:issue:`12574`) +- Bugs in concatenation with a coercable dtype was too aggressive. (:issue:`12411`, :issue:`12045`, :issue:`11594`, :issue:`10571`) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ced663bd62197..df257fb5fd1d0 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -45,21 +45,19 @@ class IndexingError(Exception): class _NDFrameIndexer(object): _valid_types = None _exception = KeyError + axis = None def __init__(self, obj, name): self.obj = obj self.ndim = obj.ndim self.name = name - self.axis = None - def __call__(self, *args, **kwargs): + def __call__(self, axis=None): # we need to return a copy of ourselves - self = self.__class__(self.obj, self.name) + new_self = self.__class__(self.obj, self.name) - # set the passed in values - for k, v in compat.iteritems(kwargs): - setattr(self, k, v) - return self + new_self.axis = axis + return new_self def __iter__(self): raise NotImplementedError('ix is not iterable') diff --git a/pandas/core/internals.py b/pandas/core/internals.py index d99c4ef45dcd3..a31bd347e674a 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4820,21 +4820,23 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): else: fill_value = upcasted_na - if self.is_null and not getattr(self.block, 'is_categorical', - None): - missing_arr = np.empty(self.shape, dtype=empty_dtype) - if np.prod(self.shape): - # NumPy 1.6 workaround: this statement gets strange if all - # blocks are of same dtype and some of them are empty: - # empty one are considered "null" so they must be filled, - # but no dtype upcasting happens and the dtype may not - # allow NaNs. - # - # In general, no one should get hurt when one tries to put - # incorrect values into empty array, but numpy 1.6 is - # strict about that. + if self.is_null: + if getattr(self.block, 'is_object', False): + # we want to avoid filling with np.nan if we are + # using None; we already know that we are all + # nulls + values = self.block.values.ravel(order='K') + if len(values) and values[0] is None: + fill_value = None + + if getattr(self.block, 'is_datetimetz', False): + pass + elif getattr(self.block, 'is_categorical', False): + pass + else: + missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) - return missing_arr + return missing_arr if not self.indexers: if not self.block._can_consolidate: diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 89552ab776608..e5be2bb08f605 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -2481,7 +2481,7 @@ def f(): # setitem df.loc(axis=0)[:, :, ['C1', 'C3']] = -10 - def test_loc_arguments(self): + def test_loc_axis_arguments(self): index = MultiIndex.from_product([_mklbl('A', 4), _mklbl('B', 2), _mklbl('C', 4), _mklbl('D', 2)]) @@ -2532,6 +2532,41 @@ def f(): self.assertRaises(ValueError, f) + def test_loc_coerceion(self): + + # 12411 + df = DataFrame({'date': [pd.Timestamp('20130101').tz_localize('UTC'), + pd.NaT]}) + expected = df.dtypes + + result = df.iloc[[0]] + assert_series_equal(result.dtypes, expected) + + result = df.iloc[[1]] + assert_series_equal(result.dtypes, expected) + + # 12045 + import datetime + df = DataFrame({'date': [datetime.datetime(2012, 1, 1), + datetime.datetime(1012, 1, 2)]}) + expected = df.dtypes + + result = df.iloc[[0]] + assert_series_equal(result.dtypes, expected) + + result = df.iloc[[1]] + assert_series_equal(result.dtypes, expected) + + # 11594 + df = DataFrame({'text': ['some words'] + [None] * 9}) + expected = df.dtypes + + result = df.iloc[0:2] + assert_series_equal(result.dtypes, expected) + + result = df.iloc[3:] + assert_series_equal(result.dtypes, expected) + def test_per_axis_per_level_setitem(self): # test index maker diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index 339ab9e0da6a1..e8ad776fd5578 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -728,6 +728,37 @@ def test_to_string_truncate_multilevel(self): with option_context("display.max_rows", 7, "display.max_columns", 7): self.assertTrue(has_doubly_truncated_repr(df)) + def test_truncate_with_different_dtypes(self): + + # 11594, 12045, 12211 + # when truncated the dtypes of the splits can differ + + # 12211 + df = DataFrame({'date' : [pd.Timestamp('20130101').tz_localize('UTC')] + [pd.NaT]*5}) + + with option_context("display.max_rows", 5): + result = str(df) + self.assertTrue('2013-01-01 00:00:00+00:00' in result) + self.assertTrue('NaT' in result) + self.assertTrue('...' in result) + self.assertTrue('[6 rows x 1 columns]' in result) + + # 11594 + import datetime + s = Series([datetime.datetime(2012, 1, 1)]*10 + [datetime.datetime(1012,1,2)] + [datetime.datetime(2012, 1, 3)]*10) + + with pd.option_context('display.max_rows', 8): + result = str(s) + self.assertTrue('object' in result) + + # 12045 + df = DataFrame({'text': ['some words'] + [None]*9}) + + with pd.option_context('display.max_rows', 8, 'display.max_columns', 3): + result = str(df) + self.assertTrue('None' in result) + self.assertFalse('NaN' in result) + def test_to_html_with_col_space(self): def check_with_width(df, col_space): import re diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 82fdf0a3d3b46..016dd5ed4e56b 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -980,7 +980,9 @@ def get_result(self): if self.axis == 0: new_data = com._concat_compat([x._values for x in self.objs]) name = com._consensus_name_attr(self.objs) - return (Series(new_data, index=self.new_axes[0], name=name) + return (Series(new_data, index=self.new_axes[0], + name=name, + dtype=new_data.dtype) .__finalize__(self, method='concat')) # combine as columns in a frame diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index ddc4e7aaf1588..25e6466fac725 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -923,6 +923,41 @@ def _constructor(self): tm.assertIsInstance(result, NotADataFrame) + def test_empty_dtype_coerce(self): + + # xref to 12411 + # xref to #12045 + # xref to #11594 + # see below + + # 10571 + df1 = DataFrame(data=[[1, None], [2, None]], columns=['a', 'b']) + df2 = DataFrame(data=[[3, None], [4, None]], columns=['a', 'b']) + result = concat([df1, df2]) + expected = df1.dtypes + assert_series_equal(result.dtypes, expected) + + def test_dtype_coerceion(self): + + # 12411 + df = DataFrame({'date': [pd.Timestamp('20130101').tz_localize('UTC'), + pd.NaT]}) + + result = concat([df.iloc[[0]], df.iloc[[1]]]) + assert_series_equal(result.dtypes, df.dtypes) + + # 12045 + import datetime + df = DataFrame({'date': [datetime.datetime(2012, 1, 1), + datetime.datetime(1012, 1, 2)]}) + result = concat([df.iloc[[0]], df.iloc[[1]]]) + assert_series_equal(result.dtypes, df.dtypes) + + # 11594 + df = DataFrame({'text': ['some words'] + [None] * 9}) + result = concat([df.iloc[[0]], df.iloc[[1]]]) + assert_series_equal(result.dtypes, df.dtypes) + def test_append_dtype_coerce(self): # GH 4993