Skip to content

Commit aa9e2cc

Browse files
committed
BUG: concatenation with a coercable dtype was too aggressive
closes pandas-dev#12411 closes pandas-dev#12045 closes pandas-dev#11594 closes pandas-dev#10571
1 parent b56f17f commit aa9e2cc

File tree

7 files changed

+127
-23
lines changed

7 files changed

+127
-23
lines changed

doc/source/whatsnew/v0.18.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ Bug Fixes
122122
- Bug in printing data which contains ``Period`` with different ``freq`` raises ``ValueError`` (:issue:`12615`)
123123
- Bug in numpy compatibility of ``np.round()`` on a ``Series`` (:issue:`12600`)
124124
- Bug in ``Series`` construction with ``Categorical`` and ``dtype='category'`` is specified (:issue:`12574`)
125+
- Bugs in concatenation with a coercable dtype was too aggressive. (:issue:`12411`, :issue:`12045`, :issue:`11594`, :issue:`10571`)
125126

126127

127128

pandas/core/indexing.py

+5-7
Original file line numberDiff line numberDiff line change
@@ -45,21 +45,19 @@ class IndexingError(Exception):
4545
class _NDFrameIndexer(object):
4646
_valid_types = None
4747
_exception = KeyError
48+
axis = None
4849

4950
def __init__(self, obj, name):
5051
self.obj = obj
5152
self.ndim = obj.ndim
5253
self.name = name
53-
self.axis = None
5454

55-
def __call__(self, *args, **kwargs):
55+
def __call__(self, axis=None):
5656
# we need to return a copy of ourselves
57-
self = self.__class__(self.obj, self.name)
57+
new_self = self.__class__(self.obj, self.name)
5858

59-
# set the passed in values
60-
for k, v in compat.iteritems(kwargs):
61-
setattr(self, k, v)
62-
return self
59+
new_self.axis = axis
60+
return new_self
6361

6462
def __iter__(self):
6563
raise NotImplementedError('ix is not iterable')

pandas/core/internals.py

+16-14
Original file line numberDiff line numberDiff line change
@@ -4820,21 +4820,23 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
48204820
else:
48214821
fill_value = upcasted_na
48224822

4823-
if self.is_null and not getattr(self.block, 'is_categorical',
4824-
None):
4825-
missing_arr = np.empty(self.shape, dtype=empty_dtype)
4826-
if np.prod(self.shape):
4827-
# NumPy 1.6 workaround: this statement gets strange if all
4828-
# blocks are of same dtype and some of them are empty:
4829-
# empty one are considered "null" so they must be filled,
4830-
# but no dtype upcasting happens and the dtype may not
4831-
# allow NaNs.
4832-
#
4833-
# In general, no one should get hurt when one tries to put
4834-
# incorrect values into empty array, but numpy 1.6 is
4835-
# strict about that.
4823+
if self.is_null:
4824+
if getattr(self.block, 'is_object', False):
4825+
# we want to avoid filling with np.nan if we are
4826+
# using None; we already know that we are all
4827+
# nulls
4828+
values = self.block.values.ravel(order='K')
4829+
if len(values) and values[0] is None:
4830+
fill_value = None
4831+
4832+
if getattr(self.block, 'is_datetimetz', False):
4833+
pass
4834+
elif getattr(self.block, 'is_categorical', False):
4835+
pass
4836+
else:
4837+
missing_arr = np.empty(self.shape, dtype=empty_dtype)
48364838
missing_arr.fill(fill_value)
4837-
return missing_arr
4839+
return missing_arr
48384840

48394841
if not self.indexers:
48404842
if not self.block._can_consolidate:

pandas/tests/indexing/test_indexing.py

+36-1
Original file line numberDiff line numberDiff line change
@@ -2481,7 +2481,7 @@ def f():
24812481
# setitem
24822482
df.loc(axis=0)[:, :, ['C1', 'C3']] = -10
24832483

2484-
def test_loc_arguments(self):
2484+
def test_loc_axis_arguments(self):
24852485

24862486
index = MultiIndex.from_product([_mklbl('A', 4), _mklbl('B', 2),
24872487
_mklbl('C', 4), _mklbl('D', 2)])
@@ -2532,6 +2532,41 @@ def f():
25322532

25332533
self.assertRaises(ValueError, f)
25342534

2535+
def test_loc_coerceion(self):
2536+
2537+
# 12411
2538+
df = DataFrame({'date': [pd.Timestamp('20130101').tz_localize('UTC'),
2539+
pd.NaT]})
2540+
expected = df.dtypes
2541+
2542+
result = df.iloc[[0]]
2543+
assert_series_equal(result.dtypes, expected)
2544+
2545+
result = df.iloc[[1]]
2546+
assert_series_equal(result.dtypes, expected)
2547+
2548+
# 12045
2549+
import datetime
2550+
df = DataFrame({'date': [datetime.datetime(2012, 1, 1),
2551+
datetime.datetime(1012, 1, 2)]})
2552+
expected = df.dtypes
2553+
2554+
result = df.iloc[[0]]
2555+
assert_series_equal(result.dtypes, expected)
2556+
2557+
result = df.iloc[[1]]
2558+
assert_series_equal(result.dtypes, expected)
2559+
2560+
# 11594
2561+
df = DataFrame({'text': ['some words'] + [None] * 9})
2562+
expected = df.dtypes
2563+
2564+
result = df.iloc[0:2]
2565+
assert_series_equal(result.dtypes, expected)
2566+
2567+
result = df.iloc[3:]
2568+
assert_series_equal(result.dtypes, expected)
2569+
25352570
def test_per_axis_per_level_setitem(self):
25362571

25372572
# test index maker

pandas/tests/test_format.py

+31
Original file line numberDiff line numberDiff line change
@@ -728,6 +728,37 @@ def test_to_string_truncate_multilevel(self):
728728
with option_context("display.max_rows", 7, "display.max_columns", 7):
729729
self.assertTrue(has_doubly_truncated_repr(df))
730730

731+
def test_truncate_with_different_dtypes(self):
732+
733+
# 11594, 12045, 12211
734+
# when truncated the dtypes of the splits can differ
735+
736+
# 12211
737+
df = DataFrame({'date' : [pd.Timestamp('20130101').tz_localize('UTC')] + [pd.NaT]*5})
738+
739+
with option_context("display.max_rows", 5):
740+
result = str(df)
741+
self.assertTrue('2013-01-01 00:00:00+00:00' in result)
742+
self.assertTrue('NaT' in result)
743+
self.assertTrue('...' in result)
744+
self.assertTrue('[6 rows x 1 columns]' in result)
745+
746+
# 11594
747+
import datetime
748+
s = Series([datetime.datetime(2012, 1, 1)]*10 + [datetime.datetime(1012,1,2)] + [datetime.datetime(2012, 1, 3)]*10)
749+
750+
with pd.option_context('display.max_rows', 8):
751+
result = str(s)
752+
self.assertTrue('object' in result)
753+
754+
# 12045
755+
df = DataFrame({'text': ['some words'] + [None]*9})
756+
757+
with pd.option_context('display.max_rows', 8, 'display.max_columns', 3):
758+
result = str(df)
759+
self.assertTrue('None' in result)
760+
self.assertFalse('NaN' in result)
761+
731762
def test_to_html_with_col_space(self):
732763
def check_with_width(df, col_space):
733764
import re

pandas/tools/merge.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -980,7 +980,9 @@ def get_result(self):
980980
if self.axis == 0:
981981
new_data = com._concat_compat([x._values for x in self.objs])
982982
name = com._consensus_name_attr(self.objs)
983-
return (Series(new_data, index=self.new_axes[0], name=name)
983+
return (Series(new_data, index=self.new_axes[0],
984+
name=name,
985+
dtype=new_data.dtype)
984986
.__finalize__(self, method='concat'))
985987

986988
# combine as columns in a frame

pandas/tools/tests/test_merge.py

+35
Original file line numberDiff line numberDiff line change
@@ -923,6 +923,41 @@ def _constructor(self):
923923

924924
tm.assertIsInstance(result, NotADataFrame)
925925

926+
def test_empty_dtype_coerce(self):
927+
928+
# xref to 12411
929+
# xref to #12045
930+
# xref to #11594
931+
# see below
932+
933+
# 10571
934+
df1 = DataFrame(data=[[1, None], [2, None]], columns=['a', 'b'])
935+
df2 = DataFrame(data=[[3, None], [4, None]], columns=['a', 'b'])
936+
result = concat([df1, df2])
937+
expected = df1.dtypes
938+
assert_series_equal(result.dtypes, expected)
939+
940+
def test_dtype_coerceion(self):
941+
942+
# 12411
943+
df = DataFrame({'date': [pd.Timestamp('20130101').tz_localize('UTC'),
944+
pd.NaT]})
945+
946+
result = concat([df.iloc[[0]], df.iloc[[1]]])
947+
assert_series_equal(result.dtypes, df.dtypes)
948+
949+
# 12045
950+
import datetime
951+
df = DataFrame({'date': [datetime.datetime(2012, 1, 1),
952+
datetime.datetime(1012, 1, 2)]})
953+
result = concat([df.iloc[[0]], df.iloc[[1]]])
954+
assert_series_equal(result.dtypes, df.dtypes)
955+
956+
# 11594
957+
df = DataFrame({'text': ['some words'] + [None] * 9})
958+
result = concat([df.iloc[[0]], df.iloc[[1]]])
959+
assert_series_equal(result.dtypes, df.dtypes)
960+
926961
def test_append_dtype_coerce(self):
927962

928963
# GH 4993

0 commit comments

Comments
 (0)