Skip to content

BUG: concatenation with a coercable dtype was too aggressive #12702

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.18.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ Bug Fixes
- Bug in printing data which contains ``Period`` with different ``freq`` raises ``ValueError`` (:issue:`12615`)
- Bug in numpy compatibility of ``np.round()`` on a ``Series`` (:issue:`12600`)
- Bug in ``Series`` construction with ``Categorical`` and ``dtype='category'`` is specified (:issue:`12574`)
- Bugs in concatenation with a coercable dtype was too aggressive. (:issue:`12411`, :issue:`12045`, :issue:`11594`, :issue:`10571`)



Expand Down
12 changes: 5 additions & 7 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,21 +45,19 @@ class IndexingError(Exception):
class _NDFrameIndexer(object):
_valid_types = None
_exception = KeyError
axis = None

def __init__(self, obj, name):
self.obj = obj
self.ndim = obj.ndim
self.name = name
self.axis = None

def __call__(self, *args, **kwargs):
def __call__(self, axis=None):
# we need to return a copy of ourselves
self = self.__class__(self.obj, self.name)
new_self = self.__class__(self.obj, self.name)

# set the passed in values
for k, v in compat.iteritems(kwargs):
setattr(self, k, v)
return self
new_self.axis = axis
return new_self

def __iter__(self):
raise NotImplementedError('ix is not iterable')
Expand Down
30 changes: 16 additions & 14 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -4820,21 +4820,23 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
else:
fill_value = upcasted_na

if self.is_null and not getattr(self.block, 'is_categorical',
None):
missing_arr = np.empty(self.shape, dtype=empty_dtype)
if np.prod(self.shape):
# NumPy 1.6 workaround: this statement gets strange if all
# blocks are of same dtype and some of them are empty:
# empty one are considered "null" so they must be filled,
# but no dtype upcasting happens and the dtype may not
# allow NaNs.
#
# In general, no one should get hurt when one tries to put
# incorrect values into empty array, but numpy 1.6 is
# strict about that.
if self.is_null:
if getattr(self.block, 'is_object', False):
# we want to avoid filling with np.nan if we are
# using None; we already know that we are all
# nulls
values = self.block.values.ravel(order='K')
if len(values) and values[0] is None:
fill_value = None

if getattr(self.block, 'is_datetimetz', False):
pass
elif getattr(self.block, 'is_categorical', False):
pass
else:
missing_arr = np.empty(self.shape, dtype=empty_dtype)
missing_arr.fill(fill_value)
return missing_arr
return missing_arr

if not self.indexers:
if not self.block._can_consolidate:
Expand Down
37 changes: 36 additions & 1 deletion pandas/tests/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2481,7 +2481,7 @@ def f():
# setitem
df.loc(axis=0)[:, :, ['C1', 'C3']] = -10

def test_loc_arguments(self):
def test_loc_axis_arguments(self):

index = MultiIndex.from_product([_mklbl('A', 4), _mklbl('B', 2),
_mklbl('C', 4), _mklbl('D', 2)])
Expand Down Expand Up @@ -2532,6 +2532,41 @@ def f():

self.assertRaises(ValueError, f)

def test_loc_coerceion(self):

# 12411
df = DataFrame({'date': [pd.Timestamp('20130101').tz_localize('UTC'),
pd.NaT]})
expected = df.dtypes

result = df.iloc[[0]]
assert_series_equal(result.dtypes, expected)

result = df.iloc[[1]]
assert_series_equal(result.dtypes, expected)

# 12045
import datetime
df = DataFrame({'date': [datetime.datetime(2012, 1, 1),
datetime.datetime(1012, 1, 2)]})
expected = df.dtypes

result = df.iloc[[0]]
assert_series_equal(result.dtypes, expected)

result = df.iloc[[1]]
assert_series_equal(result.dtypes, expected)

# 11594
df = DataFrame({'text': ['some words'] + [None] * 9})
expected = df.dtypes

result = df.iloc[0:2]
assert_series_equal(result.dtypes, expected)

result = df.iloc[3:]
assert_series_equal(result.dtypes, expected)

def test_per_axis_per_level_setitem(self):

# test index maker
Expand Down
31 changes: 31 additions & 0 deletions pandas/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -728,6 +728,37 @@ def test_to_string_truncate_multilevel(self):
with option_context("display.max_rows", 7, "display.max_columns", 7):
self.assertTrue(has_doubly_truncated_repr(df))

def test_truncate_with_different_dtypes(self):

# 11594, 12045, 12211
# when truncated the dtypes of the splits can differ

# 12211
df = DataFrame({'date' : [pd.Timestamp('20130101').tz_localize('UTC')] + [pd.NaT]*5})

with option_context("display.max_rows", 5):
result = str(df)
self.assertTrue('2013-01-01 00:00:00+00:00' in result)
self.assertTrue('NaT' in result)
self.assertTrue('...' in result)
self.assertTrue('[6 rows x 1 columns]' in result)

# 11594
import datetime
s = Series([datetime.datetime(2012, 1, 1)]*10 + [datetime.datetime(1012,1,2)] + [datetime.datetime(2012, 1, 3)]*10)

with pd.option_context('display.max_rows', 8):
result = str(s)
self.assertTrue('object' in result)

# 12045
df = DataFrame({'text': ['some words'] + [None]*9})

with pd.option_context('display.max_rows', 8, 'display.max_columns', 3):
result = str(df)
self.assertTrue('None' in result)
self.assertFalse('NaN' in result)

def test_to_html_with_col_space(self):
def check_with_width(df, col_space):
import re
Expand Down
4 changes: 3 additions & 1 deletion pandas/tools/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -980,7 +980,9 @@ def get_result(self):
if self.axis == 0:
new_data = com._concat_compat([x._values for x in self.objs])
name = com._consensus_name_attr(self.objs)
return (Series(new_data, index=self.new_axes[0], name=name)
return (Series(new_data, index=self.new_axes[0],
name=name,
dtype=new_data.dtype)
.__finalize__(self, method='concat'))

# combine as columns in a frame
Expand Down
35 changes: 35 additions & 0 deletions pandas/tools/tests/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -923,6 +923,41 @@ def _constructor(self):

tm.assertIsInstance(result, NotADataFrame)

def test_empty_dtype_coerce(self):

# xref to 12411
# xref to #12045
# xref to #11594
# see below

# 10571
df1 = DataFrame(data=[[1, None], [2, None]], columns=['a', 'b'])
df2 = DataFrame(data=[[3, None], [4, None]], columns=['a', 'b'])
result = concat([df1, df2])
expected = df1.dtypes
assert_series_equal(result.dtypes, expected)

def test_dtype_coerceion(self):

# 12411
df = DataFrame({'date': [pd.Timestamp('20130101').tz_localize('UTC'),
pd.NaT]})

result = concat([df.iloc[[0]], df.iloc[[1]]])
assert_series_equal(result.dtypes, df.dtypes)

# 12045
import datetime
df = DataFrame({'date': [datetime.datetime(2012, 1, 1),
datetime.datetime(1012, 1, 2)]})
result = concat([df.iloc[[0]], df.iloc[[1]]])
assert_series_equal(result.dtypes, df.dtypes)

# 11594
df = DataFrame({'text': ['some words'] + [None] * 9})
result = concat([df.iloc[[0]], df.iloc[[1]]])
assert_series_equal(result.dtypes, df.dtypes)

def test_append_dtype_coerce(self):

# GH 4993
Expand Down