Skip to content

BUG: .loc with duplicated label may have incorrect index dtype #11497

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 29, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/source/whatsnew/v0.18.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,7 @@ Bug Fixes
- Bug in ``.loc`` against ``CategoricalIndex`` may result in normal ``Index`` (:issue:`11586`)
- Bug groupby on tz-aware data where selection not returning ``Timestamp`` (:issue:`11616`)
- Bug in timezone info lost when broadcasting scalar datetime to ``DataFrame`` (:issue:`11682`)


- Bug in ``.loc`` result with duplicated key may have ``Index`` with incorrect dtype (:issue:`11497`)

110 changes: 76 additions & 34 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ class Index(IndexOpsMixin, StringAccessorMixin, PandasObject):
_is_numeric_dtype = False
_can_hold_na = True

# prioritize current class for _shallow_copy_with_infer,
# used to infer integers as datetime-likes
_infer_as_myclass = False

_engine_type = _index.ObjectEngine

def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False,
Expand Down Expand Up @@ -209,6 +213,24 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False,
subarr = com._asarray_tuplesafe(data, dtype=object)
return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs)

"""
NOTE for new Index creation:

- _simple_new: It returns new Index with the same type as the caller.
All metadata (such as name) must be provided by caller's responsibility.
Using _shallow_copy is recommended because it fills these metadata otherwise specified.

- _shallow_copy: It returns new Index with the same type (using _simple_new),
but fills caller's metadata otherwise specified. Passed kwargs will
overwrite corresponding metadata.

- _shallow_copy_with_infer: It returns new Index inferring its type
from passed values. It fills caller's metadata otherwise specified as the
same as _shallow_copy.

See each method's docstring.
"""

@classmethod
def _simple_new(cls, values, name=None, dtype=None, **kwargs):
"""
Expand All @@ -233,6 +255,48 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs):
result._reset_identity()
return result

def _shallow_copy(self, values=None, **kwargs):
"""
create a new Index with the same class as the caller, don't copy the data,
use the same object attributes with passed in attributes taking precedence

*this is an internal non-public method*

Parameters
----------
values : the values to create the new Index, optional
kwargs : updates the default attributes for this Index
"""
if values is None:
values = self.values
attributes = self._get_attributes_dict()
attributes.update(kwargs)
return self._simple_new(values, **attributes)

def _shallow_copy_with_infer(self, values=None, **kwargs):
"""
create a new Index inferring the class with passed value, don't copy the data,
use the same object attributes with passed in attributes taking precedence

*this is an internal non-public method*

Parameters
----------
values : the values to create the new Index, optional
kwargs : updates the default attributes for this Index
"""
if values is None:
values = self.values
attributes = self._get_attributes_dict()
attributes.update(kwargs)
attributes['copy'] = False
if self._infer_as_myclass:
try:
return self._constructor(values, **attributes)
except (TypeError, ValueError) as e:
pass
return Index(values, **attributes)

def _update_inplace(self, result, **kwargs):
# guard when called from IndexOpsMixin
raise TypeError("Index can't be updated inplace")
Expand Down Expand Up @@ -372,31 +436,6 @@ def view(self, cls=None):
result._id = self._id
return result

def _shallow_copy(self, values=None, infer=False, **kwargs):
"""
create a new Index, don't copy the data, use the same object attributes
with passed in attributes taking precedence

*this is an internal non-public method*

Parameters
----------
values : the values to create the new Index, optional
infer : boolean, default False
if True, infer the new type of the passed values
kwargs : updates the default attributes for this Index
"""
if values is None:
values = self.values
attributes = self._get_attributes_dict()
attributes.update(kwargs)

if infer:
attributes['copy'] = False
return Index(values, **attributes)

return self.__class__._simple_new(values,**attributes)

def _coerce_scalar_to_index(self, item):
"""
we need to coerce a scalar to a compat for our index type
Expand Down Expand Up @@ -1206,7 +1245,7 @@ def append(self, other):
to_concat, name = self._ensure_compat_append(other)
attribs = self._get_attributes_dict()
attribs['name'] = name
return self._shallow_copy(np.concatenate(to_concat), infer=True, **attribs)
return self._shallow_copy_with_infer(np.concatenate(to_concat), **attribs)

@staticmethod
def _ensure_compat_concat(indexes):
Expand Down Expand Up @@ -1725,7 +1764,7 @@ def sym_diff(self, other, result_name=None):
attribs['name'] = result_name
if 'freq' in attribs:
attribs['freq'] = None
return self._shallow_copy(the_diff, infer=True, **attribs)
return self._shallow_copy_with_infer(the_diff, **attribs)

def get_loc(self, key, method=None, tolerance=None):
"""
Expand Down Expand Up @@ -2199,7 +2238,8 @@ def _reindex_non_unique(self, target):
new_indexer = np.arange(len(self.take(indexer)))
new_indexer[~check] = -1

return self._shallow_copy(new_labels), indexer, new_indexer
new_index = self._shallow_copy_with_infer(new_labels, freq=None)
return new_index, indexer, new_indexer

def join(self, other, how='left', level=None, return_indexers=False):
"""
Expand Down Expand Up @@ -2756,8 +2796,7 @@ def delete(self, loc):
-------
new_index : Index
"""
attribs = self._get_attributes_dict()
return self._shallow_copy(np.delete(self._data, loc), **attribs)
return self._shallow_copy(np.delete(self._data, loc))

def insert(self, loc, item):
"""
Expand All @@ -2778,8 +2817,7 @@ def insert(self, loc, item):

idx = np.concatenate(
(_self[:loc], item, _self[loc:]))
attribs = self._get_attributes_dict()
return self._shallow_copy(idx, infer=True, **attribs)
return self._shallow_copy_with_infer(idx)

def drop(self, labels, errors='raise'):
"""
Expand Down Expand Up @@ -2841,7 +2879,6 @@ def fillna(self, value=None, downcast=None):
# no need to care metadata other than name
# because it can't have freq if
return Index(result, name=self.name)

return self._shallow_copy()

def _evaluate_with_timedelta_like(self, other, op, opstr):
Expand Down Expand Up @@ -4316,10 +4353,15 @@ def view(self, cls=None):
result._id = self._id
return result

def _shallow_copy(self, values=None, infer=False, **kwargs):
def _shallow_copy_with_infer(self, values=None, **kwargs):
return self._shallow_copy(values, **kwargs)

def _shallow_copy(self, values=None, **kwargs):
if values is not None:
if 'name' in kwargs:
kwargs['names'] = kwargs.pop('name',None)
# discards freq
kwargs.pop('freq', None)
return MultiIndex.from_tuples(values, **kwargs)
return self.view()

Expand Down
139 changes: 129 additions & 10 deletions pandas/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3516,44 +3516,163 @@ def test_series_partial_set(self):
# Regression from GH4825
ser = Series([0.1, 0.2], index=[1, 2])

# ToDo: check_index_type can be True after GH 11497

# loc
expected = Series([np.nan, 0.2, np.nan], index=[3, 2, 3])
result = ser.loc[[3, 2, 3]]
assert_series_equal(result, expected, check_index_type=False)
assert_series_equal(result, expected, check_index_type=True)

expected = Series([np.nan, 0.2, np.nan, np.nan], index=[3, 2, 3, 'x'])
result = ser.loc[[3, 2, 3, 'x']]
assert_series_equal(result, expected, check_index_type=True)

expected = Series([0.2, 0.2, 0.1], index=[2, 2, 1])
result = ser.loc[[2, 2, 1]]
assert_series_equal(result, expected, check_index_type=True)

expected = Series([0.2, 0.2, np.nan, 0.1], index=[2, 2, 'x', 1])
result = ser.loc[[2, 2, 'x', 1]]
assert_series_equal(result, expected, check_index_type=True)

# raises as nothing in in the index
self.assertRaises(KeyError, lambda : ser.loc[[3, 3, 3]])

expected = Series([0.2, 0.2, np.nan], index=[2, 2, 3])
result = ser.loc[[2, 2, 3]]
assert_series_equal(result, expected, check_index_type=False)
assert_series_equal(result, expected, check_index_type=True)

expected = Series([0.3, np.nan, np.nan], index=[3, 4, 4])
result = Series([0.1, 0.2, 0.3], index=[1, 2, 3]).loc[[3, 4, 4]]
assert_series_equal(result, expected, check_index_type=False)
assert_series_equal(result, expected, check_index_type=True)

expected = Series([np.nan, 0.3, 0.3], index=[5, 3, 3])
result = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]).loc[[5, 3, 3]]
assert_series_equal(result, expected, check_index_type=False)
assert_series_equal(result, expected, check_index_type=True)

expected = Series([np.nan, 0.4, 0.4], index=[5, 4, 4])
result = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]).loc[[5, 4, 4]]
assert_series_equal(result, expected, check_index_type=False)
assert_series_equal(result, expected, check_index_type=True)

expected = Series([0.4, np.nan, np.nan], index=[7, 2, 2])
result = Series([0.1, 0.2, 0.3, 0.4], index=[4, 5, 6, 7]).loc[[7, 2, 2]]
assert_series_equal(result, expected, check_index_type=False)
assert_series_equal(result, expected, check_index_type=True)

expected = Series([0.4, np.nan, np.nan], index=[4, 5, 5])
result = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]).loc[[4, 5, 5]]
assert_series_equal(result, expected, check_index_type=False)
assert_series_equal(result, expected, check_index_type=True)

# iloc
expected = Series([0.2, 0.2, 0.1, 0.1], index=[2, 2, 1, 1])
result = ser.iloc[[1, 1, 0, 0]]
assert_series_equal(result, expected, check_index_type=False)
assert_series_equal(result, expected, check_index_type=True)

def test_series_partial_set_with_name(self):
# GH 11497

idx = Index([1, 2], dtype='int64', name='idx')
ser = Series([0.1, 0.2], index=idx, name='s')

# loc
exp_idx = Index([3, 2, 3], dtype='int64', name='idx')
expected = Series([np.nan, 0.2, np.nan], index=exp_idx, name='s')
result = ser.loc[[3, 2, 3]]
assert_series_equal(result, expected, check_index_type=True)

exp_idx = Index([3, 2, 3, 'x'], dtype='object', name='idx')
expected = Series([np.nan, 0.2, np.nan, np.nan], index=exp_idx, name='s')
result = ser.loc[[3, 2, 3, 'x']]
assert_series_equal(result, expected, check_index_type=True)

exp_idx = Index([2, 2, 1], dtype='int64', name='idx')
expected = Series([0.2, 0.2, 0.1], index=exp_idx, name='s')
result = ser.loc[[2, 2, 1]]
assert_series_equal(result, expected, check_index_type=True)

exp_idx = Index([2, 2, 'x', 1], dtype='object', name='idx')
expected = Series([0.2, 0.2, np.nan, 0.1], index=exp_idx, name='s')
result = ser.loc[[2, 2, 'x', 1]]
assert_series_equal(result, expected, check_index_type=True)

# raises as nothing in in the index
self.assertRaises(KeyError, lambda : ser.loc[[3, 3, 3]])

exp_idx = Index([2, 2, 3], dtype='int64', name='idx')
expected = Series([0.2, 0.2, np.nan], index=exp_idx, name='s')
result = ser.loc[[2, 2, 3]]
assert_series_equal(result, expected, check_index_type=True)

exp_idx = Index([3, 4, 4], dtype='int64', name='idx')
expected = Series([0.3, np.nan, np.nan], index=exp_idx, name='s')
idx = Index([1, 2, 3], dtype='int64', name='idx')
result = Series([0.1, 0.2, 0.3], index=idx, name='s').loc[[3, 4, 4]]
assert_series_equal(result, expected, check_index_type=True)

exp_idx = Index([5, 3, 3], dtype='int64', name='idx')
expected = Series([np.nan, 0.3, 0.3], index=exp_idx, name='s')
idx = Index([1, 2, 3, 4], dtype='int64', name='idx')
result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name='s').loc[[5, 3, 3]]
assert_series_equal(result, expected, check_index_type=True)

exp_idx = Index([5, 4, 4], dtype='int64', name='idx')
expected = Series([np.nan, 0.4, 0.4], index=exp_idx, name='s')
idx = Index([1, 2, 3, 4], dtype='int64', name='idx')
result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name='s').loc[[5, 4, 4]]
assert_series_equal(result, expected, check_index_type=True)

exp_idx = Index([7, 2, 2], dtype='int64', name='idx')
expected = Series([0.4, np.nan, np.nan], index=exp_idx, name='s')
idx = Index([4, 5, 6, 7], dtype='int64', name='idx')
result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name='s').loc[[7, 2, 2]]
assert_series_equal(result, expected, check_index_type=True)

exp_idx = Index([4, 5, 5], dtype='int64', name='idx')
expected = Series([0.4, np.nan, np.nan], index=exp_idx, name='s')
idx = Index([1, 2, 3, 4], dtype='int64', name='idx')
result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name='s').loc[[4, 5, 5]]
assert_series_equal(result, expected, check_index_type=True)

# iloc
exp_idx = Index([2, 2, 1, 1], dtype='int64', name='idx')
expected = Series([0.2, 0.2, 0.1, 0.1], index=exp_idx, name='s')
result = ser.iloc[[1,1,0,0]]
assert_series_equal(result, expected, check_index_type=True)

def test_series_partial_set_datetime(self):
# GH 11497

idx = date_range('2011-01-01', '2011-01-02', freq='D', name='idx')
ser = Series([0.1, 0.2], index=idx, name='s')

result = ser.loc[[Timestamp('2011-01-01'), Timestamp('2011-01-02')]]
exp = Series([0.1, 0.2], index=idx, name='s')
assert_series_equal(result, exp, check_index_type=True)

keys = [Timestamp('2011-01-02'), Timestamp('2011-01-02'), Timestamp('2011-01-01')]
exp = Series([0.2, 0.2, 0.1], index=pd.DatetimeIndex(keys, name='idx'), name='s')
assert_series_equal(ser.loc[keys], exp, check_index_type=True)

keys = [Timestamp('2011-01-03'), Timestamp('2011-01-02'), Timestamp('2011-01-03')]
exp = Series([np.nan, 0.2, np.nan], index=pd.DatetimeIndex(keys, name='idx'), name='s')
assert_series_equal(ser.loc[keys], exp, check_index_type=True)

def test_series_partial_set_period(self):
# GH 11497

idx = pd.period_range('2011-01-01', '2011-01-02', freq='D', name='idx')
ser = Series([0.1, 0.2], index=idx, name='s')

result = ser.loc[[pd.Period('2011-01-01', freq='D'), pd.Period('2011-01-02', freq='D')]]
exp = Series([0.1, 0.2], index=idx, name='s')
assert_series_equal(result, exp, check_index_type=True)

keys = [pd.Period('2011-01-02', freq='D'), pd.Period('2011-01-02', freq='D'),
pd.Period('2011-01-01', freq='D')]
exp = Series([0.2, 0.2, 0.1], index=pd.PeriodIndex(keys, name='idx'), name='s')
assert_series_equal(ser.loc[keys], exp, check_index_type=True)

keys = [pd.Period('2011-01-03', freq='D'), pd.Period('2011-01-02', freq='D'),
pd.Period('2011-01-03', freq='D')]
exp = Series([np.nan, 0.2, np.nan], index=pd.PeriodIndex(keys, name='idx'), name='s')
assert_series_equal(ser.loc[keys], exp, check_index_type=True)

def test_partial_set_invalid(self):

Expand Down
4 changes: 2 additions & 2 deletions pandas/tseries/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def _join_i8_wrapper(joinf, **kwargs):
'is_quarter_start','is_quarter_end','is_year_start','is_year_end',
'tz','freq']
_is_numeric_dtype = False

_infer_as_myclass = True

@deprecate_kwarg(old_arg_name='infer_dst', new_arg_name='ambiguous',
mapping={True: 'infer', False: 'raise'})
Expand Down Expand Up @@ -778,7 +778,7 @@ def astype(self, dtype):
elif dtype == _NS_DTYPE and self.tz is not None:
return self.tz_convert('UTC').tz_localize(None)
elif dtype == str:
return self._shallow_copy(values=self.format(), infer=True)
return Index(self.format(), name=self.name, dtype=object)
else: # pragma: no cover
raise ValueError('Cannot cast DatetimeIndex to dtype %s' % dtype)

Expand Down
Loading