Skip to content

Commit 431e224

Browse files
committed
Merge pull request #11497 from sinhrks/loc_dtype
BUG: .loc with duplicated label may have incorrect index dtype
2 parents b18b0ab + 5a4ba71 commit 431e224

File tree

6 files changed

+224
-50
lines changed

6 files changed

+224
-50
lines changed

doc/source/whatsnew/v0.18.0.txt

+4
Original file line numberDiff line numberDiff line change
@@ -113,3 +113,7 @@ Bug Fixes
113113
- Bug in ``.loc`` against ``CategoricalIndex`` may result in normal ``Index`` (:issue:`11586`)
114114
- Bug groupby on tz-aware data where selection not returning ``Timestamp`` (:issue:`11616`)
115115
- Bug in timezone info lost when broadcasting scalar datetime to ``DataFrame`` (:issue:`11682`)
116+
117+
118+
- Bug in ``.loc`` result with duplicated key may have ``Index`` with incorrect dtype (:issue:`11497`)
119+

pandas/core/index.py

+76-34
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,10 @@ class Index(IndexOpsMixin, StringAccessorMixin, PandasObject):
111111
_is_numeric_dtype = False
112112
_can_hold_na = True
113113

114+
# prioritize current class for _shallow_copy_with_infer,
115+
# used to infer integers as datetime-likes
116+
_infer_as_myclass = False
117+
114118
_engine_type = _index.ObjectEngine
115119

116120
def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False,
@@ -209,6 +213,24 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False,
209213
subarr = com._asarray_tuplesafe(data, dtype=object)
210214
return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs)
211215

216+
"""
217+
NOTE for new Index creation:
218+
219+
- _simple_new: It returns new Index with the same type as the caller.
220+
All metadata (such as name) must be provided by caller's responsibility.
221+
Using _shallow_copy is recommended because it fills these metadata otherwise specified.
222+
223+
- _shallow_copy: It returns new Index with the same type (using _simple_new),
224+
but fills caller's metadata otherwise specified. Passed kwargs will
225+
overwrite corresponding metadata.
226+
227+
- _shallow_copy_with_infer: It returns new Index inferring its type
228+
from passed values. It fills caller's metadata otherwise specified as the
229+
same as _shallow_copy.
230+
231+
See each method's docstring.
232+
"""
233+
212234
@classmethod
213235
def _simple_new(cls, values, name=None, dtype=None, **kwargs):
214236
"""
@@ -233,6 +255,48 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs):
233255
result._reset_identity()
234256
return result
235257

258+
def _shallow_copy(self, values=None, **kwargs):
259+
"""
260+
create a new Index with the same class as the caller, don't copy the data,
261+
use the same object attributes with passed in attributes taking precedence
262+
263+
*this is an internal non-public method*
264+
265+
Parameters
266+
----------
267+
values : the values to create the new Index, optional
268+
kwargs : updates the default attributes for this Index
269+
"""
270+
if values is None:
271+
values = self.values
272+
attributes = self._get_attributes_dict()
273+
attributes.update(kwargs)
274+
return self._simple_new(values, **attributes)
275+
276+
def _shallow_copy_with_infer(self, values=None, **kwargs):
277+
"""
278+
create a new Index inferring the class with passed value, don't copy the data,
279+
use the same object attributes with passed in attributes taking precedence
280+
281+
*this is an internal non-public method*
282+
283+
Parameters
284+
----------
285+
values : the values to create the new Index, optional
286+
kwargs : updates the default attributes for this Index
287+
"""
288+
if values is None:
289+
values = self.values
290+
attributes = self._get_attributes_dict()
291+
attributes.update(kwargs)
292+
attributes['copy'] = False
293+
if self._infer_as_myclass:
294+
try:
295+
return self._constructor(values, **attributes)
296+
except (TypeError, ValueError) as e:
297+
pass
298+
return Index(values, **attributes)
299+
236300
def _update_inplace(self, result, **kwargs):
237301
# guard when called from IndexOpsMixin
238302
raise TypeError("Index can't be updated inplace")
@@ -372,31 +436,6 @@ def view(self, cls=None):
372436
result._id = self._id
373437
return result
374438

375-
def _shallow_copy(self, values=None, infer=False, **kwargs):
376-
"""
377-
create a new Index, don't copy the data, use the same object attributes
378-
with passed in attributes taking precedence
379-
380-
*this is an internal non-public method*
381-
382-
Parameters
383-
----------
384-
values : the values to create the new Index, optional
385-
infer : boolean, default False
386-
if True, infer the new type of the passed values
387-
kwargs : updates the default attributes for this Index
388-
"""
389-
if values is None:
390-
values = self.values
391-
attributes = self._get_attributes_dict()
392-
attributes.update(kwargs)
393-
394-
if infer:
395-
attributes['copy'] = False
396-
return Index(values, **attributes)
397-
398-
return self.__class__._simple_new(values,**attributes)
399-
400439
def _coerce_scalar_to_index(self, item):
401440
"""
402441
we need to coerce a scalar to a compat for our index type
@@ -1206,7 +1245,7 @@ def append(self, other):
12061245
to_concat, name = self._ensure_compat_append(other)
12071246
attribs = self._get_attributes_dict()
12081247
attribs['name'] = name
1209-
return self._shallow_copy(np.concatenate(to_concat), infer=True, **attribs)
1248+
return self._shallow_copy_with_infer(np.concatenate(to_concat), **attribs)
12101249

12111250
@staticmethod
12121251
def _ensure_compat_concat(indexes):
@@ -1725,7 +1764,7 @@ def sym_diff(self, other, result_name=None):
17251764
attribs['name'] = result_name
17261765
if 'freq' in attribs:
17271766
attribs['freq'] = None
1728-
return self._shallow_copy(the_diff, infer=True, **attribs)
1767+
return self._shallow_copy_with_infer(the_diff, **attribs)
17291768

17301769
def get_loc(self, key, method=None, tolerance=None):
17311770
"""
@@ -2199,7 +2238,8 @@ def _reindex_non_unique(self, target):
21992238
new_indexer = np.arange(len(self.take(indexer)))
22002239
new_indexer[~check] = -1
22012240

2202-
return self._shallow_copy(new_labels), indexer, new_indexer
2241+
new_index = self._shallow_copy_with_infer(new_labels, freq=None)
2242+
return new_index, indexer, new_indexer
22032243

22042244
def join(self, other, how='left', level=None, return_indexers=False):
22052245
"""
@@ -2756,8 +2796,7 @@ def delete(self, loc):
27562796
-------
27572797
new_index : Index
27582798
"""
2759-
attribs = self._get_attributes_dict()
2760-
return self._shallow_copy(np.delete(self._data, loc), **attribs)
2799+
return self._shallow_copy(np.delete(self._data, loc))
27612800

27622801
def insert(self, loc, item):
27632802
"""
@@ -2778,8 +2817,7 @@ def insert(self, loc, item):
27782817

27792818
idx = np.concatenate(
27802819
(_self[:loc], item, _self[loc:]))
2781-
attribs = self._get_attributes_dict()
2782-
return self._shallow_copy(idx, infer=True, **attribs)
2820+
return self._shallow_copy_with_infer(idx)
27832821

27842822
def drop(self, labels, errors='raise'):
27852823
"""
@@ -2841,7 +2879,6 @@ def fillna(self, value=None, downcast=None):
28412879
# no need to care metadata other than name
28422880
# because it can't have freq if
28432881
return Index(result, name=self.name)
2844-
28452882
return self._shallow_copy()
28462883

28472884
def _evaluate_with_timedelta_like(self, other, op, opstr):
@@ -4316,10 +4353,15 @@ def view(self, cls=None):
43164353
result._id = self._id
43174354
return result
43184355

4319-
def _shallow_copy(self, values=None, infer=False, **kwargs):
4356+
def _shallow_copy_with_infer(self, values=None, **kwargs):
4357+
return self._shallow_copy(values, **kwargs)
4358+
4359+
def _shallow_copy(self, values=None, **kwargs):
43204360
if values is not None:
43214361
if 'name' in kwargs:
43224362
kwargs['names'] = kwargs.pop('name',None)
4363+
# discards freq
4364+
kwargs.pop('freq', None)
43234365
return MultiIndex.from_tuples(values, **kwargs)
43244366
return self.view()
43254367

pandas/tests/test_indexing.py

+129-10
Original file line numberDiff line numberDiff line change
@@ -3516,44 +3516,163 @@ def test_series_partial_set(self):
35163516
# Regression from GH4825
35173517
ser = Series([0.1, 0.2], index=[1, 2])
35183518

3519-
# ToDo: check_index_type can be True after GH 11497
3520-
35213519
# loc
35223520
expected = Series([np.nan, 0.2, np.nan], index=[3, 2, 3])
35233521
result = ser.loc[[3, 2, 3]]
3524-
assert_series_equal(result, expected, check_index_type=False)
3522+
assert_series_equal(result, expected, check_index_type=True)
3523+
3524+
expected = Series([np.nan, 0.2, np.nan, np.nan], index=[3, 2, 3, 'x'])
3525+
result = ser.loc[[3, 2, 3, 'x']]
3526+
assert_series_equal(result, expected, check_index_type=True)
3527+
3528+
expected = Series([0.2, 0.2, 0.1], index=[2, 2, 1])
3529+
result = ser.loc[[2, 2, 1]]
3530+
assert_series_equal(result, expected, check_index_type=True)
3531+
3532+
expected = Series([0.2, 0.2, np.nan, 0.1], index=[2, 2, 'x', 1])
3533+
result = ser.loc[[2, 2, 'x', 1]]
3534+
assert_series_equal(result, expected, check_index_type=True)
35253535

35263536
# raises as nothing in in the index
35273537
self.assertRaises(KeyError, lambda : ser.loc[[3, 3, 3]])
35283538

35293539
expected = Series([0.2, 0.2, np.nan], index=[2, 2, 3])
35303540
result = ser.loc[[2, 2, 3]]
3531-
assert_series_equal(result, expected, check_index_type=False)
3541+
assert_series_equal(result, expected, check_index_type=True)
35323542

35333543
expected = Series([0.3, np.nan, np.nan], index=[3, 4, 4])
35343544
result = Series([0.1, 0.2, 0.3], index=[1, 2, 3]).loc[[3, 4, 4]]
3535-
assert_series_equal(result, expected, check_index_type=False)
3545+
assert_series_equal(result, expected, check_index_type=True)
35363546

35373547
expected = Series([np.nan, 0.3, 0.3], index=[5, 3, 3])
35383548
result = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]).loc[[5, 3, 3]]
3539-
assert_series_equal(result, expected, check_index_type=False)
3549+
assert_series_equal(result, expected, check_index_type=True)
35403550

35413551
expected = Series([np.nan, 0.4, 0.4], index=[5, 4, 4])
35423552
result = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]).loc[[5, 4, 4]]
3543-
assert_series_equal(result, expected, check_index_type=False)
3553+
assert_series_equal(result, expected, check_index_type=True)
35443554

35453555
expected = Series([0.4, np.nan, np.nan], index=[7, 2, 2])
35463556
result = Series([0.1, 0.2, 0.3, 0.4], index=[4, 5, 6, 7]).loc[[7, 2, 2]]
3547-
assert_series_equal(result, expected, check_index_type=False)
3557+
assert_series_equal(result, expected, check_index_type=True)
35483558

35493559
expected = Series([0.4, np.nan, np.nan], index=[4, 5, 5])
35503560
result = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]).loc[[4, 5, 5]]
3551-
assert_series_equal(result, expected, check_index_type=False)
3561+
assert_series_equal(result, expected, check_index_type=True)
35523562

35533563
# iloc
35543564
expected = Series([0.2, 0.2, 0.1, 0.1], index=[2, 2, 1, 1])
35553565
result = ser.iloc[[1, 1, 0, 0]]
3556-
assert_series_equal(result, expected, check_index_type=False)
3566+
assert_series_equal(result, expected, check_index_type=True)
3567+
3568+
def test_series_partial_set_with_name(self):
3569+
# GH 11497
3570+
3571+
idx = Index([1, 2], dtype='int64', name='idx')
3572+
ser = Series([0.1, 0.2], index=idx, name='s')
3573+
3574+
# loc
3575+
exp_idx = Index([3, 2, 3], dtype='int64', name='idx')
3576+
expected = Series([np.nan, 0.2, np.nan], index=exp_idx, name='s')
3577+
result = ser.loc[[3, 2, 3]]
3578+
assert_series_equal(result, expected, check_index_type=True)
3579+
3580+
exp_idx = Index([3, 2, 3, 'x'], dtype='object', name='idx')
3581+
expected = Series([np.nan, 0.2, np.nan, np.nan], index=exp_idx, name='s')
3582+
result = ser.loc[[3, 2, 3, 'x']]
3583+
assert_series_equal(result, expected, check_index_type=True)
3584+
3585+
exp_idx = Index([2, 2, 1], dtype='int64', name='idx')
3586+
expected = Series([0.2, 0.2, 0.1], index=exp_idx, name='s')
3587+
result = ser.loc[[2, 2, 1]]
3588+
assert_series_equal(result, expected, check_index_type=True)
3589+
3590+
exp_idx = Index([2, 2, 'x', 1], dtype='object', name='idx')
3591+
expected = Series([0.2, 0.2, np.nan, 0.1], index=exp_idx, name='s')
3592+
result = ser.loc[[2, 2, 'x', 1]]
3593+
assert_series_equal(result, expected, check_index_type=True)
3594+
3595+
# raises as nothing in in the index
3596+
self.assertRaises(KeyError, lambda : ser.loc[[3, 3, 3]])
3597+
3598+
exp_idx = Index([2, 2, 3], dtype='int64', name='idx')
3599+
expected = Series([0.2, 0.2, np.nan], index=exp_idx, name='s')
3600+
result = ser.loc[[2, 2, 3]]
3601+
assert_series_equal(result, expected, check_index_type=True)
3602+
3603+
exp_idx = Index([3, 4, 4], dtype='int64', name='idx')
3604+
expected = Series([0.3, np.nan, np.nan], index=exp_idx, name='s')
3605+
idx = Index([1, 2, 3], dtype='int64', name='idx')
3606+
result = Series([0.1, 0.2, 0.3], index=idx, name='s').loc[[3, 4, 4]]
3607+
assert_series_equal(result, expected, check_index_type=True)
3608+
3609+
exp_idx = Index([5, 3, 3], dtype='int64', name='idx')
3610+
expected = Series([np.nan, 0.3, 0.3], index=exp_idx, name='s')
3611+
idx = Index([1, 2, 3, 4], dtype='int64', name='idx')
3612+
result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name='s').loc[[5, 3, 3]]
3613+
assert_series_equal(result, expected, check_index_type=True)
3614+
3615+
exp_idx = Index([5, 4, 4], dtype='int64', name='idx')
3616+
expected = Series([np.nan, 0.4, 0.4], index=exp_idx, name='s')
3617+
idx = Index([1, 2, 3, 4], dtype='int64', name='idx')
3618+
result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name='s').loc[[5, 4, 4]]
3619+
assert_series_equal(result, expected, check_index_type=True)
3620+
3621+
exp_idx = Index([7, 2, 2], dtype='int64', name='idx')
3622+
expected = Series([0.4, np.nan, np.nan], index=exp_idx, name='s')
3623+
idx = Index([4, 5, 6, 7], dtype='int64', name='idx')
3624+
result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name='s').loc[[7, 2, 2]]
3625+
assert_series_equal(result, expected, check_index_type=True)
3626+
3627+
exp_idx = Index([4, 5, 5], dtype='int64', name='idx')
3628+
expected = Series([0.4, np.nan, np.nan], index=exp_idx, name='s')
3629+
idx = Index([1, 2, 3, 4], dtype='int64', name='idx')
3630+
result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name='s').loc[[4, 5, 5]]
3631+
assert_series_equal(result, expected, check_index_type=True)
3632+
3633+
# iloc
3634+
exp_idx = Index([2, 2, 1, 1], dtype='int64', name='idx')
3635+
expected = Series([0.2, 0.2, 0.1, 0.1], index=exp_idx, name='s')
3636+
result = ser.iloc[[1,1,0,0]]
3637+
assert_series_equal(result, expected, check_index_type=True)
3638+
3639+
def test_series_partial_set_datetime(self):
3640+
# GH 11497
3641+
3642+
idx = date_range('2011-01-01', '2011-01-02', freq='D', name='idx')
3643+
ser = Series([0.1, 0.2], index=idx, name='s')
3644+
3645+
result = ser.loc[[Timestamp('2011-01-01'), Timestamp('2011-01-02')]]
3646+
exp = Series([0.1, 0.2], index=idx, name='s')
3647+
assert_series_equal(result, exp, check_index_type=True)
3648+
3649+
keys = [Timestamp('2011-01-02'), Timestamp('2011-01-02'), Timestamp('2011-01-01')]
3650+
exp = Series([0.2, 0.2, 0.1], index=pd.DatetimeIndex(keys, name='idx'), name='s')
3651+
assert_series_equal(ser.loc[keys], exp, check_index_type=True)
3652+
3653+
keys = [Timestamp('2011-01-03'), Timestamp('2011-01-02'), Timestamp('2011-01-03')]
3654+
exp = Series([np.nan, 0.2, np.nan], index=pd.DatetimeIndex(keys, name='idx'), name='s')
3655+
assert_series_equal(ser.loc[keys], exp, check_index_type=True)
3656+
3657+
def test_series_partial_set_period(self):
3658+
# GH 11497
3659+
3660+
idx = pd.period_range('2011-01-01', '2011-01-02', freq='D', name='idx')
3661+
ser = Series([0.1, 0.2], index=idx, name='s')
3662+
3663+
result = ser.loc[[pd.Period('2011-01-01', freq='D'), pd.Period('2011-01-02', freq='D')]]
3664+
exp = Series([0.1, 0.2], index=idx, name='s')
3665+
assert_series_equal(result, exp, check_index_type=True)
3666+
3667+
keys = [pd.Period('2011-01-02', freq='D'), pd.Period('2011-01-02', freq='D'),
3668+
pd.Period('2011-01-01', freq='D')]
3669+
exp = Series([0.2, 0.2, 0.1], index=pd.PeriodIndex(keys, name='idx'), name='s')
3670+
assert_series_equal(ser.loc[keys], exp, check_index_type=True)
3671+
3672+
keys = [pd.Period('2011-01-03', freq='D'), pd.Period('2011-01-02', freq='D'),
3673+
pd.Period('2011-01-03', freq='D')]
3674+
exp = Series([np.nan, 0.2, np.nan], index=pd.PeriodIndex(keys, name='idx'), name='s')
3675+
assert_series_equal(ser.loc[keys], exp, check_index_type=True)
35573676

35583677
def test_partial_set_invalid(self):
35593678

pandas/tseries/index.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ def _join_i8_wrapper(joinf, **kwargs):
197197
'is_quarter_start','is_quarter_end','is_year_start','is_year_end',
198198
'tz','freq']
199199
_is_numeric_dtype = False
200-
200+
_infer_as_myclass = True
201201

202202
@deprecate_kwarg(old_arg_name='infer_dst', new_arg_name='ambiguous',
203203
mapping={True: 'infer', False: 'raise'})
@@ -778,7 +778,7 @@ def astype(self, dtype):
778778
elif dtype == _NS_DTYPE and self.tz is not None:
779779
return self.tz_convert('UTC').tz_localize(None)
780780
elif dtype == str:
781-
return self._shallow_copy(values=self.format(), infer=True)
781+
return Index(self.format(), name=self.name, dtype=object)
782782
else: # pragma: no cover
783783
raise ValueError('Cannot cast DatetimeIndex to dtype %s' % dtype)
784784

0 commit comments

Comments
 (0)