diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index d4ed68b9f4343..42444e05783c2 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -112,3 +112,7 @@ Bug Fixes - Bug in ``.loc`` against ``CategoricalIndex`` may result in normal ``Index`` (:issue:`11586`) - Bug groupby on tz-aware data where selection not returning ``Timestamp`` (:issue:`11616`) - Bug in timezone info lost when broadcasting scalar datetime to ``DataFrame`` (:issue:`11682`) + + +- Bug in ``.loc`` result with duplicated key may have ``Index`` with incorrect dtype (:issue:`11497`) + diff --git a/pandas/core/index.py b/pandas/core/index.py index 2099c1996b66b..fa23f2e1efe3f 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -111,6 +111,10 @@ class Index(IndexOpsMixin, StringAccessorMixin, PandasObject): _is_numeric_dtype = False _can_hold_na = True + # prioritize current class for _shallow_copy_with_infer, + # used to infer integers as datetime-likes + _infer_as_myclass = False + _engine_type = _index.ObjectEngine def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, @@ -209,6 +213,24 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, subarr = com._asarray_tuplesafe(data, dtype=object) return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs) + """ + NOTE for new Index creation: + + - _simple_new: It returns new Index with the same type as the caller. + All metadata (such as name) must be provided by caller's responsibility. + Using _shallow_copy is recommended because it fills these metadata otherwise specified. + + - _shallow_copy: It returns new Index with the same type (using _simple_new), + but fills caller's metadata otherwise specified. Passed kwargs will + overwrite corresponding metadata. + + - _shallow_copy_with_infer: It returns new Index inferring its type + from passed values. It fills caller's metadata otherwise specified as the + same as _shallow_copy. + + See each method's docstring. + """ + @classmethod def _simple_new(cls, values, name=None, dtype=None, **kwargs): """ @@ -233,6 +255,48 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): result._reset_identity() return result + def _shallow_copy(self, values=None, **kwargs): + """ + create a new Index with the same class as the caller, don't copy the data, + use the same object attributes with passed in attributes taking precedence + + *this is an internal non-public method* + + Parameters + ---------- + values : the values to create the new Index, optional + kwargs : updates the default attributes for this Index + """ + if values is None: + values = self.values + attributes = self._get_attributes_dict() + attributes.update(kwargs) + return self._simple_new(values, **attributes) + + def _shallow_copy_with_infer(self, values=None, **kwargs): + """ + create a new Index inferring the class with passed value, don't copy the data, + use the same object attributes with passed in attributes taking precedence + + *this is an internal non-public method* + + Parameters + ---------- + values : the values to create the new Index, optional + kwargs : updates the default attributes for this Index + """ + if values is None: + values = self.values + attributes = self._get_attributes_dict() + attributes.update(kwargs) + attributes['copy'] = False + if self._infer_as_myclass: + try: + return self._constructor(values, **attributes) + except (TypeError, ValueError) as e: + pass + return Index(values, **attributes) + def _update_inplace(self, result, **kwargs): # guard when called from IndexOpsMixin raise TypeError("Index can't be updated inplace") @@ -372,31 +436,6 @@ def view(self, cls=None): result._id = self._id return result - def _shallow_copy(self, values=None, infer=False, **kwargs): - """ - create a new Index, don't copy the data, use the same object attributes - with passed in attributes taking precedence - - *this is an internal non-public method* - - Parameters - ---------- - values : the values to create the new Index, optional - infer : boolean, default False - if True, infer the new type of the passed values - kwargs : updates the default attributes for this Index - """ - if values is None: - values = self.values - attributes = self._get_attributes_dict() - attributes.update(kwargs) - - if infer: - attributes['copy'] = False - return Index(values, **attributes) - - return self.__class__._simple_new(values,**attributes) - def _coerce_scalar_to_index(self, item): """ we need to coerce a scalar to a compat for our index type @@ -1206,7 +1245,7 @@ def append(self, other): to_concat, name = self._ensure_compat_append(other) attribs = self._get_attributes_dict() attribs['name'] = name - return self._shallow_copy(np.concatenate(to_concat), infer=True, **attribs) + return self._shallow_copy_with_infer(np.concatenate(to_concat), **attribs) @staticmethod def _ensure_compat_concat(indexes): @@ -1725,7 +1764,7 @@ def sym_diff(self, other, result_name=None): attribs['name'] = result_name if 'freq' in attribs: attribs['freq'] = None - return self._shallow_copy(the_diff, infer=True, **attribs) + return self._shallow_copy_with_infer(the_diff, **attribs) def get_loc(self, key, method=None, tolerance=None): """ @@ -2199,7 +2238,8 @@ def _reindex_non_unique(self, target): new_indexer = np.arange(len(self.take(indexer))) new_indexer[~check] = -1 - return self._shallow_copy(new_labels), indexer, new_indexer + new_index = self._shallow_copy_with_infer(new_labels, freq=None) + return new_index, indexer, new_indexer def join(self, other, how='left', level=None, return_indexers=False): """ @@ -2756,8 +2796,7 @@ def delete(self, loc): ------- new_index : Index """ - attribs = self._get_attributes_dict() - return self._shallow_copy(np.delete(self._data, loc), **attribs) + return self._shallow_copy(np.delete(self._data, loc)) def insert(self, loc, item): """ @@ -2778,8 +2817,7 @@ def insert(self, loc, item): idx = np.concatenate( (_self[:loc], item, _self[loc:])) - attribs = self._get_attributes_dict() - return self._shallow_copy(idx, infer=True, **attribs) + return self._shallow_copy_with_infer(idx) def drop(self, labels, errors='raise'): """ @@ -2841,7 +2879,6 @@ def fillna(self, value=None, downcast=None): # no need to care metadata other than name # because it can't have freq if return Index(result, name=self.name) - return self._shallow_copy() def _evaluate_with_timedelta_like(self, other, op, opstr): @@ -4316,10 +4353,15 @@ def view(self, cls=None): result._id = self._id return result - def _shallow_copy(self, values=None, infer=False, **kwargs): + def _shallow_copy_with_infer(self, values=None, **kwargs): + return self._shallow_copy(values, **kwargs) + + def _shallow_copy(self, values=None, **kwargs): if values is not None: if 'name' in kwargs: kwargs['names'] = kwargs.pop('name',None) + # discards freq + kwargs.pop('freq', None) return MultiIndex.from_tuples(values, **kwargs) return self.view() diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 66850ab29af39..c6d80a08ad61a 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -3516,44 +3516,163 @@ def test_series_partial_set(self): # Regression from GH4825 ser = Series([0.1, 0.2], index=[1, 2]) - # ToDo: check_index_type can be True after GH 11497 - # loc expected = Series([np.nan, 0.2, np.nan], index=[3, 2, 3]) result = ser.loc[[3, 2, 3]] - assert_series_equal(result, expected, check_index_type=False) + assert_series_equal(result, expected, check_index_type=True) + + expected = Series([np.nan, 0.2, np.nan, np.nan], index=[3, 2, 3, 'x']) + result = ser.loc[[3, 2, 3, 'x']] + assert_series_equal(result, expected, check_index_type=True) + + expected = Series([0.2, 0.2, 0.1], index=[2, 2, 1]) + result = ser.loc[[2, 2, 1]] + assert_series_equal(result, expected, check_index_type=True) + + expected = Series([0.2, 0.2, np.nan, 0.1], index=[2, 2, 'x', 1]) + result = ser.loc[[2, 2, 'x', 1]] + assert_series_equal(result, expected, check_index_type=True) # raises as nothing in in the index self.assertRaises(KeyError, lambda : ser.loc[[3, 3, 3]]) expected = Series([0.2, 0.2, np.nan], index=[2, 2, 3]) result = ser.loc[[2, 2, 3]] - assert_series_equal(result, expected, check_index_type=False) + assert_series_equal(result, expected, check_index_type=True) expected = Series([0.3, np.nan, np.nan], index=[3, 4, 4]) result = Series([0.1, 0.2, 0.3], index=[1, 2, 3]).loc[[3, 4, 4]] - assert_series_equal(result, expected, check_index_type=False) + assert_series_equal(result, expected, check_index_type=True) expected = Series([np.nan, 0.3, 0.3], index=[5, 3, 3]) result = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]).loc[[5, 3, 3]] - assert_series_equal(result, expected, check_index_type=False) + assert_series_equal(result, expected, check_index_type=True) expected = Series([np.nan, 0.4, 0.4], index=[5, 4, 4]) result = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]).loc[[5, 4, 4]] - assert_series_equal(result, expected, check_index_type=False) + assert_series_equal(result, expected, check_index_type=True) expected = Series([0.4, np.nan, np.nan], index=[7, 2, 2]) result = Series([0.1, 0.2, 0.3, 0.4], index=[4, 5, 6, 7]).loc[[7, 2, 2]] - assert_series_equal(result, expected, check_index_type=False) + assert_series_equal(result, expected, check_index_type=True) expected = Series([0.4, np.nan, np.nan], index=[4, 5, 5]) result = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]).loc[[4, 5, 5]] - assert_series_equal(result, expected, check_index_type=False) + assert_series_equal(result, expected, check_index_type=True) # iloc expected = Series([0.2, 0.2, 0.1, 0.1], index=[2, 2, 1, 1]) result = ser.iloc[[1, 1, 0, 0]] - assert_series_equal(result, expected, check_index_type=False) + assert_series_equal(result, expected, check_index_type=True) + + def test_series_partial_set_with_name(self): + # GH 11497 + + idx = Index([1, 2], dtype='int64', name='idx') + ser = Series([0.1, 0.2], index=idx, name='s') + + # loc + exp_idx = Index([3, 2, 3], dtype='int64', name='idx') + expected = Series([np.nan, 0.2, np.nan], index=exp_idx, name='s') + result = ser.loc[[3, 2, 3]] + assert_series_equal(result, expected, check_index_type=True) + + exp_idx = Index([3, 2, 3, 'x'], dtype='object', name='idx') + expected = Series([np.nan, 0.2, np.nan, np.nan], index=exp_idx, name='s') + result = ser.loc[[3, 2, 3, 'x']] + assert_series_equal(result, expected, check_index_type=True) + + exp_idx = Index([2, 2, 1], dtype='int64', name='idx') + expected = Series([0.2, 0.2, 0.1], index=exp_idx, name='s') + result = ser.loc[[2, 2, 1]] + assert_series_equal(result, expected, check_index_type=True) + + exp_idx = Index([2, 2, 'x', 1], dtype='object', name='idx') + expected = Series([0.2, 0.2, np.nan, 0.1], index=exp_idx, name='s') + result = ser.loc[[2, 2, 'x', 1]] + assert_series_equal(result, expected, check_index_type=True) + + # raises as nothing in in the index + self.assertRaises(KeyError, lambda : ser.loc[[3, 3, 3]]) + + exp_idx = Index([2, 2, 3], dtype='int64', name='idx') + expected = Series([0.2, 0.2, np.nan], index=exp_idx, name='s') + result = ser.loc[[2, 2, 3]] + assert_series_equal(result, expected, check_index_type=True) + + exp_idx = Index([3, 4, 4], dtype='int64', name='idx') + expected = Series([0.3, np.nan, np.nan], index=exp_idx, name='s') + idx = Index([1, 2, 3], dtype='int64', name='idx') + result = Series([0.1, 0.2, 0.3], index=idx, name='s').loc[[3, 4, 4]] + assert_series_equal(result, expected, check_index_type=True) + + exp_idx = Index([5, 3, 3], dtype='int64', name='idx') + expected = Series([np.nan, 0.3, 0.3], index=exp_idx, name='s') + idx = Index([1, 2, 3, 4], dtype='int64', name='idx') + result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name='s').loc[[5, 3, 3]] + assert_series_equal(result, expected, check_index_type=True) + + exp_idx = Index([5, 4, 4], dtype='int64', name='idx') + expected = Series([np.nan, 0.4, 0.4], index=exp_idx, name='s') + idx = Index([1, 2, 3, 4], dtype='int64', name='idx') + result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name='s').loc[[5, 4, 4]] + assert_series_equal(result, expected, check_index_type=True) + + exp_idx = Index([7, 2, 2], dtype='int64', name='idx') + expected = Series([0.4, np.nan, np.nan], index=exp_idx, name='s') + idx = Index([4, 5, 6, 7], dtype='int64', name='idx') + result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name='s').loc[[7, 2, 2]] + assert_series_equal(result, expected, check_index_type=True) + + exp_idx = Index([4, 5, 5], dtype='int64', name='idx') + expected = Series([0.4, np.nan, np.nan], index=exp_idx, name='s') + idx = Index([1, 2, 3, 4], dtype='int64', name='idx') + result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name='s').loc[[4, 5, 5]] + assert_series_equal(result, expected, check_index_type=True) + + # iloc + exp_idx = Index([2, 2, 1, 1], dtype='int64', name='idx') + expected = Series([0.2, 0.2, 0.1, 0.1], index=exp_idx, name='s') + result = ser.iloc[[1,1,0,0]] + assert_series_equal(result, expected, check_index_type=True) + + def test_series_partial_set_datetime(self): + # GH 11497 + + idx = date_range('2011-01-01', '2011-01-02', freq='D', name='idx') + ser = Series([0.1, 0.2], index=idx, name='s') + + result = ser.loc[[Timestamp('2011-01-01'), Timestamp('2011-01-02')]] + exp = Series([0.1, 0.2], index=idx, name='s') + assert_series_equal(result, exp, check_index_type=True) + + keys = [Timestamp('2011-01-02'), Timestamp('2011-01-02'), Timestamp('2011-01-01')] + exp = Series([0.2, 0.2, 0.1], index=pd.DatetimeIndex(keys, name='idx'), name='s') + assert_series_equal(ser.loc[keys], exp, check_index_type=True) + + keys = [Timestamp('2011-01-03'), Timestamp('2011-01-02'), Timestamp('2011-01-03')] + exp = Series([np.nan, 0.2, np.nan], index=pd.DatetimeIndex(keys, name='idx'), name='s') + assert_series_equal(ser.loc[keys], exp, check_index_type=True) + + def test_series_partial_set_period(self): + # GH 11497 + + idx = pd.period_range('2011-01-01', '2011-01-02', freq='D', name='idx') + ser = Series([0.1, 0.2], index=idx, name='s') + + result = ser.loc[[pd.Period('2011-01-01', freq='D'), pd.Period('2011-01-02', freq='D')]] + exp = Series([0.1, 0.2], index=idx, name='s') + assert_series_equal(result, exp, check_index_type=True) + + keys = [pd.Period('2011-01-02', freq='D'), pd.Period('2011-01-02', freq='D'), + pd.Period('2011-01-01', freq='D')] + exp = Series([0.2, 0.2, 0.1], index=pd.PeriodIndex(keys, name='idx'), name='s') + assert_series_equal(ser.loc[keys], exp, check_index_type=True) + + keys = [pd.Period('2011-01-03', freq='D'), pd.Period('2011-01-02', freq='D'), + pd.Period('2011-01-03', freq='D')] + exp = Series([np.nan, 0.2, np.nan], index=pd.PeriodIndex(keys, name='idx'), name='s') + assert_series_equal(ser.loc[keys], exp, check_index_type=True) def test_partial_set_invalid(self): diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 4fd61e28233a6..0799c839a024d 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -197,7 +197,7 @@ def _join_i8_wrapper(joinf, **kwargs): 'is_quarter_start','is_quarter_end','is_year_start','is_year_end', 'tz','freq'] _is_numeric_dtype = False - + _infer_as_myclass = True @deprecate_kwarg(old_arg_name='infer_dst', new_arg_name='ambiguous', mapping={True: 'infer', False: 'raise'}) @@ -778,7 +778,7 @@ def astype(self, dtype): elif dtype == _NS_DTYPE and self.tz is not None: return self.tz_convert('UTC').tz_localize(None) elif dtype == str: - return self._shallow_copy(values=self.format(), infer=True) + return Index(self.format(), name=self.name, dtype=object) else: # pragma: no cover raise ValueError('Cannot cast DatetimeIndex to dtype %s' % dtype) diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 578727f515fe4..3f4bba0344ca0 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -156,6 +156,8 @@ class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index): _datetimelike_ops = ['year','month','day','hour','minute','second', 'weekofyear','week','dayofweek','weekday','dayofyear','quarter', 'qyear', 'freq', 'days_in_month', 'daysinmonth'] _is_numeric_dtype = False + _infer_as_myclass = True + freq = None __eq__ = _period_index_cmp('__eq__') @@ -279,9 +281,15 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): result._reset_identity() return result - def _shallow_copy(self, values=None, infer=False, **kwargs): + def _shallow_copy_with_infer(self, values=None, **kwargs): """ we always want to return a PeriodIndex """ - return super(PeriodIndex, self)._shallow_copy(values=values, infer=False, **kwargs) + return self._shallow_copy(values=values, **kwargs) + + def _shallow_copy(self, values=None, **kwargs): + if kwargs.get('freq') is None: + # freq must be provided + kwargs['freq'] = self.freq + return super(PeriodIndex, self)._shallow_copy(values=values, **kwargs) def _coerce_scalar_to_index(self, item): """ diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 89229fc48bcb2..5691e29cb0e96 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -129,6 +129,8 @@ def _join_i8_wrapper(joinf, **kwargs): _comparables = ['name', 'freq'] _attributes = ['name', 'freq'] _is_numeric_dtype = True + _infer_as_myclass = True + freq = None def __new__(cls, data=None, unit=None, @@ -514,8 +516,7 @@ def _wrap_joined_index(self, joined, other): name = self.name if self.name == other.name else None if (isinstance(other, TimedeltaIndex) and self.freq == other.freq and self._can_fast_union(other)): - joined = self._shallow_copy(joined) - joined.name = name + joined = self._shallow_copy(joined, name=name) return joined else: return self._simple_new(joined, name)