From 5029592fdb53da07e6646fa629311ce10af4fbf8 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Thu, 10 Aug 2017 14:59:15 +0200 Subject: [PATCH 1/5] REF: use inheritance to concatenate categorical Indexes --- pandas/core/indexes/base.py | 7 +++---- pandas/core/indexes/category.py | 4 ++++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4aecc75d95971..4a23389fba8d4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1741,10 +1741,9 @@ def append(self, other): names = set([obj.name for obj in to_concat]) name = None if len(names) > 1 else self.name - if self.is_categorical(): - # if calling index is category, don't check dtype of others - from pandas.core.indexes.category import CategoricalIndex - return CategoricalIndex._append_same_dtype(self, to_concat, name) + return self._concat(to_concat, name) + + def _concat(self, to_concat, name): typs = _concat.get_dtype_kinds(to_concat) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index ac4698b570d17..c0d23bcd4f32b 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -633,6 +633,10 @@ def insert(self, loc, item): codes = np.concatenate((codes[:loc], code, codes[loc:])) return self._create_from_codes(codes) + def _concat(self, to_concat, name): + # if calling index is category, don't check dtype of others + return CategoricalIndex._append_same_dtype(self, to_concat, name) + def _append_same_dtype(self, to_concat, name): """ Concatenate to_concat which has the same class From d5c7d77c5cc4914270b6cf9f979fdc23dd2a5073 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Thu, 10 Aug 2017 09:57:56 +0200 Subject: [PATCH 2/5] REF: Avoid code duplication in RangeIndex.append --- pandas/core/dtypes/common.py | 6 +++- pandas/core/dtypes/concat.py | 38 +++++++++++++++++++++++ pandas/core/indexes/range.py | 60 +++--------------------------------- 3 files changed, 47 insertions(+), 57 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 37f99bd344e6c..4085acce10c61 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -12,7 +12,7 @@ from .generic import (ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries, ABCSparseArray, ABCSparseSeries, ABCCategoricalIndex, - ABCIndexClass) + ABCIndexClass, ABCRangeIndex) from .inference import is_string_like from .inference import * # noqa @@ -220,6 +220,10 @@ def is_categorical(arr): return isinstance(arr, ABCCategorical) or is_categorical_dtype(arr) +def is_range(arr): + return isinstance(arr, ABCRangeIndex) + + def is_datetimetz(arr): """ Check whether an array-like is a datetime array-like with a timezone diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 292d5f608d4cb..0bb8d0021c6cf 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -15,6 +15,7 @@ is_object_dtype, is_bool_dtype, is_dtype_equal, + is_range, _NS_DTYPE, _TD_DTYPE) from pandas.core.dtypes.generic import ( @@ -45,6 +46,8 @@ def get_dtype_kinds(l): # if to_concat contains different tz, # the result must be object dtype typ = str(arr.dtype) + elif is_range(arr): + typ = 'range' elif is_datetime64_dtype(dtype): typ = 'datetime' elif is_timedelta64_dtype(dtype): @@ -559,3 +562,38 @@ def convert_sparse(x, axis): # coerce to object if needed result = result.astype('object') return result + + +def _concat_indexes_same_dtype_rangeindex(indexes): + + start = step = next = None + + for obj in indexes: + if not len(obj): + continue + + if start is None: + # This is set by the first non-empty index + start = obj._start + if step is None and len(obj) > 1: + step = obj._step + elif step is None: + # First non-empty index had only one element + if obj._start == start: + return _concat_index_asobject(indexes) + step = obj._start - start + + non_consecutive = ((step != obj._step and len(obj) > 1) or + (next is not None and obj._start != next)) + if non_consecutive: + # Not nice... but currently what happens in NumericIndex: + return _concat_index_asobject(indexes) + + if step is not None: + next = obj[-1] + step + + if start is None: + start = obj._start + step = obj._step + stop = obj._stop if next is None else next + return indexes[0].__class__(start, stop, step) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 5071b50bbebdf..0eee31e5e47a6 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -14,6 +14,7 @@ from pandas.compat.numpy import function as nv from pandas.core.indexes.base import Index, _index_shared_docs from pandas.util._decorators import Appender, cache_readonly +import pandas.core.dtypes.concat as _concat import pandas.core.indexes.base as ibase from pandas.core.indexes.numeric import Int64Index @@ -443,62 +444,9 @@ def join(self, other, how='left', level=None, return_indexers=False, return super(RangeIndex, self).join(other, how, level, return_indexers, sort) - def append(self, other): - """ - Append a collection of Index options together - - Parameters - ---------- - other : Index or list/tuple of indices - - Returns - ------- - appended : RangeIndex if all indexes are consecutive RangeIndexes, - otherwise Int64Index or Index - """ - - to_concat = [self] - - if isinstance(other, (list, tuple)): - to_concat = to_concat + list(other) - else: - to_concat.append(other) - - if not all([isinstance(i, RangeIndex) for i in to_concat]): - return super(RangeIndex, self).append(other) - - start = step = next = None - - for obj in to_concat: - if not len(obj): - continue - - if start is None: - # This is set by the first non-empty index - start = obj._start - if step is None and len(obj) > 1: - step = obj._step - elif step is None: - # First non-empty index had only one element - if obj._start == start: - return super(RangeIndex, self).append(other) - step = obj._start - start - - non_consecutive = ((step != obj._step and len(obj) > 1) or - (next is not None and obj._start != next)) - if non_consecutive: - return super(RangeIndex, self).append(other) - - if step is not None: - next = obj[-1] + step - - if start is None: - start = obj._start - step = obj._step - stop = obj._stop if next is None else next - names = set([obj.name for obj in to_concat]) - name = None if len(names) > 1 else self.name - return RangeIndex(start, stop, step, name=name) + def _append_same_dtype(self, indexes, name): + return _concat._concat_indexes_same_dtype_rangeindex(indexes + ).rename(name) def __len__(self): """ From 44800a4d908ed9ba6cdba24ab4dca7f3e33722a1 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Sun, 20 Aug 2017 19:38:52 +0200 Subject: [PATCH 3/5] Comment --- pandas/core/dtypes/concat.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 0bb8d0021c6cf..4fe402d675e53 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -565,6 +565,11 @@ def convert_sparse(x, axis): def _concat_indexes_same_dtype_rangeindex(indexes): + # Concatenates multiple RangeIndex instances. All members of "indexes" must + # be of type RangeIndex; result will be RangeIndex if possible, Int64Index + # otherwise. E.g.: + # indexes = [RangeIndex(3), RangeIndex(3, 6)] -> RangeIndex(6) + # indexes = [RangeIndex(3), RangeIndex(4, 6)] -> Int64Index([0,1,2,4,5]) start = step = next = None @@ -586,7 +591,9 @@ def _concat_indexes_same_dtype_rangeindex(indexes): non_consecutive = ((step != obj._step and len(obj) > 1) or (next is not None and obj._start != next)) if non_consecutive: - # Not nice... but currently what happens in NumericIndex: + # Int64Index._append_same_dtype([ix.astype(int) for ix in indexes]) + # would be preferred... but it currently resorts to + # _concat_index_asobject anyway. return _concat_index_asobject(indexes) if step is not None: From 528295d6813d956007d328ebdee37d2d0871e1ab Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Mon, 21 Aug 2017 14:38:28 +0200 Subject: [PATCH 4/5] Removed is_range, renamed _concat_rangeindex_same_dtype --- pandas/core/dtypes/common.py | 6 +----- pandas/core/dtypes/concat.py | 21 +++++++++++---------- pandas/core/indexes/range.py | 3 +-- 3 files changed, 13 insertions(+), 17 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 4085acce10c61..37f99bd344e6c 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -12,7 +12,7 @@ from .generic import (ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries, ABCSparseArray, ABCSparseSeries, ABCCategoricalIndex, - ABCIndexClass, ABCRangeIndex) + ABCIndexClass) from .inference import is_string_like from .inference import * # noqa @@ -220,10 +220,6 @@ def is_categorical(arr): return isinstance(arr, ABCCategorical) or is_categorical_dtype(arr) -def is_range(arr): - return isinstance(arr, ABCRangeIndex) - - def is_datetimetz(arr): """ Check whether an array-like is a datetime array-like with a timezone diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 4fe402d675e53..0ce45eea119ed 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -15,12 +15,11 @@ is_object_dtype, is_bool_dtype, is_dtype_equal, - is_range, _NS_DTYPE, _TD_DTYPE) from pandas.core.dtypes.generic import ( ABCDatetimeIndex, ABCTimedeltaIndex, - ABCPeriodIndex) + ABCPeriodIndex, ABCRangeIndex) def get_dtype_kinds(l): @@ -42,12 +41,12 @@ def get_dtype_kinds(l): typ = 'category' elif is_sparse(arr): typ = 'sparse' + elif isinstance(arr, ABCRangeIndex): + typ = 'range' elif is_datetimetz(arr): # if to_concat contains different tz, # the result must be object dtype typ = str(arr.dtype) - elif is_range(arr): - typ = 'range' elif is_datetime64_dtype(dtype): typ = 'datetime' elif is_timedelta64_dtype(dtype): @@ -564,12 +563,14 @@ def convert_sparse(x, axis): return result -def _concat_indexes_same_dtype_rangeindex(indexes): - # Concatenates multiple RangeIndex instances. All members of "indexes" must - # be of type RangeIndex; result will be RangeIndex if possible, Int64Index - # otherwise. E.g.: - # indexes = [RangeIndex(3), RangeIndex(3, 6)] -> RangeIndex(6) - # indexes = [RangeIndex(3), RangeIndex(4, 6)] -> Int64Index([0,1,2,4,5]) +def _concat_rangeindex_same_dtype(indexes): + """ + Concatenates multiple RangeIndex instances. All members of "indexes" must + be of type RangeIndex; result will be RangeIndex if possible, Int64Index + otherwise. E.g.: + indexes = [RangeIndex(3), RangeIndex(3, 6)] -> RangeIndex(6) + indexes = [RangeIndex(3), RangeIndex(4, 6)] -> Int64Index([0,1,2,4,5]) + """ start = step = next = None diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 0eee31e5e47a6..f7da97f4611f6 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -445,8 +445,7 @@ def join(self, other, how='left', level=None, return_indexers=False, sort) def _append_same_dtype(self, indexes, name): - return _concat._concat_indexes_same_dtype_rangeindex(indexes - ).rename(name) + return _concat._concat_rangeindex_same_dtype(indexes).rename(name) def __len__(self): """ From 554ee79709cbb5fdd3e8ef8e3e6553e6fa499cf9 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Mon, 21 Aug 2017 15:05:59 +0200 Subject: [PATCH 5/5] _append_same_dtype -> _concat_same_dtype --- pandas/core/indexes/base.py | 4 ++-- pandas/core/indexes/category.py | 4 ++-- pandas/core/indexes/datetimelike.py | 2 +- pandas/core/indexes/interval.py | 4 ++-- pandas/core/indexes/range.py | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4a23389fba8d4..e3c32629f54a0 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1748,10 +1748,10 @@ def _concat(self, to_concat, name): typs = _concat.get_dtype_kinds(to_concat) if len(typs) == 1: - return self._append_same_dtype(to_concat, name=name) + return self._concat_same_dtype(to_concat, name=name) return _concat._concat_index_asobject(to_concat, name=name) - def _append_same_dtype(self, to_concat, name): + def _concat_same_dtype(self, to_concat, name): """ Concatenate to_concat which has the same class """ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index c0d23bcd4f32b..f22407308e094 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -635,9 +635,9 @@ def insert(self, loc, item): def _concat(self, to_concat, name): # if calling index is category, don't check dtype of others - return CategoricalIndex._append_same_dtype(self, to_concat, name) + return CategoricalIndex._concat_same_dtype(self, to_concat, name) - def _append_same_dtype(self, to_concat, name): + def _concat_same_dtype(self, to_concat, name): """ Concatenate to_concat which has the same class ValueError if other is not in the categories diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 845c71b6c41d8..c3232627fce74 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -837,7 +837,7 @@ def summary(self, name=None): result = result.replace("'", "") return result - def _append_same_dtype(self, to_concat, name): + def _concat_same_dtype(self, to_concat, name): """ Concatenate to_concat which has the same class """ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index aa2ad21ae37fd..c855dbf82c2af 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -867,7 +867,7 @@ def _as_like_interval_index(self, other, error_msg): raise ValueError(error_msg) return other - def _append_same_dtype(self, to_concat, name): + def _concat_same_dtype(self, to_concat, name): """ assert that we all have the same .closed we allow a 0-len index here as well @@ -876,7 +876,7 @@ def _append_same_dtype(self, to_concat, name): msg = ('can only append two IntervalIndex objects ' 'that are closed on the same side') raise ValueError(msg) - return super(IntervalIndex, self)._append_same_dtype(to_concat, name) + return super(IntervalIndex, self)._concat_same_dtype(to_concat, name) @Appender(_index_shared_docs['take'] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index f7da97f4611f6..3f24bdeac0420 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -444,7 +444,7 @@ def join(self, other, how='left', level=None, return_indexers=False, return super(RangeIndex, self).join(other, how, level, return_indexers, sort) - def _append_same_dtype(self, indexes, name): + def _concat_same_dtype(self, indexes, name): return _concat._concat_rangeindex_same_dtype(indexes).rename(name) def __len__(self):