From 491e3c3dffad6016ad79f36776ea3efcefa89d66 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 19 Aug 2013 10:00:36 -0400 Subject: [PATCH 1/2] TST: GH4604, reindexing with a method of 'ffill' gives incorrect results BUG/CLN: (GH4604) Refactor Series.reindex to core/generic.py allow method= in reindexing on a Series to work API/CLN: GH4604 Infer and downcast dtype if appropriate on ffill/bfill this is for consistency when doing: df.reindex().ffill() and df.reindex(method='ffill') CLN: allow backfill/pad/interpolate to operate on integers (by float conversion) provide downcasting back to original dtype where needed core.internals.interpolate ENH: provide core.index.identical method to compare values and attributes similar to .equals API: changed back to pre-GH3482 where a reindex with no args will by default copy --- doc/source/release.rst | 9 ++- doc/source/v0.13.0.txt | 5 ++ pandas/core/common.py | 56 +++++++++++-- pandas/core/generic.py | 40 +++++++--- pandas/core/index.py | 8 ++ pandas/core/internals.py | 109 ++++++++++++++++++-------- pandas/core/series.py | 81 ++----------------- pandas/sparse/frame.py | 6 +- pandas/sparse/series.py | 13 --- pandas/tests/test_panel.py | 2 +- pandas/tests/test_panel4d.py | 2 +- pandas/tests/test_series.py | 26 +++++- pandas/tools/tests/test_merge.py | 1 + pandas/tseries/index.py | 1 + pandas/tseries/period.py | 1 + pandas/tseries/tests/test_resample.py | 10 +-- 16 files changed, 216 insertions(+), 154 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 390c6e857ba32..ec21172230da9 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -144,8 +144,6 @@ See :ref:`Internal Refactoring` - support attribute access for setting - filter supports same api as original ``DataFrame`` filter -- Reindex called with no arguments will now return a copy of the input object - - Series now inherits from ``NDFrame`` rather than directly from ``ndarray``. There are several minor changes that affect the API. @@ -185,6 +183,11 @@ See :ref:`Internal Refactoring` - Indexing with dtype conversions fixed (:issue:`4463`, :issue:`4204`) +- Refactor Series.reindex to core/generic.py (:issue:`4604`), allow ``method=`` in reindexing + on a Series to work + +- Infer and downcast dtype if appropriate on ``ffill/bfill`` (:issue:`4604`) + **Experimental Features** **Bug Fixes** @@ -210,7 +213,7 @@ See :ref:`Internal Refactoring` - In ``to_json``, raise if a passed ``orient`` would cause loss of data because of a duplicate index (:issue:`4359`) - In ``to_json``, fix date handling so milliseconds are the default timestamp - as the docstring says (:issue:`4362`). + as the docstring says (:issue:`4362`). - JSON NaT handling fixed, NaTs are now serialised to `null` (:issue:`4498`) - Fixed passing ``keep_default_na=False`` when ``na_values=None`` (:issue:`4318`) - Fixed bug with ``values`` raising an error on a DataFrame with duplicate columns and mixed diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index 9776c3e4662ec..34733e03d897c 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -237,6 +237,11 @@ and behaviors. Series formerly subclassed directly from ``ndarray``. (:issue:`40 - Indexing with dtype conversions fixed (:issue:`4463`, :issue:`4204`) +- Refactor Series.reindex to core/generic.py (:issue:`4604`), allow ``method=`` in reindexing + on a Series to work + +- Infer and downcast dtype if appropriate on ``ffill/bfill`` (:issue:`4604`) + Bug Fixes ~~~~~~~~~ diff --git a/pandas/core/common.py b/pandas/core/common.py index 5765340f2906a..2bfdaccb32df7 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -961,16 +961,43 @@ def _possibly_downcast_to_dtype(result, dtype): """ try to cast to the specified dtype (e.g. convert back to bool/int or could be an astype of float64->float32 """ - if np.isscalar(result): + if np.isscalar(result) or not len(result): return result + if isinstance(dtype, compat.string_types): + if dtype == 'infer': + inferred_type = lib.infer_dtype(_ensure_object(result.ravel())) + if inferred_type == 'boolean': + dtype = 'bool' + elif inferred_type == 'integer': + dtype = 'int64' + elif inferred_type == 'datetime64': + dtype = 'datetime64[ns]' + elif inferred_type == 'timedelta64': + dtype = 'timedelta64[ns]' + + # try to upcast here + elif inferred_type == 'floating': + dtype = 'int64' + + else: + dtype = 'object' + + if isinstance(dtype, compat.string_types): + dtype = np.dtype(dtype) + try: if issubclass(dtype.type, np.floating): return result.astype(dtype) elif dtype == np.bool_ or issubclass(dtype.type, np.integer): - if issubclass(result.dtype.type, np.number) and notnull(result).all(): + if issubclass(result.dtype.type, (np.object_,np.number)) and notnull(result).all(): new_result = result.astype(dtype) if (new_result == result).all(): + + # a comparable, e.g. a Decimal may slip in here + if not isinstance(result.ravel()[0], (np.integer,np.floating,np.bool,int,float,bool)): + return result + return new_result except: pass @@ -1052,6 +1079,9 @@ def pad_1d(values, limit=None, mask=None): _method = getattr(algos, 'pad_inplace_%s' % dtype, None) elif is_datetime64_dtype(values): _method = _pad_1d_datetime + elif is_integer_dtype(values): + values = _ensure_float64(values) + _method = algos.pad_inplace_float64 elif values.dtype == np.object_: _method = algos.pad_inplace_object @@ -1062,7 +1092,7 @@ def pad_1d(values, limit=None, mask=None): mask = isnull(values) mask = mask.view(np.uint8) _method(values, mask, limit=limit) - + return values def backfill_1d(values, limit=None, mask=None): @@ -1072,6 +1102,9 @@ def backfill_1d(values, limit=None, mask=None): _method = getattr(algos, 'backfill_inplace_%s' % dtype, None) elif is_datetime64_dtype(values): _method = _backfill_1d_datetime + elif is_integer_dtype(values): + values = _ensure_float64(values) + _method = algos.backfill_inplace_float64 elif values.dtype == np.object_: _method = algos.backfill_inplace_object @@ -1083,7 +1116,7 @@ def backfill_1d(values, limit=None, mask=None): mask = mask.view(np.uint8) _method(values, mask, limit=limit) - + return values def pad_2d(values, limit=None, mask=None): @@ -1093,6 +1126,9 @@ def pad_2d(values, limit=None, mask=None): _method = getattr(algos, 'pad_2d_inplace_%s' % dtype, None) elif is_datetime64_dtype(values): _method = _pad_2d_datetime + elif is_integer_dtype(values): + values = _ensure_float64(values) + _method = algos.pad_2d_inplace_float64 elif values.dtype == np.object_: _method = algos.pad_2d_inplace_object @@ -1108,7 +1144,7 @@ def pad_2d(values, limit=None, mask=None): else: # for test coverage pass - + return values def backfill_2d(values, limit=None, mask=None): @@ -1118,6 +1154,9 @@ def backfill_2d(values, limit=None, mask=None): _method = getattr(algos, 'backfill_2d_inplace_%s' % dtype, None) elif is_datetime64_dtype(values): _method = _backfill_2d_datetime + elif is_integer_dtype(values): + values = _ensure_float64(values) + _method = algos.backfill_2d_inplace_float64 elif values.dtype == np.object_: _method = algos.backfill_2d_inplace_object @@ -1133,7 +1172,7 @@ def backfill_2d(values, limit=None, mask=None): else: # for test coverage pass - + return values def interpolate_2d(values, method='pad', axis=0, limit=None, missing=None): """ perform an actual interpolation of values, values will be make 2-d if needed @@ -1153,10 +1192,11 @@ def interpolate_2d(values, method='pad', axis=0, limit=None, missing=None): else: # todo create faster fill func without masking mask = mask_missing(transf(values), missing) + method = _clean_fill_method(method) if method == 'pad': - pad_2d(transf(values), limit=limit, mask=mask) + values = transf(pad_2d(transf(values), limit=limit, mask=mask)) else: - backfill_2d(transf(values), limit=limit, mask=mask) + values = transf(backfill_2d(transf(values), limit=limit, mask=mask)) # reshape back if ndim == 1: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 91c5804d48a78..f845824572751 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1003,11 +1003,15 @@ def reindex(self, *args, **kwargs): except: pass - # perform the reindex on the axes - if copy and not com._count_not_none(*axes.values()): - return self.copy() + # if all axes that are requested to reindex are equal, then only copy if indicated + # must have index names equal here as well as values + if all([ self._get_axis(axis).identical(ax) for axis, ax in axes.items() if ax is not None ]): + if copy: + return self.copy() + return self - return self._reindex_axes(axes, level, limit, method, fill_value, copy, takeable=takeable) + # perform the reindex on the axes + return self._reindex_axes(axes, level, limit, method, fill_value, copy, takeable=takeable)._propogate_attributes(self) def _reindex_axes(self, axes, level, limit, method, fill_value, copy, takeable=False): """ perform the reinxed for all the axes """ @@ -1025,7 +1029,8 @@ def _reindex_axes(self, axes, level, limit, method, fill_value, copy, takeable=F new_index, indexer = self._get_axis(a).reindex( labels, level=level, limit=limit, takeable=takeable) obj = obj._reindex_with_indexers( - {axis: [labels, indexer]}, method, fill_value, copy) + {axis: [new_index, indexer]}, method=method, fill_value=fill_value, + limit=limit, copy=copy) return obj @@ -1079,9 +1084,10 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, axis_values = self._get_axis(axis_name) new_index, indexer = axis_values.reindex(labels, method, level, limit=limit, copy_if_needed=True) - return self._reindex_with_indexers({axis: [new_index, indexer]}, method, fill_value, copy) + return self._reindex_with_indexers({axis: [new_index, indexer]}, method=method, fill_value=fill_value, + limit=limit, copy=copy)._propogate_attributes(self) - def _reindex_with_indexers(self, reindexers, method=None, fill_value=np.nan, copy=False): + def _reindex_with_indexers(self, reindexers, method=None, fill_value=np.nan, limit=None, copy=False): # reindex doing multiple operations on different axes if indiciated new_data = self._data @@ -1089,11 +1095,15 @@ def _reindex_with_indexers(self, reindexers, method=None, fill_value=np.nan, cop index, indexer = reindexers[axis] baxis = self._get_block_manager_axis(axis) + if index is None: + continue + index = _ensure_index(index) + # reindex the axis if method is not None: new_data = new_data.reindex_axis( index, method=method, axis=baxis, - fill_value=fill_value, copy=copy) + fill_value=fill_value, limit=limit, copy=copy) elif indexer is not None: # TODO: speed up on homogeneous DataFrame objects @@ -1435,14 +1445,20 @@ def fillna(self, value=None, method=None, axis=0, inplace=False, if self._is_mixed_type and axis == 1: if inplace: raise NotImplementedError() - return self.T.fillna(method=method, limit=limit).T + result = self.T.fillna(method=method, limit=limit).T + + # need to downcast here because of all of the transposes + result._data = result._data.downcast() + + return result method = com._clean_fill_method(method) new_data = self._data.interpolate(method=method, axis=axis, limit=limit, inplace=inplace, - coerce=True) + coerce=True, + downcast=downcast) else: if method is not None: raise ValueError('cannot specify both a fill method and value') @@ -1474,11 +1490,11 @@ def fillna(self, value=None, method=None, axis=0, inplace=False, def ffill(self, axis=0, inplace=False, limit=None): return self.fillna(method='ffill', axis=axis, inplace=inplace, - limit=limit) + limit=limit, downcast='infer') def bfill(self, axis=0, inplace=False, limit=None): return self.fillna(method='bfill', axis=axis, inplace=inplace, - limit=limit) + limit=limit, downcast='infer') def replace(self, to_replace=None, value=None, inplace=False, limit=None, regex=False, method=None, axis=None): diff --git a/pandas/core/index.py b/pandas/core/index.py index 73aff7bcab953..aea168654599a 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -83,6 +83,7 @@ class Index(FrozenNDArray): name = None asi8 = None + _comparables = ['name'] _engine_type = _index.ObjectEngine @@ -545,6 +546,13 @@ def equals(self, other): return np.array_equal(self, other) + def identical(self, other): + """ + Similar to equals, but check that other comparable attributes are also equal + """ + return self.equals(other) and all( + [ getattr(self,c,None) == getattr(other,c,None) for c in self._comparables ]) + def asof(self, label): """ For a sorted index, return the most recent label up to and including diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f1578303e6db0..fb277695815d0 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -203,7 +203,7 @@ def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, limit=None new_values, self.items, self.ref_items, ndim=self.ndim, fastpath=True, placement=self._ref_locs) - def reindex_items_from(self, new_ref_items, copy=True): + def reindex_items_from(self, new_ref_items, indexer=None, method=None, fill_value=None, limit=None, copy=True): """ Reindex to only those items contained in the input set of items @@ -214,16 +214,26 @@ def reindex_items_from(self, new_ref_items, copy=True): ------- reindexed : Block """ - new_ref_items, indexer = self.items.reindex(new_ref_items) + if indexer is None: + new_ref_items, indexer = self.items.reindex(new_ref_items, limit=limit) + new_items = new_ref_items if indexer is None: - new_items = new_ref_items new_values = self.values.copy() if copy else self.values else: - masked_idx = indexer[indexer != -1] - new_values = com.take_nd(self.values, masked_idx, axis=0, - allow_fill=False) - new_items = self.items.take(masked_idx) + if fill_value is None: + fill_value = self.fill_value + + # single block reindex + if self.ndim == 1: + new_values = com.take_nd(self.values, indexer, axis=0, + fill_value=fill_value) + else: + masked_idx = indexer[indexer != -1] + new_values = com.take_nd(self.values, masked_idx, axis=0, + allow_fill=False) + new_items = self.items.take(masked_idx) + return make_block(new_values, new_items, new_ref_items, ndim=self.ndim, fastpath=True) def get(self, item): @@ -305,22 +315,37 @@ def fillna(self, value, inplace=False, downcast=None): def downcast(self, dtypes=None): """ try to downcast each item to the dict of dtypes if present """ + values = self.values + + # single block handling + if self._is_single_block: + if dtypes is None: + return [ self ] + + nv = _possibly_downcast_to_dtype(values, dtypes) + return [ make_block(nv, self.items, self.ref_items, ndim=self.ndim, fastpath=True) ] + + # ndim > 1 if dtypes is None: dtypes = dict() + elif dtypes == 'infer': + pass - values = self.values blocks = [] for i, item in enumerate(self.items): - dtype = dtypes.get(item, self._downcast_dtype) + if dtypes == 'infer': + dtype = 'infer' + else: + dtype = dtypes.get(item, self._downcast_dtype) + if dtype is None: nv = _block_shape(values[i]) - blocks.append(make_block(nv, [item], self.ref_items)) - continue + else: + nv = _possibly_downcast_to_dtype(values[i], dtype) + nv = _block_shape(nv) - nv = _possibly_downcast_to_dtype(values[i], np.dtype(dtype)) - nv = _block_shape(nv) - blocks.append(make_block(nv, [item], self.ref_items)) + blocks.append(make_block(nv, Index([item]), self.ref_items, ndim=self.ndim, fastpath=True)) return blocks @@ -578,7 +603,8 @@ def create_block(v, m, n, item, reshape=True): return [make_block(new_values, self.items, self.ref_items, fastpath=True)] def interpolate(self, method='pad', axis=0, inplace=False, - limit=None, missing=None, coerce=False): + limit=None, fill_value=None, coerce=False, + downcast=None): # if we are coercing, then don't force the conversion # if the block can't hold the type @@ -590,8 +616,15 @@ def interpolate(self, method='pad', axis=0, inplace=False, return [self.copy()] values = self.values if inplace else self.values.copy() - values = com.interpolate_2d(values, method, axis, limit, missing) - return [make_block(values, self.items, self.ref_items, ndim=self.ndim, klass=self.__class__, fastpath=True)] + values = com.interpolate_2d(values, method, axis, limit, fill_value) + + block = make_block(values, self.items, self.ref_items, ndim=self.ndim, klass=self.__class__, fastpath=True) + + # try to downcast back to original dtype if we can + # as we could have reindexed then down a ffill + if downcast is None: + downcast = 'infer' + return block.downcast(downcast) def take(self, indexer, ref_items, axis=1): if axis < 1: @@ -1290,10 +1323,10 @@ def make_block( return make_block(new_values, items, ref_items, ndim=self.ndim, fastpath=fastpath) def interpolate(self, method='pad', axis=0, inplace=False, - limit=None, missing=None, **kwargs): + limit=None, fill_value=None, **kwargs): values = com.interpolate_2d( - self.values.to_dense(), method, axis, limit, missing) + self.values.to_dense(), method, axis, limit, fill_value) return self.make_block(values, self.items, self.ref_items) def fillna(self, value, inplace=False, downcast=None): @@ -1336,7 +1369,7 @@ def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, limit=None fill_value = self.fill_value return self.make_block(self.values.take(indexer), items=self.items, fill_value=fill_value) - def reindex_items_from(self, new_ref_items, copy=True): + def reindex_items_from(self, new_ref_items, indexer=None, method=None, fill_value=None, limit=None, copy=True): """ Reindex to only those items contained in the input set of items @@ -1348,17 +1381,13 @@ def reindex_items_from(self, new_ref_items, copy=True): reindexed : Block """ - # 2-d - if self.ndim >= 2: - if self.items[0] not in self.ref_items: - return None - return self.make_block(self.values, ref_items=new_ref_items, copy=copy) - - # 1-d - new_ref_items, indexer = self.items.reindex(new_ref_items) + # 1-d always + if indexer is None: + new_ref_items, indexer = self.items.reindex(new_ref_items, limit=limit) if indexer is None: indexer = np.arange(len(self.items)) + # note that we DO NOT FILL HERE return self.make_block(com.take_1d(self.values.values, indexer), items=new_ref_items, ref_items=new_ref_items, copy=copy) def sparse_reindex(self, new_index): @@ -2522,15 +2551,18 @@ def reindex_axis(self, new_axis, method=None, axis=0, fill_value=None, limit=Non return self if axis == 0: - if method is not None: - raise AssertionError('method argument not supported for ' - 'axis == 0') + if method is not None or limit is not None: + return self.reindex_axis0_with_method(new_axis, method=method, fill_value=fill_value, limit=limit, copy=copy) return self.reindex_items(new_axis, copy=copy, fill_value=fill_value) new_axis, indexer = cur_axis.reindex( new_axis, method, copy_if_needed=True) return self.reindex_indexer(new_axis, indexer, axis=axis, fill_value=fill_value) + def reindex_axis0_with_method(self, new_axis, method=None, fill_value=None, limit=None, copy=True): + raise AssertionError('method argument not supported for ' + 'axis == 0') + def reindex_indexer(self, new_axis, indexer, axis=1, fill_value=None): """ pandas-indexer with -1's only. @@ -2843,19 +2875,28 @@ def shape(self): self._shape = tuple([len(self.axes[0])]) return self._shape - def reindex(self, new_axis, method=None, limit=None, copy=True): + def reindex(self, new_axis, indexer=None, method=None, fill_value=None, limit=None, copy=True): # if we are the same and don't copy, just return if not copy and self.index.equals(new_axis): return self - block = self._block.reindex_items_from(new_axis, copy=copy) + block = self._block.reindex_items_from(new_axis, indexer=indexer, method=method, + fill_value=fill_value, limit=limit, copy=copy) if method is not None or limit is not None: - block = block.interpolate(method=method, limit=limit) + block = block.interpolate(method=method, fill_value=fill_value, limit=limit, downcast=self.dtype) + mgr = SingleBlockManager(block, new_axis) mgr._consolidate_inplace() return mgr + def _reindex_indexer_items(self, new_items, indexer, fill_value): + # equiv to a reindex + return self.reindex(new_items, indexer=indexer, fill_value=fill_value, copy=False) + + def reindex_axis0_with_method(self, new_axis, method=None, fill_value=None, limit=None, copy=True): + return self.reindex(new_axis, method=method, fill_value=fill_value, limit=limit, copy=copy) + def get_slice(self, slobj, raise_on_error=False): if raise_on_error: _check_slice_bounds(slobj, self.index) diff --git a/pandas/core/series.py b/pandas/core/series.py index 2e2026dfdb8ae..051b445638f5b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2737,77 +2737,18 @@ def align(self, other, join='outer', level=None, copy=True, return left, right def _reindex_indexer(self, new_index, indexer, copy): - if indexer is not None: - new_values = com.take_1d(self.values, indexer) - else: + if indexer is None: if copy: - result = self.copy() - else: - result = self - return result + return self.copy() + return self # be subclass-friendly + new_values = com.take_1d(self.get_values(), indexer) return self._constructor(new_values, new_index, name=self.name) - def reindex(self, index=None, method=None, level=None, fill_value=pa.NA, - limit=None, copy=True, takeable=False): - """Conform Series to new index with optional filling logic, placing - NA/NaN in locations having no value in the previous index. A new object - is produced unless the new index is equivalent to the current one and - copy=False - - Parameters - ---------- - index : array-like or Index - New labels / index to conform to. Preferably an Index object to - avoid duplicating data - method : {'backfill', 'bfill', 'pad', 'ffill', None} - Method to use for filling holes in reindexed Series - pad / ffill: propagate LAST valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap - copy : boolean, default True - Return a new object, even if the passed indexes are the same - level : int or name - Broadcast across a level, matching Index values on the - passed MultiIndex level - fill_value : scalar, default NaN - Value to use for missing values. Defaults to NaN, but can be any - "compatible" value - limit : int, default None - Maximum size gap to forward or backward fill - takeable : the labels are locations (and not labels) - - Returns - ------- - reindexed : Series - """ - if index is None: - raise ValueError('Must pass Index or sequence, not None') - - index = _ensure_index(index) - if self.index.equals(index): - if copy: - result = self.copy() - result.index = index - return result - else: - return self - - if len(self.index) == 0: - return self._constructor(nan, index=index, name=self.name) - - new_index, indexer = self.index.reindex(index, method=method, - level=level, limit=limit, - takeable=takeable) - - # GH4246 (dispatch to a common method with frame to handle possibly - # duplicate index) - return self._reindex_with_indexers({ 0 : [new_index, indexer] }, copy=copy, fill_value=fill_value) - - def _reindex_with_indexers(self, reindexers, copy, fill_value=None): - index, indexer = reindexers[0] - new_values = com.take_1d(self.values, indexer, fill_value=fill_value) - return self._constructor(new_values, index=index, name=self.name) + def _needs_reindex_multi(self, axes, method, level): + """ check if we do need a multi reindex; this is for compat with higher dims """ + return False def reindex_axis(self, labels, axis=0, **kwargs): """ for compatibility with higher dims """ @@ -3472,14 +3413,6 @@ def _resolve_offset(freq, kwds): return offset -def _get_fill_func(method): - method = com._clean_fill_method(method) - if method == 'pad': - fill_f = com.pad_1d - elif method == 'backfill': - fill_f = com.backfill_1d - return fill_f - # backwards compatiblity TimeSeries = Series diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 00a9d41112154..ff6334b9f4295 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -471,6 +471,7 @@ def _combine_match_index(self, other, func, fill_value=None): for col, series in compat.iteritems(this): new_data[col] = func(series.values, other.values) + #new_data[col] = func(series.as_sparse_array(fill_value=np.nan), other.as_sparse_array(fill_value=np.nan)) # fill_value is a function of our operator if isnull(other.fill_value) or isnull(self.default_fill_value): @@ -573,7 +574,10 @@ def _reindex_columns(self, columns, copy, level, fill_value, limit=None, return SparseDataFrame(sdict, index=self.index, columns=columns, default_fill_value=self._default_fill_value) - def _reindex_with_indexers(self, reindexers, method=None, copy=False, fill_value=np.nan): + def _reindex_with_indexers(self, reindexers, method=None, fill_value=np.nan, limit=None, copy=False): + + if limit is not None: + raise NotImplementedError("cannot take limit with a sparse tyep") index, row_indexer = reindexers.get(0, (None, None)) columns, col_indexer = reindexers.get(1, (None, None)) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 6d7e4994f3694..21a054e6fe1a3 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -565,19 +565,6 @@ def sparse_reindex(self, new_index): sparse_index=new_index, fill_value=self.fill_value) - def _reindex_indexer(self, new_index, indexer, copy): - if indexer is not None: - new_values = com.take_1d(self.values.values, indexer) - else: - if copy: - result = self.copy() - else: - result = self - return result - - # be subclass-friendly - return self._constructor(new_values, new_index, name=self.name) - def take(self, indices, axis=0, convert=True): """ Sparse-compatible version of ndarray.take diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 430e5df839e18..8ad88374f40f6 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -1051,7 +1051,7 @@ def test_reindex(self): # don't necessarily copy result = self.panel.reindex(major=self.panel.major_axis, copy=False) assert_panel_equal(result,self.panel) - self.assert_((result is self.panel) == False) + self.assert_((result is self.panel) == True) def test_reindex_like(self): # reindex_like diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index add8ebf73f85f..9b34631ecc894 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -786,7 +786,7 @@ def test_reindex(self): result = self.panel4d.reindex( major=self.panel4d.major_axis, copy=False) assert_panel4d_equal(result,self.panel4d) - self.assert_((result is self.panel4d) == False) + self.assert_((result is self.panel4d) == True) def test_not_hashable(self): p4D_empty = Panel4D() diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 094a68a4c658d..a015eeb1c26c6 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -4034,7 +4034,9 @@ def test_reindex(self): for idx, val in compat.iteritems(subNonContig): self.assertEqual(val, self.ts[idx]) - self.assertRaises(ValueError, self.ts.reindex) + # return a copy the same index here + result = self.ts.reindex() + self.assert_((result is self.ts) == False) def test_reindex_corner(self): # (don't forget to fix this) I think it's fixed @@ -4052,8 +4054,8 @@ def test_reindex_corner(self): self.assertRaises(Exception, ts.reindex, self.ts.index, method='foo') def test_reindex_pad(self): - s = Series(np.arange(10), np.arange(10)) + s = Series(np.arange(10), np.arange(10)) s2 = s[::2] reindexed = s2.reindex(s.index, method='pad') @@ -4065,6 +4067,26 @@ def test_reindex_pad(self): index=np.arange(10)) assert_series_equal(reindexed, expected) + # GH4604 + s = Series([1,2,3,4,5], index=['a', 'b', 'c', 'd', 'e']) + new_index = ['a','g','c','f'] + expected = Series([1,1,3,3],index=new_index) + + # this changes dtype because the ffill happens after + result = s.reindex(new_index).ffill() + assert_series_equal(result, expected) + + # this preserves dtype + result = s.reindex(new_index, method='ffill') + assert_series_equal(result, expected) + + # inferrence of new dtype + s = Series([True,False,False,True],index=list('abcd')) + new_index='agc' + result = s.reindex(list(new_index)).ffill() + expected = Series([True,True,False],index=list(new_index)) + assert_series_equal(result, expected) + def test_reindex_backfill(self): pass diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 67adc6bf8e7f2..5cfe22781f362 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -316,6 +316,7 @@ def test_join_index_mixed(self): df2['float'] = 1. for kind in JOIN_TYPES: + joined = df1.join(df2, how=kind) expected = _join_by_hand(df1, df2, how=kind) assert_frame_equal(joined, expected) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 7af1dd657267a..7e54d6ebcfbdc 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -139,6 +139,7 @@ class DatetimeIndex(Int64Index): _engine_type = _index.DatetimeEngine offset = None + _comparables = ['name','freqstr','tz'] def __new__(cls, data=None, freq=None, start=None, end=None, periods=None, diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index bf9d7b2cf0b24..b28da7c9d7e0b 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -61,6 +61,7 @@ class Period(PandasObject): second : int, default 0 """ __slots__ = ['freq', 'ordinal'] + _comparables = ['name','freqstr'] def __init__(self, value=None, freq=None, ordinal=None, year=None, month=1, quarter=None, day=1, diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 357c64407dc49..3fdeacad5ffcd 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -566,19 +566,19 @@ def test_resample_median_bug_1688(self): def test_how_lambda_functions(self): ts = _simple_ts('1/1/2000', '4/1/2000') - + result = ts.resample('M', how=lambda x: x.mean()) exp = ts.resample('M', how='mean') tm.assert_series_equal(result, exp) - + self.assertRaises(Exception, ts.resample, 'M', how=[lambda x: x.mean(), lambda x: x.std(ddof=1)]) - + result = ts.resample('M', how={'foo': lambda x: x.mean(), 'bar': lambda x: x.std(ddof=1)}) foo_exp = ts.resample('M', how='mean') bar_exp = ts.resample('M', how='std') - + tm.assert_series_equal(result['foo'], foo_exp) tm.assert_series_equal(result['bar'], bar_exp) @@ -771,7 +771,7 @@ def test_resample_to_quarterly(self): ts.index[-1].asfreq('D', 'end'), freq='Q-%s' % month) - expected = stamps.reindex(qdates.to_timestamp('D', 'e'), + expected = stamps.reindex(qdates.to_timestamp('D', 's'), method='ffill') expected.index = qdates From dc73315d557874a7f162dcf0bc3ee106f9338ec5 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 19 Aug 2013 19:42:48 -0400 Subject: [PATCH 2/2] BUG: GH3317 Fix frequency inference to be automatic in a DatetimeIndex if freq is not specified during _ensure_index PERF: remove automatic downcasting, instead do on-demand via 'downcast=infer' TST: fix up failing tests CLN: reindex/fill in internals moved to core/internals/Block/reindex_items_from PERF: don't automatically downcast with a float block BUG: GH3317 reverse prior fix in tseries/offsets, to change slightly the multi reindex TST: identical index tests BUG: GH4618 provide automatic downcasting on a reindexed with method Series (in this case a shifted boolean then filled series) --- doc/source/release.rst | 8 ++- doc/source/v0.13.0.txt | 6 +- pandas/core/common.py | 23 +++--- pandas/core/frame.py | 7 +- pandas/core/generic.py | 18 +++-- pandas/core/index.py | 3 +- pandas/core/internals.py | 97 ++++++++++++++++++-------- pandas/io/tests/test_parsers.py | 2 +- pandas/sparse/frame.py | 5 +- pandas/tests/test_frame.py | 25 ++++++- pandas/tests/test_index.py | 41 +++++++++++ pandas/tests/test_series.py | 9 +++ pandas/tools/pivot.py | 2 +- pandas/tseries/index.py | 2 + pandas/tseries/tests/test_daterange.py | 17 +++++ vb_suite/reindex.py | 2 +- 16 files changed, 202 insertions(+), 65 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index ec21172230da9..932b93a634051 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -115,6 +115,8 @@ pandas 0.13 - ``MultiIndex.astype()`` now only allows ``np.object_``-like dtypes and now returns a ``MultiIndex`` rather than an ``Index``. (:issue:`4039`) + - Infer and downcast dtype if ``downcast='infer'`` is passed to ``fillna/ffill/bfill`` (:issue:`4604`) + **Internal Refactoring** In 0.13.0 there is a major refactor primarily to subclass ``Series`` from ``NDFrame``, @@ -183,11 +185,9 @@ See :ref:`Internal Refactoring` - Indexing with dtype conversions fixed (:issue:`4463`, :issue:`4204`) -- Refactor Series.reindex to core/generic.py (:issue:`4604`), allow ``method=`` in reindexing +- Refactor Series.reindex to core/generic.py (:issue:`4604`, :issue:`4618`), allow ``method=`` in reindexing on a Series to work -- Infer and downcast dtype if appropriate on ``ffill/bfill`` (:issue:`4604`) - **Experimental Features** **Bug Fixes** @@ -259,6 +259,8 @@ See :ref:`Internal Refactoring` - Fix bug in ``pd.read_clipboard`` on windows with PY3 (:issue:`4561`); not decoding properly - ``tslib.get_period_field()`` and ``tslib.get_period_field_arr()`` now raise if code argument out of range (:issue:`4519`, :issue:`4520`) + - Fix reindexing with multiple axes; if an axes match was not replacing the current axes, leading + to a possible lazay frequency inference issue (:issue:`3317`) pandas 0.12 =========== diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index 34733e03d897c..ffa71cbe97ce0 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -96,6 +96,8 @@ API changes # and all methods take an inplace kwarg index.set_names(["bob", "cranberry"], inplace=True) + - Infer and downcast dtype if ``downcast='infer'`` is passed to ``fillna/ffill/bfill`` (:issue:`4604`) + Enhancements ~~~~~~~~~~~~ @@ -237,11 +239,9 @@ and behaviors. Series formerly subclassed directly from ``ndarray``. (:issue:`40 - Indexing with dtype conversions fixed (:issue:`4463`, :issue:`4204`) -- Refactor Series.reindex to core/generic.py (:issue:`4604`), allow ``method=`` in reindexing +- Refactor Series.reindex to core/generic.py (:issue:`4604`, :issue:`4618`), allow ``method=`` in reindexing on a Series to work -- Infer and downcast dtype if appropriate on ``ffill/bfill`` (:issue:`4604`) - Bug Fixes ~~~~~~~~~ diff --git a/pandas/core/common.py b/pandas/core/common.py index 2bfdaccb32df7..e46abb4aa83a6 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -990,14 +990,20 @@ def _possibly_downcast_to_dtype(result, dtype): if issubclass(dtype.type, np.floating): return result.astype(dtype) elif dtype == np.bool_ or issubclass(dtype.type, np.integer): + + # do a test on the first element, if it fails then we are done + r = result.ravel() + arr = np.array([ r[0] ]) + if (arr != arr.astype(dtype)).item(): + return result + + # a comparable, e.g. a Decimal may slip in here + elif not isinstance(r[0], (np.integer,np.floating,np.bool,int,float,bool)): + return result + if issubclass(result.dtype.type, (np.object_,np.number)) and notnull(result).all(): new_result = result.astype(dtype) if (new_result == result).all(): - - # a comparable, e.g. a Decimal may slip in here - if not isinstance(result.ravel()[0], (np.integer,np.floating,np.bool,int,float,bool)): - return result - return new_result except: pass @@ -1174,7 +1180,7 @@ def backfill_2d(values, limit=None, mask=None): pass return values -def interpolate_2d(values, method='pad', axis=0, limit=None, missing=None): +def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None): """ perform an actual interpolation of values, values will be make 2-d if needed fills inplace, returns the result """ @@ -1187,10 +1193,10 @@ def interpolate_2d(values, method='pad', axis=0, limit=None, missing=None): raise Exception("cannot interpolate on a ndim == 1 with axis != 0") values = values.reshape(tuple((1,) + values.shape)) - if missing is None: + if fill_value is None: mask = None else: # todo create faster fill func without masking - mask = mask_missing(transf(values), missing) + mask = mask_missing(transf(values), fill_value) method = _clean_fill_method(method) if method == 'pad': @@ -1870,6 +1876,7 @@ def _astype_nansafe(arr, dtype, copy=True): def _clean_fill_method(method): + if method is None: return None method = method.lower() if method == 'ffill': method = 'pad' diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 200e4ce9322fd..d032bbf66f95e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2280,12 +2280,9 @@ def _reindex_multi(self, axes, copy, fill_value): fill_value=fill_value) return self._constructor(new_values, index=new_index, columns=new_columns) - elif row_indexer is not None: - return self._reindex_with_indexers({0: [new_index, row_indexer]}, copy=copy, fill_value=fill_value) - elif col_indexer is not None: - return self._reindex_with_indexers({1: [new_columns, col_indexer]}, copy=copy, fill_value=fill_value) else: - return self.copy() if copy else self + return self._reindex_with_indexers({0: [new_index, row_indexer], + 1: [new_columns, col_indexer]}, copy=copy, fill_value=fill_value) def reindex_like(self, other, method=None, copy=True, limit=None, fill_value=NA): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f845824572751..dccf3c9b8d36a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -987,7 +987,7 @@ def reindex(self, *args, **kwargs): # construct the args axes, kwargs = self._construct_axes_from_arguments(args, kwargs) - method = kwargs.get('method') + method = com._clean_fill_method(kwargs.get('method')) level = kwargs.get('level') copy = kwargs.get('copy', True) limit = kwargs.get('limit') @@ -1082,6 +1082,7 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, axis_name = self._get_axis_name(axis) axis_values = self._get_axis(axis_name) + method = com._clean_fill_method(method) new_index, indexer = axis_values.reindex(labels, method, level, limit=limit, copy_if_needed=True) return self._reindex_with_indexers({axis: [new_index, indexer]}, method=method, fill_value=fill_value, @@ -1102,7 +1103,7 @@ def _reindex_with_indexers(self, reindexers, method=None, fill_value=np.nan, lim # reindex the axis if method is not None: new_data = new_data.reindex_axis( - index, method=method, axis=baxis, + index, indexer=indexer, method=method, axis=baxis, fill_value=fill_value, limit=limit, copy=copy) elif indexer is not None: @@ -1419,7 +1420,8 @@ def fillna(self, value=None, method=None, axis=0, inplace=False, limit : int, default None Maximum size gap to forward or backward fill downcast : dict, default is None, a dict of item->dtype of what to - downcast if possible + downcast if possible, or the string 'infer' which will try to + downcast to an appropriate equal type (e.g. float64 to int64 if possible) See also -------- @@ -1438,6 +1440,7 @@ def fillna(self, value=None, method=None, axis=0, inplace=False, if axis + 1 > self._AXIS_LEN: raise ValueError( "invalid axis passed for object type {0}".format(type(self))) + method = com._clean_fill_method(method) if value is None: if method is None: @@ -1488,13 +1491,13 @@ def fillna(self, value=None, method=None, axis=0, inplace=False, else: return self._constructor(new_data) - def ffill(self, axis=0, inplace=False, limit=None): + def ffill(self, axis=0, inplace=False, limit=None, downcast=None): return self.fillna(method='ffill', axis=axis, inplace=inplace, - limit=limit, downcast='infer') + limit=limit, downcast=downcast) - def bfill(self, axis=0, inplace=False, limit=None): + def bfill(self, axis=0, inplace=False, limit=None, downcast=None): return self.fillna(method='bfill', axis=axis, inplace=inplace, - limit=limit, downcast='infer') + limit=limit, downcast=downcast) def replace(self, to_replace=None, value=None, inplace=False, limit=None, regex=False, method=None, axis=None): @@ -2046,6 +2049,7 @@ def align(self, other, join='outer', axis=None, level=None, copy=True, Aligned objects """ from pandas import DataFrame, Series + method = com._clean_fill_method(method) if isinstance(other, DataFrame): return self._align_frame(other, join=join, axis=axis, level=level, diff --git a/pandas/core/index.py b/pandas/core/index.py index aea168654599a..22bd7f318a237 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -551,7 +551,7 @@ def identical(self, other): Similar to equals, but check that other comparable attributes are also equal """ return self.equals(other) and all( - [ getattr(self,c,None) == getattr(other,c,None) for c in self._comparables ]) + ( getattr(self,c,None) == getattr(other,c,None) for c in self._comparables )) def asof(self, label): """ @@ -1555,6 +1555,7 @@ class MultiIndex(Index): _names = FrozenList() _levels = FrozenList() _labels = FrozenList() + _comparables = ['names'] def __new__(cls, levels=None, labels=None, sortorder=None, names=None, copy=False): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index fb277695815d0..ecce508284fc1 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -7,8 +7,8 @@ import numpy as np from pandas.core.base import PandasObject -from pandas.core.common import (_possibly_downcast_to_dtype, isnull, _NS_DTYPE, - _TD_DTYPE, ABCSeries, ABCSparseSeries, +from pandas.core.common import (_possibly_downcast_to_dtype, isnull, notnull, + _NS_DTYPE, _TD_DTYPE, ABCSeries, ABCSparseSeries, is_list_like) from pandas.core.index import (Index, MultiIndex, _ensure_index, _handle_legacy_indexes) @@ -36,6 +36,9 @@ class Block(PandasObject): """ __slots__ = ['items', 'ref_items', '_ref_locs', 'values', 'ndim'] is_numeric = False + is_float = False + is_integer = False + is_complex = False is_bool = False is_object = False is_sparse = False @@ -217,24 +220,38 @@ def reindex_items_from(self, new_ref_items, indexer=None, method=None, fill_valu if indexer is None: new_ref_items, indexer = self.items.reindex(new_ref_items, limit=limit) + if fill_value is None: + fill_value = self.fill_value + new_items = new_ref_items if indexer is None: new_values = self.values.copy() if copy else self.values + else: - if fill_value is None: - fill_value = self.fill_value # single block reindex if self.ndim == 1: - new_values = com.take_nd(self.values, indexer, axis=0, + new_values = com.take_1d(self.values, indexer, fill_value=fill_value) else: + masked_idx = indexer[indexer != -1] new_values = com.take_nd(self.values, masked_idx, axis=0, allow_fill=False) new_items = self.items.take(masked_idx) - return make_block(new_values, new_items, new_ref_items, ndim=self.ndim, fastpath=True) + # fill if needed + fill_method = method is not None or limit is not None + if fill_method: + new_values = com.interpolate_2d(new_values, method=method, limit=limit, fill_value=fill_value) + + block = make_block(new_values, new_items, new_ref_items, ndim=self.ndim, fastpath=True) + + # down cast if needed + if not self.is_float and (fill_method or notnull(fill_value)): + block = block.downcast() + + return block def get(self, item): loc = self.items.get_loc(item) @@ -301,36 +318,49 @@ def fillna(self, value, inplace=False, downcast=None): mask = com.isnull(self.values) value = self._try_fill(value) blocks = self.putmask(mask, value, inplace=inplace) + return self._maybe_downcast(blocks, downcast) - # possibily downcast the blocks - if not downcast: + def _maybe_downcast(self, blocks, downcast=None): + + # no need to downcast our float + # unless indicated + if downcast is None and self.is_float: return blocks result_blocks = [] for b in blocks: - result_blocks.extend(b.downcast()) + result_blocks.extend(b.downcast(downcast)) return result_blocks def downcast(self, dtypes=None): """ try to downcast each item to the dict of dtypes if present """ + # turn it off completely + if dtypes is False: + return [ self ] + values = self.values # single block handling if self._is_single_block: + + # try to cast all non-floats here if dtypes is None: - return [ self ] + dtypes = 'infer' nv = _possibly_downcast_to_dtype(values, dtypes) return [ make_block(nv, self.items, self.ref_items, ndim=self.ndim, fastpath=True) ] # ndim > 1 if dtypes is None: - dtypes = dict() - elif dtypes == 'infer': - pass + return [ self ] + + if not (dtypes == 'infer' or isinstance(dtypes, dict)): + raise ValueError("downcast must have a dictionary or 'infer' as its argument") + # item-by-item + # this is expensive as it splits the blocks items-by-item blocks = [] for i, item in enumerate(self.items): @@ -618,13 +648,8 @@ def interpolate(self, method='pad', axis=0, inplace=False, values = self.values if inplace else self.values.copy() values = com.interpolate_2d(values, method, axis, limit, fill_value) - block = make_block(values, self.items, self.ref_items, ndim=self.ndim, klass=self.__class__, fastpath=True) - - # try to downcast back to original dtype if we can - # as we could have reindexed then down a ffill - if downcast is None: - downcast = 'infer' - return block.downcast(downcast) + blocks = [ make_block(values, self.items, self.ref_items, ndim=self.ndim, klass=self.__class__, fastpath=True) ] + return self._maybe_downcast(blocks, downcast) def take(self, indexer, ref_items, axis=1): if axis < 1: @@ -822,6 +847,7 @@ def _try_cast_result(self, result): class FloatBlock(NumericBlock): + is_float = True _downcast_dtype = 'int64' def _can_hold_element(self, element): @@ -858,6 +884,7 @@ def should_store(self, value): class ComplexBlock(NumericBlock): + is_complex = True def _can_hold_element(self, element): return isinstance(element, complex) @@ -873,6 +900,7 @@ def should_store(self, value): class IntBlock(NumericBlock): + is_integer = True _can_hold_na = False def _can_hold_element(self, element): @@ -1387,8 +1415,15 @@ def reindex_items_from(self, new_ref_items, indexer=None, method=None, fill_valu if indexer is None: indexer = np.arange(len(self.items)) - # note that we DO NOT FILL HERE - return self.make_block(com.take_1d(self.values.values, indexer), items=new_ref_items, ref_items=new_ref_items, copy=copy) + new_values = com.take_1d(self.values.values, indexer) + + # fill if needed + if method is not None or limit is not None: + if fill_value is None: + fill_value = self.fill_value + new_values = com.interpolate_2d(new_values, method=method, limit=limit, fill_value=fill_value) + + return self.make_block(new_values, items=new_ref_items, ref_items=new_ref_items, copy=copy) def sparse_reindex(self, new_index): """ sparse reindex and return a new block @@ -2531,7 +2566,7 @@ def _check_have(self, item): if item not in self.items: raise KeyError('no item named %s' % com.pprint_thing(item)) - def reindex_axis(self, new_axis, method=None, axis=0, fill_value=None, limit=None, copy=True): + def reindex_axis(self, new_axis, indexer=None, method=None, axis=0, fill_value=None, limit=None, copy=True): new_axis = _ensure_index(new_axis) cur_axis = self.axes[axis] @@ -2552,14 +2587,15 @@ def reindex_axis(self, new_axis, method=None, axis=0, fill_value=None, limit=Non if axis == 0: if method is not None or limit is not None: - return self.reindex_axis0_with_method(new_axis, method=method, fill_value=fill_value, limit=limit, copy=copy) + return self.reindex_axis0_with_method(new_axis, indexer=indexer, + method=method, fill_value=fill_value, limit=limit, copy=copy) return self.reindex_items(new_axis, copy=copy, fill_value=fill_value) new_axis, indexer = cur_axis.reindex( new_axis, method, copy_if_needed=True) return self.reindex_indexer(new_axis, indexer, axis=axis, fill_value=fill_value) - def reindex_axis0_with_method(self, new_axis, method=None, fill_value=None, limit=None, copy=True): + def reindex_axis0_with_method(self, new_axis, indexer=None, method=None, fill_value=None, limit=None, copy=True): raise AssertionError('method argument not supported for ' 'axis == 0') @@ -2880,12 +2916,9 @@ def reindex(self, new_axis, indexer=None, method=None, fill_value=None, limit=No # if we are the same and don't copy, just return if not copy and self.index.equals(new_axis): return self + block = self._block.reindex_items_from(new_axis, indexer=indexer, method=method, fill_value=fill_value, limit=limit, copy=copy) - - if method is not None or limit is not None: - block = block.interpolate(method=method, fill_value=fill_value, limit=limit, downcast=self.dtype) - mgr = SingleBlockManager(block, new_axis) mgr._consolidate_inplace() return mgr @@ -2894,8 +2927,10 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value): # equiv to a reindex return self.reindex(new_items, indexer=indexer, fill_value=fill_value, copy=False) - def reindex_axis0_with_method(self, new_axis, method=None, fill_value=None, limit=None, copy=True): - return self.reindex(new_axis, method=method, fill_value=fill_value, limit=limit, copy=copy) + def reindex_axis0_with_method(self, new_axis, indexer=None, method=None, fill_value=None, limit=None, copy=True): + if method is None: + indexer = None + return self.reindex(new_axis, indexer=indexer, method=method, fill_value=fill_value, limit=limit, copy=copy) def get_slice(self, slobj, raise_on_error=False): if raise_on_error: diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 41345352b5ec5..787682f340250 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -774,7 +774,7 @@ def test_parse_dates_string(self): """ rs = self.read_csv( StringIO(data), index_col='date', parse_dates='date') - idx = date_range('1/1/2009', periods=3).asobject + idx = date_range('1/1/2009', periods=3) idx.name = 'date' xp = DataFrame({'A': ['a', 'b', 'c'], 'B': [1, 3, 4], diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index ff6334b9f4295..dd7bd52076e06 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -471,7 +471,6 @@ def _combine_match_index(self, other, func, fill_value=None): for col, series in compat.iteritems(this): new_data[col] = func(series.values, other.values) - #new_data[col] = func(series.as_sparse_array(fill_value=np.nan), other.as_sparse_array(fill_value=np.nan)) # fill_value is a function of our operator if isnull(other.fill_value) or isnull(self.default_fill_value): @@ -576,8 +575,8 @@ def _reindex_columns(self, columns, copy, level, fill_value, limit=None, def _reindex_with_indexers(self, reindexers, method=None, fill_value=np.nan, limit=None, copy=False): - if limit is not None: - raise NotImplementedError("cannot take limit with a sparse tyep") + if method is not None or limit is not None: + raise NotImplementedError("cannot reindex with a method or limit with sparse") index, row_indexer = reindexers.get(0, (None, None)) columns, col_indexer = reindexers.get(1, (None, None)) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index b84115bd3e6b4..1a65eec8557c0 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4218,8 +4218,17 @@ def test_logical_with_nas(self): expected = Series([np.nan, True]) assert_series_equal(result, expected) + # GH4604, automatic casting here result = d['a'].fillna(False) | d['b'] - expected = Series([True, True], dtype=object) + expected = Series([True, True]) + assert_series_equal(result, expected) + + result = d['a'].fillna(False,downcast=False) | d['b'] + expected = Series([True, True],dtype=object) + assert_series_equal(result, expected) + + result = (d['a'].fillna(False,downcast=False) | d['b']).convert_objects() + expected = Series([True, True]) assert_series_equal(result, expected) def test_neg(self): @@ -7411,6 +7420,20 @@ def test_reindex_columns(self): newFrame = self.frame.reindex(columns=[]) self.assert_(newFrame.empty) + def test_reindex_axes(self): + + # GH 3317, reindexing by both axes loses freq of the index + from datetime import datetime + df = DataFrame(np.ones((3, 3)), index=[datetime(2012, 1, 1), datetime(2012, 1, 2), datetime(2012, 1, 3)], columns=['a', 'b', 'c']) + time_freq = date_range('2012-01-01', '2012-01-03', freq='d') + some_cols = ['a', 'b'] + + index_freq = df.reindex(index=time_freq).index.freq + both_freq = df.reindex(index=time_freq, columns=some_cols).index.freq + seq_freq = df.reindex(index=time_freq).reindex(columns=some_cols).index.freq + self.assert_(index_freq == both_freq) + self.assert_(index_freq == seq_freq) + def test_reindex_fill_value(self): df = DataFrame(np.random.randn(10, 4)) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index ae81752c11b29..16f3026896d4f 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -151,6 +151,21 @@ def test_equals(self): # Must also be an Index self.assertFalse(Index(['a', 'b', 'c']).equals(['a', 'b', 'c'])) + def test_identical(self): + + # index + i1 = Index(['a', 'b', 'c']) + i2 = Index(['a', 'b', 'c']) + + self.assert_(i1.identical(i2)) + + i1 = i1.rename('foo') + self.assert_(i1.equals(i2)) + self.assert_(not i1.identical(i2)) + + i2 = i2.rename('foo') + self.assert_(i1.identical(i2)) + def test_asof(self): d = self.dateIndex[0] self.assert_(self.dateIndex.asof(d) is d) @@ -660,6 +675,20 @@ def test_equals(self): self.assert_(self.index.equals(same_values)) self.assert_(same_values.equals(self.index)) + def test_identical(self): + + i = self.index.copy() + same_values = Index(i, dtype=object) + self.assert_(i.identical(same_values)) + + i = self.index.copy() + i = i.rename('foo') + same_values = Index(i, dtype=object) + self.assert_(same_values.identical(self.index)) + + self.assertFalse(i.identical(self.index)) + self.assert_(Index(same_values, name='foo').identical(i)) + def test_get_indexer(self): target = Int64Index(np.arange(10)) indexer = self.index.get_indexer(target) @@ -1604,6 +1633,18 @@ def test_equals(self): labels=[major_labels, minor_labels]) self.assert_(not self.index.equals(index)) + def test_identical(self): + mi = self.index.copy() + mi2 = self.index.copy() + self.assert_(mi.identical(mi2)) + + mi = mi.set_names(['new1','new2']) + self.assert_(mi.equals(mi2)) + self.assert_(not mi.identical(mi2)) + + mi2 = mi2.set_names(['new1','new2']) + self.assert_(mi.identical(mi2)) + def test_union(self): piece1 = self.index[:5][::-1] piece2 = self.index[3:] diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index a015eeb1c26c6..4100185752b8f 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -4074,6 +4074,9 @@ def test_reindex_pad(self): # this changes dtype because the ffill happens after result = s.reindex(new_index).ffill() + assert_series_equal(result, expected.astype('float64')) + + result = s.reindex(new_index).ffill(downcast='infer') assert_series_equal(result, expected) # this preserves dtype @@ -4087,6 +4090,12 @@ def test_reindex_pad(self): expected = Series([True,True,False],index=list(new_index)) assert_series_equal(result, expected) + # GH4618 shifted series downcasting + s = Series(False,index=lrange(0,5)) + result = s.shift(1).fillna(method='bfill') + expected = Series(False,index=lrange(0,5)) + assert_series_equal(result, expected) + def test_reindex_backfill(self): pass diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 9bca698cd4304..c11ec9f338f88 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -129,7 +129,7 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean', table = table.sort_index(axis=1) if fill_value is not None: - table = table.fillna(value=fill_value, downcast=True) + table = table.fillna(value=fill_value, downcast='infer') if margins: table = _add_margins(table, data, values, rows=rows, diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 7e54d6ebcfbdc..1b5c9a8b5ebd3 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -155,6 +155,8 @@ def __new__(cls, data=None, freq_infer = False if not isinstance(freq, DateOffset): + + # if a passed freq is None, don't infer automatically if freq != 'infer': freq = to_offset(freq) else: diff --git a/pandas/tseries/tests/test_daterange.py b/pandas/tseries/tests/test_daterange.py index 536d718d72eba..d17b42c1e23c9 100644 --- a/pandas/tseries/tests/test_daterange.py +++ b/pandas/tseries/tests/test_daterange.py @@ -316,6 +316,23 @@ def test_date_parse_failure(self): def test_equals(self): self.assertFalse(self.rng.equals(list(self.rng))) + def test_identical(self): + t1 = self.rng.copy() + t2 = self.rng.copy() + self.assert_(t1.identical(t2)) + + # name + t1 = t1.rename('foo') + self.assert_(t1.equals(t2)) + self.assert_(not t1.identical(t2)) + t2 = t2.rename('foo') + self.assert_(t1.identical(t2)) + + # freq + t2v = Index(t2.values) + self.assert_(t1.equals(t2v)) + self.assert_(not t1.identical(t2v)) + def test_daterange_bug_456(self): # GH #456 rng1 = bdate_range('12/5/2011', '12/5/2011') diff --git a/vb_suite/reindex.py b/vb_suite/reindex.py index acf8f6f043bad..de0f397334e94 100644 --- a/vb_suite/reindex.py +++ b/vb_suite/reindex.py @@ -51,7 +51,7 @@ # Pad / backfill setup = common_setup + """ -rng = DateRange('1/1/2000', periods=10000, offset=datetools.Minute()) +rng = DateRange('1/1/2000', periods=100000, offset=datetools.Minute()) ts = Series(np.random.randn(len(rng)), index=rng) ts2 = ts[::2]