Skip to content

Commit 491e3c3

Browse files
committed
TST: GH4604, reindexing with a method of 'ffill' gives incorrect results
BUG/CLN: (GH4604) Refactor Series.reindex to core/generic.py allow method= in reindexing on a Series to work API/CLN: GH4604 Infer and downcast dtype if appropriate on ffill/bfill this is for consistency when doing: df.reindex().ffill() and df.reindex(method='ffill') CLN: allow backfill/pad/interpolate to operate on integers (by float conversion) provide downcasting back to original dtype where needed core.internals.interpolate ENH: provide core.index.identical method to compare values and attributes similar to .equals API: changed back to pre-GH3482 where a reindex with no args will by default copy
1 parent 4c8ad82 commit 491e3c3

16 files changed

+216
-154
lines changed

doc/source/release.rst

+6-3
Original file line numberDiff line numberDiff line change
@@ -144,8 +144,6 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
144144
- support attribute access for setting
145145
- filter supports same api as original ``DataFrame`` filter
146146

147-
- Reindex called with no arguments will now return a copy of the input object
148-
149147
- Series now inherits from ``NDFrame`` rather than directly from ``ndarray``.
150148
There are several minor changes that affect the API.
151149

@@ -185,6 +183,11 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
185183

186184
- Indexing with dtype conversions fixed (:issue:`4463`, :issue:`4204`)
187185

186+
- Refactor Series.reindex to core/generic.py (:issue:`4604`), allow ``method=`` in reindexing
187+
on a Series to work
188+
189+
- Infer and downcast dtype if appropriate on ``ffill/bfill`` (:issue:`4604`)
190+
188191
**Experimental Features**
189192

190193
**Bug Fixes**
@@ -210,7 +213,7 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
210213
- In ``to_json``, raise if a passed ``orient`` would cause loss of data because
211214
of a duplicate index (:issue:`4359`)
212215
- In ``to_json``, fix date handling so milliseconds are the default timestamp
213-
as the docstring says (:issue:`4362`).
216+
as the docstring says (:issue:`4362`).
214217
- JSON NaT handling fixed, NaTs are now serialised to `null` (:issue:`4498`)
215218
- Fixed passing ``keep_default_na=False`` when ``na_values=None`` (:issue:`4318`)
216219
- Fixed bug with ``values`` raising an error on a DataFrame with duplicate columns and mixed

doc/source/v0.13.0.txt

+5
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,11 @@ and behaviors. Series formerly subclassed directly from ``ndarray``. (:issue:`40
237237

238238
- Indexing with dtype conversions fixed (:issue:`4463`, :issue:`4204`)
239239

240+
- Refactor Series.reindex to core/generic.py (:issue:`4604`), allow ``method=`` in reindexing
241+
on a Series to work
242+
243+
- Infer and downcast dtype if appropriate on ``ffill/bfill`` (:issue:`4604`)
244+
240245
Bug Fixes
241246
~~~~~~~~~
242247

pandas/core/common.py

+48-8
Original file line numberDiff line numberDiff line change
@@ -961,16 +961,43 @@ def _possibly_downcast_to_dtype(result, dtype):
961961
""" try to cast to the specified dtype (e.g. convert back to bool/int
962962
or could be an astype of float64->float32 """
963963

964-
if np.isscalar(result):
964+
if np.isscalar(result) or not len(result):
965965
return result
966966

967+
if isinstance(dtype, compat.string_types):
968+
if dtype == 'infer':
969+
inferred_type = lib.infer_dtype(_ensure_object(result.ravel()))
970+
if inferred_type == 'boolean':
971+
dtype = 'bool'
972+
elif inferred_type == 'integer':
973+
dtype = 'int64'
974+
elif inferred_type == 'datetime64':
975+
dtype = 'datetime64[ns]'
976+
elif inferred_type == 'timedelta64':
977+
dtype = 'timedelta64[ns]'
978+
979+
# try to upcast here
980+
elif inferred_type == 'floating':
981+
dtype = 'int64'
982+
983+
else:
984+
dtype = 'object'
985+
986+
if isinstance(dtype, compat.string_types):
987+
dtype = np.dtype(dtype)
988+
967989
try:
968990
if issubclass(dtype.type, np.floating):
969991
return result.astype(dtype)
970992
elif dtype == np.bool_ or issubclass(dtype.type, np.integer):
971-
if issubclass(result.dtype.type, np.number) and notnull(result).all():
993+
if issubclass(result.dtype.type, (np.object_,np.number)) and notnull(result).all():
972994
new_result = result.astype(dtype)
973995
if (new_result == result).all():
996+
997+
# a comparable, e.g. a Decimal may slip in here
998+
if not isinstance(result.ravel()[0], (np.integer,np.floating,np.bool,int,float,bool)):
999+
return result
1000+
9741001
return new_result
9751002
except:
9761003
pass
@@ -1052,6 +1079,9 @@ def pad_1d(values, limit=None, mask=None):
10521079
_method = getattr(algos, 'pad_inplace_%s' % dtype, None)
10531080
elif is_datetime64_dtype(values):
10541081
_method = _pad_1d_datetime
1082+
elif is_integer_dtype(values):
1083+
values = _ensure_float64(values)
1084+
_method = algos.pad_inplace_float64
10551085
elif values.dtype == np.object_:
10561086
_method = algos.pad_inplace_object
10571087

@@ -1062,7 +1092,7 @@ def pad_1d(values, limit=None, mask=None):
10621092
mask = isnull(values)
10631093
mask = mask.view(np.uint8)
10641094
_method(values, mask, limit=limit)
1065-
1095+
return values
10661096

10671097
def backfill_1d(values, limit=None, mask=None):
10681098

@@ -1072,6 +1102,9 @@ def backfill_1d(values, limit=None, mask=None):
10721102
_method = getattr(algos, 'backfill_inplace_%s' % dtype, None)
10731103
elif is_datetime64_dtype(values):
10741104
_method = _backfill_1d_datetime
1105+
elif is_integer_dtype(values):
1106+
values = _ensure_float64(values)
1107+
_method = algos.backfill_inplace_float64
10751108
elif values.dtype == np.object_:
10761109
_method = algos.backfill_inplace_object
10771110

@@ -1083,7 +1116,7 @@ def backfill_1d(values, limit=None, mask=None):
10831116
mask = mask.view(np.uint8)
10841117

10851118
_method(values, mask, limit=limit)
1086-
1119+
return values
10871120

10881121
def pad_2d(values, limit=None, mask=None):
10891122

@@ -1093,6 +1126,9 @@ def pad_2d(values, limit=None, mask=None):
10931126
_method = getattr(algos, 'pad_2d_inplace_%s' % dtype, None)
10941127
elif is_datetime64_dtype(values):
10951128
_method = _pad_2d_datetime
1129+
elif is_integer_dtype(values):
1130+
values = _ensure_float64(values)
1131+
_method = algos.pad_2d_inplace_float64
10961132
elif values.dtype == np.object_:
10971133
_method = algos.pad_2d_inplace_object
10981134

@@ -1108,7 +1144,7 @@ def pad_2d(values, limit=None, mask=None):
11081144
else:
11091145
# for test coverage
11101146
pass
1111-
1147+
return values
11121148

11131149
def backfill_2d(values, limit=None, mask=None):
11141150

@@ -1118,6 +1154,9 @@ def backfill_2d(values, limit=None, mask=None):
11181154
_method = getattr(algos, 'backfill_2d_inplace_%s' % dtype, None)
11191155
elif is_datetime64_dtype(values):
11201156
_method = _backfill_2d_datetime
1157+
elif is_integer_dtype(values):
1158+
values = _ensure_float64(values)
1159+
_method = algos.backfill_2d_inplace_float64
11211160
elif values.dtype == np.object_:
11221161
_method = algos.backfill_2d_inplace_object
11231162

@@ -1133,7 +1172,7 @@ def backfill_2d(values, limit=None, mask=None):
11331172
else:
11341173
# for test coverage
11351174
pass
1136-
1175+
return values
11371176

11381177
def interpolate_2d(values, method='pad', axis=0, limit=None, missing=None):
11391178
""" perform an actual interpolation of values, values will be make 2-d if needed
@@ -1153,10 +1192,11 @@ def interpolate_2d(values, method='pad', axis=0, limit=None, missing=None):
11531192
else: # todo create faster fill func without masking
11541193
mask = mask_missing(transf(values), missing)
11551194

1195+
method = _clean_fill_method(method)
11561196
if method == 'pad':
1157-
pad_2d(transf(values), limit=limit, mask=mask)
1197+
values = transf(pad_2d(transf(values), limit=limit, mask=mask))
11581198
else:
1159-
backfill_2d(transf(values), limit=limit, mask=mask)
1199+
values = transf(backfill_2d(transf(values), limit=limit, mask=mask))
11601200

11611201
# reshape back
11621202
if ndim == 1:

pandas/core/generic.py

+28-12
Original file line numberDiff line numberDiff line change
@@ -1003,11 +1003,15 @@ def reindex(self, *args, **kwargs):
10031003
except:
10041004
pass
10051005

1006-
# perform the reindex on the axes
1007-
if copy and not com._count_not_none(*axes.values()):
1008-
return self.copy()
1006+
# if all axes that are requested to reindex are equal, then only copy if indicated
1007+
# must have index names equal here as well as values
1008+
if all([ self._get_axis(axis).identical(ax) for axis, ax in axes.items() if ax is not None ]):
1009+
if copy:
1010+
return self.copy()
1011+
return self
10091012

1010-
return self._reindex_axes(axes, level, limit, method, fill_value, copy, takeable=takeable)
1013+
# perform the reindex on the axes
1014+
return self._reindex_axes(axes, level, limit, method, fill_value, copy, takeable=takeable)._propogate_attributes(self)
10111015

10121016
def _reindex_axes(self, axes, level, limit, method, fill_value, copy, takeable=False):
10131017
""" perform the reinxed for all the axes """
@@ -1025,7 +1029,8 @@ def _reindex_axes(self, axes, level, limit, method, fill_value, copy, takeable=F
10251029
new_index, indexer = self._get_axis(a).reindex(
10261030
labels, level=level, limit=limit, takeable=takeable)
10271031
obj = obj._reindex_with_indexers(
1028-
{axis: [labels, indexer]}, method, fill_value, copy)
1032+
{axis: [new_index, indexer]}, method=method, fill_value=fill_value,
1033+
limit=limit, copy=copy)
10291034

10301035
return obj
10311036

@@ -1079,21 +1084,26 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
10791084
axis_values = self._get_axis(axis_name)
10801085
new_index, indexer = axis_values.reindex(labels, method, level,
10811086
limit=limit, copy_if_needed=True)
1082-
return self._reindex_with_indexers({axis: [new_index, indexer]}, method, fill_value, copy)
1087+
return self._reindex_with_indexers({axis: [new_index, indexer]}, method=method, fill_value=fill_value,
1088+
limit=limit, copy=copy)._propogate_attributes(self)
10831089

1084-
def _reindex_with_indexers(self, reindexers, method=None, fill_value=np.nan, copy=False):
1090+
def _reindex_with_indexers(self, reindexers, method=None, fill_value=np.nan, limit=None, copy=False):
10851091

10861092
# reindex doing multiple operations on different axes if indiciated
10871093
new_data = self._data
10881094
for axis in sorted(reindexers.keys()):
10891095
index, indexer = reindexers[axis]
10901096
baxis = self._get_block_manager_axis(axis)
10911097

1098+
if index is None:
1099+
continue
1100+
index = _ensure_index(index)
1101+
10921102
# reindex the axis
10931103
if method is not None:
10941104
new_data = new_data.reindex_axis(
10951105
index, method=method, axis=baxis,
1096-
fill_value=fill_value, copy=copy)
1106+
fill_value=fill_value, limit=limit, copy=copy)
10971107

10981108
elif indexer is not None:
10991109
# TODO: speed up on homogeneous DataFrame objects
@@ -1435,14 +1445,20 @@ def fillna(self, value=None, method=None, axis=0, inplace=False,
14351445
if self._is_mixed_type and axis == 1:
14361446
if inplace:
14371447
raise NotImplementedError()
1438-
return self.T.fillna(method=method, limit=limit).T
1448+
result = self.T.fillna(method=method, limit=limit).T
1449+
1450+
# need to downcast here because of all of the transposes
1451+
result._data = result._data.downcast()
1452+
1453+
return result
14391454

14401455
method = com._clean_fill_method(method)
14411456
new_data = self._data.interpolate(method=method,
14421457
axis=axis,
14431458
limit=limit,
14441459
inplace=inplace,
1445-
coerce=True)
1460+
coerce=True,
1461+
downcast=downcast)
14461462
else:
14471463
if method is not None:
14481464
raise ValueError('cannot specify both a fill method and value')
@@ -1474,11 +1490,11 @@ def fillna(self, value=None, method=None, axis=0, inplace=False,
14741490

14751491
def ffill(self, axis=0, inplace=False, limit=None):
14761492
return self.fillna(method='ffill', axis=axis, inplace=inplace,
1477-
limit=limit)
1493+
limit=limit, downcast='infer')
14781494

14791495
def bfill(self, axis=0, inplace=False, limit=None):
14801496
return self.fillna(method='bfill', axis=axis, inplace=inplace,
1481-
limit=limit)
1497+
limit=limit, downcast='infer')
14821498

14831499
def replace(self, to_replace=None, value=None, inplace=False, limit=None,
14841500
regex=False, method=None, axis=None):

pandas/core/index.py

+8
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ class Index(FrozenNDArray):
8383

8484
name = None
8585
asi8 = None
86+
_comparables = ['name']
8687

8788
_engine_type = _index.ObjectEngine
8889

@@ -545,6 +546,13 @@ def equals(self, other):
545546

546547
return np.array_equal(self, other)
547548

549+
def identical(self, other):
550+
"""
551+
Similar to equals, but check that other comparable attributes are also equal
552+
"""
553+
return self.equals(other) and all(
554+
[ getattr(self,c,None) == getattr(other,c,None) for c in self._comparables ])
555+
548556
def asof(self, label):
549557
"""
550558
For a sorted index, return the most recent label up to and including

0 commit comments

Comments
 (0)