Skip to content

Commit 49260b9

Browse files
committed
ENH: fill_value arg in DataFrame.reindex/reindex_axis, add fillna to sparse objects, GH #784
1 parent 8786bf9 commit 49260b9

File tree

10 files changed

+474
-199
lines changed

10 files changed

+474
-199
lines changed

pandas/core/common.py

+21-16
Original file line numberDiff line numberDiff line change
@@ -108,15 +108,15 @@ def _take_1d_bool(arr, indexer, out, fill_value=np.nan):
108108
outview = out.view(np.uint8)
109109
lib.take_1d_bool(view, indexer, outview, fill_value=fill_value)
110110

111-
def _take_2d_axis0_bool(arr, indexer, out):
111+
def _take_2d_axis0_bool(arr, indexer, out, fill_value=np.nan):
112112
view = arr.view(np.uint8)
113113
outview = out.view(np.uint8)
114-
lib.take_2d_axis0_bool(view, indexer, outview)
114+
lib.take_2d_axis0_bool(view, indexer, outview, fill_value=fill_value)
115115

116-
def _take_2d_axis1_bool(arr, indexer, out):
116+
def _take_2d_axis1_bool(arr, indexer, out, fill_value=np.nan):
117117
view = arr.view(np.uint8)
118118
outview = out.view(np.uint8)
119-
lib.take_2d_axis1_bool(view, indexer, outview)
119+
lib.take_2d_axis1_bool(view, indexer, outview, fill_value=fill_value)
120120

121121
_take1d_dict = {
122122
'float64' : lib.take_1d_float64,
@@ -198,7 +198,8 @@ def take_1d(arr, indexer, out=None, fill_value=np.nan):
198198

199199
return out
200200

201-
def take_2d(arr, indexer, out=None, mask=None, needs_masking=None, axis=0):
201+
def take_2d(arr, indexer, out=None, mask=None, needs_masking=None, axis=0,
202+
fill_value=np.nan):
202203
"""
203204
Specialized Cython take which sets NaN values in one pass
204205
"""
@@ -221,19 +222,20 @@ def take_2d(arr, indexer, out=None, mask=None, needs_masking=None, axis=0):
221222
# upcasting may be required
222223
result = arr.take(indexer, axis=axis, out=out)
223224
result = _maybe_mask(result, mask, needs_masking, axis=axis,
224-
out_passed=out is not None)
225+
out_passed=out is not None,
226+
fill_value=fill_value)
225227
return result
226228
else:
227229
if out is None:
228230
out = np.empty(out_shape, dtype=arr.dtype)
229231
take_f = _get_take2d_function(dtype_str, axis=axis)
230-
take_f(arr, indexer, out=out)
232+
take_f(arr, indexer, out=out, fill_value=fill_value)
231233
return out
232234
elif dtype_str in ('float64', 'object'):
233235
if out is None:
234236
out = np.empty(out_shape, dtype=arr.dtype)
235237
take_f = _get_take2d_function(dtype_str, axis=axis)
236-
take_f(arr, indexer, out=out)
238+
take_f(arr, indexer, out=out, fill_value=fill_value)
237239
return out
238240
else:
239241
if mask is None:
@@ -246,34 +248,37 @@ def take_2d(arr, indexer, out=None, mask=None, needs_masking=None, axis=0):
246248

247249
result = arr.take(indexer, axis=axis, out=out)
248250
result = _maybe_mask(result, mask, needs_masking, axis=axis,
249-
out_passed=out is not None)
251+
out_passed=out is not None,
252+
fill_value=fill_value)
250253
return result
251254

252-
def null_out_axis(arr, mask, axis):
255+
def mask_out_axis(arr, mask, axis, fill_value=np.nan):
253256
indexer = [slice(None)] * arr.ndim
254257
indexer[axis] = mask
255258

256-
arr[tuple(indexer)] = np.NaN
259+
arr[tuple(indexer)] = fill_value
257260

258-
def take_fast(arr, indexer, mask, needs_masking, axis=0, out=None):
261+
def take_fast(arr, indexer, mask, needs_masking, axis=0, out=None,
262+
fill_value=np.nan):
259263
if arr.ndim == 2:
260264
return take_2d(arr, indexer, out=out, mask=mask,
261265
needs_masking=needs_masking,
262-
axis=axis)
266+
axis=axis, fill_value=fill_value)
263267

264268
result = arr.take(indexer, axis=axis, out=out)
265269
result = _maybe_mask(result, mask, needs_masking, axis=axis,
266-
out_passed=out is not None)
270+
out_passed=out is not None, fill_value=fill_value)
267271
return result
268272

269-
def _maybe_mask(result, mask, needs_masking, axis=0, out_passed=False):
273+
def _maybe_mask(result, mask, needs_masking, axis=0, out_passed=False,
274+
fill_value=np.nan):
270275
if needs_masking:
271276
if out_passed and _need_upcast(result):
272277
raise Exception('incompatible type for NAs')
273278
else:
274279
# a bit spaghettified
275280
result = _maybe_upcast(result)
276-
null_out_axis(result, mask, axis)
281+
mask_out_axis(result, mask, axis, fill_value)
277282
return result
278283

279284
def _maybe_upcast(values):

pandas/core/frame.py

+39-35
Original file line numberDiff line numberDiff line change
@@ -1693,7 +1693,7 @@ def lookup(self, row_labels, col_labels):
16931693
# Reindexing and alignment
16941694

16951695
def align(self, other, join='outer', axis=None, level=None, copy=True,
1696-
fill_value=None, method=None):
1696+
fill_value=np.nan, method=None):
16971697
"""
16981698
Align two DataFrame object on their index and columns with the
16991699
specified join method for each axis Index
@@ -1710,7 +1710,9 @@ def align(self, other, join='outer', axis=None, level=None, copy=True,
17101710
copy : boolean, default True
17111711
Always returns new objects. If copy=False and no reindexing is
17121712
required then original objects are returned.
1713-
fill_value : object, default None
1713+
fill_value : scalar, default np.NaN
1714+
Value to use for missing values. Defaults to NaN, but can be any
1715+
"compatible" value
17141716
method : str, default None
17151717
17161718
Returns
@@ -1730,7 +1732,7 @@ def align(self, other, join='outer', axis=None, level=None, copy=True,
17301732
raise TypeError('unsupported type: %s' % type(other))
17311733

17321734
def _align_frame(self, other, join='outer', axis=None, level=None,
1733-
copy=True, fill_value=None, method=None):
1735+
copy=True, fill_value=np.nan, method=None):
17341736
# defaults
17351737
join_index, join_columns = None, None
17361738
ilidx, iridx = None, None
@@ -1749,15 +1751,17 @@ def _align_frame(self, other, join='outer', axis=None, level=None,
17491751
return_indexers=True)
17501752

17511753
left = self._reindex_with_indexers(join_index, ilidx,
1752-
join_columns, clidx, copy)
1754+
join_columns, clidx, copy,
1755+
fill_value=fill_value)
17531756
right = other._reindex_with_indexers(join_index, iridx,
1754-
join_columns, cridx, copy)
1755-
fill_na = (fill_value is not None) or (method is not None)
1756-
if fill_na:
1757-
return (left.fillna(fill_value, method=method),
1758-
right.fillna(fill_value, method=method))
1759-
else:
1760-
return left, right
1757+
join_columns, cridx, copy,
1758+
fill_value=fill_value)
1759+
1760+
if method is not None:
1761+
left = left.fillna(method=method)
1762+
right = right.fillna(method=method)
1763+
1764+
return left, right
17611765

17621766
def _align_series(self, other, join='outer', axis=None, level=None,
17631767
copy=True, fill_value=None, method=None):
@@ -1798,7 +1802,7 @@ def _align_series(self, other, join='outer', axis=None, level=None,
17981802
return left_result, right_result
17991803

18001804
def reindex(self, index=None, columns=None, method=None, level=None,
1801-
copy=True):
1805+
fill_value=np.nan, copy=True):
18021806
"""Conform DataFrame to new index with optional filling logic, placing
18031807
NA/NaN in locations having no value in the previous index. A new object
18041808
is produced unless the new index is equivalent to the current one and
@@ -1820,6 +1824,9 @@ def reindex(self, index=None, columns=None, method=None, level=None,
18201824
level : int or name
18211825
Broadcast across a level, matching Index values on the
18221826
passed MultiIndex level
1827+
fill_value : scalar, default np.NaN
1828+
Value to use for missing values. Defaults to NaN, but can be any
1829+
"compatible" value
18231830
18241831
Examples
18251832
--------
@@ -1833,14 +1840,15 @@ def reindex(self, index=None, columns=None, method=None, level=None,
18331840
frame = self
18341841

18351842
if index is not None:
1836-
frame = frame._reindex_index(index, method, copy, level)
1843+
frame = frame._reindex_index(index, method, copy, level, fill_value)
18371844

18381845
if columns is not None:
1839-
frame = frame._reindex_columns(columns, copy, level)
1846+
frame = frame._reindex_columns(columns, copy, level, fill_value)
18401847

18411848
return frame
18421849

1843-
def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True):
1850+
def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
1851+
fill_value=np.nan):
18441852
"""Conform DataFrame to new index with optional filling logic, placing
18451853
NA/NaN in locations having no value in the previous index. A new object
18461854
is produced unless the new index is equivalent to the current one and
@@ -1878,42 +1886,45 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True):
18781886
"""
18791887
self._consolidate_inplace()
18801888
if axis == 0:
1881-
df = self._reindex_index(labels, method, copy, level)
1882-
return df
1889+
return self._reindex_index(labels, method, copy, level,
1890+
fill_value=fill_value)
18831891
elif axis == 1:
1884-
df = self._reindex_columns(labels, copy, level)
1885-
return df
1892+
return self._reindex_columns(labels, copy, level,
1893+
fill_value=fill_value)
18861894
else: # pragma: no cover
18871895
raise ValueError('Must specify axis=0 or 1')
18881896

1889-
def _reindex_index(self, new_index, method, copy, level):
1897+
def _reindex_index(self, new_index, method, copy, level, fill_value=np.nan):
18901898
if level is not None:
18911899
assert(isinstance(new_index, MultiIndex))
18921900
new_index, indexer = self.index.reindex(new_index, method, level)
18931901
return self._reindex_with_indexers(new_index, indexer, None, None,
1894-
copy)
1902+
copy, fill_value)
18951903

1896-
def _reindex_columns(self, new_columns, copy, level):
1904+
def _reindex_columns(self, new_columns, copy, level, fill_value=np.nan):
18971905
if level is not None:
18981906
assert(isinstance(new_columns, MultiIndex))
18991907
new_columns, indexer = self.columns.reindex(new_columns, level=level)
19001908
return self._reindex_with_indexers(None, None, new_columns, indexer,
1901-
copy)
1909+
copy, fill_value)
19021910

19031911
def _reindex_with_indexers(self, index, row_indexer, columns, col_indexer,
1904-
copy):
1912+
copy, fill_value):
19051913
new_data = self._data
19061914
if row_indexer is not None:
1907-
new_data = new_data.reindex_indexer(index, row_indexer, axis=1)
1915+
new_data = new_data.reindex_indexer(index, row_indexer, axis=1,
1916+
fill_value=fill_value)
19081917
elif index is not None and index is not new_data.axes[1]:
19091918
new_data = new_data.copy(deep=copy)
19101919
new_data.axes[1] = index
19111920

19121921
if col_indexer is not None:
19131922
# TODO: speed up on homogeneous DataFrame objects
1914-
new_data = new_data.reindex_indexer(columns, col_indexer, axis=0)
1923+
new_data = new_data.reindex_indexer(columns, col_indexer, axis=0,
1924+
fill_value=fill_value)
19151925
elif columns is not None and columns is not new_data.axes[0]:
1916-
new_data = new_data.reindex_items(columns, copy=copy)
1926+
new_data = new_data.reindex_items(columns, copy=copy,
1927+
fill_value=fill_value)
19171928

19181929
if copy and new_data is self._data:
19191930
new_data = new_data.copy()
@@ -2361,8 +2372,7 @@ def reorder_levels(self, order, axis=0):
23612372

23622373
def fillna(self, value=None, method='pad', inplace=False):
23632374
"""
2364-
Fill NA/NaN values using the specified method. Member Series /
2365-
TimeSeries are filled separately
2375+
Fill NA/NaN values using the specified method
23662376
23672377
Parameters
23682378
----------
@@ -2402,12 +2412,6 @@ def fillna(self, value=None, method='pad', inplace=False):
24022412
new_blocks.append(newb)
24032413

24042414
new_data = BlockManager(new_blocks, self._data.axes)
2405-
2406-
# series = self._series
2407-
# for col, s in series.iteritems():
2408-
# result[col] = s.fillna(method=method, value=value)
2409-
# return self._constructor(result, index=self.index,
2410-
# columns=self.columns)
24112415
else:
24122416
# Float type values
24132417
if len(self.columns) == 0:

pandas/core/internals.py

+21-13
Original file line numberDiff line numberDiff line change
@@ -101,18 +101,20 @@ def merge(self, other):
101101
# union_ref = self.ref_items + other.ref_items
102102
return _merge_blocks([self, other], self.ref_items)
103103

104-
def reindex_axis(self, indexer, mask, needs_masking, axis=0):
104+
def reindex_axis(self, indexer, mask, needs_masking, axis=0,
105+
fill_value=np.nan):
105106
"""
106107
Reindex using pre-computed indexer information
107108
"""
108109
if self.values.size > 0:
109110
new_values = com.take_fast(self.values, indexer, mask,
110-
needs_masking, axis=axis)
111+
needs_masking, axis=axis,
112+
fill_value=fill_value)
111113
else:
112114
shape = list(self.shape)
113115
shape[axis] = len(indexer)
114116
new_values = np.empty(shape)
115-
new_values.fill(np.nan)
117+
new_values.fill(fill_value)
116118
return make_block(new_values, self.items, self.ref_items)
117119

118120
def reindex_items_from(self, new_ref_items, copy=True):
@@ -730,12 +732,12 @@ def reindex_axis(self, new_axis, method=None, axis=0, copy=True):
730732
new_axis, indexer = cur_axis.reindex(new_axis, method)
731733
return self.reindex_indexer(new_axis, indexer, axis=axis)
732734

733-
def reindex_indexer(self, new_axis, indexer, axis=1):
735+
def reindex_indexer(self, new_axis, indexer, axis=1, fill_value=np.nan):
734736
"""
735737
pandas-indexer with -1's only.
736738
"""
737739
if axis == 0:
738-
return self._reindex_indexer_items(new_axis, indexer)
740+
return self._reindex_indexer_items(new_axis, indexer, fill_value)
739741

740742
mask = indexer == -1
741743

@@ -745,14 +747,14 @@ def reindex_indexer(self, new_axis, indexer, axis=1):
745747
new_blocks = []
746748
for block in self.blocks:
747749
newb = block.reindex_axis(indexer, mask, needs_masking,
748-
axis=axis)
750+
axis=axis, fill_value=fill_value)
749751
new_blocks.append(newb)
750752

751753
new_axes = list(self.axes)
752754
new_axes[axis] = new_axis
753755
return BlockManager(new_blocks, new_axes)
754756

755-
def _reindex_indexer_items(self, new_items, indexer):
757+
def _reindex_indexer_items(self, new_items, indexer, fill_value):
756758
# TODO: less efficient than I'd like
757759

758760
item_order = com.take_1d(self.items.values, indexer)
@@ -778,13 +780,14 @@ def _reindex_indexer_items(self, new_items, indexer):
778780

779781
if not mask.all():
780782
na_items = new_items[-mask]
781-
na_block = self._make_na_block(na_items, new_items)
783+
na_block = self._make_na_block(na_items, new_items,
784+
fill_value=fill_value)
782785
new_blocks.append(na_block)
783786
new_blocks = _consolidate(new_blocks, new_items)
784787

785788
return BlockManager(new_blocks, [new_items] + self.axes[1:])
786789

787-
def reindex_items(self, new_items, copy=True):
790+
def reindex_items(self, new_items, copy=True, fill_value=np.nan):
788791
"""
789792
790793
"""
@@ -814,17 +817,22 @@ def reindex_items(self, new_items, copy=True):
814817
mask = indexer == -1
815818
if mask.any():
816819
extra_items = new_items[mask]
817-
na_block = self._make_na_block(extra_items, new_items)
820+
na_block = self._make_na_block(extra_items, new_items,
821+
fill_value=fill_value)
818822
new_blocks.append(na_block)
819823
new_blocks = _consolidate(new_blocks, new_items)
820824

821825
return BlockManager(new_blocks, [new_items] + self.axes[1:])
822826

823-
def _make_na_block(self, items, ref_items):
827+
def _make_na_block(self, items, ref_items, fill_value=np.nan):
828+
# TODO: infer dtypes other than float64 from fill_value
829+
824830
block_shape = list(self.shape)
825831
block_shape[0] = len(items)
826-
block_values = np.empty(block_shape, dtype=np.float64)
827-
block_values.fill(nan)
832+
833+
dtype = com._infer_dtype(fill_value)
834+
block_values = np.empty(block_shape, dtype=dtype)
835+
block_values.fill(fill_value)
828836
na_block = make_block(block_values, items, ref_items,
829837
do_integrity_check=True)
830838
return na_block

0 commit comments

Comments
 (0)