Skip to content

Commit 039d2a7

Browse files
committed
ENH: optimize fillna for 2D blocks, add inplace option, GH #788, #786
1 parent 5481c4a commit 039d2a7

File tree

10 files changed

+417
-21
lines changed

10 files changed

+417
-21
lines changed

RELEASE.rst

+10
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,16 @@ Where to get it
2222
* Binary installers on PyPI: http://pypi.python.org/pypi/pandas
2323
* Documentation: http://pandas.pydata.org
2424

25+
pandas 0.7.1
26+
============
27+
28+
**Release date:** NOT YET RELEASED
29+
30+
**Bug fixes**
31+
32+
- Fix memory leak when inserting large number of columns into a single
33+
DataFrame (#790)
34+
2535
pandas 0.7.0
2636
============
2737

pandas/core/common.py

+7
Original file line numberDiff line numberDiff line change
@@ -562,6 +562,13 @@ def _ensure_object(arr):
562562
arr = arr.astype('O')
563563
return arr
564564

565+
def _clean_fill_method(method):
566+
method = method.lower()
567+
if method == 'ffill':
568+
method = 'pad'
569+
if method == 'bfill':
570+
method = 'backfill'
571+
return method
565572

566573
def save(obj, path):
567574
"""

pandas/core/frame.py

+32-7
Original file line numberDiff line numberDiff line change
@@ -2307,7 +2307,7 @@ def reorder_levels(self, order, axis=0):
23072307
#----------------------------------------------------------------------
23082308
# Filling NA's
23092309

2310-
def fillna(self, value=None, method='pad'):
2310+
def fillna(self, value=None, method='pad', inplace=False):
23112311
"""
23122312
Fill NA/NaN values using the specified method. Member Series /
23132313
TimeSeries are filled separately
@@ -2320,6 +2320,11 @@ def fillna(self, value=None, method='pad'):
23202320
backfill / bfill: use NEXT valid observation to fill gap
23212321
value : any kind (should be same type as array)
23222322
Value to use to fill holes (e.g. 0)
2323+
inplace : boolean, default False
2324+
If True, fill the DataFrame in place. Note: this will modify any
2325+
other views on this DataFrame, like if you took a no-copy slice of
2326+
an existing DataFrame, for example a column in a DataFrame. Returns
2327+
a reference to the filled object, which is self if inplace=True
23232328
23242329
See also
23252330
--------
@@ -2329,18 +2334,38 @@ def fillna(self, value=None, method='pad'):
23292334
-------
23302335
filled : DataFrame
23312336
"""
2337+
from pandas.core.internals import FloatBlock, ObjectBlock
2338+
2339+
self._consolidate_inplace()
2340+
23322341
if value is None:
2333-
result = {}
2334-
series = self._series
2335-
for col, s in series.iteritems():
2336-
result[col] = s.fillna(method=method, value=value)
2337-
return self._constructor(result, index=self.index,
2338-
columns=self.columns)
2342+
new_blocks = []
2343+
2344+
method = com._clean_fill_method(method)
2345+
for block in self._data.blocks:
2346+
if isinstance(block, (FloatBlock, ObjectBlock)):
2347+
newb = block.interpolate(method, inplace=inplace)
2348+
else:
2349+
newb = block if inplace else block.copy()
2350+
new_blocks.append(newb)
2351+
2352+
new_data = BlockManager(new_blocks, self._data.axes)
2353+
2354+
# series = self._series
2355+
# for col, s in series.iteritems():
2356+
# result[col] = s.fillna(method=method, value=value)
2357+
# return self._constructor(result, index=self.index,
2358+
# columns=self.columns)
23392359
else:
23402360
# Float type values
23412361
if len(self.columns) == 0:
23422362
return self
23432363
new_data = self._data.fillna(value)
2364+
2365+
if inplace:
2366+
self._data = new_data
2367+
return self
2368+
else:
23442369
return self._constructor(new_data)
23452370

23462371
#----------------------------------------------------------------------

pandas/core/internals.py

+33
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,39 @@ def fillna(self, value):
209209
new_values.flat[mask] = value
210210
return make_block(new_values, self.items, self.ref_items)
211211

212+
def interpolate(self, method='pad', inplace=False):
213+
values = self.values if inplace else self.values.copy()
214+
215+
if values.ndim != 2:
216+
raise NotImplementedError
217+
218+
if method == 'pad':
219+
_pad(values)
220+
else:
221+
_backfill(values)
222+
223+
return make_block(values, self.items, self.ref_items)
224+
225+
def _pad(values):
226+
if com.is_float_dtype(values):
227+
_method = lib.pad_2d_inplace_float64
228+
elif values.dtype == np.object_:
229+
_method = lib.pad_2d_inplace_object
230+
else:
231+
raise ValueError('Invalid dtype for padding')
232+
233+
_method(values, com.isnull(values).view(np.uint8))
234+
235+
def _backfill(values):
236+
if com.is_float_dtype(values):
237+
_method = lib.backfill_2d_inplace_float64
238+
elif values.dtype == np.object_:
239+
_method = lib.backfill_2d_inplace_object
240+
else:
241+
raise ValueError('Invalid dtype for padding')
242+
243+
_method(values, com.isnull(values).view(np.uint8))
244+
212245
#-------------------------------------------------------------------------------
213246
# Is this even possible?
214247

pandas/core/series.py

+18-14
Original file line numberDiff line numberDiff line change
@@ -1832,7 +1832,7 @@ def take(self, indices, axis=0):
18321832

18331833
truncate = generic.truncate
18341834

1835-
def fillna(self, value=None, method='pad'):
1835+
def fillna(self, value=None, method='pad', inplace=False):
18361836
"""
18371837
Fill NA/NaN values using the specified method
18381838
@@ -1844,6 +1844,10 @@ def fillna(self, value=None, method='pad'):
18441844
Method to use for filling holes in reindexed Series
18451845
pad / ffill: propagate last valid observation forward to next valid
18461846
backfill / bfill: use NEXT valid observation to fill gap
1847+
inplace : boolean, default False
1848+
If True, fill the Series in place. Note: this will modify any other
1849+
views on this Series, for example a column in a DataFrame. Returns
1850+
a reference to the filled object, which is self if inplace=True
18471851
18481852
See also
18491853
--------
@@ -1853,22 +1857,16 @@ def fillna(self, value=None, method='pad'):
18531857
-------
18541858
filled : Series
18551859
"""
1860+
mask = isnull(self.values)
1861+
18561862
if value is not None:
1857-
newSeries = self.copy()
1858-
newSeries[isnull(newSeries)] = value
1859-
return newSeries
1863+
result = self.copy() if not inplace else self
1864+
np.putmask(result, mask, value)
18601865
else:
18611866
if method is None: # pragma: no cover
18621867
raise ValueError('must specify a fill method')
18631868

1864-
method = method.lower()
1865-
1866-
if method == 'ffill':
1867-
method = 'pad'
1868-
if method == 'bfill':
1869-
method = 'backfill'
1870-
1871-
mask = isnull(self.values)
1869+
method = com._clean_fill_method(method)
18721870

18731871
# sadness. for Python 2.5 compatibility
18741872
mask = mask.astype(np.uint8)
@@ -1878,8 +1876,14 @@ def fillna(self, value=None, method='pad'):
18781876
elif method == 'backfill':
18791877
indexer = lib.get_backfill_indexer(mask)
18801878

1881-
new_values = self.values.take(indexer)
1882-
return Series(new_values, index=self.index, name=self.name)
1879+
if inplace:
1880+
self.values[:] = self.values.take(indexer)
1881+
result = self
1882+
else:
1883+
new_values = self.values.take(indexer)
1884+
result = Series(new_values, index=self.index, name=self.name)
1885+
1886+
return result
18831887

18841888
def isin(self, values):
18851889
"""

pandas/src/generate_code.py

+101
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,105 @@ def pad_%(name)s(ndarray[%(c_type)s] oldIndex,
268268
269269
"""
270270

271+
pad_template = """@cython.boundscheck(False)
272+
@cython.wraparound(False)
273+
def pad_%(name)s(ndarray[%(c_type)s] oldIndex,
274+
ndarray[%(c_type)s] newIndex,
275+
dict oldMap, dict newMap):
276+
cdef Py_ssize_t i, j, oldLength, newLength, curLoc
277+
cdef ndarray[int32_t, ndim=1] fill_vec
278+
cdef Py_ssize_t newPos, oldPos
279+
cdef %(c_type)s prevOld, curOld
280+
281+
oldLength = len(oldIndex)
282+
newLength = len(newIndex)
283+
284+
fill_vec = np.empty(len(newIndex), dtype = np.int32)
285+
fill_vec.fill(-1)
286+
287+
if oldLength == 0 or newLength == 0:
288+
return fill_vec
289+
290+
oldPos = 0
291+
newPos = 0
292+
293+
if newIndex[newLength - 1] < oldIndex[0]:
294+
return fill_vec
295+
296+
while newPos < newLength:
297+
curOld = oldIndex[oldPos]
298+
299+
while newIndex[newPos] < curOld:
300+
newPos += 1
301+
if newPos > newLength - 1:
302+
break
303+
304+
curLoc = oldMap[curOld]
305+
306+
if oldPos == oldLength - 1:
307+
if newIndex[newPos] >= curOld:
308+
fill_vec[newPos:] = curLoc
309+
break
310+
else:
311+
nextOld = oldIndex[oldPos + 1]
312+
done = 0
313+
314+
while newIndex[newPos] < nextOld:
315+
fill_vec[newPos] = curLoc
316+
newPos += 1
317+
318+
if newPos > newLength - 1:
319+
done = 1
320+
break
321+
322+
if done:
323+
break
324+
325+
oldPos += 1
326+
327+
return fill_vec
328+
329+
"""
330+
331+
pad_2d_template = """@cython.boundscheck(False)
332+
@cython.wraparound(False)
333+
def pad_2d_inplace_%(name)s(ndarray[%(c_type)s, ndim=2] values,
334+
ndarray[uint8_t, ndim=2] mask):
335+
cdef Py_ssize_t i, j, N, K
336+
cdef %(c_type)s val
337+
338+
K, N = (<object> values).shape
339+
340+
val = np.nan
341+
342+
for j in range(K):
343+
val = values[j, 0]
344+
for i in range(N):
345+
if mask[j, i]:
346+
values[j, i] = val
347+
else:
348+
val = values[j, i]
349+
"""
350+
351+
backfill_2d_template = """@cython.boundscheck(False)
352+
@cython.wraparound(False)
353+
def backfill_2d_inplace_%(name)s(ndarray[%(c_type)s, ndim=2] values,
354+
ndarray[uint8_t, ndim=2] mask):
355+
cdef Py_ssize_t i, j, N, K
356+
cdef %(c_type)s val
357+
358+
K, N = (<object> values).shape
359+
360+
for j in range(K):
361+
val = values[j, N - 1]
362+
for i in range(N - 1, -1 , -1):
363+
if mask[j, i]:
364+
values[j, i] = val
365+
else:
366+
val = values[j, i]
367+
"""
368+
369+
271370
is_monotonic_template = """@cython.boundscheck(False)
272371
@cython.wraparound(False)
273372
def is_monotonic_%(name)s(ndarray[%(c_type)s] arr):
@@ -638,6 +737,8 @@ def generate_from_template(template, ndim=1, exclude=None):
638737
merge_indexer_template,
639738
pad_template,
640739
backfill_template,
740+
pad_2d_template,
741+
backfill_2d_template,
641742
take_1d_template,
642743
is_monotonic_template,
643744
groupby_template,

0 commit comments

Comments
 (0)