Skip to content

Commit c871050

Browse files
committed
ENH: Optimize take_*; improve non-NA fill_value support
1 parent 119c2e1 commit c871050

File tree

11 files changed

+4575
-4292
lines changed

11 files changed

+4575
-4292
lines changed

pandas/algos.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -2018,7 +2018,7 @@ def group_median(ndarray[float64_t, ndim=2] out,
20182018
data = np.empty((K, N), dtype=np.float64)
20192019
ptr = <float64_t*> data.data
20202020

2021-
take_2d_axis1_float64(values.T, indexer, out=data)
2021+
take_2d_axis1_float64_float64(values.T, indexer, out=data)
20222022

20232023
for i in range(K):
20242024
# exclude NA group

pandas/core/common.py

+360-211
Large diffs are not rendered by default.

pandas/core/frame.py

+15-18
Original file line numberDiff line numberDiff line change
@@ -2107,10 +2107,6 @@ def __setitem__(self, key, value):
21072107
def _boolean_set(self, key, value):
21082108
if key.values.dtype != np.bool_:
21092109
raise ValueError('Must pass DataFrame with boolean values only')
2110-
2111-
if self._is_mixed_type:
2112-
raise ValueError('Cannot do boolean setting on mixed-type frame')
2113-
21142110
self.where(-key, value, inplace=True)
21152111

21162112
def _set_item_multiple(self, keys, value):
@@ -2928,7 +2924,7 @@ def take(self, indices, axis=0):
29282924
new_columns = self.columns.take(indices)
29292925
return self.reindex(columns=new_columns)
29302926
else:
2931-
new_values = com.take_2d(self.values,
2927+
new_values = com.take_nd(self.values,
29322928
com._ensure_int64(indices),
29332929
axis=axis)
29342930
if axis == 0:
@@ -5229,16 +5225,19 @@ def where(self, cond, other=NA, inplace=False, try_cast=False, raise_on_error=Tr
52295225
52305226
Parameters
52315227
----------
5232-
cond: boolean DataFrame or array
5233-
other: scalar or DataFrame
5234-
inplace: perform the operation in place on the data
5235-
try_cast: try to cast the result back to the input type (if possible), defaults to False
5236-
raise_on_error: should I raise on invalid data types (e.g. trying to where on strings),
5237-
defaults to True
5228+
cond : boolean DataFrame or array
5229+
other : scalar or DataFrame
5230+
inplace : boolean, default False
5231+
Whether to perform the operation in place on the data
5232+
try_cast : boolean, default False
5233+
try to cast the result back to the input type (if possible),
5234+
raise_on_error : boolean, default True
5235+
Whether to raise on invalid data types (e.g. trying to where on
5236+
strings)
52385237
52395238
Returns
52405239
-------
5241-
wh: DataFrame
5240+
wh : DataFrame
52425241
"""
52435242
if not hasattr(cond, 'shape'):
52445243
raise ValueError('where requires an ndarray like object for its '
@@ -5263,18 +5262,16 @@ def where(self, cond, other=NA, inplace=False, try_cast=False, raise_on_error=Tr
52635262
if isinstance(other, DataFrame):
52645263
_, other = self.align(other, join='left', fill_value=NA)
52655264
elif isinstance(other,np.ndarray):
5266-
5267-
if other.shape[0] != len(self.index) or other.shape[1] != len(self.columns):
5268-
raise ValueError('other must be the same shape as self when an ndarray')
5269-
other = DataFrame(other,self.index,self.columns)
5265+
if other.shape != self.shape:
5266+
raise ValueError('other must be the same shape as self '
5267+
'when an ndarray')
5268+
other = DataFrame(other, self.index, self.columns)
52705269

52715270
if inplace:
5272-
52735271
# we may have different type blocks come out of putmask, so reconstruct the block manager
52745272
self._data = self._data.putmask(cond,other,inplace=True)
52755273

52765274
else:
5277-
52785275
func = lambda values, others, conds: np.where(conds, values, others)
52795276
new_data = self._data.where(func, other, cond, raise_on_error=raise_on_error, try_cast=try_cast)
52805277

pandas/core/internals.py

+36-45
Original file line numberDiff line numberDiff line change
@@ -125,15 +125,9 @@ def reindex_axis(self, indexer, mask, needs_masking, axis=0,
125125
"""
126126
Reindex using pre-computed indexer information
127127
"""
128-
if self.values.size > 0:
129-
new_values = com.take_fast(self.values, indexer, mask,
130-
needs_masking, axis=axis,
131-
fill_value=fill_value)
132-
else:
133-
shape = list(self.shape)
134-
shape[axis] = len(indexer)
135-
new_values = np.empty(shape)
136-
new_values.fill(fill_value)
128+
new_values = com.take_fast(self.values, indexer,
129+
mask, needs_masking, axis=axis,
130+
fill_value=fill_value)
137131
return make_block(new_values, self.items, self.ref_items)
138132

139133
def reindex_items_from(self, new_ref_items, copy=True):
@@ -155,12 +149,9 @@ def reindex_items_from(self, new_ref_items, copy=True):
155149
mask = indexer != -1
156150
masked_idx = indexer[mask]
157151

158-
if self.values.ndim == 2:
159-
new_values = com.take_2d(self.values, masked_idx, axis=0,
160-
needs_masking=False)
161-
else:
162-
new_values = self.values.take(masked_idx, axis=0)
163-
152+
new_values = com.take_fast(self.values, masked_idx,
153+
mask=None, needs_masking=False,
154+
axis=0)
164155
new_items = self.items.take(masked_idx)
165156
return make_block(new_values, new_items, new_ref_items)
166157

@@ -301,24 +292,23 @@ def putmask(self, mask, new, inplace=False):
301292
new_values = self.values if inplace else self.values.copy()
302293

303294
# may need to align the new
304-
if hasattr(new,'reindex_axis'):
305-
axis = getattr(new,'_het_axis',0)
295+
if hasattr(new, 'reindex_axis'):
296+
axis = getattr(new, '_het_axis', 0)
306297
new = new.reindex_axis(self.items, axis=axis, copy=False).values.T
307298

308299
# may need to align the mask
309-
if hasattr(mask,'reindex_axis'):
310-
axis = getattr(mask,'_het_axis',0)
300+
if hasattr(mask, 'reindex_axis'):
301+
axis = getattr(mask, '_het_axis', 0)
311302
mask = mask.reindex_axis(self.items, axis=axis, copy=False).values.T
312303

313304
if self._can_hold_element(new):
314305
new = self._try_cast(new)
315306
np.putmask(new_values, mask, new)
316-
317307
# upcast me
318308
else:
319-
320309
# type of the new block
321-
if isinstance(new,np.ndarray) and issubclass(new.dtype,np.number) or issubclass(type(new),float):
310+
if ((isinstance(new, np.ndarray) and issubclass(new.dtype, np.number)) or
311+
isinstance(new, float)):
322312
typ = float
323313
else:
324314
typ = object
@@ -369,9 +359,8 @@ def interpolate(self, method='pad', axis=0, inplace=False,
369359
def take(self, indexer, axis=1, fill_value=np.nan):
370360
if axis < 1:
371361
raise AssertionError('axis must be at least 1, got %d' % axis)
372-
new_values = com.take_fast(self.values, indexer, None,
373-
None, axis=axis,
374-
fill_value=fill_value)
362+
new_values = com.take_fast(self.values, indexer, None, False,
363+
axis=axis, fill_value=fill_value)
375364
return make_block(new_values, self.items, self.ref_items)
376365

377366
def get_values(self, dtype):
@@ -401,22 +390,21 @@ def where(self, func, other, cond = None, raise_on_error = True, try_cast = Fals
401390
402391
Parameters
403392
----------
404-
func : how to combine self,other
393+
func : how to combine self, other
405394
other : a ndarray/object
406395
cond : the condition to respect, optional
407-
raise_on_error : if True, raise when I can't perform the function, False by default (and just return
408-
the data that we had coming in)
396+
raise_on_error : if True, raise when I can't perform the function,
397+
False by default (and just return the data that we had coming in)
409398
410399
Returns
411400
-------
412401
a new block, the result of the func
413402
"""
414-
415403
values = self.values
416404

417405
# see if we can align other
418-
if hasattr(other,'reindex_axis'):
419-
axis = getattr(other,'_het_axis',0)
406+
if hasattr(other, 'reindex_axis'):
407+
axis = getattr(other, '_het_axis', 0)
420408
other = other.reindex_axis(self.items, axis=axis, copy=True).values
421409

422410
# make sure that we can broadcast
@@ -428,17 +416,20 @@ def where(self, func, other, cond = None, raise_on_error = True, try_cast = Fals
428416

429417
# see if we can align cond
430418
if cond is not None:
431-
if not hasattr(cond,'shape'):
432-
raise ValueError("where must have a condition that is ndarray like")
433-
if hasattr(cond,'reindex_axis'):
434-
axis = getattr(cond,'_het_axis',0)
435-
cond = cond.reindex_axis(self.items, axis=axis, copy=True).values
419+
if not hasattr(cond, 'shape'):
420+
raise ValueError('where must have a condition that is ndarray'
421+
' like')
422+
if hasattr(cond, 'reindex_axis'):
423+
axis = getattr(cond, '_het_axis', 0)
424+
cond = cond.reindex_axis(self.items, axis=axis,
425+
copy=True).values
436426
else:
437427
cond = cond.values
438428

439429
# may need to undo transpose of values
440430
if hasattr(values, 'ndim'):
441-
if values.ndim != cond.ndim or values.shape == cond.shape[::-1]:
431+
if (values.ndim != cond.ndim or
432+
values.shape == cond.shape[::-1]):
442433
values = values.T
443434
is_transposed = not is_transposed
444435

@@ -494,7 +485,7 @@ class FloatBlock(NumericBlock):
494485

495486
def _can_hold_element(self, element):
496487
if isinstance(element, np.ndarray):
497-
return issubclass(element.dtype.type, (np.floating,np.integer))
488+
return issubclass(element.dtype.type, (np.floating, np.integer))
498489
return isinstance(element, (float, int))
499490

500491
def _try_cast(self, element):
@@ -541,7 +532,8 @@ def _try_cast(self, element):
541532
def _try_cast_result(self, result):
542533
# this is quite restrictive to convert
543534
try:
544-
if isinstance(result, np.ndarray) and issubclass(result.dtype.type, np.floating):
535+
if (isinstance(result, np.ndarray) and
536+
issubclass(result.dtype.type, np.floating)):
545537
if com.notnull(result).all():
546538
new_result = result.astype(self.dtype)
547539
if (new_result == result).all():
@@ -958,7 +950,8 @@ def _get_clean_block_types(self, type_list):
958950
return type_list
959951

960952
def get_bool_data(self, copy=False, as_blocks=False):
961-
return self.get_numeric_data(copy=copy, type_list=(BoolBlock,), as_blocks=as_blocks)
953+
return self.get_numeric_data(copy=copy, type_list=(BoolBlock,),
954+
as_blocks=as_blocks)
962955

963956
def get_slice(self, slobj, axis=0):
964957
new_axes = list(self.axes)
@@ -1429,7 +1422,7 @@ def take(self, indexer, axis=1):
14291422
if axis == 0:
14301423
raise NotImplementedError
14311424

1432-
indexer = np.asarray(indexer, dtype='i4')
1425+
indexer = com._ensure_platform_int(indexer)
14331426

14341427
n = len(self.axes[axis])
14351428
if ((indexer == -1) | (indexer >= n)).any():
@@ -1440,8 +1433,8 @@ def take(self, indexer, axis=1):
14401433
new_axes[axis] = self.axes[axis].take(indexer)
14411434
new_blocks = []
14421435
for blk in self.blocks:
1443-
new_values = com.take_fast(blk.values, indexer,
1444-
None, False, axis=axis)
1436+
new_values = com.take_fast(blk.values, indexer, None, False,
1437+
axis=axis)
14451438
newb = make_block(new_values, blk.items, self.items)
14461439
new_blocks.append(newb)
14471440

@@ -1765,8 +1758,6 @@ def _consolidate(blocks, items):
17651758
return new_blocks
17661759

17671760

1768-
# TODO: this could be much optimized
1769-
17701761
def _merge_blocks(blocks, items):
17711762
if len(blocks) == 1:
17721763
return blocks[0]

pandas/core/reshape.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def _make_sorted_values_labels(self):
9393
indexer = algos.groupsort_indexer(comp_index, ngroups)[0]
9494
indexer = _ensure_platform_int(indexer)
9595

96-
self.sorted_values = com.take_2d(self.values, indexer, axis=0)
96+
self.sorted_values = com.take_nd(self.values, indexer, axis=0)
9797
self.sorted_labels = [l.take(indexer) for l in to_sort]
9898

9999
def _make_selectors(self):
@@ -136,7 +136,7 @@ def get_result(self):
136136
# rare case, level values not observed
137137
if len(obs_ids) < self.full_shape[1]:
138138
inds = (value_mask.sum(0) > 0).nonzero()[0]
139-
values = com.take_2d(values, inds, axis=1)
139+
values = com.take_nd(values, inds, axis=1)
140140
columns = columns[inds]
141141

142142
return DataFrame(values, index=index, columns=columns)

pandas/core/series.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -794,7 +794,9 @@ def convert_objects(self, convert_dates=True, convert_numeric=True):
794794
converted : Series
795795
"""
796796
if self.dtype == np.object_:
797-
return Series(com._possibly_convert_objects(self.values,convert_dates=convert_dates,convert_numeric=convert_numeric), index=self.index, name=self.name)
797+
return Series(com._possibly_convert_objects(self.values,
798+
convert_dates=convert_dates, convert_numeric=convert_numeric),
799+
index=self.index, name=self.name)
798800
return self.copy()
799801

800802
def repeat(self, reps):

0 commit comments

Comments
 (0)