Skip to content

Commit c1307b6

Browse files
committed
BUG/ENH: idxmin/idxmax NA behavior should be same as other reductions, refactoring, bugfix in Cython object conversion function
1 parent 2bf7613 commit c1307b6

File tree

9 files changed

+108
-90
lines changed

9 files changed

+108
-90
lines changed

pandas/core/frame.py

+25-19
Original file line numberDiff line numberDiff line change
@@ -2186,7 +2186,8 @@ def _shift_indexer(self, periods):
21862186
#----------------------------------------------------------------------
21872187
# Function application
21882188

2189-
def apply(self, func, axis=0, broadcast=False, raw=False):
2189+
def apply(self, func, axis=0, broadcast=False, raw=False,
2190+
args=(), **kwds):
21902191
"""
21912192
Applies function along input axis of DataFrame. Objects passed to
21922193
functions are Series objects having index either the DataFrame's index
@@ -2207,6 +2208,10 @@ def apply(self, func, axis=0, broadcast=False, raw=False):
22072208
passed function will receive ndarray objects instead. If you are
22082209
just applying a NumPy reduction function this will achieve much
22092210
better performance
2211+
args : tuple
2212+
Positional arguments to pass to function in addition to the
2213+
array/series
2214+
Additional keyword arguments will be passed as keywords to the function
22102215
22112216
Examples
22122217
--------
@@ -2226,26 +2231,31 @@ def apply(self, func, axis=0, broadcast=False, raw=False):
22262231
if len(self.columns) == 0 and len(self.index) == 0:
22272232
return self
22282233

2229-
if isinstance(func, np.ufunc):
2230-
results = func(self.values)
2234+
if kwds or args and not isinstance(func, np.ufunc):
2235+
f = lambda x: func(x, *args, **kwds)
2236+
else:
2237+
f = func
2238+
2239+
if isinstance(f, np.ufunc):
2240+
results = f(self.values)
22312241
return self._constructor(data=results, index=self.index,
22322242
columns=self.columns, copy=False)
22332243
else:
22342244
if not broadcast:
22352245
if not all(self.shape):
2236-
is_reduction = not isinstance(func(_EMPTY_SERIES),
2246+
is_reduction = not isinstance(f(_EMPTY_SERIES),
22372247
np.ndarray)
22382248
if is_reduction:
22392249
return Series(np.nan, index=self._get_agg_axis(axis))
22402250
else:
22412251
return self.copy()
22422252

22432253
if raw and not self._is_mixed_type:
2244-
return self._apply_raw(func, axis)
2254+
return self._apply_raw(f, axis)
22452255
else:
2246-
return self._apply_standard(func, axis)
2256+
return self._apply_standard(f, axis)
22472257
else:
2248-
return self._apply_broadcast(func, axis)
2258+
return self._apply_broadcast(f, axis)
22492259

22502260
def _apply_raw(self, func, axis):
22512261
try:
@@ -2857,12 +2867,10 @@ def idxmin(self, axis=0, skipna=True):
28572867
-------
28582868
idxmin : Series
28592869
"""
2860-
values = self.values.copy()
2861-
if skipna and not issubclass(values.dtype.type, np.integer):
2862-
np.putmask(values, -np.isfinite(values), np.inf)
2863-
argmin_index = self._get_axis(axis)
2864-
return Series([argmin_index[i] for i in values.argmin(axis)],
2865-
index=self._get_agg_axis(axis))
2870+
indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna)
2871+
index = self._get_axis(axis)
2872+
result = [index[i] if i >= 0 else np.nan for i in indices]
2873+
return Series(result, index=self._get_agg_axis(axis))
28662874

28672875
def idxmax(self, axis=0, skipna=True):
28682876
"""
@@ -2881,12 +2889,10 @@ def idxmax(self, axis=0, skipna=True):
28812889
-------
28822890
idxmax : Series
28832891
"""
2884-
values = self.values.copy()
2885-
if skipna and not issubclass(values.dtype.type, np.integer):
2886-
np.putmask(values, -np.isfinite(values), -np.inf)
2887-
argmax_index = self._get_axis(axis)
2888-
return Series([argmax_index[i] for i in values.argmax(axis)],
2889-
index=self._get_agg_axis(axis))
2892+
indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna)
2893+
index = self._get_axis(axis)
2894+
result = [index[i] if i >= 0 else np.nan for i in indices]
2895+
return Series(result, index=self._get_agg_axis(axis))
28902896

28912897
def _agg_by_level(self, name, axis=0, level=0, skipna=True):
28922898
method = getattr(type(self), name)

pandas/core/nanops.py

+42
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,48 @@ def nanprod(values, axis=None, skipna=True, copy=True):
139139
result = values.prod(axis)
140140
return _maybe_null_out(result, axis, mask)
141141

142+
def nanargmax(values, axis=None, skipna=True):
143+
"""
144+
Returns -1 in the NA case
145+
"""
146+
mask = -np.isfinite(values)
147+
if not issubclass(values.dtype.type, np.integer):
148+
values = values.copy()
149+
np.putmask(values, mask, -np.inf)
150+
result = values.argmax(axis)
151+
result = _maybe_arg_null_out(result, axis, mask, skipna)
152+
return result
153+
154+
def nanargmin(values, axis=None, skipna=True):
155+
"""
156+
Returns -1 in the NA case
157+
"""
158+
mask = -np.isfinite(values)
159+
if not issubclass(values.dtype.type, np.integer):
160+
values = values.copy()
161+
np.putmask(values, mask, np.inf)
162+
result = values.argmin(axis)
163+
result = _maybe_arg_null_out(result, axis, mask, skipna)
164+
return result
165+
166+
def _maybe_arg_null_out(result, axis, mask, skipna):
167+
# helper function for nanargmin/nanargmax
168+
if axis is None:
169+
if skipna:
170+
if mask.all():
171+
result = -1
172+
else:
173+
if mask.any():
174+
result = -1
175+
else:
176+
if skipna:
177+
na_mask = mask.all(axis)
178+
else:
179+
na_mask = mask.any(axis)
180+
if na_mask.any():
181+
result[na_mask] = -1
182+
return result
183+
142184
def _get_counts(mask, axis):
143185
if axis is not None:
144186
count = (mask.shape[axis] - mask.sum(axis)).astype(float)

pandas/core/series.py

+8-10
Original file line numberDiff line numberDiff line change
@@ -717,11 +717,10 @@ def idxmin(self, axis=None, out=None, skipna=True):
717717
-------
718718
idxmin : Index of mimimum of values
719719
"""
720-
arr = self.values.copy()
721-
if skipna:
722-
if not issubclass(arr.dtype.type, np.integer):
723-
np.putmask(arr, isnull(arr), np.inf)
724-
return self.index[arr.argmin()]
720+
i = nanops.nanargmin(self.values, skipna=skipna)
721+
if i == -1:
722+
return np.nan
723+
return self.index[i]
725724

726725
def idxmax(self, axis=None, out=None, skipna=True):
727726
"""
@@ -736,11 +735,10 @@ def idxmax(self, axis=None, out=None, skipna=True):
736735
-------
737736
idxmax : Index of mimimum of values
738737
"""
739-
arr = self.values.copy()
740-
if skipna:
741-
if not issubclass(arr.dtype.type, np.integer):
742-
np.putmask(arr, isnull(arr), -np.inf)
743-
return self.index[arr.argmax()]
738+
i = nanops.nanargmax(self.values, skipna=skipna)
739+
if i == -1:
740+
return np.nan
741+
return self.index[i]
744742

745743
def _agg_by_level(self, name, level=0, skipna=True):
746744
method = getattr(type(self), name)

pandas/src/parsing.pyx

+7-2
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ def maybe_convert_objects(ndarray[object] objects):
126126
bint seen_float = 0
127127
bint seen_int = 0
128128
bint seen_bool = 0
129+
bint seen_object = 0
129130
bint seen_null = 0
130131
object val, onan
131132
float64_t fval, fnan
@@ -164,14 +165,18 @@ def maybe_convert_objects(ndarray[object] objects):
164165
seen_float = 1
165166
except Exception:
166167
pass
168+
else:
169+
seen_object = 1
167170

168171
if seen_null:
169-
if seen_float or seen_int:
172+
if (seen_float or seen_int) and not seen_object:
170173
return floats
171174
else:
172175
return objects
173176
else:
174-
if seen_int:
177+
if seen_object:
178+
return objects
179+
elif seen_int:
175180
return ints
176181
elif seen_float:
177182
return floats

pandas/src/reduce.pyx

+6-7
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ cdef class Reducer:
5959
chunk.data = arr.data
6060
try:
6161
for i in range(self.nresults):
62-
res = self.f(self.dummy)
62+
res = self.f(chunk)
6363
if i == 0:
6464
result = self._get_result_array(res)
6565
it = <flatiter> PyArray_IterNew(result)
@@ -70,19 +70,18 @@ cdef class Reducer:
7070
finally:
7171
# so we don't free the wrong memory
7272
chunk.data = dummy_buf
73-
7473
if result.dtype == np.object_:
7574
result = maybe_convert_objects(result)
76-
7775
return result
7876

7977
def _get_result_array(self, object res):
8078
try:
8179
assert(not isinstance(res, np.ndarray))
82-
if hasattr(res, 'dtype'):
83-
result = np.empty(self.nresults, dtype=res.dtype)
84-
else:
85-
result = np.empty(self.nresults, dtype='O')
80+
result = np.empty(self.nresults, dtype='O')
81+
# if hasattr(res, 'dtype'):
82+
# result = np.empty(self.nresults, dtype=res.dtype)
83+
# else:
84+
# result = np.empty(self.nresults, dtype='O')
8685
result[0] = res
8786
except Exception:
8887
raise ValueError('function does not reduce')

pandas/tests/test_frame.py

+9-47
Original file line numberDiff line numberDiff line change
@@ -3628,66 +3628,28 @@ def test_dot(self):
36283628
assert_frame_equal(result, expected)
36293629

36303630
def test_idxmin(self):
3631-
def validate(f, s, axis, skipna):
3632-
def get_result(f, i, v, axis, skipna):
3633-
if axis == 0:
3634-
return (f[i][v], f[i].min(skipna=skipna))
3635-
else:
3636-
return (f[v][i], f.ix[i].min(skipna=skipna))
3637-
for i, v in s.iteritems():
3638-
(r1, r2) = get_result(f, i, v, axis, skipna)
3639-
if np.isnan(r1) or np.isinf(r1):
3640-
self.assert_(np.isnan(r2) or np.isinf(r2))
3641-
elif np.isnan(r2) or np.isinf(r2):
3642-
self.assert_(np.isnan(r1) or np.isinf(r1))
3643-
else:
3644-
self.assertEqual(r1, r2)
3645-
36463631
frame = self.frame
36473632
frame.ix[5:10] = np.nan
36483633
frame.ix[15:20, -2:] = np.nan
36493634
for skipna in [True, False]:
36503635
for axis in [0, 1]:
3651-
validate(frame,
3652-
frame.idxmin(axis=axis, skipna=skipna),
3653-
axis,
3654-
skipna)
3655-
validate(self.intframe,
3656-
self.intframe.idxmin(axis=axis, skipna=skipna),
3657-
axis,
3658-
skipna)
3636+
for df in [frame, self.intframe]:
3637+
result = df.idxmax(axis=axis, skipna=skipna)
3638+
expected = df.apply(Series.idxmax, axis=axis, skipna=skipna)
3639+
assert_series_equal(result, expected)
36593640

3660-
self.assertRaises(Exception, frame.idxmin, axis=2)
3641+
self.assertRaises(Exception, frame.idxmax, axis=2)
36613642

36623643
def test_idxmax(self):
3663-
def validate(f, s, axis, skipna):
3664-
def get_result(f, i, v, axis, skipna):
3665-
if axis == 0:
3666-
return (f[i][v], f[i].max(skipna=skipna))
3667-
else:
3668-
return (f[v][i], f.ix[i].max(skipna=skipna))
3669-
for i, v in s.iteritems():
3670-
(r1, r2) = get_result(f, i, v, axis, skipna)
3671-
if np.isnan(r1) or np.isinf(r1):
3672-
self.assert_(np.isnan(r2) or np.isinf(r2))
3673-
elif np.isnan(r2) or np.isinf(r2):
3674-
self.assert_(np.isnan(r1) or np.isinf(r1))
3675-
else:
3676-
self.assertEqual(r1, r2)
3677-
36783644
frame = self.frame
36793645
frame.ix[5:10] = np.nan
36803646
frame.ix[15:20, -2:] = np.nan
36813647
for skipna in [True, False]:
36823648
for axis in [0, 1]:
3683-
validate(frame,
3684-
frame.idxmax(axis=axis, skipna=skipna),
3685-
axis,
3686-
skipna)
3687-
validate(self.intframe,
3688-
self.intframe.idxmax(axis=axis, skipna=skipna),
3689-
axis,
3690-
skipna)
3649+
for df in [frame, self.intframe]:
3650+
result = df.idxmax(axis=axis, skipna=skipna)
3651+
expected = df.apply(Series.idxmax, axis=axis, skipna=skipna)
3652+
assert_series_equal(result, expected)
36913653

36923654
self.assertRaises(Exception, frame.idxmax, axis=2)
36933655

pandas/tests/test_multilevel.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,8 @@ def _check_counts(frame, axis=0):
294294
for i in range(index.nlevels):
295295
result = frame.count(axis=axis, level=i)
296296
expected = frame.groupby(axis=axis, level=i).count(axis=axis)
297-
assert_frame_equal(result, expected.reindex_like(result))
297+
expected = expected.reindex_like(result).astype('i8')
298+
assert_frame_equal(result, expected)
298299

299300
self.frame.ix[1, [1, 2]] = np.nan
300301
self.frame.ix[7, [0, 1]] = np.nan

pandas/tests/test_series.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -695,7 +695,7 @@ def test_idxmin(self):
695695

696696
# skipna or no
697697
self.assertEqual(self.series[self.series.idxmin()], self.series.min())
698-
self.assert_(isnull(self.series[self.series.idxmin(skipna=False)]))
698+
self.assert_(isnull(self.series.idxmin(skipna=False)))
699699

700700
# no NaNs
701701
nona = self.series.dropna()
@@ -705,7 +705,7 @@ def test_idxmin(self):
705705

706706
# all NaNs
707707
allna = self.series * nan
708-
self.assertEqual(allna.idxmin(), allna.index[0])
708+
self.assert_(isnull(allna.idxmin()))
709709

710710
def test_idxmax(self):
711711
# test idxmax
@@ -716,7 +716,7 @@ def test_idxmax(self):
716716

717717
# skipna or no
718718
self.assertEqual(self.series[self.series.idxmax()], self.series.max())
719-
self.assert_(isnull(self.series[self.series.idxmax(skipna=False)]))
719+
self.assert_(isnull(self.series.idxmax(skipna=False)))
720720

721721
# no NaNs
722722
nona = self.series.dropna()
@@ -726,7 +726,7 @@ def test_idxmax(self):
726726

727727
# all NaNs
728728
allna = self.series * nan
729-
self.assertEqual(allna.idxmax(), allna.index[0])
729+
self.assert_(isnull(allna.idxmax()))
730730

731731
def test_operators_date(self):
732732
result = self.objSeries + timedelta(1)

pandas/tests/test_tseries.py

+5
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,11 @@ def test_duplicated_with_nas():
171171
expected = trues + falses
172172
assert(np.array_equal(result, expected))
173173

174+
def test_convert_objects():
175+
arr = np.array(['a', 'b', np.nan, np.nan, 'd', 'e', 'f'], dtype='O')
176+
result = lib.maybe_convert_objects(arr)
177+
assert(result.dtype == np.object_)
178+
174179
class TestMoments(unittest.TestCase):
175180
pass
176181

0 commit comments

Comments
 (0)