Skip to content

Commit b3f2444

Browse files
committed
API: provide automatic dtype conversions on _reduce operations
API: exclude non-numerics if mixed types in _reduce operations BUG: timedelta fixes CLN: small cleaning in nanops.py BUG: allow _reduce to call .apply for certain operations when the who block fails via a reduce exception
1 parent df791a7 commit b3f2444

File tree

9 files changed

+122
-26
lines changed

9 files changed

+122
-26
lines changed

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,8 @@ pandas 0.13
157157
- Remove undocumented/unused ``kind`` keyword argument from ``read_excel``, and ``ExcelFile``. (:issue:`4713`, :issue:`4712`)
158158
- The ``method`` argument of ``NDFrame.replace()`` is valid again, so that a
159159
a list can be passed to ``to_replace`` (:issue:`4743`).
160+
- provide automatic dtype conversions on _reduce operations (:issue:`3371`)
161+
- exclude non-numerics if mixed types with datelike in _reduce operations (:issue:`3371`)
160162

161163
**Internal Refactoring**
162164

pandas/core/common.py

+50-1
Original file line numberDiff line numberDiff line change
@@ -705,6 +705,54 @@ def diff(arr, n, axis=0):
705705
return out_arr
706706

707707

708+
def _coerce_scalar_to_timedelta_type(r):
709+
# kludgy here until we have a timedelta scalar
710+
# handle the numpy < 1.7 case
711+
712+
if is_integer(r):
713+
r = timedelta(microseconds=r/1000)
714+
715+
if _np_version_under1p7:
716+
if not isinstance(r, timedelta):
717+
raise AssertionError("Invalid type for timedelta scalar: %s" % type(r))
718+
if compat.PY3:
719+
# convert to microseconds in timedelta64
720+
r = np.timedelta64(int(r.total_seconds()*1e9 + r.microseconds*1000))
721+
else:
722+
return r
723+
724+
if isinstance(r, timedelta):
725+
r = np.timedelta64(r)
726+
elif not isinstance(r, np.timedelta64):
727+
raise AssertionError("Invalid type for timedelta scalar: %s" % type(r))
728+
return r.astype('timedelta64[ns]')
729+
730+
def _coerce_to_dtypes(result, dtypes):
731+
""" given a dtypes and a result set, coerce the result elements to the dtypes """
732+
if len(result) != len(dtypes):
733+
raise AssertionError("_coerce_to_dtypes requires equal len arrays")
734+
735+
def conv(r,dtype):
736+
try:
737+
if isnull(r):
738+
pass
739+
elif dtype == _NS_DTYPE:
740+
r = Timestamp(r)
741+
elif dtype == _TD_DTYPE:
742+
r = _coerce_scalar_to_timedelta_type(r)
743+
elif dtype == np.bool_:
744+
r = bool(r)
745+
elif dtype.kind == 'f':
746+
r = float(r)
747+
elif dtype.kind == 'i':
748+
r = int(r)
749+
except:
750+
pass
751+
752+
return r
753+
754+
return np.array([ conv(r,dtype) for r, dtype in zip(result,dtypes) ])
755+
708756
def _infer_dtype_from_scalar(val):
709757
""" interpret the dtype from a scalar, upcast floats and ints
710758
return the new value and the dtype """
@@ -1288,7 +1336,7 @@ def _possibly_cast_to_timedelta(value, coerce=True):
12881336
# coercion compatability
12891337
if coerce == 'compat' and _np_version_under1p7:
12901338

1291-
def convert(td, type):
1339+
def convert(td, dtype):
12921340

12931341
# we have an array with a non-object dtype
12941342
if hasattr(td,'item'):
@@ -1317,6 +1365,7 @@ def convert(td, type):
13171365
# < 1.7 coercion
13181366
if not is_list_like(value):
13191367
value = np.array([ value ])
1368+
13201369
dtype = value.dtype
13211370
return np.array([ convert(v,dtype) for v in value ], dtype='m8[ns]')
13221371

pandas/core/frame.py

+19-3
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@
2323

2424
from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
2525
_default_index, _maybe_upcast, _is_sequence,
26-
_infer_dtype_from_scalar, _values_from_object)
26+
_infer_dtype_from_scalar, _values_from_object,
27+
_coerce_to_dtypes, _DATELIKE_DTYPES)
2728
from pandas.core.generic import NDFrame
2829
from pandas.core.index import Index, MultiIndex, _ensure_index
2930
from pandas.core.indexing import (_NDFrameIndexer, _maybe_droplevels,
@@ -4235,11 +4236,24 @@ def _reduce(self, op, axis=0, skipna=True, numeric_only=None,
42354236
axis = self._get_axis_number(axis)
42364237
f = lambda x: op(x, axis=axis, skipna=skipna, **kwds)
42374238
labels = self._get_agg_axis(axis)
4239+
4240+
# exclude timedelta/datetime unless we are uniform types
4241+
if axis == 1 and self._is_mixed_type and len(set(self.dtypes) & _DATELIKE_DTYPES):
4242+
numeric_only = True
4243+
42384244
if numeric_only is None:
42394245
try:
42404246
values = self.values
42414247
result = f(values)
42424248
except Exception as e:
4249+
4250+
# try by-column first
4251+
if filter_type is None and axis == 0:
4252+
try:
4253+
return self.apply(f).iloc[0]
4254+
except:
4255+
pass
4256+
42434257
if filter_type is None or filter_type == 'numeric':
42444258
data = self._get_numeric_data()
42454259
elif filter_type == 'bool':
@@ -4273,9 +4287,11 @@ def _reduce(self, op, axis=0, skipna=True, numeric_only=None,
42734287
result = result.astype(np.float64)
42744288
elif filter_type == 'bool' and notnull(result).all():
42754289
result = result.astype(np.bool_)
4276-
# otherwise, accept it
42774290
except (ValueError, TypeError):
4278-
pass
4291+
4292+
# try to coerce to the original dtypes item by item if we can
4293+
if axis == 0:
4294+
result = com._coerce_to_dtypes(result, self.dtypes)
42794295

42804296
return Series(result, index=labels)
42814297

pandas/core/generic.py

+15-4
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,9 @@
2020
_infer_dtype_from_scalar, _maybe_promote,
2121
ABCSeries)
2222

23-
24-
2523
def is_dictlike(x):
2624
return isinstance(x, (dict, com.ABCSeries))
2725

28-
2926
def _single_replace(self, to_replace, method, inplace, limit):
3027
orig_dtype = self.dtype
3128
result = self if inplace else self.copy()
@@ -1906,7 +1903,21 @@ def abs(self):
19061903
abs: type of caller
19071904
"""
19081905
obj = np.abs(self)
1909-
obj = com._possibly_cast_to_timedelta(obj, coerce=False)
1906+
1907+
# suprimo numpy 1.6 hacking
1908+
if com._np_version_under1p7:
1909+
if self.ndim == 1:
1910+
if obj.dtype == 'm8[us]':
1911+
obj = obj.astype('m8[ns]')
1912+
elif self.ndim == 2:
1913+
def f(x):
1914+
if x.dtype == 'm8[us]':
1915+
x = x.astype('m8[ns]')
1916+
return x
1917+
1918+
if 'm8[us]' in obj.dtypes.values:
1919+
obj = obj.apply(f)
1920+
19101921
return obj
19111922

19121923
def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,

pandas/core/internals.py

+1
Original file line numberDiff line numberDiff line change
@@ -1021,6 +1021,7 @@ def should_store(self, value):
10211021
class TimeDeltaBlock(IntBlock):
10221022
is_timedelta = True
10231023
_can_hold_na = True
1024+
is_numeric = False
10241025

10251026
def _try_fill(self, value):
10261027
""" if we are a NaT, return the actual fill value """

pandas/core/nanops.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -287,8 +287,7 @@ def nanmin(values, axis=None, skipna=True):
287287
values, mask, dtype = _get_values(values, skipna, fill_value_typ = '+inf')
288288

289289
# numpy 1.6.1 workaround in Python 3.x
290-
if (values.dtype == np.object_
291-
and sys.version_info[0] >= 3): # pragma: no cover
290+
if (values.dtype == np.object_ and compat.PY3):
292291
if values.ndim > 1:
293292
apply_ax = axis if axis is not None else 0
294293
result = np.apply_along_axis(builtins.min, apply_ax, values)
@@ -311,8 +310,7 @@ def nanmax(values, axis=None, skipna=True):
311310
values, mask, dtype = _get_values(values, skipna, fill_value_typ ='-inf')
312311

313312
# numpy 1.6.1 workaround in Python 3.x
314-
if (values.dtype == np.object_
315-
and sys.version_info[0] >= 3): # pragma: no cover
313+
if (values.dtype == np.object_ and compat.PY3):
316314

317315
if values.ndim > 1:
318316
apply_ax = axis if axis is not None else 0

pandas/src/inference.pyx

+3
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,9 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
480480
seen_object = 1
481481
# objects[i] = val.astype('O')
482482
break
483+
elif util.is_timedelta64_object(val):
484+
seen_object = 1
485+
break
483486
elif util.is_integer_object(val):
484487
seen_int = 1
485488
floats[i] = <float64_t> val

pandas/tests/test_frame.py

+27-9
Original file line numberDiff line numberDiff line change
@@ -3232,25 +3232,43 @@ def test_operators_timedelta64(self):
32323232
result = diffs.max(axis=1)
32333233
self.assert_((result == diffs['A']).all() == True)
32343234

3235-
# abs ###### THIS IS BROKEN NOW ###### (results are dtype=timedelta64[us]
3236-
# even though fixed in series
3237-
#result = np.abs(df['A']-df['B'])
3238-
#result = diffs.abs()
3239-
#expected = DataFrame(dict(A = df['A']-df['C'],
3240-
# B = df['B']-df['A']))
3241-
#assert_frame_equal(result,expected)
3235+
# abs
3236+
result = diffs.abs()
3237+
expected = DataFrame(dict(A = df['A']-df['C'],
3238+
B = df['B']-df['A']))
3239+
assert_frame_equal(result,expected)
32423240

32433241
# mixed frame
32443242
mixed = diffs.copy()
32453243
mixed['C'] = 'foo'
32463244
mixed['D'] = 1
32473245
mixed['E'] = 1.
3246+
mixed['F'] = Timestamp('20130101')
32483247

3249-
# this is ok
3248+
# results in an object array
32503249
result = mixed.min()
3250+
expected = Series([com._coerce_scalar_to_timedelta_type(timedelta(seconds=5*60+5)),
3251+
com._coerce_scalar_to_timedelta_type(timedelta(days=-1)),
3252+
'foo',
3253+
1,
3254+
1.0,
3255+
Timestamp('20130101')],
3256+
index=mixed.columns)
3257+
assert_series_equal(result,expected)
32513258

3252-
# this is not
3259+
# excludes numeric
32533260
result = mixed.min(axis=1)
3261+
expected = Series([1, 1, 1.],index=[0, 1, 2])
3262+
assert_series_equal(result,expected)
3263+
3264+
# works when only those columns are selected
3265+
result = mixed[['A','B']].min(1)
3266+
expected = Series([ timedelta(days=-1) ] * 3)
3267+
assert_series_equal(result,expected)
3268+
3269+
result = mixed[['A','B']].min()
3270+
expected = Series([ timedelta(seconds=5*60+5), timedelta(days=-1) ],index=['A','B'])
3271+
assert_series_equal(result,expected)
32543272

32553273
# GH 3106
32563274
df = DataFrame({'time' : date_range('20130102',periods=5),

pandas/tests/test_series.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -4277,16 +4277,14 @@ def test_reindex_corner(self):
42774277

42784278
def test_reindex_pad(self):
42794279

4280-
s = Series(np.arange(10), np.arange(10))
4280+
s = Series(np.arange(10))
42814281
s2 = s[::2]
42824282

42834283
reindexed = s2.reindex(s.index, method='pad')
42844284
reindexed2 = s2.reindex(s.index, method='ffill')
42854285
assert_series_equal(reindexed, reindexed2)
42864286

4287-
# used platform int above, need to pass int explicitly here per #1219
4288-
expected = Series([0, 0, 2, 2, 4, 4, 6, 6, 8, 8], dtype=int,
4289-
index=np.arange(10))
4287+
expected = Series([0, 0, 2, 2, 4, 4, 6, 6, 8, 8], index=np.arange(10))
42904288
assert_series_equal(reindexed, expected)
42914289

42924290
# GH4604
@@ -4696,7 +4694,7 @@ def test_replace_with_single_list(self):
46964694
assert_series_equal(s, ser)
46974695

46984696
def test_replace_mixed_types(self):
4699-
s = Series(np.arange(5))
4697+
s = Series(np.arange(5),dtype='int64')
47004698

47014699
def check_replace(to_rep, val, expected):
47024700
sc = s.copy()

0 commit comments

Comments
 (0)