Skip to content

Fix #12373: rolling functions raise ValueError on float32 data #12376

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.18.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1200,3 +1200,5 @@ Bug Fixes
- Bug when initializing categorical series with a scalar value. (:issue:`12336`)
- Bug when specifying a UTC ``DatetimeIndex`` by setting ``utc=True`` in ``.to_datetime`` (:issue:`11934`)
- Bug when increasing the buffer size of CSV reader in ``read_csv`` (:issue:`12494`)

- Bug in ``.rolling`` in which apply on float32 data will raise a ``ValueError`` (:issue:`12373`)
20 changes: 14 additions & 6 deletions pandas/core/window.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,16 +149,17 @@ def _prep_values(self, values=None, kill_inf=True, how=None):
if values is None:
values = getattr(self._selected_obj, 'values', self._selected_obj)

# coerce dtypes as appropriate
# GH #12373 : rolling functions error on float32 data
# make sure the data is coerced to float64
if com.is_float_dtype(values.dtype):
pass
values = com._ensure_float64(values)
elif com.is_integer_dtype(values.dtype):
values = values.astype(float)
values = com._ensure_float64(values)
elif com.is_timedelta64_dtype(values.dtype):
values = values.view('i8').astype(float)
values = com._ensure_float64(values.view('i8'))
else:
try:
values = values.astype(float)
values = com._ensure_float64(values)
except (ValueError, TypeError):
raise TypeError("cannot handle this type -> {0}"
"".format(values.dtype))
Expand Down Expand Up @@ -457,7 +458,9 @@ def _apply(self, func, window=None, center=None, check_minp=None, how=None,

def func(arg, window, min_periods=None):
minp = check_minp(min_periods, window)
return cfunc(arg, window, minp, **kwargs)
# GH #12373: rolling functions error on float32 data
return cfunc(com._ensure_float64(arg),
window, minp, **kwargs)

# calculation function
if center:
Expand Down Expand Up @@ -494,6 +497,7 @@ def count(self):
obj = self._convert_freq()
window = self._get_window()
window = min(window, len(obj)) if not self.center else window

try:
converted = np.isfinite(obj).astype(float)
except TypeError:
Expand Down Expand Up @@ -657,6 +661,10 @@ def cov(self, other=None, pairwise=None, ddof=1, **kwargs):
window = self._get_window(other)

def _get_cov(X, Y):
# GH #12373 : rolling functions error on float32 data
# to avoid potential overflow, cast the data to float64
X = X.astype('float64')
Y = Y.astype('float64')
mean = lambda x: x.rolling(window, self.min_periods,
center=self.center).mean(**kwargs)
count = (X + Y).rolling(window=window,
Expand Down
204 changes: 204 additions & 0 deletions pandas/tests/test_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
assert_frame_equal, assert_panel_equal,
assert_index_equal)
import pandas.core.datetools as datetools
import pandas.core.common as com
import pandas.stats.moments as mom
import pandas.core.window as rwindow
from pandas.core.base import SpecificationError
Expand Down Expand Up @@ -289,6 +290,209 @@ def test_deprecations(self):
mom.rolling_mean(Series(np.ones(10)), 3, center=True, axis=0)


# GH #12373 : rolling functions error on float32 data
# make sure rolling functions works for different dtypes
class TestDtype(Base):
dtype = None
window = 2
# the nan value, timedelta uses tslib.iNaT
naval = np.nan

# Function Name : (function, result_dtype, expectation_dtype)
funcs = {
'count': (lambda v: v.count(), 'float64', 'float64'),
'max': (lambda v: v.max(), 'float64', 'float64'),
'min': (lambda v: v.min(), 'float64', 'float64'),
'sum': (lambda v: v.sum(), 'float64', 'float64'),
'mean': (lambda v: v.mean(), 'float64', 'float64'),
'std': (lambda v: v.std(), 'float64', 'float64'),
'var': (lambda v: v.var(), 'float64', 'float64'),
'median': (lambda v: v.median(), 'float64', 'float64')
}

def get_expects(self):
expects = {
'sr1': {
'count': Series([1, 2, 2, 2, 2]),
'max': Series([self.naval, 1, 2, 3, 4]),
'min': Series([self.naval, 0, 1, 2, 3]),
'sum': Series([self.naval, 1, 3, 5, 7]),
'mean': Series([self.naval, .5, 1.5, 2.5, 3.5]),
'std': Series([self.naval] + [np.sqrt(.5)] * 4),
'var': Series([self.naval, .5, .5, .5, .5]),
'median': Series([self.naval, .5, 1.5, 2.5, 3.5])
},
'sr2': {
'count': Series([1, 2, 2, 2, 2]),
'max': Series([self.naval, 10, 8, 6, 4]),
'min': Series([self.naval, 8, 6, 4, 2]),
'sum': Series([self.naval, 18, 14, 10, 6]),
'mean': Series([self.naval, 9, 7, 5, 3]),
'std': Series([self.naval] + [np.sqrt(2)] * 4),
'var': Series([self.naval, 2, 2, 2, 2]),
'median': Series([self.naval, 9, 7, 5, 3])
},
'df': {
'count': DataFrame({0: Series([1, 2, 2, 2, 2]),
1: Series([1, 2, 2, 2, 2])}),
'max': DataFrame({0: Series([self.naval, 2, 4, 6, 8]),
1: Series([self.naval, 3, 5, 7, 9])}),
'min': DataFrame({0: Series([self.naval, 0, 2, 4, 6]),
1: Series([self.naval, 1, 3, 5, 7])}),
'sum': DataFrame({0: Series([self.naval, 2, 6, 10, 14]),
1: Series([self.naval, 4, 8, 12, 16])}),
'mean': DataFrame({0: Series([self.naval, 1, 3, 5, 7]),
1: Series([self.naval, 2, 4, 6, 8])}),
'std': DataFrame({0: Series([self.naval] + [np.sqrt(2)] * 4),
1: Series([self.naval] + [np.sqrt(2)] * 4)}),
'var': DataFrame({0: Series([self.naval, 2, 2, 2, 2]),
1: Series([self.naval, 2, 2, 2, 2])}),
'median': DataFrame({0: Series([self.naval, 1, 3, 5, 7]),
1: Series([self.naval, 2, 4, 6, 8])}),
}
}
return expects

def _create_dtype_data(self, dtype):
sr1 = Series(range(5), dtype=dtype)
sr2 = Series(range(10, 0, -2), dtype=dtype)
df = DataFrame(np.arange(10).reshape((5, 2)), dtype=dtype)

data = {
'sr1': sr1,
'sr2': sr2,
'df': df
}

return data

def _create_data(self):
super(TestDtype, self)._create_data()
self.data = self._create_dtype_data(self.dtype)
self.expects = self.get_expects()

def setUp(self):
self._create_data()

def _cast_result(self, result, from_dtype, to_dtype):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't need this as you are overriding this in TestDatetimelikes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do so because I see that assertRaises doesn't captures the error in setUp(), in which we construct the data and if the dtype is specified to 'M8[ns, UTC]' will raise error there.

What's your suggestion to capture this?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

override create_data_types in Testdatetimelike. setUp should NEVER raise, the point is its a known starting point. Really try to make this as simple as possible. I am happy for you to skip ALL Datetimelike test ATM.

if com.needs_i8_conversion(from_dtype):
if isinstance(result, Series):
result = result.view('i8')
elif isinstance(result, DataFrame):
final = []
for idx in result:
final.append(Series(result[idx].view('i8')))
result = pd.concat(final, axis=1).reindex(
columns=result.columns)
return result.astype(to_dtype)

def test_dtypes(self):
for f_name, d_name in product(self.funcs.keys(), self.data.keys()):
# Specify if the results and expectations
# need to be coerced to a given dtype
# once we changed the return value for roll_<function>,
# we should change coerce behavior here accordingly
f, res_dtype, exp_dtype = self.funcs[f_name]
d = self.data[d_name]
assert_equal = assert_series_equal if isinstance(
d, Series) else assert_frame_equal
exp = self.expects[d_name][f_name]
if exp_dtype:
exp = exp.astype(com.pandas_dtype(exp_dtype))

roll = d.rolling(window=self.window)
result = f(roll)
if res_dtype:
result = self._cast_result(result,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls take all casting out. The expected should already have the expectation of casts built in. If something is later changed, then it will obvious as the expecations are listed. They may be wrong, but that's ok for now (when I mean wrong they are not the 'correct' dtype as everything is getting casted ATM). This should be a dead simple tests. We don't want to keep expanding this, really make this much much simpler.

self.dtype,
com.pandas_dtype(res_dtype))
assert_equal(result, exp)


class TestDtype_object(TestDtype):
dtype = object


class TestDtype_int8(TestDtype):
dtype = np.int8


class TestDtype_int16(TestDtype):
dtype = np.int16


class TestDtype_int32(TestDtype):
dtype = np.int32


class TestDtype_int64(TestDtype):
dtype = np.int64


class TestDtype_uint8(TestDtype):
dtype = np.uint8


class TestDtype_uint16(TestDtype):
dtype = np.uint16


class TestDtype_uint32(TestDtype):
dtype = np.uint32


class TestDtype_uint64(TestDtype):
dtype = np.uint64


class TestDtype_float16(TestDtype):
dtype = np.float16


class TestDtype_float32(TestDtype):
dtype = np.float32


class TestDtype_float64(TestDtype):
dtype = np.float64


class TestDtype_category(TestDtype):
dtype = 'category'
include_df = False

def _create_dtype_data(self, dtype):
sr1 = Series(range(5), dtype=dtype)
sr2 = Series(range(10, 0, -2), dtype=dtype)

data = {
'sr1': sr1,
'sr2': sr2
}

return data


class TestDatetimeLikeDtype(TestDtype):
dtype = np.dtype('M8[ns]')

# GH #12373: rolling functions raise ValueError on float32 data
def setUp(self):
raise nose.SkipTest("Skip rolling on DatetimeLike dtypes.")

def test_dtypes(self):
with tm.assertRaises(TypeError):
super(TestDatetimeLikeDtype, self).test_dtypes()


class TestDtype_timedelta(TestDatetimeLikeDtype):
dtype = np.dtype('m8[ns]')


class TestDtype_datetime64UTC(TestDatetimeLikeDtype):
dtype = 'datetime64[ns, UTC]'


class TestMoments(Base):

def setUp(self):
Expand Down