Skip to content

API: Return sparse objects always for cumsum #14771

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ Backwards incompatible API changes


- ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`)
- ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`)



Expand Down
23 changes: 17 additions & 6 deletions pandas/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,19 +620,30 @@ def sum(self, axis=0, *args, **kwargs):

def cumsum(self, axis=0, *args, **kwargs):
"""
Cumulative sum of values. Preserves locations of NaN values
Cumulative sum of non-NA/null values.

When performing the cumulative summation, any non-NA/null values will
be skipped. The resulting SparseArray will preserve the locations of
NaN values, but the fill value will be `np.nan` regardless.

Parameters
----------
axis : int or None
Axis over which to perform the cumulative summation. If None,
perform cumulative summation over flattened array.

Returns
-------
cumsum : Series
cumsum : SparseArray
"""
nv.validate_cumsum(args, kwargs)

# TODO: gh-12855 - return a SparseArray here
if notnull(self.fill_value):
return self.to_dense().cumsum()
if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour.
raise ValueError("axis(={axis}) out of bounds".format(axis=axis))

if not self._null_fill_value:
return SparseArray(self.to_dense()).cumsum()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

side thing. we should use _constructor (and define it for SparseArray).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can make a separate issue / do later

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good as a follow-up.


# TODO: what if sp_values contains NaN??
return SparseArray(self.sp_values.cumsum(), sparse_index=self.sp_index,
fill_value=self.fill_value)

Expand Down
28 changes: 18 additions & 10 deletions pandas/sparse/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -630,21 +630,29 @@ def take(self, indices, axis=0, convert=True, *args, **kwargs):

def cumsum(self, axis=0, *args, **kwargs):
"""
Cumulative sum of values. Preserves locations of NaN values
Cumulative sum of non-NA/null values.

When performing the cumulative summation, any non-NA/null values will
be skipped. The resulting SparseSeries will preserve the locations of
NaN values, but the fill value will be `np.nan` regardless.

Parameters
----------
axis : {0}

Returns
-------
cumsum : SparseSeries if `self` has a null `fill_value` and a
generic Series otherwise
cumsum : SparseSeries
"""
nv.validate_cumsum(args, kwargs)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

e.g. I would do axis = self._get_axis_number(axis) (which validates).

and something similar to SparseArray.

We normally don't list these in the doc-string Parameters (even though they are in the signature)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair enough, though it was misleading because I had initially assumed the axis parameter did something, even though it doesn't do anything in this implementation.

new_array = SparseArray.cumsum(self.values)
if isinstance(new_array, SparseArray):
return self._constructor(
new_array, index=self.index,
sparse_index=new_array.sp_index).__finalize__(self)
# TODO: gh-12855 - return a SparseSeries here
return Series(new_array, index=self.index).__finalize__(self)
if axis is not None:
axis = self._get_axis_number(axis)

new_array = self.values.cumsum()

return self._constructor(
new_array, index=self.index,
sparse_index=new_array.sp_index).__finalize__(self)

@Appender(generic._shared_docs['isnull'])
def isnull(self):
Expand Down
75 changes: 43 additions & 32 deletions pandas/sparse/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -688,46 +688,57 @@ def test_numpy_sum(self):
SparseArray(data), out=out)

def test_cumsum(self):
data = np.arange(10).astype(float)
out = SparseArray(data).cumsum()
expected = SparseArray(data.cumsum())
tm.assert_sp_array_equal(out, expected)
non_null_data = np.array([1, 2, 3, 4, 5], dtype=float)
non_null_expected = SparseArray(non_null_data.cumsum())

# TODO: gh-12855 - return a SparseArray here
data[5] = np.nan
out = SparseArray(data, fill_value=2).cumsum()
self.assertNotIsInstance(out, SparseArray)
tm.assert_numpy_array_equal(out, data.cumsum())
null_data = np.array([1, 2, np.nan, 4, 5], dtype=float)
null_expected = SparseArray(np.array([1.0, 3.0, np.nan, 7.0, 12.0]))

for data, expected in [
(null_data, null_expected),
(non_null_data, non_null_expected)
]:
out = SparseArray(data).cumsum()
tm.assert_sp_array_equal(out, expected)

out = SparseArray(data, fill_value=np.nan).cumsum()
tm.assert_sp_array_equal(out, expected)

out = SparseArray(data, fill_value=np.nan).cumsum()
expected = SparseArray(np.array([
0, 1, 3, 6, 10, np.nan, 16, 23, 31, 40]))
tm.assert_sp_array_equal(out, expected)
out = SparseArray(data, fill_value=2).cumsum()
tm.assert_sp_array_equal(out, expected)

axis = 1 # SparseArray currently 1-D, so only axis = 0 is valid.
msg = "axis\(={axis}\) out of bounds".format(axis=axis)
with tm.assertRaisesRegexp(ValueError, msg):
SparseArray(data).cumsum(axis=axis)

def test_numpy_cumsum(self):
data = np.arange(10).astype(float)
out = np.cumsum(SparseArray(data))
expected = SparseArray(data.cumsum())
tm.assert_sp_array_equal(out, expected)
non_null_data = np.array([1, 2, 3, 4, 5], dtype=float)
non_null_expected = SparseArray(non_null_data.cumsum())

# TODO: gh-12855 - return a SparseArray here
data[5] = np.nan
out = np.cumsum(SparseArray(data, fill_value=2))
self.assertNotIsInstance(out, SparseArray)
tm.assert_numpy_array_equal(out, data.cumsum())
null_data = np.array([1, 2, np.nan, 4, 5], dtype=float)
null_expected = SparseArray(np.array([1.0, 3.0, np.nan, 7.0, 12.0]))

out = np.cumsum(SparseArray(data, fill_value=np.nan))
expected = SparseArray(np.array([
0, 1, 3, 6, 10, np.nan, 16, 23, 31, 40]))
tm.assert_sp_array_equal(out, expected)
for data, expected in [
(null_data, null_expected),
(non_null_data, non_null_expected)
]:
out = np.cumsum(SparseArray(data))
tm.assert_sp_array_equal(out, expected)

msg = "the 'dtype' parameter is not supported"
tm.assertRaisesRegexp(ValueError, msg, np.cumsum,
SparseArray(data), dtype=np.int64)
out = np.cumsum(SparseArray(data, fill_value=np.nan))
tm.assert_sp_array_equal(out, expected)

msg = "the 'out' parameter is not supported"
tm.assertRaisesRegexp(ValueError, msg, np.cumsum,
SparseArray(data), out=out)
out = np.cumsum(SparseArray(data, fill_value=2))
tm.assert_sp_array_equal(out, expected)

msg = "the 'dtype' parameter is not supported"
tm.assertRaisesRegexp(ValueError, msg, np.cumsum,
SparseArray(data), dtype=np.int64)

msg = "the 'out' parameter is not supported"
tm.assertRaisesRegexp(ValueError, msg, np.cumsum,
SparseArray(data), out=out)

def test_mean(self):
data = np.arange(10).astype(float)
Expand Down
9 changes: 5 additions & 4 deletions pandas/sparse/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1331,21 +1331,22 @@ def test_cumsum(self):
expected = SparseSeries(self.bseries.to_dense().cumsum())
tm.assert_sp_series_equal(result, expected)

# TODO: gh-12855 - return a SparseSeries here
result = self.zbseries.cumsum()
expected = self.zbseries.to_dense().cumsum()
self.assertNotIsInstance(result, SparseSeries)
tm.assert_series_equal(result, expected)

axis = 1 # Series is 1-D, so only axis = 0 is valid.
msg = "No axis named {axis}".format(axis=axis)
with tm.assertRaisesRegexp(ValueError, msg):
self.bseries.cumsum(axis=axis)

def test_numpy_cumsum(self):
result = np.cumsum(self.bseries)
expected = SparseSeries(self.bseries.to_dense().cumsum())
tm.assert_sp_series_equal(result, expected)

# TODO: gh-12855 - return a SparseSeries here
result = np.cumsum(self.zbseries)
expected = self.zbseries.to_dense().cumsum()
self.assertNotIsInstance(result, SparseSeries)
tm.assert_series_equal(result, expected)

msg = "the 'dtype' parameter is not supported"
Expand Down