Skip to content

BUG: don't assume series is length > 0 #19438

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 13 commits into from
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -589,3 +589,4 @@ Other
^^^^^

- Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`)
- Bug in `Series.memory_usage` which assumes series will always have more than one element (:issue:`19368`)
26 changes: 16 additions & 10 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,14 @@ def memory_usage_of_objects(ndarray[object, ndim=1] arr):
cdef Py_ssize_t i, n
cdef int64_t s = 0

# The problem here is that...
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comments are not necessary

# A SparseArray of size 1 that has fill_value = the only value
# will cause this

# n = 1
#
n = len(arr)
for i from 0 <= i < n:
for i in range(n):
s += arr[i].__sizeof__()
return s

Expand Down Expand Up @@ -131,10 +137,10 @@ def fast_unique_multiple(list arrays):
dict table = {}
object val, stub = 0

for i from 0 <= i < k:
for i in range(k):
buf = arrays[i]
n = len(buf)
for j from 0 <= j < n:
for j in range(n):
val = buf[j]
if val not in table:
table[val] = stub
Expand All @@ -158,10 +164,10 @@ def fast_unique_multiple_list(list lists):
dict table = {}
object val, stub = 0

for i from 0 <= i < k:
for i in range(k):
buf = lists[i]
n = len(buf)
for j from 0 <= j < n:
for j in range(n):
val = buf[j]
if val not in table:
table[val] = stub
Expand Down Expand Up @@ -200,7 +206,7 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True):

for buf in gen:
n = len(buf)
for j from 0 <= j < n:
for j in range(n):
val = buf[j]
if val not in table:
table[val] = stub
Expand Down Expand Up @@ -830,15 +836,15 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
if axis == 0:
counts = np.zeros((max_bin, k), dtype='i8')
with nogil:
for i from 0 <= i < n:
for j from 0 <= j < k:
for i in range(n):
for j in range(n):
counts[labels[i], j] += mask[i, j]

else: # axis == 1
counts = np.zeros((n, max_bin), dtype='i8')
with nogil:
for i from 0 <= i < n:
for j from 0 <= j < k:
for i in range(n):
for j in range(k):
counts[i, labels[j]] += mask[i, j]

return counts
Expand Down
14 changes: 10 additions & 4 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1069,12 +1069,18 @@ def memory_usage(self, deep=False):
--------
numpy.ndarray.nbytes
"""
if hasattr(self.values, 'memory_usage'):
return self.values.memory_usage(deep=deep)
# Use sparse values if they exist for memory consumption
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would rather override this in SparseSeries/SparseArray

if hasattr(self.values, 'sp_values'):
values = self.values.sp_values
else:
values = self.values

if hasattr(values, 'memory_usage'):
return values.memory_usage(deep=deep)

v = self.values.nbytes
v = values.nbytes
if deep and is_object_dtype(self) and not PYPY:
v += lib.memory_usage_of_objects(self.values)
v += lib.memory_usage_of_objects(values)
return v

def factorize(self, sort=False, na_sentinel=-1):
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/sparse/series/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
from pandas.core.sparse.api import SparseSeries
from pandas.tests.series.test_api import SharedWithSparse

from itertools import product


def _test_data1():
# nan-based
Expand Down Expand Up @@ -971,6 +973,17 @@ def test_combine_first(self):
tm.assert_sp_series_equal(result, result2)
tm.assert_sp_series_equal(result, expected)

@pytest.mark.parametrize('deep,fill_values', [([True, False],
[0, 1, np.nan, None])])
def test_memory_usage_deep(self, deep, fill_values):
for deep, fill_value in product(deep, fill_values):
sparse_series = SparseSeries(fill_values, fill_value=fill_value)
dense_series = Series(fill_values)
sparse_usage = sparse_series.memory_usage(deep=deep)
dense_usage = dense_series.memory_usage(deep=deep)

assert sparse_usage < dense_usage


class TestSparseHandlingMultiIndexes(object):

Expand Down