Skip to content

Commit f101e66

Browse files
committed
ENH: add sparse op for other dtypes
1 parent 97de42a commit f101e66

13 files changed

+6336
-580
lines changed

doc/source/whatsnew/v0.19.0.txt

+23-5
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,29 @@ Google BigQuery Enhancements
307307
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
308308
- The :func:`pandas.io.gbq.read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the :ref:`docs <io.bigquery_reader>` for more details (:issue:`13615`).
309309

310+
.. _whatsnew_0190.sparse:
311+
312+
Sparse changes
313+
~~~~~~~~~~~~~~
314+
315+
These changes allow pandas to handle sparse data with more dtypes, and for work to make a smoother experience with data handling.
316+
317+
- Sparse data structure now can preserve ``dtype`` after arithmetic ops (:issue:`13848`)
318+
319+
.. ipython:: python
320+
321+
s = pd.SparseSeries([0, 2, 0, 1], fill_value=0, dtype=np.int64)
322+
s.dtype
323+
324+
s + 1
325+
326+
327+
- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`)
328+
- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`)
329+
- Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`)
330+
- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`)
331+
- Bug in ``SparseDataFrame`` doesn't respect passed ``SparseArray`` or ``SparseSeries`` 's dtype and ``fill_value`` (:issue:`13866`)
332+
310333
.. _whatsnew_0190.enhancements.other:
311334

312335
Other enhancements
@@ -754,11 +777,6 @@ Bug Fixes
754777
- Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`)
755778
- Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`)
756779
- Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`)
757-
- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`)
758-
- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`)
759-
- Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`)
760-
- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`)
761-
- Bug in ``SparseDataFrame`` doesn't respect passed ``SparseArray`` or ``SparseSeries`` 's dtype and ``fill_value`` (:issue:`13866`)
762780
- Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`)
763781
- Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`)
764782
- Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`)

pandas/sparse/array.py

+72-28
Original file line numberDiff line numberDiff line change
@@ -48,16 +48,14 @@ def wrapper(self, other):
4848
raise AssertionError("length mismatch: %d vs. %d" %
4949
(len(self), len(other)))
5050
if not isinstance(other, ABCSparseArray):
51-
other = SparseArray(other, fill_value=self.fill_value)
52-
if name[0] == 'r':
53-
return _sparse_array_op(other, self, op, name[1:])
54-
else:
55-
return _sparse_array_op(self, other, op, name)
51+
dtype = getattr(other, 'dtype', None)
52+
other = SparseArray(other, fill_value=self.fill_value,
53+
dtype=dtype)
54+
return _sparse_array_op(self, other, op, name)
5655
elif is_scalar(other):
57-
new_fill_value = op(np.float64(self.fill_value), np.float64(other))
58-
56+
fill = op(_get_fill(self), np.asarray(other))
5957
return _wrap_result(name, op(self.sp_values, other),
60-
self.sp_index, new_fill_value)
58+
self.sp_index, fill)
6159
else: # pragma: no cover
6260
raise TypeError('operation with %s not supported' % type(other))
6361

@@ -67,33 +65,74 @@ def wrapper(self, other):
6765
return wrapper
6866

6967

70-
def _sparse_array_op(left, right, op, name):
71-
if left.sp_index.equals(right.sp_index):
72-
result = op(left.sp_values, right.sp_values)
73-
result_index = left.sp_index
68+
def _maybe_match_dtype(left, right):
69+
if not hasattr(right, 'dtype'):
70+
return left.dtype
71+
elif left.dtype == right.dtype:
72+
return getattr(left.dtype, '__name__', left.dtype)
7473
else:
75-
sparse_op = getattr(splib, 'sparse_%s' % name)
76-
result, result_index = sparse_op(left.sp_values, left.sp_index,
77-
left.fill_value, right.sp_values,
78-
right.sp_index, right.fill_value)
74+
# ToDo: to be supported after GH 667
75+
raise NotImplementedError('dtypes must be identical')
76+
77+
78+
def _get_fill(arr):
79+
# coerce fill_value to arr dtype if possible
80+
# int64 SparseArray can have NaN as fill_value if there is no missing
7981
try:
80-
fill_value = op(left.fill_value, right.fill_value)
81-
except:
82-
fill_value = nan
83-
return _wrap_result(name, result, result_index, fill_value)
82+
return np.asarray(arr.fill_value, dtype=arr.dtype)
83+
except ValueError:
84+
return np.asarray(arr.fill_value)
8485

8586

86-
def _wrap_result(name, data, sparse_index, fill_value):
87+
def _sparse_array_op(left, right, op, name, series=False):
88+
89+
if series and is_integer_dtype(left) and is_integer_dtype(right):
90+
# series coerces to float64 if result should have NaN/inf
91+
if name in ('floordiv', 'mod') and (right.values == 0).any():
92+
left = left.astype(np.float64)
93+
right = right.astype(np.float64)
94+
elif name in ('rfloordiv', 'rmod') and (left.values == 0).any():
95+
left = left.astype(np.float64)
96+
right = right.astype(np.float64)
97+
98+
dtype = _maybe_match_dtype(left, right)
99+
100+
if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0:
101+
result = op(left.get_values(), right.get_values())
102+
103+
if left.sp_index.ngaps == 0:
104+
index = left.sp_index
105+
else:
106+
index = right.sp_index
107+
fill = op(_get_fill(left), _get_fill(right))
108+
elif left.sp_index.equals(right.sp_index):
109+
result = op(left.sp_values, right.sp_values)
110+
index = left.sp_index
111+
fill = op(_get_fill(left), _get_fill(right))
112+
else:
113+
if name[0] == 'r':
114+
left, right = right, left
115+
name = name[1:]
116+
117+
opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype)
118+
sparse_op = getattr(splib, opname)
119+
120+
result, index, fill = sparse_op(left.sp_values, left.sp_index,
121+
left.fill_value, right.sp_values,
122+
right.sp_index, right.fill_value)
123+
return _wrap_result(name, result, index, fill, dtype=result.dtype)
124+
125+
126+
def _wrap_result(name, data, sparse_index, fill_value, dtype=None):
87127
""" wrap op result to have correct dtype """
88128
if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
89129
# ToDo: We can remove this condition when removing
90130
# SparseArray's dtype default when closing GH 667
91-
return SparseArray(data, sparse_index=sparse_index,
92-
fill_value=fill_value,
93-
dtype=np.bool)
94-
else:
95-
return SparseArray(data, sparse_index=sparse_index,
96-
fill_value=fill_value)
131+
dtype = np.bool
132+
elif name == 'truediv':
133+
dtype = np.float64
134+
return SparseArray(data, sparse_index=sparse_index,
135+
fill_value=fill_value, dtype=dtype)
97136

98137

99138
class SparseArray(PandasObject, np.ndarray):
@@ -419,7 +458,12 @@ def astype(self, dtype=None):
419458
dtype = np.dtype(dtype)
420459
if dtype is not None and dtype not in (np.float_, float):
421460
raise TypeError('Can only support floating point data for now')
422-
return self.copy()
461+
462+
if self.dtype == dtype:
463+
return self.copy()
464+
else:
465+
return self._simple_new(self.sp_values.astype(dtype),
466+
self.sp_index, float(self.fill_value))
423467

424468
def copy(self, deep=True):
425469
"""

pandas/sparse/series.py

+4-10
Original file line numberDiff line numberDiff line change
@@ -57,16 +57,9 @@ def wrapper(self, other):
5757
elif isinstance(other, DataFrame):
5858
return NotImplemented
5959
elif is_scalar(other):
60-
if isnull(other) or isnull(self.fill_value):
61-
new_fill_value = np.nan
62-
else:
63-
new_fill_value = op(np.float64(self.fill_value),
64-
np.float64(other))
65-
66-
return self._constructor(op(self.sp_values, other),
60+
new_values = op(self.values, other)
61+
return self._constructor(new_values,
6762
index=self.index,
68-
sparse_index=self.sp_index,
69-
fill_value=new_fill_value,
7063
name=self.name)
7164
else: # pragma: no cover
7265
raise TypeError('operation with %s not supported' % type(other))
@@ -84,7 +77,8 @@ def _sparse_series_op(left, right, op, name):
8477
new_index = left.index
8578
new_name = _maybe_match_name(left, right)
8679

87-
result = _sparse_array_op(left, right, op, name)
80+
result = _sparse_array_op(left.values, right.values, op, name,
81+
series=True)
8882
return left._constructor(result, index=new_index, name=new_name)
8983

9084

0 commit comments

Comments
 (0)