Skip to content

Commit 913f71f

Browse files
JustinZhengBCTomAugspurger
authored andcommitted
CLN: Move SparseArray to arrays (#23147)
1 parent fd9f8c7 commit 913f71f

27 files changed

+285
-285
lines changed

pandas/api/extensions/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
register_index_accessor,
44
register_series_accessor)
55
from pandas.core.algorithms import take # noqa
6-
from pandas.core.arrays.base import (ExtensionArray, # noqa
7-
ExtensionScalarOpsMixin)
6+
from pandas.core.arrays import (ExtensionArray, # noqa
7+
ExtensionScalarOpsMixin)
88
from pandas.core.dtypes.dtypes import ( # noqa
99
ExtensionDtype, register_extension_dtype
1010
)

pandas/compat/pickle_compat.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ def load_reduce(self):
5656

5757
# If classes are moved, provide compat here.
5858
_class_locations_map = {
59+
('pandas.core.sparse.array', 'SparseArray'):
60+
('pandas.core.arrays', 'SparseArray'),
5961

6062
# 15477
6163
('pandas.core.base', 'FrozenNDArray'):
@@ -88,7 +90,7 @@ def load_reduce(self):
8890

8991
# 15998 top-level dirs moving
9092
('pandas.sparse.array', 'SparseArray'):
91-
('pandas.core.sparse.array', 'SparseArray'),
93+
('pandas.core.arrays.sparse', 'SparseArray'),
9294
('pandas.sparse.series', 'SparseSeries'):
9395
('pandas.core.sparse.series', 'SparseSeries'),
9496
('pandas.sparse.frame', 'SparseDataFrame'):

pandas/core/arrays/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@
88
from .timedeltas import TimedeltaArrayMixin # noqa
99
from .integer import ( # noqa
1010
IntegerArray, integer_array)
11+
from .sparse import SparseArray # noqa

pandas/core/sparse/array.py renamed to pandas/core/arrays/sparse.py

+250-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from __future__ import division
55
# pylint: disable=E1101,E1103,W0231
66

7+
import re
78
import operator
89
import numbers
910
import numpy as np
@@ -16,8 +17,10 @@
1617
from pandas.errors import PerformanceWarning
1718
from pandas.compat.numpy import function as nv
1819

19-
from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin
20+
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
2021
import pandas.core.common as com
22+
from pandas.core.dtypes.base import ExtensionDtype
23+
from pandas.core.dtypes.dtypes import register_extension_dtype
2124
from pandas.core.dtypes.generic import (
2225
ABCSparseSeries, ABCSeries, ABCIndexClass
2326
)
@@ -45,7 +48,252 @@
4548
import pandas.core.algorithms as algos
4649
import pandas.io.formats.printing as printing
4750

48-
from pandas.core.sparse.dtype import SparseDtype
51+
52+
# ----------------------------------------------------------------------------
53+
# Dtype
54+
55+
@register_extension_dtype
56+
class SparseDtype(ExtensionDtype):
57+
"""
58+
Dtype for data stored in :class:`SparseArray`.
59+
60+
This dtype implements the pandas ExtensionDtype interface.
61+
62+
.. versionadded:: 0.24.0
63+
64+
Parameters
65+
----------
66+
dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64
67+
The dtype of the underlying array storing the non-fill value values.
68+
fill_value : scalar, optional.
69+
The scalar value not stored in the SparseArray. By default, this
70+
depends on `dtype`.
71+
72+
========== ==========
73+
dtype na_value
74+
========== ==========
75+
float ``np.nan``
76+
int ``0``
77+
bool ``False``
78+
datetime64 ``pd.NaT``
79+
timedelta64 ``pd.NaT``
80+
========== ==========
81+
82+
The default value may be overridden by specifying a `fill_value`.
83+
"""
84+
# We include `_is_na_fill_value` in the metadata to avoid hash collisions
85+
# between SparseDtype(float, 0.0) and SparseDtype(float, nan).
86+
# Without is_na_fill_value in the comparison, those would be equal since
87+
# hash(nan) is (sometimes?) 0.
88+
_metadata = ('_dtype', '_fill_value', '_is_na_fill_value')
89+
90+
def __init__(self, dtype=np.float64, fill_value=None):
91+
# type: (Union[str, np.dtype, 'ExtensionDtype', type], Any) -> None
92+
from pandas.core.dtypes.missing import na_value_for_dtype
93+
from pandas.core.dtypes.common import (
94+
pandas_dtype, is_string_dtype, is_scalar
95+
)
96+
97+
if isinstance(dtype, type(self)):
98+
if fill_value is None:
99+
fill_value = dtype.fill_value
100+
dtype = dtype.subtype
101+
102+
dtype = pandas_dtype(dtype)
103+
if is_string_dtype(dtype):
104+
dtype = np.dtype('object')
105+
106+
if fill_value is None:
107+
fill_value = na_value_for_dtype(dtype)
108+
109+
if not is_scalar(fill_value):
110+
raise ValueError("fill_value must be a scalar. Got {} "
111+
"instead".format(fill_value))
112+
self._dtype = dtype
113+
self._fill_value = fill_value
114+
115+
def __hash__(self):
116+
# Python3 doesn't inherit __hash__ when a base class overrides
117+
# __eq__, so we explicitly do it here.
118+
return super(SparseDtype, self).__hash__()
119+
120+
def __eq__(self, other):
121+
# We have to override __eq__ to handle NA values in _metadata.
122+
# The base class does simple == checks, which fail for NA.
123+
if isinstance(other, compat.string_types):
124+
try:
125+
other = self.construct_from_string(other)
126+
except TypeError:
127+
return False
128+
129+
if isinstance(other, type(self)):
130+
subtype = self.subtype == other.subtype
131+
if self._is_na_fill_value:
132+
# this case is complicated by two things:
133+
# SparseDtype(float, float(nan)) == SparseDtype(float, np.nan)
134+
# SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT)
135+
# i.e. we want to treat any floating-point NaN as equal, but
136+
# not a floating-point NaN and a datetime NaT.
137+
fill_value = (
138+
other._is_na_fill_value and
139+
isinstance(self.fill_value, type(other.fill_value)) or
140+
isinstance(other.fill_value, type(self.fill_value))
141+
)
142+
else:
143+
fill_value = self.fill_value == other.fill_value
144+
145+
return subtype and fill_value
146+
return False
147+
148+
@property
149+
def fill_value(self):
150+
"""
151+
The fill value of the array.
152+
153+
Converting the SparseArray to a dense ndarray will fill the
154+
array with this value.
155+
156+
.. warning::
157+
158+
It's possible to end up with a SparseArray that has ``fill_value``
159+
values in ``sp_values``. This can occur, for example, when setting
160+
``SparseArray.fill_value`` directly.
161+
"""
162+
return self._fill_value
163+
164+
@property
165+
def _is_na_fill_value(self):
166+
from pandas.core.dtypes.missing import isna
167+
return isna(self.fill_value)
168+
169+
@property
170+
def _is_numeric(self):
171+
from pandas.core.dtypes.common import is_object_dtype
172+
return not is_object_dtype(self.subtype)
173+
174+
@property
175+
def _is_boolean(self):
176+
from pandas.core.dtypes.common import is_bool_dtype
177+
return is_bool_dtype(self.subtype)
178+
179+
@property
180+
def kind(self):
181+
return self.subtype.kind
182+
183+
@property
184+
def type(self):
185+
return self.subtype.type
186+
187+
@property
188+
def subtype(self):
189+
return self._dtype
190+
191+
@property
192+
def name(self):
193+
return 'Sparse[{}, {}]'.format(self.subtype.name, self.fill_value)
194+
195+
def __repr__(self):
196+
return self.name
197+
198+
@classmethod
199+
def construct_array_type(cls):
200+
return SparseArray
201+
202+
@classmethod
203+
def construct_from_string(cls, string):
204+
"""
205+
Construct a SparseDtype from a string form.
206+
207+
Parameters
208+
----------
209+
string : str
210+
Can take the following forms.
211+
212+
string dtype
213+
================ ============================
214+
'int' SparseDtype[np.int64, 0]
215+
'Sparse' SparseDtype[np.float64, nan]
216+
'Sparse[int]' SparseDtype[np.int64, 0]
217+
'Sparse[int, 0]' SparseDtype[np.int64, 0]
218+
================ ============================
219+
220+
It is not possible to specify non-default fill values
221+
with a string. An argument like ``'Sparse[int, 1]'``
222+
will raise a ``TypeError`` because the default fill value
223+
for integers is 0.
224+
225+
Returns
226+
-------
227+
SparseDtype
228+
"""
229+
msg = "Could not construct SparseDtype from '{}'".format(string)
230+
if string.startswith("Sparse"):
231+
try:
232+
sub_type, has_fill_value = cls._parse_subtype(string)
233+
result = SparseDtype(sub_type)
234+
except Exception:
235+
raise TypeError(msg)
236+
else:
237+
msg = ("Could not construct SparseDtype from '{}'.\n\nIt "
238+
"looks like the fill_value in the string is not "
239+
"the default for the dtype. Non-default fill_values "
240+
"are not supported. Use the 'SparseDtype()' "
241+
"constructor instead.")
242+
if has_fill_value and str(result) != string:
243+
raise TypeError(msg.format(string))
244+
return result
245+
else:
246+
raise TypeError(msg)
247+
248+
@staticmethod
249+
def _parse_subtype(dtype):
250+
"""
251+
Parse a string to get the subtype
252+
253+
Parameters
254+
----------
255+
dtype : str
256+
A string like
257+
258+
* Sparse[subtype]
259+
* Sparse[subtype, fill_value]
260+
261+
Returns
262+
-------
263+
subtype : str
264+
265+
Raises
266+
------
267+
ValueError
268+
When the subtype cannot be extracted.
269+
"""
270+
xpr = re.compile(
271+
r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$"
272+
)
273+
m = xpr.match(dtype)
274+
has_fill_value = False
275+
if m:
276+
subtype = m.groupdict()['subtype']
277+
has_fill_value = m.groupdict()['fill_value'] or has_fill_value
278+
elif dtype == "Sparse":
279+
subtype = 'float64'
280+
else:
281+
raise ValueError("Cannot parse {}".format(dtype))
282+
return subtype, has_fill_value
283+
284+
@classmethod
285+
def is_dtype(cls, dtype):
286+
dtype = getattr(dtype, 'dtype', dtype)
287+
if (isinstance(dtype, compat.string_types) and
288+
dtype.startswith("Sparse")):
289+
sub_type, _ = cls._parse_subtype(dtype)
290+
dtype = np.dtype(sub_type)
291+
elif isinstance(dtype, cls):
292+
return True
293+
return isinstance(dtype, np.dtype) or dtype == 'Sparse'
294+
295+
# ----------------------------------------------------------------------------
296+
# Array
49297

50298

51299
_sparray_doc_kwargs = dict(klass='SparseArray')

pandas/core/dtypes/common.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
""" common type operations """
2-
32
import numpy as np
43
from pandas.compat import (string_types, text_type, binary_type,
54
PY3, PY36)
@@ -12,7 +11,6 @@
1211
PeriodDtype, IntervalDtype,
1312
PandasExtensionDtype, ExtensionDtype,
1413
_pandas_registry)
15-
from pandas.core.sparse.dtype import SparseDtype
1614
from pandas.core.dtypes.generic import (
1715
ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries,
1816
ABCSparseArray, ABCSparseSeries, ABCCategoricalIndex, ABCIndexClass,
@@ -23,7 +21,6 @@
2321
is_file_like, is_re, is_re_compilable, is_sequence, is_nested_list_like,
2422
is_named_tuple, is_array_like, is_decimal, is_complex, is_interval)
2523

26-
2724
_POSSIBLY_CAST_DTYPES = {np.dtype(t).name
2825
for t in ['O', 'int8', 'uint8', 'int16', 'uint16',
2926
'int32', 'uint32', 'int64', 'uint64']}
@@ -181,7 +178,7 @@ def is_sparse(arr):
181178
>>> is_sparse(bsr_matrix([1, 2, 3]))
182179
False
183180
"""
184-
from pandas.core.sparse.dtype import SparseDtype
181+
from pandas.core.arrays.sparse import SparseDtype
185182

186183
dtype = getattr(arr, 'dtype', arr)
187184
return isinstance(dtype, SparseDtype)
@@ -1928,10 +1925,13 @@ def _get_dtype_type(arr_or_dtype):
19281925
elif is_interval_dtype(arr_or_dtype):
19291926
return Interval
19301927
return _get_dtype_type(np.dtype(arr_or_dtype))
1931-
elif isinstance(arr_or_dtype, (ABCSparseSeries, ABCSparseArray,
1932-
SparseDtype)):
1933-
dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype)
1934-
return dtype.type
1928+
else:
1929+
from pandas.core.arrays.sparse import SparseDtype
1930+
if isinstance(arr_or_dtype, (ABCSparseSeries,
1931+
ABCSparseArray,
1932+
SparseDtype)):
1933+
dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype)
1934+
return dtype.type
19351935
try:
19361936
return arr_or_dtype.dtype.type
19371937
except AttributeError:

pandas/core/dtypes/concat.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -556,7 +556,7 @@ def _concat_sparse(to_concat, axis=0, typs=None):
556556
a single array, preserving the combined dtypes
557557
"""
558558

559-
from pandas.core.sparse.array import SparseArray
559+
from pandas.core.arrays import SparseArray
560560

561561
fill_values = [x.fill_value for x in to_concat
562562
if isinstance(x, SparseArray)]

pandas/core/frame.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1765,7 +1765,7 @@ def to_sparse(self, fill_value=None, kind='block'):
17651765
>>> type(sdf)
17661766
<class 'pandas.core.sparse.frame.SparseDataFrame'>
17671767
"""
1768-
from pandas.core.sparse.frame import SparseDataFrame
1768+
from pandas.core.sparse.api import SparseDataFrame
17691769
return SparseDataFrame(self._series, index=self.index,
17701770
columns=self.columns, default_kind=kind,
17711771
default_fill_value=fill_value)

pandas/core/internals/managers.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929

3030
from pandas.core.base import PandasObject
3131
import pandas.core.algorithms as algos
32-
from pandas.core.sparse.array import _maybe_to_sparse
32+
from pandas.core.arrays.sparse import _maybe_to_sparse
3333

3434
from pandas.core.index import Index, MultiIndex, ensure_index
3535
from pandas.core.indexing import maybe_convert_indices

0 commit comments

Comments
 (0)