|
4 | 4 | from __future__ import division
|
5 | 5 | # pylint: disable=E1101,E1103,W0231
|
6 | 6 |
|
| 7 | +import re |
7 | 8 | import operator
|
8 | 9 | import numbers
|
9 | 10 | import numpy as np
|
|
16 | 17 | from pandas.errors import PerformanceWarning
|
17 | 18 | from pandas.compat.numpy import function as nv
|
18 | 19 |
|
19 |
| -from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin |
| 20 | +from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin |
20 | 21 | import pandas.core.common as com
|
| 22 | +from pandas.core.dtypes.base import ExtensionDtype |
| 23 | +from pandas.core.dtypes.dtypes import register_extension_dtype |
21 | 24 | from pandas.core.dtypes.generic import (
|
22 | 25 | ABCSparseSeries, ABCSeries, ABCIndexClass
|
23 | 26 | )
|
|
45 | 48 | import pandas.core.algorithms as algos
|
46 | 49 | import pandas.io.formats.printing as printing
|
47 | 50 |
|
48 |
| -from pandas.core.sparse.dtype import SparseDtype |
| 51 | + |
| 52 | +# ---------------------------------------------------------------------------- |
| 53 | +# Dtype |
| 54 | + |
| 55 | +@register_extension_dtype |
| 56 | +class SparseDtype(ExtensionDtype): |
| 57 | + """ |
| 58 | + Dtype for data stored in :class:`SparseArray`. |
| 59 | +
|
| 60 | + This dtype implements the pandas ExtensionDtype interface. |
| 61 | +
|
| 62 | + .. versionadded:: 0.24.0 |
| 63 | +
|
| 64 | + Parameters |
| 65 | + ---------- |
| 66 | + dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64 |
| 67 | + The dtype of the underlying array storing the non-fill value values. |
| 68 | + fill_value : scalar, optional. |
| 69 | + The scalar value not stored in the SparseArray. By default, this |
| 70 | + depends on `dtype`. |
| 71 | +
|
| 72 | + ========== ========== |
| 73 | + dtype na_value |
| 74 | + ========== ========== |
| 75 | + float ``np.nan`` |
| 76 | + int ``0`` |
| 77 | + bool ``False`` |
| 78 | + datetime64 ``pd.NaT`` |
| 79 | + timedelta64 ``pd.NaT`` |
| 80 | + ========== ========== |
| 81 | +
|
| 82 | + The default value may be overridden by specifying a `fill_value`. |
| 83 | + """ |
| 84 | + # We include `_is_na_fill_value` in the metadata to avoid hash collisions |
| 85 | + # between SparseDtype(float, 0.0) and SparseDtype(float, nan). |
| 86 | + # Without is_na_fill_value in the comparison, those would be equal since |
| 87 | + # hash(nan) is (sometimes?) 0. |
| 88 | + _metadata = ('_dtype', '_fill_value', '_is_na_fill_value') |
| 89 | + |
| 90 | + def __init__(self, dtype=np.float64, fill_value=None): |
| 91 | + # type: (Union[str, np.dtype, 'ExtensionDtype', type], Any) -> None |
| 92 | + from pandas.core.dtypes.missing import na_value_for_dtype |
| 93 | + from pandas.core.dtypes.common import ( |
| 94 | + pandas_dtype, is_string_dtype, is_scalar |
| 95 | + ) |
| 96 | + |
| 97 | + if isinstance(dtype, type(self)): |
| 98 | + if fill_value is None: |
| 99 | + fill_value = dtype.fill_value |
| 100 | + dtype = dtype.subtype |
| 101 | + |
| 102 | + dtype = pandas_dtype(dtype) |
| 103 | + if is_string_dtype(dtype): |
| 104 | + dtype = np.dtype('object') |
| 105 | + |
| 106 | + if fill_value is None: |
| 107 | + fill_value = na_value_for_dtype(dtype) |
| 108 | + |
| 109 | + if not is_scalar(fill_value): |
| 110 | + raise ValueError("fill_value must be a scalar. Got {} " |
| 111 | + "instead".format(fill_value)) |
| 112 | + self._dtype = dtype |
| 113 | + self._fill_value = fill_value |
| 114 | + |
| 115 | + def __hash__(self): |
| 116 | + # Python3 doesn't inherit __hash__ when a base class overrides |
| 117 | + # __eq__, so we explicitly do it here. |
| 118 | + return super(SparseDtype, self).__hash__() |
| 119 | + |
| 120 | + def __eq__(self, other): |
| 121 | + # We have to override __eq__ to handle NA values in _metadata. |
| 122 | + # The base class does simple == checks, which fail for NA. |
| 123 | + if isinstance(other, compat.string_types): |
| 124 | + try: |
| 125 | + other = self.construct_from_string(other) |
| 126 | + except TypeError: |
| 127 | + return False |
| 128 | + |
| 129 | + if isinstance(other, type(self)): |
| 130 | + subtype = self.subtype == other.subtype |
| 131 | + if self._is_na_fill_value: |
| 132 | + # this case is complicated by two things: |
| 133 | + # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan) |
| 134 | + # SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT) |
| 135 | + # i.e. we want to treat any floating-point NaN as equal, but |
| 136 | + # not a floating-point NaN and a datetime NaT. |
| 137 | + fill_value = ( |
| 138 | + other._is_na_fill_value and |
| 139 | + isinstance(self.fill_value, type(other.fill_value)) or |
| 140 | + isinstance(other.fill_value, type(self.fill_value)) |
| 141 | + ) |
| 142 | + else: |
| 143 | + fill_value = self.fill_value == other.fill_value |
| 144 | + |
| 145 | + return subtype and fill_value |
| 146 | + return False |
| 147 | + |
| 148 | + @property |
| 149 | + def fill_value(self): |
| 150 | + """ |
| 151 | + The fill value of the array. |
| 152 | +
|
| 153 | + Converting the SparseArray to a dense ndarray will fill the |
| 154 | + array with this value. |
| 155 | +
|
| 156 | + .. warning:: |
| 157 | +
|
| 158 | + It's possible to end up with a SparseArray that has ``fill_value`` |
| 159 | + values in ``sp_values``. This can occur, for example, when setting |
| 160 | + ``SparseArray.fill_value`` directly. |
| 161 | + """ |
| 162 | + return self._fill_value |
| 163 | + |
| 164 | + @property |
| 165 | + def _is_na_fill_value(self): |
| 166 | + from pandas.core.dtypes.missing import isna |
| 167 | + return isna(self.fill_value) |
| 168 | + |
| 169 | + @property |
| 170 | + def _is_numeric(self): |
| 171 | + from pandas.core.dtypes.common import is_object_dtype |
| 172 | + return not is_object_dtype(self.subtype) |
| 173 | + |
| 174 | + @property |
| 175 | + def _is_boolean(self): |
| 176 | + from pandas.core.dtypes.common import is_bool_dtype |
| 177 | + return is_bool_dtype(self.subtype) |
| 178 | + |
| 179 | + @property |
| 180 | + def kind(self): |
| 181 | + return self.subtype.kind |
| 182 | + |
| 183 | + @property |
| 184 | + def type(self): |
| 185 | + return self.subtype.type |
| 186 | + |
| 187 | + @property |
| 188 | + def subtype(self): |
| 189 | + return self._dtype |
| 190 | + |
| 191 | + @property |
| 192 | + def name(self): |
| 193 | + return 'Sparse[{}, {}]'.format(self.subtype.name, self.fill_value) |
| 194 | + |
| 195 | + def __repr__(self): |
| 196 | + return self.name |
| 197 | + |
| 198 | + @classmethod |
| 199 | + def construct_array_type(cls): |
| 200 | + return SparseArray |
| 201 | + |
| 202 | + @classmethod |
| 203 | + def construct_from_string(cls, string): |
| 204 | + """ |
| 205 | + Construct a SparseDtype from a string form. |
| 206 | +
|
| 207 | + Parameters |
| 208 | + ---------- |
| 209 | + string : str |
| 210 | + Can take the following forms. |
| 211 | +
|
| 212 | + string dtype |
| 213 | + ================ ============================ |
| 214 | + 'int' SparseDtype[np.int64, 0] |
| 215 | + 'Sparse' SparseDtype[np.float64, nan] |
| 216 | + 'Sparse[int]' SparseDtype[np.int64, 0] |
| 217 | + 'Sparse[int, 0]' SparseDtype[np.int64, 0] |
| 218 | + ================ ============================ |
| 219 | +
|
| 220 | + It is not possible to specify non-default fill values |
| 221 | + with a string. An argument like ``'Sparse[int, 1]'`` |
| 222 | + will raise a ``TypeError`` because the default fill value |
| 223 | + for integers is 0. |
| 224 | +
|
| 225 | + Returns |
| 226 | + ------- |
| 227 | + SparseDtype |
| 228 | + """ |
| 229 | + msg = "Could not construct SparseDtype from '{}'".format(string) |
| 230 | + if string.startswith("Sparse"): |
| 231 | + try: |
| 232 | + sub_type, has_fill_value = cls._parse_subtype(string) |
| 233 | + result = SparseDtype(sub_type) |
| 234 | + except Exception: |
| 235 | + raise TypeError(msg) |
| 236 | + else: |
| 237 | + msg = ("Could not construct SparseDtype from '{}'.\n\nIt " |
| 238 | + "looks like the fill_value in the string is not " |
| 239 | + "the default for the dtype. Non-default fill_values " |
| 240 | + "are not supported. Use the 'SparseDtype()' " |
| 241 | + "constructor instead.") |
| 242 | + if has_fill_value and str(result) != string: |
| 243 | + raise TypeError(msg.format(string)) |
| 244 | + return result |
| 245 | + else: |
| 246 | + raise TypeError(msg) |
| 247 | + |
| 248 | + @staticmethod |
| 249 | + def _parse_subtype(dtype): |
| 250 | + """ |
| 251 | + Parse a string to get the subtype |
| 252 | +
|
| 253 | + Parameters |
| 254 | + ---------- |
| 255 | + dtype : str |
| 256 | + A string like |
| 257 | +
|
| 258 | + * Sparse[subtype] |
| 259 | + * Sparse[subtype, fill_value] |
| 260 | +
|
| 261 | + Returns |
| 262 | + ------- |
| 263 | + subtype : str |
| 264 | +
|
| 265 | + Raises |
| 266 | + ------ |
| 267 | + ValueError |
| 268 | + When the subtype cannot be extracted. |
| 269 | + """ |
| 270 | + xpr = re.compile( |
| 271 | + r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$" |
| 272 | + ) |
| 273 | + m = xpr.match(dtype) |
| 274 | + has_fill_value = False |
| 275 | + if m: |
| 276 | + subtype = m.groupdict()['subtype'] |
| 277 | + has_fill_value = m.groupdict()['fill_value'] or has_fill_value |
| 278 | + elif dtype == "Sparse": |
| 279 | + subtype = 'float64' |
| 280 | + else: |
| 281 | + raise ValueError("Cannot parse {}".format(dtype)) |
| 282 | + return subtype, has_fill_value |
| 283 | + |
| 284 | + @classmethod |
| 285 | + def is_dtype(cls, dtype): |
| 286 | + dtype = getattr(dtype, 'dtype', dtype) |
| 287 | + if (isinstance(dtype, compat.string_types) and |
| 288 | + dtype.startswith("Sparse")): |
| 289 | + sub_type, _ = cls._parse_subtype(dtype) |
| 290 | + dtype = np.dtype(sub_type) |
| 291 | + elif isinstance(dtype, cls): |
| 292 | + return True |
| 293 | + return isinstance(dtype, np.dtype) or dtype == 'Sparse' |
| 294 | + |
| 295 | +# ---------------------------------------------------------------------------- |
| 296 | +# Array |
49 | 297 |
|
50 | 298 |
|
51 | 299 | _sparray_doc_kwargs = dict(klass='SparseArray')
|
|
0 commit comments