Skip to content

Commit a508c2e

Browse files
committed
Merge remote-tracking branch 'upstream/master' into fix-windows-ci
2 parents 0c1e34e + 913f71f commit a508c2e

28 files changed

+318
-314
lines changed

doc/source/cookbook.rst

+33-29
Original file line numberDiff line numberDiff line change
@@ -1228,36 +1228,40 @@ Correlation
12281228

12291229
The `method` argument within `DataFrame.corr` can accept a callable in addition to the named correlation types. Here we compute the `distance correlation <https://en.wikipedia.org/wiki/Distance_correlation>`__ matrix for a `DataFrame` object.
12301230

1231-
.. ipython:: python
1232-
1233-
def distcorr(x, y):
1234-
n = len(x)
1235-
a = np.zeros(shape=(n, n))
1236-
b = np.zeros(shape=(n, n))
1237-
1238-
for i in range(n):
1239-
for j in range(i + 1, n):
1240-
a[i, j] = abs(x[i] - x[j])
1241-
b[i, j] = abs(y[i] - y[j])
1242-
1243-
a += a.T
1244-
b += b.T
1245-
1246-
a_bar = np.vstack([np.nanmean(a, axis=0)] * n)
1247-
b_bar = np.vstack([np.nanmean(b, axis=0)] * n)
1248-
1249-
A = a - a_bar - a_bar.T + np.full(shape=(n, n), fill_value=a_bar.mean())
1250-
B = b - b_bar - b_bar.T + np.full(shape=(n, n), fill_value=b_bar.mean())
1251-
1252-
cov_ab = np.sqrt(np.nansum(A * B)) / n
1253-
std_a = np.sqrt(np.sqrt(np.nansum(A**2)) / n)
1254-
std_b = np.sqrt(np.sqrt(np.nansum(B**2)) / n)
1255-
1256-
return cov_ab / std_a / std_b
1257-
1258-
df = pd.DataFrame(np.random.normal(size=(100, 3)))
1231+
.. code-block:: python
12591232
1260-
df.corr(method=distcorr)
1233+
>>> def distcorr(x, y):
1234+
... n = len(x)
1235+
... a = np.zeros(shape=(n, n))
1236+
... b = np.zeros(shape=(n, n))
1237+
...
1238+
... for i in range(n):
1239+
... for j in range(i + 1, n):
1240+
... a[i, j] = abs(x[i] - x[j])
1241+
... b[i, j] = abs(y[i] - y[j])
1242+
...
1243+
... a += a.T
1244+
... b += b.T
1245+
...
1246+
... a_bar = np.vstack([np.nanmean(a, axis=0)] * n)
1247+
... b_bar = np.vstack([np.nanmean(b, axis=0)] * n)
1248+
...
1249+
... A = a - a_bar - a_bar.T + np.full(shape=(n, n), fill_value=a_bar.mean())
1250+
... B = b - b_bar - b_bar.T + np.full(shape=(n, n), fill_value=b_bar.mean())
1251+
...
1252+
... cov_ab = np.sqrt(np.nansum(A * B)) / n
1253+
... std_a = np.sqrt(np.sqrt(np.nansum(A**2)) / n)
1254+
... std_b = np.sqrt(np.sqrt(np.nansum(B**2)) / n)
1255+
...
1256+
... return cov_ab / std_a / std_b
1257+
...
1258+
>>> df = pd.DataFrame(np.random.normal(size=(100, 3)))
1259+
...
1260+
>>> df.corr(method=distcorr)
1261+
0 1 2
1262+
0 1.000000 0.171368 0.145302
1263+
1 0.171368 1.000000 0.189919
1264+
2 0.145302 0.189919 1.000000
12611265
12621266
Timedeltas
12631267
----------

pandas/api/extensions/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
register_index_accessor,
44
register_series_accessor)
55
from pandas.core.algorithms import take # noqa
6-
from pandas.core.arrays.base import (ExtensionArray, # noqa
7-
ExtensionScalarOpsMixin)
6+
from pandas.core.arrays import (ExtensionArray, # noqa
7+
ExtensionScalarOpsMixin)
88
from pandas.core.dtypes.dtypes import ( # noqa
99
ExtensionDtype, register_extension_dtype
1010
)

pandas/compat/pickle_compat.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ def load_reduce(self):
5656

5757
# If classes are moved, provide compat here.
5858
_class_locations_map = {
59+
('pandas.core.sparse.array', 'SparseArray'):
60+
('pandas.core.arrays', 'SparseArray'),
5961

6062
# 15477
6163
('pandas.core.base', 'FrozenNDArray'):
@@ -88,7 +90,7 @@ def load_reduce(self):
8890

8991
# 15998 top-level dirs moving
9092
('pandas.sparse.array', 'SparseArray'):
91-
('pandas.core.sparse.array', 'SparseArray'),
93+
('pandas.core.arrays.sparse', 'SparseArray'),
9294
('pandas.sparse.series', 'SparseSeries'):
9395
('pandas.core.sparse.series', 'SparseSeries'),
9496
('pandas.sparse.frame', 'SparseDataFrame'):

pandas/core/arrays/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@
88
from .timedeltas import TimedeltaArrayMixin # noqa
99
from .integer import ( # noqa
1010
IntegerArray, integer_array)
11+
from .sparse import SparseArray # noqa

pandas/core/sparse/array.py renamed to pandas/core/arrays/sparse.py

+250-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from __future__ import division
55
# pylint: disable=E1101,E1103,W0231
66

7+
import re
78
import operator
89
import numbers
910
import numpy as np
@@ -16,8 +17,10 @@
1617
from pandas.errors import PerformanceWarning
1718
from pandas.compat.numpy import function as nv
1819

19-
from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin
20+
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
2021
import pandas.core.common as com
22+
from pandas.core.dtypes.base import ExtensionDtype
23+
from pandas.core.dtypes.dtypes import register_extension_dtype
2124
from pandas.core.dtypes.generic import (
2225
ABCSparseSeries, ABCSeries, ABCIndexClass
2326
)
@@ -45,7 +48,252 @@
4548
import pandas.core.algorithms as algos
4649
import pandas.io.formats.printing as printing
4750

48-
from pandas.core.sparse.dtype import SparseDtype
51+
52+
# ----------------------------------------------------------------------------
53+
# Dtype
54+
55+
@register_extension_dtype
56+
class SparseDtype(ExtensionDtype):
57+
"""
58+
Dtype for data stored in :class:`SparseArray`.
59+
60+
This dtype implements the pandas ExtensionDtype interface.
61+
62+
.. versionadded:: 0.24.0
63+
64+
Parameters
65+
----------
66+
dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64
67+
The dtype of the underlying array storing the non-fill value values.
68+
fill_value : scalar, optional.
69+
The scalar value not stored in the SparseArray. By default, this
70+
depends on `dtype`.
71+
72+
========== ==========
73+
dtype na_value
74+
========== ==========
75+
float ``np.nan``
76+
int ``0``
77+
bool ``False``
78+
datetime64 ``pd.NaT``
79+
timedelta64 ``pd.NaT``
80+
========== ==========
81+
82+
The default value may be overridden by specifying a `fill_value`.
83+
"""
84+
# We include `_is_na_fill_value` in the metadata to avoid hash collisions
85+
# between SparseDtype(float, 0.0) and SparseDtype(float, nan).
86+
# Without is_na_fill_value in the comparison, those would be equal since
87+
# hash(nan) is (sometimes?) 0.
88+
_metadata = ('_dtype', '_fill_value', '_is_na_fill_value')
89+
90+
def __init__(self, dtype=np.float64, fill_value=None):
91+
# type: (Union[str, np.dtype, 'ExtensionDtype', type], Any) -> None
92+
from pandas.core.dtypes.missing import na_value_for_dtype
93+
from pandas.core.dtypes.common import (
94+
pandas_dtype, is_string_dtype, is_scalar
95+
)
96+
97+
if isinstance(dtype, type(self)):
98+
if fill_value is None:
99+
fill_value = dtype.fill_value
100+
dtype = dtype.subtype
101+
102+
dtype = pandas_dtype(dtype)
103+
if is_string_dtype(dtype):
104+
dtype = np.dtype('object')
105+
106+
if fill_value is None:
107+
fill_value = na_value_for_dtype(dtype)
108+
109+
if not is_scalar(fill_value):
110+
raise ValueError("fill_value must be a scalar. Got {} "
111+
"instead".format(fill_value))
112+
self._dtype = dtype
113+
self._fill_value = fill_value
114+
115+
def __hash__(self):
116+
# Python3 doesn't inherit __hash__ when a base class overrides
117+
# __eq__, so we explicitly do it here.
118+
return super(SparseDtype, self).__hash__()
119+
120+
def __eq__(self, other):
121+
# We have to override __eq__ to handle NA values in _metadata.
122+
# The base class does simple == checks, which fail for NA.
123+
if isinstance(other, compat.string_types):
124+
try:
125+
other = self.construct_from_string(other)
126+
except TypeError:
127+
return False
128+
129+
if isinstance(other, type(self)):
130+
subtype = self.subtype == other.subtype
131+
if self._is_na_fill_value:
132+
# this case is complicated by two things:
133+
# SparseDtype(float, float(nan)) == SparseDtype(float, np.nan)
134+
# SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT)
135+
# i.e. we want to treat any floating-point NaN as equal, but
136+
# not a floating-point NaN and a datetime NaT.
137+
fill_value = (
138+
other._is_na_fill_value and
139+
isinstance(self.fill_value, type(other.fill_value)) or
140+
isinstance(other.fill_value, type(self.fill_value))
141+
)
142+
else:
143+
fill_value = self.fill_value == other.fill_value
144+
145+
return subtype and fill_value
146+
return False
147+
148+
@property
149+
def fill_value(self):
150+
"""
151+
The fill value of the array.
152+
153+
Converting the SparseArray to a dense ndarray will fill the
154+
array with this value.
155+
156+
.. warning::
157+
158+
It's possible to end up with a SparseArray that has ``fill_value``
159+
values in ``sp_values``. This can occur, for example, when setting
160+
``SparseArray.fill_value`` directly.
161+
"""
162+
return self._fill_value
163+
164+
@property
165+
def _is_na_fill_value(self):
166+
from pandas.core.dtypes.missing import isna
167+
return isna(self.fill_value)
168+
169+
@property
170+
def _is_numeric(self):
171+
from pandas.core.dtypes.common import is_object_dtype
172+
return not is_object_dtype(self.subtype)
173+
174+
@property
175+
def _is_boolean(self):
176+
from pandas.core.dtypes.common import is_bool_dtype
177+
return is_bool_dtype(self.subtype)
178+
179+
@property
180+
def kind(self):
181+
return self.subtype.kind
182+
183+
@property
184+
def type(self):
185+
return self.subtype.type
186+
187+
@property
188+
def subtype(self):
189+
return self._dtype
190+
191+
@property
192+
def name(self):
193+
return 'Sparse[{}, {}]'.format(self.subtype.name, self.fill_value)
194+
195+
def __repr__(self):
196+
return self.name
197+
198+
@classmethod
199+
def construct_array_type(cls):
200+
return SparseArray
201+
202+
@classmethod
203+
def construct_from_string(cls, string):
204+
"""
205+
Construct a SparseDtype from a string form.
206+
207+
Parameters
208+
----------
209+
string : str
210+
Can take the following forms.
211+
212+
string dtype
213+
================ ============================
214+
'int' SparseDtype[np.int64, 0]
215+
'Sparse' SparseDtype[np.float64, nan]
216+
'Sparse[int]' SparseDtype[np.int64, 0]
217+
'Sparse[int, 0]' SparseDtype[np.int64, 0]
218+
================ ============================
219+
220+
It is not possible to specify non-default fill values
221+
with a string. An argument like ``'Sparse[int, 1]'``
222+
will raise a ``TypeError`` because the default fill value
223+
for integers is 0.
224+
225+
Returns
226+
-------
227+
SparseDtype
228+
"""
229+
msg = "Could not construct SparseDtype from '{}'".format(string)
230+
if string.startswith("Sparse"):
231+
try:
232+
sub_type, has_fill_value = cls._parse_subtype(string)
233+
result = SparseDtype(sub_type)
234+
except Exception:
235+
raise TypeError(msg)
236+
else:
237+
msg = ("Could not construct SparseDtype from '{}'.\n\nIt "
238+
"looks like the fill_value in the string is not "
239+
"the default for the dtype. Non-default fill_values "
240+
"are not supported. Use the 'SparseDtype()' "
241+
"constructor instead.")
242+
if has_fill_value and str(result) != string:
243+
raise TypeError(msg.format(string))
244+
return result
245+
else:
246+
raise TypeError(msg)
247+
248+
@staticmethod
249+
def _parse_subtype(dtype):
250+
"""
251+
Parse a string to get the subtype
252+
253+
Parameters
254+
----------
255+
dtype : str
256+
A string like
257+
258+
* Sparse[subtype]
259+
* Sparse[subtype, fill_value]
260+
261+
Returns
262+
-------
263+
subtype : str
264+
265+
Raises
266+
------
267+
ValueError
268+
When the subtype cannot be extracted.
269+
"""
270+
xpr = re.compile(
271+
r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$"
272+
)
273+
m = xpr.match(dtype)
274+
has_fill_value = False
275+
if m:
276+
subtype = m.groupdict()['subtype']
277+
has_fill_value = m.groupdict()['fill_value'] or has_fill_value
278+
elif dtype == "Sparse":
279+
subtype = 'float64'
280+
else:
281+
raise ValueError("Cannot parse {}".format(dtype))
282+
return subtype, has_fill_value
283+
284+
@classmethod
285+
def is_dtype(cls, dtype):
286+
dtype = getattr(dtype, 'dtype', dtype)
287+
if (isinstance(dtype, compat.string_types) and
288+
dtype.startswith("Sparse")):
289+
sub_type, _ = cls._parse_subtype(dtype)
290+
dtype = np.dtype(sub_type)
291+
elif isinstance(dtype, cls):
292+
return True
293+
return isinstance(dtype, np.dtype) or dtype == 'Sparse'
294+
295+
# ----------------------------------------------------------------------------
296+
# Array
49297

50298

51299
_sparray_doc_kwargs = dict(klass='SparseArray')

0 commit comments

Comments
 (0)