Skip to content

Commit 7cce2c7

Browse files
committed
Merge remote-tracking branch 'upstream/master' into sparsedepr
2 parents d78ecea + 6437f5e commit 7cce2c7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+1273
-583
lines changed

ci/deps/azure-36-locale.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ dependencies:
2727
- openpyxl
2828
# lowest supported version of pyarrow (putting it here instead of in
2929
# azure-36-minimum_versions because it needs numpy >= 1.14)
30-
- pyarrow=0.12
30+
- pyarrow=0.13
3131
- pytables
3232
- python-dateutil
3333
- pytz

ci/deps/azure-macos-36.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ dependencies:
2222
- numexpr
2323
- numpy=1.14
2424
- openpyxl
25-
- pyarrow>=0.12.0
25+
- pyarrow>=0.13.0
2626
- pytables
2727
- python-dateutil==2.6.1
2828
- pytz

ci/deps/azure-windows-36.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ dependencies:
2222
- numpy=1.15.*
2323
- openpyxl
2424
- jinja2
25-
- pyarrow>=0.12.0
25+
- pyarrow>=0.13.0
2626
- pytables
2727
- python-dateutil
2828
- pytz

ci/deps/azure-windows-37.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ dependencies:
2424
- numexpr
2525
- numpy=1.14.*
2626
- openpyxl
27+
- pyarrow=0.14
2728
- pytables
2829
- python-dateutil
2930
- pytz

ci/deps/travis-36-cov.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ dependencies:
3131
# https://github.com/pandas-dev/pandas/pull/30009 openpyxl 3.0.2 broke
3232
- pandas-gbq
3333
- psycopg2
34-
- pyarrow>=0.12.0
34+
- pyarrow>=0.13.0
3535
- pymysql
3636
- pytables
3737
- python-snappy

doc/source/user_guide/io.rst

+7-3
Original file line numberDiff line numberDiff line change
@@ -2066,6 +2066,8 @@ The Numpy parameter
20662066
+++++++++++++++++++
20672067

20682068
.. note::
2069+
This param has been deprecated as of version 1.0.0 and will raise a ``FutureWarning``.
2070+
20692071
This supports numeric data only. Index and columns labels may be non-numeric, e.g. strings, dates etc.
20702072

20712073
If ``numpy=True`` is passed to ``read_json`` an attempt will be made to sniff
@@ -2088,6 +2090,7 @@ data:
20882090
%timeit pd.read_json(jsonfloats)
20892091
20902092
.. ipython:: python
2093+
:okwarning:
20912094
20922095
%timeit pd.read_json(jsonfloats, numpy=True)
20932096
@@ -2102,6 +2105,7 @@ The speedup is less noticeable for smaller datasets:
21022105
%timeit pd.read_json(jsonfloats)
21032106
21042107
.. ipython:: python
2108+
:okwarning:
21052109
21062110
%timeit pd.read_json(jsonfloats, numpy=True)
21072111
@@ -4648,10 +4652,10 @@ Several caveats.
46484652
* Index level names, if specified, must be strings.
46494653
* In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype.
46504654
* The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag.
4651-
* Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message
4652-
on an attempt at serialization.
4655+
* Non supported types include ``Interval`` and actual Python object types. These will raise a helpful error message
4656+
on an attempt at serialization. ``Period`` type is supported with pyarrow >= 0.16.0.
46534657
* The ``pyarrow`` engine preserves extension data types such as the nullable integer and string data
4654-
type (requiring pyarrow >= 1.0.0, and requiring the extension type to implement the needed protocols,
4658+
type (requiring pyarrow >= 0.16.0, and requiring the extension type to implement the needed protocols,
46554659
see the :ref:`extension types documentation <extending.extension.arrow>`).
46564660

46574661
You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``.

doc/source/whatsnew/v1.0.0.rst

+78-77
Large diffs are not rendered by default.

pandas/api/extensions/__init__.py

+21-9
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,27 @@
1-
"""Public API for extending pandas objects."""
2-
from pandas._libs.lib import no_default # noqa: F401
1+
"""
2+
Public API for extending pandas objects.
3+
"""
34

4-
from pandas.core.dtypes.dtypes import ( # noqa: F401
5-
ExtensionDtype,
6-
register_extension_dtype,
7-
)
5+
from pandas._libs.lib import no_default
6+
7+
from pandas.core.dtypes.dtypes import ExtensionDtype, register_extension_dtype
88

9-
from pandas.core.accessor import ( # noqa: F401
9+
from pandas.core.accessor import (
1010
register_dataframe_accessor,
1111
register_index_accessor,
1212
register_series_accessor,
1313
)
14-
from pandas.core.algorithms import take # noqa: F401
15-
from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin # noqa: F401
14+
from pandas.core.algorithms import take
15+
from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin
16+
17+
__all__ = [
18+
"no_default",
19+
"ExtensionDtype",
20+
"register_extension_dtype",
21+
"register_dataframe_accessor",
22+
"register_index_accessor",
23+
"register_series_accessor",
24+
"take",
25+
"ExtensionArray",
26+
"ExtensionScalarOpsMixin",
27+
]

pandas/api/indexers/__init__.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1-
"""Public API for Rolling Window Indexers"""
2-
from pandas.core.indexers import check_bool_array_indexer # noqa: F401
3-
from pandas.core.window.indexers import BaseIndexer # noqa: F401
1+
"""
2+
Public API for Rolling Window Indexers.
3+
"""
4+
5+
from pandas.core.indexers import check_bool_array_indexer
6+
from pandas.core.window.indexers import BaseIndexer
7+
8+
__all__ = ["check_bool_array_indexer", "BaseIndexer"]

pandas/api/types/__init__.py

+15-4
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,23 @@
1-
""" public toolkit API """
1+
"""
2+
Public toolkit API.
3+
"""
24

3-
from pandas._libs.lib import infer_dtype # noqa: F401
5+
from pandas._libs.lib import infer_dtype
46

57
from pandas.core.dtypes.api import * # noqa: F403, F401
6-
from pandas.core.dtypes.concat import union_categoricals # noqa: F401
7-
from pandas.core.dtypes.dtypes import ( # noqa: F401
8+
from pandas.core.dtypes.concat import union_categoricals
9+
from pandas.core.dtypes.dtypes import (
810
CategoricalDtype,
911
DatetimeTZDtype,
1012
IntervalDtype,
1113
PeriodDtype,
1214
)
15+
16+
__all__ = [
17+
"infer_dtype",
18+
"union_categoricals",
19+
"CategoricalDtype",
20+
"DatetimeTZDtype",
21+
"IntervalDtype",
22+
"PeriodDtype",
23+
]

pandas/compat/_optional.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
"odfpy": "1.3.0",
1717
"openpyxl": "2.5.7",
1818
"pandas_gbq": "0.8.0",
19-
"pyarrow": "0.12.0",
19+
"pyarrow": "0.13.0",
2020
"pytables": "3.4.2",
2121
"pytest": "5.0.1",
2222
"s3fs": "0.3.0",

pandas/core/arrays/__init__.py

+31-11
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,36 @@
1-
from pandas.core.arrays.base import ( # noqa: F401
1+
from pandas.core.arrays.base import (
22
ExtensionArray,
33
ExtensionOpsMixin,
44
ExtensionScalarOpsMixin,
55
try_cast_to_ea,
66
)
7-
from pandas.core.arrays.boolean import BooleanArray # noqa: F401
8-
from pandas.core.arrays.categorical import Categorical # noqa: F401
9-
from pandas.core.arrays.datetimes import DatetimeArray # noqa: F401
10-
from pandas.core.arrays.integer import IntegerArray, integer_array # noqa: F401
11-
from pandas.core.arrays.interval import IntervalArray # noqa: F401
12-
from pandas.core.arrays.numpy_ import PandasArray, PandasDtype # noqa: F401
13-
from pandas.core.arrays.period import PeriodArray, period_array # noqa: F401
14-
from pandas.core.arrays.sparse import SparseArray # noqa: F401
15-
from pandas.core.arrays.string_ import StringArray # noqa: F401
16-
from pandas.core.arrays.timedeltas import TimedeltaArray # noqa: F401
7+
from pandas.core.arrays.boolean import BooleanArray
8+
from pandas.core.arrays.categorical import Categorical
9+
from pandas.core.arrays.datetimes import DatetimeArray
10+
from pandas.core.arrays.integer import IntegerArray, integer_array
11+
from pandas.core.arrays.interval import IntervalArray
12+
from pandas.core.arrays.numpy_ import PandasArray, PandasDtype
13+
from pandas.core.arrays.period import PeriodArray, period_array
14+
from pandas.core.arrays.sparse import SparseArray
15+
from pandas.core.arrays.string_ import StringArray
16+
from pandas.core.arrays.timedeltas import TimedeltaArray
17+
18+
__all__ = [
19+
"ExtensionArray",
20+
"ExtensionOpsMixin",
21+
"ExtensionScalarOpsMixin",
22+
"try_cast_to_ea",
23+
"BooleanArray",
24+
"Categorical",
25+
"DatetimeArray",
26+
"IntegerArray",
27+
"integer_array",
28+
"IntervalArray",
29+
"PandasArray",
30+
"PandasDtype",
31+
"PeriodArray",
32+
"period_array",
33+
"SparseArray",
34+
"StringArray",
35+
"TimedeltaArray",
36+
]

pandas/core/arrays/_arrow_utils.py

+124
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
from distutils.version import LooseVersion
2+
import json
3+
4+
import numpy as np
5+
import pyarrow
6+
7+
from pandas.core.arrays.interval import _VALID_CLOSED
8+
9+
_pyarrow_version_ge_015 = LooseVersion(pyarrow.__version__) >= LooseVersion("0.15")
10+
11+
12+
def pyarrow_array_to_numpy_and_mask(arr, dtype):
13+
"""
14+
Convert a primitive pyarrow.Array to a numpy array and boolean mask based
15+
on the buffers of the Array.
16+
17+
Parameters
18+
----------
19+
arr : pyarrow.Array
20+
dtype : numpy.dtype
21+
22+
Returns
23+
-------
24+
(data, mask)
25+
Tuple of two numpy arrays with the raw data (with specified dtype) and
26+
a boolean mask (validity mask, so False means missing)
27+
"""
28+
buflist = arr.buffers()
29+
data = np.frombuffer(buflist[1], dtype=dtype)[arr.offset : arr.offset + len(arr)]
30+
bitmask = buflist[0]
31+
if bitmask is not None:
32+
mask = pyarrow.BooleanArray.from_buffers(
33+
pyarrow.bool_(), len(arr), [None, bitmask]
34+
)
35+
mask = np.asarray(mask)
36+
else:
37+
mask = np.ones(len(arr), dtype=bool)
38+
return data, mask
39+
40+
41+
if _pyarrow_version_ge_015:
42+
# the pyarrow extension types are only available for pyarrow 0.15+
43+
44+
class ArrowPeriodType(pyarrow.ExtensionType):
45+
def __init__(self, freq):
46+
# attributes need to be set first before calling
47+
# super init (as that calls serialize)
48+
self._freq = freq
49+
pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period")
50+
51+
@property
52+
def freq(self):
53+
return self._freq
54+
55+
def __arrow_ext_serialize__(self):
56+
metadata = {"freq": self.freq}
57+
return json.dumps(metadata).encode()
58+
59+
@classmethod
60+
def __arrow_ext_deserialize__(cls, storage_type, serialized):
61+
metadata = json.loads(serialized.decode())
62+
return ArrowPeriodType(metadata["freq"])
63+
64+
def __eq__(self, other):
65+
if isinstance(other, pyarrow.BaseExtensionType):
66+
return type(self) == type(other) and self.freq == other.freq
67+
else:
68+
return NotImplemented
69+
70+
def __hash__(self):
71+
return hash((str(self), self.freq))
72+
73+
# register the type with a dummy instance
74+
_period_type = ArrowPeriodType("D")
75+
pyarrow.register_extension_type(_period_type)
76+
77+
class ArrowIntervalType(pyarrow.ExtensionType):
78+
def __init__(self, subtype, closed):
79+
# attributes need to be set first before calling
80+
# super init (as that calls serialize)
81+
assert closed in _VALID_CLOSED
82+
self._closed = closed
83+
if not isinstance(subtype, pyarrow.DataType):
84+
subtype = pyarrow.type_for_alias(str(subtype))
85+
self._subtype = subtype
86+
87+
storage_type = pyarrow.struct([("left", subtype), ("right", subtype)])
88+
pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval")
89+
90+
@property
91+
def subtype(self):
92+
return self._subtype
93+
94+
@property
95+
def closed(self):
96+
return self._closed
97+
98+
def __arrow_ext_serialize__(self):
99+
metadata = {"subtype": str(self.subtype), "closed": self.closed}
100+
return json.dumps(metadata).encode()
101+
102+
@classmethod
103+
def __arrow_ext_deserialize__(cls, storage_type, serialized):
104+
metadata = json.loads(serialized.decode())
105+
subtype = pyarrow.type_for_alias(metadata["subtype"])
106+
closed = metadata["closed"]
107+
return ArrowIntervalType(subtype, closed)
108+
109+
def __eq__(self, other):
110+
if isinstance(other, pyarrow.BaseExtensionType):
111+
return (
112+
type(self) == type(other)
113+
and self.subtype == other.subtype
114+
and self.closed == other.closed
115+
)
116+
else:
117+
return NotImplemented
118+
119+
def __hash__(self):
120+
return hash((str(self), str(self.subtype), self.closed))
121+
122+
# register the type with a dummy instance
123+
_interval_type = ArrowIntervalType(pyarrow.int64(), "left")
124+
pyarrow.register_extension_type(_interval_type)

0 commit comments

Comments
 (0)