Skip to content

Commit f457dff

Browse files
Merge remote-tracking branch 'upstream/master' into fix-40014
2 parents 0c16e6c + 4aeb8f2 commit f457dff

24 files changed

+1209
-921
lines changed

doc/source/whatsnew/v1.2.4.rst

+3-22
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
.. _whatsnew_124:
22

3-
What's new in 1.2.4 (April ??, 2021)
4-
---------------------------------------
3+
What's new in 1.2.4 (April 12, 2021)
4+
------------------------------------
55

66
These are the changes in pandas 1.2.4. See :ref:`release` for a full changelog
77
including other versions of pandas.
@@ -21,26 +21,7 @@ Fixed regressions
2121
- Fixed regression in :meth:`DataFrame.where` not returning a copy in the case of an all True condition (:issue:`39595`)
2222
- Fixed regression in :meth:`DataFrame.replace` raising ``IndexError`` when ``regex`` was a multi-key dictionary (:issue:`39338`)
2323
- Fixed regression in repr of floats in an ``object`` column not respecting ``float_format`` when printed in the console or outputted through :meth:`DataFrame.to_string`, :meth:`DataFrame.to_html`, and :meth:`DataFrame.to_latex` (:issue:`40024`)
24-
25-
.. ---------------------------------------------------------------------------
26-
27-
.. _whatsnew_124.bug_fixes:
28-
29-
Bug fixes
30-
~~~~~~~~~
31-
32-
-
33-
-
34-
35-
.. ---------------------------------------------------------------------------
36-
37-
.. _whatsnew_124.other:
38-
39-
Other
40-
~~~~~
41-
42-
-
43-
-
24+
- Fixed regression in NumPy ufuncs such as ``np.add`` not passing through all arguments for :class:`DataFrame` (:issue:`40662`)
4425

4526
.. ---------------------------------------------------------------------------
4627

doc/source/whatsnew/v1.3.0.rst

+3
Original file line numberDiff line numberDiff line change
@@ -584,6 +584,7 @@ Performance improvements
584584
- Performance improvement in :class:`Styler` where render times are more than 50% reduced (:issue:`39972` :issue:`39952`)
585585
- Performance improvement in :meth:`core.window.ewm.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`)
586586
- Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`)
587+
- Performance improvement for concatenation of data with type :class:`CategoricalDtype` (:issue:`40193`)
587588

588589
.. ---------------------------------------------------------------------------
589590
@@ -787,6 +788,7 @@ Reshaping
787788
^^^^^^^^^
788789
- Bug in :func:`merge` raising error when performing an inner join with partial index and ``right_index`` when no overlap between indices (:issue:`33814`)
789790
- Bug in :meth:`DataFrame.unstack` with missing levels led to incorrect index names (:issue:`37510`)
791+
- Bug in :func:`merge_asof` propagating the right Index with ``left_index=True`` and ``right_on`` specification instead of left Index (:issue:`33463`)
790792
- Bug in :func:`join` over :class:`MultiIndex` returned wrong result, when one of both indexes had only one level (:issue:`36909`)
791793
- :meth:`merge_asof` raises ``ValueError`` instead of cryptic ``TypeError`` in case of non-numerical merge columns (:issue:`29130`)
792794
- Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`)
@@ -814,6 +816,7 @@ ExtensionArray
814816

815817
- Bug in :meth:`DataFrame.where` when ``other`` is a :class:`Series` with :class:`ExtensionArray` dtype (:issue:`38729`)
816818
- Fixed bug where :meth:`Series.idxmax`, :meth:`Series.idxmin` and ``argmax/min`` fail when the underlying data is :class:`ExtensionArray` (:issue:`32749`, :issue:`33719`, :issue:`36566`)
819+
- Fixed a bug where some properties of subclasses of :class:`PandasExtensionDtype` where improperly cached (:issue:`40329`)
817820
-
818821

819822
Other

pandas/core/arraylike.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -357,15 +357,17 @@ def reconstruct(result):
357357
# * len(inputs) > 1 is doable when we know that we have
358358
# aligned blocks / dtypes.
359359
inputs = tuple(np.asarray(x) for x in inputs)
360-
result = getattr(ufunc, method)(*inputs)
360+
result = getattr(ufunc, method)(*inputs, **kwargs)
361361
elif self.ndim == 1:
362362
# ufunc(series, ...)
363363
inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs)
364364
result = getattr(ufunc, method)(*inputs, **kwargs)
365365
else:
366366
# ufunc(dataframe)
367-
if method == "__call__":
367+
if method == "__call__" and not kwargs:
368368
# for np.<ufunc>(..) calls
369+
# kwargs cannot necessarily be handled block-by-block, so only
370+
# take this path if there are no kwargs
369371
mgr = inputs[0]._mgr
370372
result = mgr.apply(getattr(ufunc, method))
371373
else:

pandas/core/dtypes/concat.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,13 @@
3030
)
3131

3232

33-
def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
33+
def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
3434
"""
3535
Helper function for `arr.astype(common_dtype)` but handling all special
3636
cases.
3737
"""
38+
if is_dtype_equal(arr.dtype, dtype):
39+
return arr
3840
if (
3941
is_categorical_dtype(arr.dtype)
4042
and isinstance(dtype, np.dtype)
@@ -121,7 +123,7 @@ def is_nonempty(x) -> bool:
121123
# for axis=0
122124
if not single_dtype:
123125
target_dtype = find_common_type([x.dtype for x in to_concat])
124-
to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat]
126+
to_concat = [cast_to_common_type(arr, target_dtype) for arr in to_concat]
125127

126128
if isinstance(to_concat[0], ExtensionArray):
127129
cls = type(to_concat[0])

pandas/core/dtypes/dtypes.py

+17-13
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import pytz
1616

1717
from pandas._libs.interval import Interval
18+
from pandas._libs.properties import cache_readonly
1819
from pandas._libs.tslibs import (
1920
BaseOffset,
2021
NaT,
@@ -81,7 +82,7 @@ class PandasExtensionDtype(ExtensionDtype):
8182
base: DtypeObj | None = None
8283
isbuiltin = 0
8384
isnative = 0
84-
_cache: dict[str_type, PandasExtensionDtype] = {}
85+
_cache_dtypes: dict[str_type, PandasExtensionDtype] = {}
8586

8687
def __str__(self) -> str_type:
8788
"""
@@ -105,7 +106,7 @@ def __getstate__(self) -> dict[str_type, Any]:
105106
@classmethod
106107
def reset_cache(cls) -> None:
107108
""" clear the cache """
108-
cls._cache = {}
109+
cls._cache_dtypes = {}
109110

110111

111112
class CategoricalDtypeType(type):
@@ -177,7 +178,7 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
177178
str = "|O08"
178179
base = np.dtype("O")
179180
_metadata = ("categories", "ordered")
180-
_cache: dict[str_type, PandasExtensionDtype] = {}
181+
_cache_dtypes: dict[str_type, PandasExtensionDtype] = {}
181182

182183
def __init__(self, categories=None, ordered: Ordered = False):
183184
self._finalize(categories, ordered, fastpath=False)
@@ -355,7 +356,7 @@ def __hash__(self) -> int:
355356
else:
356357
return -2
357358
# We *do* want to include the real self.ordered here
358-
return int(self._hash_categories(self.categories, self.ordered))
359+
return int(self._hash_categories)
359360

360361
def __eq__(self, other: Any) -> bool:
361362
"""
@@ -429,14 +430,17 @@ def __repr__(self) -> str_type:
429430
data = data.rstrip(", ")
430431
return f"CategoricalDtype(categories={data}, ordered={self.ordered})"
431432

432-
@staticmethod
433-
def _hash_categories(categories, ordered: Ordered = True) -> int:
433+
@cache_readonly
434+
def _hash_categories(self) -> int:
434435
from pandas.core.util.hashing import (
435436
combine_hash_arrays,
436437
hash_array,
437438
hash_tuples,
438439
)
439440

441+
categories = self.categories
442+
ordered = self.ordered
443+
440444
if len(categories) and isinstance(categories[0], tuple):
441445
# assumes if any individual category is a tuple, then all our. ATM
442446
# I don't really want to support just some of the categories being
@@ -671,7 +675,7 @@ class DatetimeTZDtype(PandasExtensionDtype):
671675
na_value = NaT
672676
_metadata = ("unit", "tz")
673677
_match = re.compile(r"(datetime64|M8)\[(?P<unit>.+), (?P<tz>.+)\]")
674-
_cache: dict[str_type, PandasExtensionDtype] = {}
678+
_cache_dtypes: dict[str_type, PandasExtensionDtype] = {}
675679

676680
def __init__(self, unit: str_type | DatetimeTZDtype = "ns", tz=None):
677681
if isinstance(unit, DatetimeTZDtype):
@@ -837,7 +841,7 @@ class PeriodDtype(dtypes.PeriodDtypeBase, PandasExtensionDtype):
837841
num = 102
838842
_metadata = ("freq",)
839843
_match = re.compile(r"(P|p)eriod\[(?P<freq>.+)\]")
840-
_cache: dict[str_type, PandasExtensionDtype] = {}
844+
_cache_dtypes: dict[str_type, PandasExtensionDtype] = {}
841845

842846
def __new__(cls, freq=None):
843847
"""
@@ -859,12 +863,12 @@ def __new__(cls, freq=None):
859863
freq = cls._parse_dtype_strict(freq)
860864

861865
try:
862-
return cls._cache[freq.freqstr]
866+
return cls._cache_dtypes[freq.freqstr]
863867
except KeyError:
864868
dtype_code = freq._period_dtype_code
865869
u = dtypes.PeriodDtypeBase.__new__(cls, dtype_code)
866870
u._freq = freq
867-
cls._cache[freq.freqstr] = u
871+
cls._cache_dtypes[freq.freqstr] = u
868872
return u
869873

870874
def __reduce__(self):
@@ -1042,7 +1046,7 @@ class IntervalDtype(PandasExtensionDtype):
10421046
_match = re.compile(
10431047
r"(I|i)nterval\[(?P<subtype>[^,]+)(, (?P<closed>(right|left|both|neither)))?\]"
10441048
)
1045-
_cache: dict[str_type, PandasExtensionDtype] = {}
1049+
_cache_dtypes: dict[str_type, PandasExtensionDtype] = {}
10461050

10471051
def __new__(cls, subtype=None, closed: str_type | None = None):
10481052
from pandas.core.dtypes.common import (
@@ -1099,12 +1103,12 @@ def __new__(cls, subtype=None, closed: str_type | None = None):
10991103

11001104
key = str(subtype) + str(closed)
11011105
try:
1102-
return cls._cache[key]
1106+
return cls._cache_dtypes[key]
11031107
except KeyError:
11041108
u = object.__new__(cls)
11051109
u._subtype = subtype
11061110
u._closed = closed
1107-
cls._cache[key] = u
1111+
cls._cache_dtypes[key] = u
11081112
return u
11091113

11101114
@property

pandas/core/internals/array_manager.py

+68-3
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,14 @@
1818
)
1919
from pandas._typing import (
2020
ArrayLike,
21+
DtypeObj,
2122
Hashable,
2223
)
2324
from pandas.util._validators import validate_bool_kwarg
2425

2526
from pandas.core.dtypes.cast import (
2627
astype_array_safe,
28+
ensure_dtype_can_hold_na,
2729
infer_dtype_from_scalar,
2830
soft_convert_objects,
2931
)
@@ -49,6 +51,7 @@
4951
from pandas.core.dtypes.missing import (
5052
array_equals,
5153
isna,
54+
na_value_for_dtype,
5255
)
5356

5457
import pandas.core.algorithms as algos
@@ -952,10 +955,18 @@ def reindex_indexer(
952955
# ignored keywords
953956
consolidate: bool = True,
954957
only_slice: bool = False,
958+
# ArrayManager specific keywords
959+
use_na_proxy: bool = False,
955960
) -> T:
956961
axis = self._normalize_axis(axis)
957962
return self._reindex_indexer(
958-
new_axis, indexer, axis, fill_value, allow_dups, copy
963+
new_axis,
964+
indexer,
965+
axis,
966+
fill_value,
967+
allow_dups,
968+
copy,
969+
use_na_proxy,
959970
)
960971

961972
def _reindex_indexer(
@@ -966,6 +977,7 @@ def _reindex_indexer(
966977
fill_value=None,
967978
allow_dups: bool = False,
968979
copy: bool = True,
980+
use_na_proxy: bool = False,
969981
) -> T:
970982
"""
971983
Parameters
@@ -1000,7 +1012,9 @@ def _reindex_indexer(
10001012
new_arrays = []
10011013
for i in indexer:
10021014
if i == -1:
1003-
arr = self._make_na_array(fill_value=fill_value)
1015+
arr = self._make_na_array(
1016+
fill_value=fill_value, use_na_proxy=use_na_proxy
1017+
)
10041018
else:
10051019
arr = self.arrays[i]
10061020
new_arrays.append(arr)
@@ -1051,7 +1065,11 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
10511065
new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True
10521066
)
10531067

1054-
def _make_na_array(self, fill_value=None):
1068+
def _make_na_array(self, fill_value=None, use_na_proxy=False):
1069+
if use_na_proxy:
1070+
assert fill_value is None
1071+
return NullArrayProxy(self.shape_proper[0])
1072+
10551073
if fill_value is None:
10561074
fill_value = np.nan
10571075

@@ -1271,3 +1289,50 @@ def set_values(self, values: ArrayLike):
12711289
valid for the current SingleArrayManager (length, dtype, etc).
12721290
"""
12731291
self.arrays[0] = values
1292+
1293+
1294+
class NullArrayProxy:
1295+
"""
1296+
Proxy object for an all-NA array.
1297+
1298+
Only stores the length of the array, and not the dtype. The dtype
1299+
will only be known when actually concatenating (after determining the
1300+
common dtype, for which this proxy is ignored).
1301+
Using this object avoids that the internals/concat.py needs to determine
1302+
the proper dtype and array type.
1303+
"""
1304+
1305+
ndim = 1
1306+
1307+
def __init__(self, n: int):
1308+
self.n = n
1309+
1310+
@property
1311+
def shape(self):
1312+
return (self.n,)
1313+
1314+
def to_array(self, dtype: DtypeObj) -> ArrayLike:
1315+
"""
1316+
Helper function to create the actual all-NA array from the NullArrayProxy
1317+
object.
1318+
1319+
Parameters
1320+
----------
1321+
arr : NullArrayProxy
1322+
dtype : the dtype for the resulting array
1323+
1324+
Returns
1325+
-------
1326+
np.ndarray or ExtensionArray
1327+
"""
1328+
if isinstance(dtype, ExtensionDtype):
1329+
empty = dtype.construct_array_type()._from_sequence([], dtype=dtype)
1330+
indexer = -np.ones(self.n, dtype=np.intp)
1331+
return empty.take(indexer, allow_fill=True)
1332+
else:
1333+
# when introducing missing values, int becomes float, bool becomes object
1334+
dtype = ensure_dtype_can_hold_na(dtype)
1335+
fill_value = na_value_for_dtype(dtype)
1336+
arr = np.empty(self.n, dtype=dtype)
1337+
arr.fill(fill_value)
1338+
return ensure_wrapped_if_datetimelike(arr)

0 commit comments

Comments
 (0)