Skip to content

Commit a5d8987

Browse files
committed
Merge remote-tracking branch 'upstream/master' into move-metadata-to-cfg
2 parents cb757e4 + 508b9f9 commit a5d8987

33 files changed

+509
-295
lines changed

asv_bench/benchmarks/categoricals.py

+17
Original file line numberDiff line numberDiff line change
@@ -118,12 +118,29 @@ def setup(self):
118118
self.a = pd.Categorical(list("aabbcd") * N)
119119
self.b = pd.Categorical(list("bbcdjk") * N)
120120

121+
self.idx_a = pd.CategoricalIndex(range(N), range(N))
122+
self.idx_b = pd.CategoricalIndex(range(N + 1), range(N + 1))
123+
self.df_a = pd.DataFrame(range(N), columns=["a"], index=self.idx_a)
124+
self.df_b = pd.DataFrame(range(N + 1), columns=["a"], index=self.idx_b)
125+
121126
def time_concat(self):
122127
pd.concat([self.s, self.s])
123128

124129
def time_union(self):
125130
union_categoricals([self.a, self.b])
126131

132+
def time_append_overlapping_index(self):
133+
self.idx_a.append(self.idx_a)
134+
135+
def time_append_non_overlapping_index(self):
136+
self.idx_a.append(self.idx_b)
137+
138+
def time_concat_overlapping_index(self):
139+
pd.concat([self.df_a, self.df_a])
140+
141+
def time_concat_non_overlapping_index(self):
142+
pd.concat([self.df_a, self.df_b])
143+
127144

128145
class ValueCounts:
129146

asv_bench/benchmarks/hash_functions.py

+9
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,15 @@ def time_isin_outside(self, dtype, exponent):
2525
self.s.isin(self.values_outside)
2626

2727

28+
class UniqueForLargePyObjectInts:
29+
def setup(self):
30+
lst = [x << 32 for x in range(5000)]
31+
self.arr = np.array(lst, dtype=np.object_)
32+
33+
def time_unique(self):
34+
pd.unique(self.arr)
35+
36+
2837
class IsinWithRandomFloat:
2938
params = [
3039
[np.float64, np.object],

doc/source/development/extending.rst

+4-14
Original file line numberDiff line numberDiff line change
@@ -329,21 +329,11 @@ Each data structure has several *constructor properties* for returning a new
329329
data structure as the result of an operation. By overriding these properties,
330330
you can retain subclasses through ``pandas`` data manipulations.
331331

332-
There are 3 constructor properties to be defined:
332+
There are 3 possible constructor properties to be defined on a subclass:
333333

334-
* ``_constructor``: Used when a manipulation result has the same dimensions as the original.
335-
* ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing.
336-
* ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()``.
337-
338-
Following table shows how ``pandas`` data structures define constructor properties by default.
339-
340-
=========================== ======================= =============
341-
Property Attributes ``Series`` ``DataFrame``
342-
=========================== ======================= =============
343-
``_constructor`` ``Series`` ``DataFrame``
344-
``_constructor_sliced`` ``NotImplementedError`` ``Series``
345-
``_constructor_expanddim`` ``DataFrame`` ``NotImplementedError``
346-
=========================== ======================= =============
334+
* ``DataFrame/Series._constructor``: Used when a manipulation result has the same dimension as the original.
335+
* ``DataFrame._constructor_sliced``: Used when a ``DataFrame`` (sub-)class manipulation result should be a ``Series`` (sub-)class.
336+
* ``Series._constructor_expanddim``: Used when a ``Series`` (sub-)class manipulation result should be a ``DataFrame`` (sub-)class, e.g. ``Series.to_frame()``.
347337

348338
Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties.
349339

doc/source/whatsnew/v1.2.2.rst

+4-1
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,14 @@ Fixed regressions
1717

1818
- Fixed regression in :func:`read_excel` that caused it to raise ``AttributeError`` when checking version of older xlrd versions (:issue:`38955`)
1919
- Fixed regression in :class:`DataFrame` constructor reordering element when construction from datetime ndarray with dtype not ``"datetime64[ns]"`` (:issue:`39422`)
20-
- Fixed regression in :class:`DataFrame.astype` and :class:`Series.astype` not casting to bytes dtype (:issue:`39474`)
20+
- Fixed regression in :meth:`DataFrame.astype` and :meth:`Series.astype` not casting to bytes dtype (:issue:`39474`)
2121
- Fixed regression in :meth:`~DataFrame.to_pickle` failing to create bz2/xz compressed pickle files with ``protocol=5`` (:issue:`39002`)
2222
- Fixed regression in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` always raising ``AssertionError`` when comparing extension dtypes (:issue:`39410`)
2323
- Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`)
24+
- Fixed regression in :meth:`~DataFrame.to_excel` creating corrupt files when appending (``mode="a"``) to an existing file (:issue:`39576`)
25+
- Fixed regression in :meth:`DataFrame.transform` failing in case of an empty DataFrame or Series (:issue:`39636`)
2426
- Fixed regression in :meth:`core.window.rolling.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`)
27+
- Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`)
2528
-
2629

2730
.. ---------------------------------------------------------------------------

doc/source/whatsnew/v1.3.0.rst

+6-3
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor
221221
Other API changes
222222
^^^^^^^^^^^^^^^^^
223223
- Partially initialized :class:`CategoricalDtype` (i.e. those with ``categories=None`` objects will no longer compare as equal to fully initialized dtype objects.
224-
-
224+
- Accessing ``_constructor_expanddim`` on a :class:`DataFrame` and ``_constructor_sliced`` on a :class:`Series` now raise an ``AttributeError``. Previously a ``NotImplementedError`` was raised (:issue:`38782`)
225225
-
226226

227227
.. ---------------------------------------------------------------------------
@@ -255,6 +255,7 @@ Performance improvements
255255
- Performance improvement in :meth:`DataFrame.corr` for method=kendall (:issue:`28329`)
256256
- Performance improvement in :meth:`core.window.rolling.Rolling.corr` and :meth:`core.window.rolling.Rolling.cov` (:issue:`39388`)
257257
- Performance improvement in :meth:`core.window.rolling.RollingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` (:issue:`39591`)
258+
- Performance improvement in :func:`unique` for object data type (:issue:`37615`)
258259

259260
.. ---------------------------------------------------------------------------
260261
@@ -306,6 +307,7 @@ Numeric
306307
- Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`)
307308
- Bug in :meth:`DataFrame.rank` with ``np.inf`` and mixture of ``np.nan`` and ``np.inf`` (:issue:`32593`)
308309
- Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising ``IndexError`` (:issue:`38932`)
310+
- Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`)
309311
-
310312

311313
Conversion
@@ -340,7 +342,7 @@ Indexing
340342
- Bug in :meth:`Series.__setitem__` raising ``ValueError`` when setting a :class:`Series` with a scalar indexer (:issue:`38303`)
341343
- Bug in :meth:`DataFrame.loc` dropping levels of :class:`MultiIndex` when :class:`DataFrame` used as input has only one row (:issue:`10521`)
342344
- Bug in :meth:`DataFrame.__getitem__` and :meth:`Series.__getitem__` always raising ``KeyError`` when slicing with existing strings an :class:`Index` with milliseconds (:issue:`33589`)
343-
- Bug in setting ``timedelta64`` values into numeric :class:`Series` failing to cast to object dtype (:issue:`39086`)
345+
- Bug in setting ``timedelta64`` or ``datetime64`` values into numeric :class:`Series` failing to cast to object dtype (:issue:`39086`, issue:`39619`)
344346
- Bug in setting :class:`Interval` values into a :class:`Series` or :class:`DataFrame` with mismatched :class:`IntervalDtype` incorrectly casting the new values to the existing dtype (:issue:`39120`)
345347
- Bug in setting ``datetime64`` values into a :class:`Series` with integer-dtype incorrect casting the datetime64 values to integers (:issue:`39266`)
346348
- Bug in :meth:`Index.get_loc` not raising ``KeyError`` when method is specified for ``NaN`` value when ``NaN`` is not in :class:`Index` (:issue:`39382`)
@@ -447,10 +449,11 @@ Other
447449
- Bug in :class:`Index` constructor sometimes silently ignorning a specified ``dtype`` (:issue:`38879`)
448450
- Bug in constructing a :class:`Series` from a list and a :class:`PandasDtype` (:issue:`39357`)
449451
- Bug in :class:`Styler` which caused CSS to duplicate on multiple renders. (:issue:`39395`)
452+
- ``inspect.getmembers(Series)`` no longer raises an ``AbstractMethodError`` (:issue:`38782`)
450453
- :meth:`Index.where` behavior now mirrors :meth:`Index.putmask` behavior, i.e. ``index.where(mask, other)`` matches ``index.putmask(~mask, other)`` (:issue:`39412`)
451454
- Bug in :func:`pandas.testing.assert_series_equal`, :func:`pandas.testing.assert_frame_equal`, :func:`pandas.testing.assert_index_equal` and :func:`pandas.testing.assert_extension_array_equal` incorrectly raising when an attribute has an unrecognized NA type (:issue:`39461`)
452455
- Bug in :class:`Styler` where ``subset`` arg in methods raised an error for some valid multiindex slices (:issue:`33562`)
453-
-
456+
- :class:`Styler` rendered HTML output minor alterations to support w3 good code standard (:issue:`39626`)
454457
-
455458

456459
.. ---------------------------------------------------------------------------

pandas/_libs/index_class_helper.pxi.in

+5-1
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,14 @@ cdef class {{name}}Engine(IndexEngine):
3434
cdef _make_hash_table(self, Py_ssize_t n):
3535
return _hash.{{name}}HashTable(n)
3636

37-
{{if name not in {'Float64', 'Float32'} }}
3837
cdef _check_type(self, object val):
38+
{{if name not in {'Float64', 'Float32'} }}
3939
if not util.is_integer_object(val):
4040
raise KeyError(val)
41+
{{else}}
42+
if util.is_bool_object(val):
43+
# avoid casting to True -> 1.0
44+
raise KeyError(val)
4145
{{endif}}
4246

4347
cdef void _call_map_locations(self, values):

pandas/_libs/src/klib/khash_python.h

+25-5
Original file line numberDiff line numberDiff line change
@@ -178,11 +178,31 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) {
178178
return result;
179179
}
180180

181-
// For PyObject_Hash holds:
182-
// hash(0.0) == 0 == hash(-0.0)
183-
// hash(X) == 0 if X is a NaN-value
184-
// so it is OK to use it directly
185-
#define kh_python_hash_func(key) (PyObject_Hash(key))
181+
182+
khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){
183+
// For PyObject_Hash holds:
184+
// hash(0.0) == 0 == hash(-0.0)
185+
// hash(X) == 0 if X is a NaN-value
186+
// so it is OK to use it directly for doubles
187+
Py_hash_t hash = PyObject_Hash(key);
188+
if (hash == -1) {
189+
PyErr_Clear();
190+
return 0;
191+
}
192+
#if SIZEOF_PY_HASH_T == 4
193+
// it is already 32bit value
194+
return hash;
195+
#else
196+
// for 64bit builds,
197+
// we need information of the upper 32bits as well
198+
// see GH 37615
199+
khuint64_t as_uint = (khuint64_t) hash;
200+
// uints avoid undefined behavior of signed ints
201+
return (as_uint>>32)^as_uint;
202+
#endif
203+
}
204+
205+
186206
#define kh_python_hash_equal(a, b) (pyobject_cmp(a, b))
187207

188208

pandas/core/aggregation.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -457,7 +457,7 @@ def transform(
457457

458458
# Functions that transform may return empty Series/DataFrame
459459
# when the dtype is not appropriate
460-
if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty:
460+
if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty and not obj.empty:
461461
raise ValueError("Transform function failed")
462462
if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals(
463463
obj.index

pandas/core/algorithms.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,8 @@ def unique(values):
324324
Hash table-based unique. Uniques are returned in order
325325
of appearance. This does NOT sort.
326326
327-
Significantly faster than numpy.unique. Includes NA values.
327+
Significantly faster than numpy.unique for long enough sequences.
328+
Includes NA values.
328329
329330
Parameters
330331
----------

pandas/core/apply.py

+20-20
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def frame_apply(
6363
raw: bool = False,
6464
result_type: Optional[str] = None,
6565
args=None,
66-
kwds=None,
66+
kwargs=None,
6767
) -> FrameApply:
6868
""" construct and return a row or column based frame apply object """
6969
axis = obj._get_axis_number(axis)
@@ -79,7 +79,7 @@ def frame_apply(
7979
raw=raw,
8080
result_type=result_type,
8181
args=args,
82-
kwds=kwds,
82+
kwargs=kwargs,
8383
)
8484

8585

@@ -88,14 +88,14 @@ def series_apply(
8888
func: AggFuncType,
8989
convert_dtype: bool = True,
9090
args=None,
91-
kwds=None,
91+
kwargs=None,
9292
) -> SeriesApply:
9393
return SeriesApply(
9494
obj,
9595
func,
9696
convert_dtype,
9797
args,
98-
kwds,
98+
kwargs,
9999
)
100100

101101

@@ -109,12 +109,12 @@ def __init__(
109109
raw: bool,
110110
result_type: Optional[str],
111111
args,
112-
kwds,
112+
kwargs,
113113
):
114114
self.obj = obj
115115
self.raw = raw
116116
self.args = args or ()
117-
self.kwds = kwds or {}
117+
self.kwargs = kwargs or {}
118118

119119
if result_type not in [None, "reduce", "broadcast", "expand"]:
120120
raise ValueError(
@@ -126,13 +126,13 @@ def __init__(
126126

127127
# curry if needed
128128
if (
129-
(kwds or args)
129+
(kwargs or args)
130130
and not isinstance(func, (np.ufunc, str))
131131
and not is_list_like(func)
132132
):
133133

134134
def f(x):
135-
return func(x, *args, **kwds)
135+
return func(x, *args, **kwargs)
136136

137137
else:
138138
f = func
@@ -163,7 +163,7 @@ def agg(self) -> Tuple[Optional[FrameOrSeriesUnion], Optional[bool]]:
163163
obj = self.obj
164164
arg = self.f
165165
args = self.args
166-
kwargs = self.kwds
166+
kwargs = self.kwargs
167167

168168
_axis = kwargs.pop("_axis", None)
169169
if _axis is None:
@@ -413,10 +413,10 @@ def maybe_apply_str(self) -> Optional[FrameOrSeriesUnion]:
413413
if callable(func):
414414
sig = inspect.getfullargspec(func)
415415
if "axis" in sig.args:
416-
self.kwds["axis"] = self.axis
416+
self.kwargs["axis"] = self.axis
417417
elif self.axis != 0:
418418
raise ValueError(f"Operation {f} does not support axis=1")
419-
return self.obj._try_aggregate_string_function(f, *self.args, **self.kwds)
419+
return self.obj._try_aggregate_string_function(f, *self.args, **self.kwargs)
420420

421421
def maybe_apply_multiple(self) -> Optional[FrameOrSeriesUnion]:
422422
"""
@@ -430,7 +430,7 @@ def maybe_apply_multiple(self) -> Optional[FrameOrSeriesUnion]:
430430
# Note: dict-likes are list-like
431431
if not is_list_like(self.f):
432432
return None
433-
return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwds)
433+
return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwargs)
434434

435435

436436
class FrameApply(Apply):
@@ -806,7 +806,7 @@ def __init__(
806806
func: AggFuncType,
807807
convert_dtype: bool,
808808
args,
809-
kwds,
809+
kwargs,
810810
):
811811
self.convert_dtype = convert_dtype
812812

@@ -816,7 +816,7 @@ def __init__(
816816
raw=False,
817817
result_type=None,
818818
args=args,
819-
kwds=kwds,
819+
kwargs=kwargs,
820820
)
821821

822822
def apply(self) -> FrameOrSeriesUnion:
@@ -877,17 +877,17 @@ def __init__(
877877
obj: Union[SeriesGroupBy, DataFrameGroupBy],
878878
func: AggFuncType,
879879
args,
880-
kwds,
880+
kwargs,
881881
):
882-
kwds = kwds.copy()
883-
self.axis = obj.obj._get_axis_number(kwds.get("axis", 0))
882+
kwargs = kwargs.copy()
883+
self.axis = obj.obj._get_axis_number(kwargs.get("axis", 0))
884884
super().__init__(
885885
obj,
886886
func,
887887
raw=False,
888888
result_type=None,
889889
args=args,
890-
kwds=kwds,
890+
kwargs=kwargs,
891891
)
892892

893893
def apply(self):
@@ -903,15 +903,15 @@ def __init__(
903903
obj: Union[Resampler, BaseWindow],
904904
func: AggFuncType,
905905
args,
906-
kwds,
906+
kwargs,
907907
):
908908
super().__init__(
909909
obj,
910910
func,
911911
raw=False,
912912
result_type=None,
913913
args=args,
914-
kwds=kwds,
914+
kwargs=kwargs,
915915
)
916916

917917
def apply(self):

pandas/core/construction.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -588,9 +588,13 @@ def _try_cast(arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bo
588588
Otherwise an object array is returned.
589589
"""
590590
# perf shortcut as this is the most common case
591-
if isinstance(arr, np.ndarray):
592-
if maybe_castable(arr) and not copy and dtype is None:
593-
return arr
591+
if (
592+
isinstance(arr, np.ndarray)
593+
and maybe_castable(arr.dtype)
594+
and not copy
595+
and dtype is None
596+
):
597+
return arr
594598

595599
if isinstance(dtype, ExtensionDtype) and (dtype.kind != "M" or is_sparse(dtype)):
596600
# create an extension array from its dtype

0 commit comments

Comments
 (0)