pandas-dev · CyberQin · Feb 7, 2021 · Feb 7, 2021 · Feb 7, 2021 · Feb 7, 2021
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -118,12 +118,29 @@ def setup(self):
         self.a = pd.Categorical(list("aabbcd") * N)
         self.b = pd.Categorical(list("bbcdjk") * N)
 
+        self.idx_a = pd.CategoricalIndex(range(N), range(N))
+        self.idx_b = pd.CategoricalIndex(range(N + 1), range(N + 1))
+        self.df_a = pd.DataFrame(range(N), columns=["a"], index=self.idx_a)
+        self.df_b = pd.DataFrame(range(N + 1), columns=["a"], index=self.idx_b)
+
     def time_concat(self):
         pd.concat([self.s, self.s])
 
     def time_union(self):
         union_categoricals([self.a, self.b])
 
+    def time_append_overlapping_index(self):
+        self.idx_a.append(self.idx_a)
+
+    def time_append_non_overlapping_index(self):
+        self.idx_a.append(self.idx_b)
+
+    def time_concat_overlapping_index(self):
+        pd.concat([self.df_a, self.df_a])
+
+    def time_concat_non_overlapping_index(self):
+        pd.concat([self.df_a, self.df_b])
+
 
 class ValueCounts:
 

diff --git a/asv_bench/benchmarks/hash_functions.py b/asv_bench/benchmarks/hash_functions.py
@@ -25,6 +25,15 @@ def time_isin_outside(self, dtype, exponent):
         self.s.isin(self.values_outside)
 
 
+class UniqueForLargePyObjectInts:
+    def setup(self):
+        lst = [x << 32 for x in range(5000)]
+        self.arr = np.array(lst, dtype=np.object_)
+
+    def time_unique(self):
+        pd.unique(self.arr)
+
+
 class IsinWithRandomFloat:
     params = [
         [np.float64, np.object],

diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst
@@ -329,21 +329,11 @@ Each data structure has several *constructor properties* for returning a new
 data structure as the result of an operation. By overriding these properties,
 you can retain subclasses through ``pandas`` data manipulations.
 
-There are 3 constructor properties to be defined:
+There are 3 possible constructor properties to be defined on a subclass:
 
-* ``_constructor``: Used when a manipulation result has the same dimensions as the original.
-* ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing.
-* ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()``.
-
-Following table shows how ``pandas`` data structures define constructor properties by default.
-
-===========================  ======================= =============
-Property Attributes          ``Series``              ``DataFrame``
-===========================  ======================= =============
-``_constructor``             ``Series``              ``DataFrame``
-``_constructor_sliced``      ``NotImplementedError`` ``Series``
-``_constructor_expanddim``   ``DataFrame``           ``NotImplementedError``
-===========================  ======================= =============
+* ``DataFrame/Series._constructor``: Used when a manipulation result has the same dimension as the original.
+* ``DataFrame._constructor_sliced``: Used when a ``DataFrame`` (sub-)class manipulation result should be a ``Series`` (sub-)class.
+* ``Series._constructor_expanddim``: Used when a ``Series`` (sub-)class manipulation result should be a ``DataFrame`` (sub-)class, e.g. ``Series.to_frame()``.
 
 Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties.
 

diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst
@@ -17,11 +17,14 @@ Fixed regressions
 
 - Fixed regression in :func:`read_excel` that caused it to raise ``AttributeError`` when checking version of older xlrd versions (:issue:`38955`)
 - Fixed regression in :class:`DataFrame` constructor reordering element when construction from datetime ndarray with dtype not ``"datetime64[ns]"`` (:issue:`39422`)
-- Fixed regression in :class:`DataFrame.astype` and :class:`Series.astype` not casting to bytes dtype (:issue:`39474`)
+- Fixed regression in :meth:`DataFrame.astype` and :meth:`Series.astype` not casting to bytes dtype (:issue:`39474`)
 - Fixed regression in :meth:`~DataFrame.to_pickle` failing to create bz2/xz compressed pickle files with ``protocol=5`` (:issue:`39002`)
 - Fixed regression in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` always raising ``AssertionError`` when comparing extension dtypes (:issue:`39410`)
 - Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`)
+- Fixed regression in :meth:`~DataFrame.to_excel` creating corrupt files when appending (``mode="a"``) to an existing file (:issue:`39576`)
+- Fixed regression in :meth:`DataFrame.transform` failing in case of an empty DataFrame or Series (:issue:`39636`)
 - Fixed regression in :meth:`core.window.rolling.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`)
+- Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`)
 -
 
 .. ---------------------------------------------------------------------------

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -219,7 +219,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor
 Other API changes
 ^^^^^^^^^^^^^^^^^
 - Partially initialized :class:`CategoricalDtype` (i.e. those with ``categories=None`` objects will no longer compare as equal to fully initialized dtype objects.
--
+- Accessing ``_constructor_expanddim`` on a :class:`DataFrame` and ``_constructor_sliced`` on a :class:`Series` now raise an ``AttributeError``. Previously a ``NotImplementedError`` was raised (:issue:`38782`)
 -
 
 .. ---------------------------------------------------------------------------
@@ -253,6 +253,7 @@ Performance improvements
 - Performance improvement in :meth:`DataFrame.corr` for method=kendall (:issue:`28329`)
 - Performance improvement in :meth:`core.window.rolling.Rolling.corr` and :meth:`core.window.rolling.Rolling.cov` (:issue:`39388`)
 - Performance improvement in :meth:`core.window.rolling.RollingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` (:issue:`39591`)
+- Performance improvement in :func:`unique` for object data type (:issue:`37615`)
 
 .. ---------------------------------------------------------------------------
 
@@ -304,6 +305,7 @@ Numeric
 - Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`)
 - Bug in :meth:`DataFrame.rank` with ``np.inf`` and mixture of ``np.nan`` and ``np.inf`` (:issue:`32593`)
 - Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising ``IndexError`` (:issue:`38932`)
+- Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`)
 -
 
 Conversion
@@ -338,7 +340,7 @@ Indexing
 - Bug in :meth:`Series.__setitem__` raising ``ValueError`` when setting a :class:`Series` with a scalar indexer (:issue:`38303`)
 - Bug in :meth:`DataFrame.loc` dropping levels of :class:`MultiIndex` when :class:`DataFrame` used as input has only one row (:issue:`10521`)
 - Bug in :meth:`DataFrame.__getitem__` and :meth:`Series.__getitem__` always raising ``KeyError`` when slicing with existing strings an :class:`Index` with milliseconds (:issue:`33589`)
-- Bug in setting ``timedelta64`` values into numeric :class:`Series` failing to cast to object dtype (:issue:`39086`)
+- Bug in setting ``timedelta64`` or ``datetime64`` values into numeric :class:`Series` failing to cast to object dtype (:issue:`39086`, issue:`39619`)
 - Bug in setting :class:`Interval` values into a :class:`Series` or :class:`DataFrame` with mismatched :class:`IntervalDtype` incorrectly casting the new values to the existing dtype (:issue:`39120`)
 - Bug in setting ``datetime64`` values into a :class:`Series` with integer-dtype incorrect casting the datetime64 values to integers (:issue:`39266`)
 - Bug in :meth:`Index.get_loc` not raising ``KeyError`` when method is specified for ``NaN`` value when ``NaN`` is not in :class:`Index` (:issue:`39382`)
@@ -445,10 +447,11 @@ Other
 - Bug in :class:`Index` constructor sometimes silently ignorning a specified ``dtype`` (:issue:`38879`)
 - Bug in constructing a :class:`Series` from a list and a :class:`PandasDtype` (:issue:`39357`)
 - Bug in :class:`Styler` which caused CSS to duplicate on multiple renders. (:issue:`39395`)
+- ``inspect.getmembers(Series)`` no longer raises an ``AbstractMethodError`` (:issue:`38782`)
 - :meth:`Index.where` behavior now mirrors :meth:`Index.putmask` behavior, i.e. ``index.where(mask, other)`` matches ``index.putmask(~mask, other)`` (:issue:`39412`)
 - Bug in :func:`pandas.testing.assert_series_equal`, :func:`pandas.testing.assert_frame_equal`, :func:`pandas.testing.assert_index_equal` and :func:`pandas.testing.assert_extension_array_equal` incorrectly raising when an attribute has an unrecognized NA type (:issue:`39461`)
 - Bug in :class:`Styler` where ``subset`` arg in methods raised an error for some valid multiindex slices (:issue:`33562`)
--
+- :class:`Styler` rendered HTML output minor alterations to support w3 good code standard (:issue:`39626`)
 -
 
 .. ---------------------------------------------------------------------------

diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in
@@ -34,10 +34,14 @@ cdef class {{name}}Engine(IndexEngine):
     cdef _make_hash_table(self, Py_ssize_t n):
         return _hash.{{name}}HashTable(n)
 
-    {{if name not in {'Float64', 'Float32'} }}
     cdef _check_type(self, object val):
+    {{if name not in {'Float64', 'Float32'} }}
         if not util.is_integer_object(val):
             raise KeyError(val)
+    {{else}}
+        if util.is_bool_object(val):
+            # avoid casting to True -> 1.0
+            raise KeyError(val)
     {{endif}}
 
     cdef void _call_map_locations(self, values):

diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h
@@ -178,11 +178,31 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) {
 	return result;
 }
 
-// For PyObject_Hash holds:
-//    hash(0.0) == 0 == hash(-0.0)
-//    hash(X) == 0 if X is a NaN-value
-// so it is OK to use it directly
-#define kh_python_hash_func(key) (PyObject_Hash(key))
+
+khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){
+    // For PyObject_Hash holds:
+    //    hash(0.0) == 0 == hash(-0.0)
+    //    hash(X) == 0 if X is a NaN-value
+    // so it is OK to use it directly for doubles
+    Py_hash_t hash = PyObject_Hash(key);
+	if (hash == -1) {
+		PyErr_Clear();
+		return 0;
+	}
+    #if SIZEOF_PY_HASH_T == 4
+        // it is already 32bit value
+        return hash;
+    #else
+        // for 64bit builds,
+        // we need information of the upper 32bits as well
+        // see GH 37615
+        khuint64_t as_uint = (khuint64_t) hash;
+        // uints avoid undefined behavior of signed ints
+        return (as_uint>>32)^as_uint;
+    #endif
+}
+
+
 #define kh_python_hash_equal(a, b) (pyobject_cmp(a, b))
 
 

diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py
@@ -457,7 +457,7 @@ def transform(
 
     # Functions that transform may return empty Series/DataFrame
     # when the dtype is not appropriate
-    if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty:
+    if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty and not obj.empty:
         raise ValueError("Transform function failed")
     if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals(
         obj.index

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -324,7 +324,8 @@ def unique(values):
     Hash table-based unique. Uniques are returned in order
     of appearance. This does NOT sort.
 
-    Significantly faster than numpy.unique. Includes NA values.
+    Significantly faster than numpy.unique for long enough sequences.
+    Includes NA values.
 
     Parameters
     ----------

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -63,7 +63,7 @@ def frame_apply(
     raw: bool = False,
     result_type: Optional[str] = None,
     args=None,
-    kwds=None,
+    kwargs=None,
 ) -> FrameApply:
     """ construct and return a row or column based frame apply object """
     axis = obj._get_axis_number(axis)
@@ -79,7 +79,7 @@ def frame_apply(
         raw=raw,
         result_type=result_type,
         args=args,
-        kwds=kwds,
+        kwargs=kwargs,
     )
 
 
@@ -88,14 +88,14 @@ def series_apply(
     func: AggFuncType,
     convert_dtype: bool = True,
     args=None,
-    kwds=None,
+    kwargs=None,
 ) -> SeriesApply:
     return SeriesApply(
         obj,
         func,
         convert_dtype,
         args,
-        kwds,
+        kwargs,
     )
 
 
@@ -109,12 +109,12 @@ def __init__(
         raw: bool,
         result_type: Optional[str],
         args,
-        kwds,
+        kwargs,
     ):
         self.obj = obj
         self.raw = raw
         self.args = args or ()
-        self.kwds = kwds or {}
+        self.kwargs = kwargs or {}
 
         if result_type not in [None, "reduce", "broadcast", "expand"]:
             raise ValueError(
@@ -126,13 +126,13 @@ def __init__(
 
         # curry if needed
         if (
-            (kwds or args)
+            (kwargs or args)
             and not isinstance(func, (np.ufunc, str))
             and not is_list_like(func)
         ):
 
             def f(x):
-                return func(x, *args, **kwds)
+                return func(x, *args, **kwargs)
 
         else:
             f = func
@@ -163,7 +163,7 @@ def agg(self) -> Tuple[Optional[FrameOrSeriesUnion], Optional[bool]]:
         obj = self.obj
         arg = self.f
         args = self.args
-        kwargs = self.kwds
+        kwargs = self.kwargs
 
         _axis = kwargs.pop("_axis", None)
         if _axis is None:
@@ -413,10 +413,10 @@ def maybe_apply_str(self) -> Optional[FrameOrSeriesUnion]:
         if callable(func):
             sig = inspect.getfullargspec(func)
             if "axis" in sig.args:
-                self.kwds["axis"] = self.axis
+                self.kwargs["axis"] = self.axis
             elif self.axis != 0:
                 raise ValueError(f"Operation {f} does not support axis=1")
-        return self.obj._try_aggregate_string_function(f, *self.args, **self.kwds)
+        return self.obj._try_aggregate_string_function(f, *self.args, **self.kwargs)
 
     def maybe_apply_multiple(self) -> Optional[FrameOrSeriesUnion]:
         """
@@ -430,7 +430,7 @@ def maybe_apply_multiple(self) -> Optional[FrameOrSeriesUnion]:
         # Note: dict-likes are list-like
         if not is_list_like(self.f):
             return None
-        return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwds)
+        return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwargs)
 
 
 class FrameApply(Apply):
@@ -806,7 +806,7 @@ def __init__(
         func: AggFuncType,
         convert_dtype: bool,
         args,
-        kwds,
+        kwargs,
     ):
         self.convert_dtype = convert_dtype
 
@@ -816,7 +816,7 @@ def __init__(
             raw=False,
             result_type=None,
             args=args,
-            kwds=kwds,
+            kwargs=kwargs,
         )
 
     def apply(self) -> FrameOrSeriesUnion:
@@ -877,17 +877,17 @@ def __init__(
         obj: Union[SeriesGroupBy, DataFrameGroupBy],
         func: AggFuncType,
         args,
-        kwds,
+        kwargs,
     ):
-        kwds = kwds.copy()
-        self.axis = obj.obj._get_axis_number(kwds.get("axis", 0))
+        kwargs = kwargs.copy()
+        self.axis = obj.obj._get_axis_number(kwargs.get("axis", 0))
         super().__init__(
             obj,
             func,
             raw=False,
             result_type=None,
             args=args,
-            kwds=kwds,
+            kwargs=kwargs,
         )
 
     def apply(self):
@@ -903,15 +903,15 @@ def __init__(
         obj: Union[Resampler, BaseWindow],
         func: AggFuncType,
         args,
-        kwds,
+        kwargs,
     ):
         super().__init__(
             obj,
             func,
             raw=False,
             result_type=None,
             args=args,
-            kwds=kwds,
+            kwargs=kwargs,
         )
 
     def apply(self):

diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -588,9 +588,13 @@ def _try_cast(arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bo
         Otherwise an object array is returned.
     """
     # perf shortcut as this is the most common case
-    if isinstance(arr, np.ndarray):
-        if maybe_castable(arr) and not copy and dtype is None:
-            return arr
+    if (
+        isinstance(arr, np.ndarray)
+        and maybe_castable(arr.dtype)
+        and not copy
+        and dtype is None
+    ):
+        return arr
 
     if isinstance(dtype, ExtensionDtype) and (dtype.kind != "M" or is_sparse(dtype)):
         # create an extension array from its dtype