Merge remote-tracking branch 'upstream/master' into bisect

simonjayhawkins · simonjayhawkins · commit b7ac784ca0f5 · 2021-07-27T11:06:23.000+01:00
diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py
@@ -102,6 +102,7 @@ def setup(self, dtype):
         columns = np.arange(n)
         if dtype == "int":
             values = np.arange(m * m * n).reshape(m * m, n)
+            self.df = DataFrame(values, index, columns)
         else:
             # the category branch is ~20x slower than int. So we
             # cut down the size a bit. Now it's only ~3x slower.
@@ -111,7 +112,10 @@ def setup(self, dtype):
             values = np.take(list(string.ascii_letters), indices)
             values = [pd.Categorical(v) for v in values.T]
 
-        self.df = DataFrame(values, index, columns)
+            self.df = DataFrame(
+                {i: cat for i, cat in enumerate(values)}, index, columns
+            )
+
         self.df2 = self.df.iloc[:-1]
 
     def time_full_product(self, dtype):
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -121,7 +121,8 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
       pandas/io/parsers/ \
       pandas/io/sas/ \
       pandas/io/sql.py \
-      pandas/tseries/
+      pandas/tseries/ \
+      pandas/io/formats/style_render.py
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
 fi
diff --git a/ci/deps/actions-39-slow.yaml b/ci/deps/actions-39-slow.yaml
@@ -23,6 +23,7 @@ dependencies:
   - matplotlib
   - moto>=1.3.14
   - flask
+  - numba
   - numexpr
   - numpy
   - openpyxl
diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml
@@ -22,6 +22,7 @@ dependencies:
   - matplotlib
   - moto>=1.3.14
   - flask
+  - numba
   - numexpr
   - numpy
   - openpyxl
diff --git a/ci/deps/azure-windows-39.yaml b/ci/deps/azure-windows-39.yaml
@@ -23,6 +23,7 @@ dependencies:
   - matplotlib
   - moto>=1.3.14
   - flask
+  - numba
   - numexpr
   - numpy
   - openpyxl
diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst
@@ -14,7 +14,9 @@ including other versions of pandas.
 
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
--
+- Performance regression in :meth:`DataFrame.isin` and :meth:`Series.isin` for nullable data types (:issue:`42714`)
+- Regression in updating values of :class:`pandas.Series` using boolean index, created by using :meth:`pandas.DataFrame.pop` (:issue:`42530`)
+- Regression in :meth:`DataFrame.from_records` with empty records (:issue:`42456`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -166,6 +166,7 @@ Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 - Performance improvement in :meth:`.GroupBy.sample`, especially when ``weights`` argument provided (:issue:`34483`)
 - Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`)
+- Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`)
 
 .. ---------------------------------------------------------------------------
 
@@ -225,7 +226,6 @@ Indexing
 - Bug in :meth:`Series.loc` when with a :class:`MultiIndex` whose first level contains only ``np.nan`` values (:issue:`42055`)
 - Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`DatetimeIndex` when passing a string, the return type depended on whether the index was monotonic (:issue:`24892`)
 - Bug in indexing on a :class:`MultiIndex` failing to drop scalar levels when the indexer is a tuple containing a datetime-like string (:issue:`42476`)
--
 
 Missing
 ^^^^^^^
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -417,7 +417,7 @@ def isin(self, values) -> BooleanArray:  # type: ignore[override]
             # see https://github.com/pandas-dev/pandas/pull/38379 for some discussion
             result[self._mask] = values_have_NA
 
-        mask = np.zeros_like(self, dtype=bool)
+        mask = np.zeros(self._data.shape, dtype=bool)
         return BooleanArray(result, mask, copy=False)
 
     def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3745,7 +3745,7 @@ def _set_item_mgr(self, key, value: ArrayLike) -> None:
         # try to set first as we want an invalid
         # value exception to occur first
         if len(self):
-            self._check_setitem_copy(stacklevel=5)
+            self._check_setitem_copy()
 
     def _iset_item(self, loc: int, value) -> None:
         arraylike = self._sanitize_column(value)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -67,6 +67,7 @@
     doc,
     rewrite_axis_style_signature,
 )
+from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import (
     validate_ascending,
     validate_bool_kwarg,
@@ -3506,7 +3507,7 @@ def _maybe_update_cacher(
         """
 
         if verify_is_copy:
-            self._check_setitem_copy(stacklevel=5, t="referent")
+            self._check_setitem_copy(t="referent")
 
         if clear:
             self._clear_item_cache()
@@ -3853,26 +3854,21 @@ def _check_is_chained_assignment_possible(self) -> bool_t:
         setting.
         """
         if self._is_copy:
-            self._check_setitem_copy(stacklevel=4, t="referent")
+            self._check_setitem_copy(t="referent")
         return False
 
     @final
-    def _check_setitem_copy(self, stacklevel=4, t="setting", force=False):
+    def _check_setitem_copy(self, t="setting", force=False):
         """
 
         Parameters
         ----------
-        stacklevel : int, default 4
-           the level to show of the stack when the error is output
         t : str, the type of setting error
         force : bool, default False
            If True, then force showing an error.
 
         validate if we are doing a setitem on a chained copy.
 
-        If you call this function, be sure to set the stacklevel such that the
-        user will see the error *at the level of setting*
-
         It is technically possible to figure out that we are setting on
         a copy even WITH a multi-dtyped pandas object. In other words, some
         blocks may be views while other are not. Currently _is_view will ALWAYS
@@ -3931,7 +3927,7 @@ def _check_setitem_copy(self, stacklevel=4, t="setting", force=False):
         if value == "raise":
             raise com.SettingWithCopyError(t)
         elif value == "warn":
-            warnings.warn(t, com.SettingWithCopyWarning, stacklevel=stacklevel)
+            warnings.warn(t, com.SettingWithCopyWarning, stacklevel=find_stack_level())
 
     def __delitem__(self, key) -> None:
         """
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -2836,7 +2836,7 @@ def _maybe_to_slice(loc):
             try:
                 return self._engine.get_loc(key)
             except TypeError:
-                # e.g. partial string slicing
+                # e.g. test_partial_slicing_with_multiindex partial string slicing
                 loc, _ = self.get_loc_level(key, list(range(self.nlevels)))
                 return loc
 
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -658,16 +658,7 @@ def _get_setitem_indexer(self, key):
         if isinstance(key, range):
             return list(key)
 
-        try:
-            return self._convert_to_indexer(key, axis=0, is_setter=True)
-        except TypeError as e:
-
-            # invalid indexer type vs 'other' indexing errors
-            if "cannot do" in str(e):
-                raise
-            elif "unhashable type" in str(e):
-                raise
-            raise IndexingError(key) from e
+        return self._convert_to_indexer(key, axis=0, is_setter=True)
 
     def _ensure_listlike_indexer(self, key, axis=None, value=None):
         """
@@ -1209,7 +1200,7 @@ def _convert_to_indexer(self, key, axis: int, is_setter: bool = False):
         is_int_index = labels.is_integer()
         is_int_positional = is_integer(key) and not is_int_index
 
-        if is_scalar(key) or isinstance(labels, MultiIndex):
+        if is_scalar(key) or (isinstance(labels, MultiIndex) and is_hashable(key)):
             # Otherwise get_loc will raise InvalidIndexError
 
             # if we are a label return me
@@ -1224,8 +1215,6 @@ def _convert_to_indexer(self, key, axis: int, is_setter: bool = False):
                 # GH35015, using datetime as column indices raises exception
                 if not isinstance(labels, MultiIndex):
                     raise
-            except TypeError:
-                pass
             except ValueError:
                 if not is_int_positional:
                     raise
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -25,7 +25,6 @@
 from pandas._libs.internals import BlockPlacement
 from pandas._typing import (
     ArrayLike,
-    Dtype,
     DtypeObj,
     F,
     Shape,
@@ -52,7 +51,6 @@
     is_list_like,
     is_sparse,
     is_string_dtype,
-    pandas_dtype,
 )
 from pandas.core.dtypes.dtypes import (
     CategoricalDtype,
@@ -100,6 +98,7 @@
     TimedeltaArray,
 )
 from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
+from pandas.core.arrays.sparse import SparseDtype
 from pandas.core.base import PandasObject
 import pandas.core.common as com
 import pandas.core.computation.expressions as expressions
@@ -326,6 +325,8 @@ def getitem_block_columns(self, slicer, new_mgr_locs: BlockPlacement) -> Block:
 
         return type(self)(new_values, new_mgr_locs, self.ndim)
 
+    # NB: this cannot be made cache_readonly because in libreduction we pin
+    #  new .values that can have different shape GH#42631
     @property
     def shape(self) -> Shape:
         return self.values.shape
@@ -1255,7 +1256,7 @@ def where(self, other, cond, errors="raise") -> list[Block]:
 
         return result_blocks
 
-    def _unstack(self, unstacker, fill_value, new_placement):
+    def _unstack(self, unstacker, fill_value, new_placement, allow_fill: bool):
         """
         Return a list of unstacked blocks of self
 
@@ -1264,6 +1265,7 @@ def _unstack(self, unstacker, fill_value, new_placement):
         unstacker : reshape._Unstacker
         fill_value : int
             Only used in ExtensionBlock._unstack
+        allow_fill : bool
 
         Returns
         -------
@@ -1638,7 +1640,7 @@ def where(self, other, cond, errors="raise") -> list[Block]:
 
         return [self.make_block_same_class(result)]
 
-    def _unstack(self, unstacker, fill_value, new_placement):
+    def _unstack(self, unstacker, fill_value, new_placement, allow_fill: bool):
         # ExtensionArray-safe unstack.
         # We override ObjectBlock._unstack, which unstacks directly on the
         # values of the array. For EA-backed blocks, this would require
@@ -1655,7 +1657,7 @@ def _unstack(self, unstacker, fill_value, new_placement):
         blocks = [
             # TODO: could cast to object depending on fill_value?
             self.make_block_same_class(
-                self.values.take(indices, allow_fill=True, fill_value=fill_value),
+                self.values.take(indices, allow_fill=allow_fill, fill_value=fill_value),
                 BlockPlacement(place),
             )
             for indices, place in zip(new_values.T, new_placement)
@@ -1842,7 +1844,7 @@ class CategoricalBlock(ExtensionBlock):
 # Constructor Helpers
 
 
-def maybe_coerce_values(values) -> ArrayLike:
+def maybe_coerce_values(values: ArrayLike) -> ArrayLike:
     """
     Input validation for values passed to __init__. Ensure that
     any datetime64/timedelta64 dtypes are in nanoseconds.  Ensure
@@ -1874,7 +1876,7 @@ def maybe_coerce_values(values) -> ArrayLike:
     return values
 
 
-def get_block_type(values, dtype: Dtype | None = None):
+def get_block_type(values, dtype: DtypeObj | None = None):
     """
     Find the appropriate Block subclass to use for the given values and dtype.
 
@@ -1889,13 +1891,15 @@ def get_block_type(values, dtype: Dtype | None = None):
     """
     # We use vtype and kind checks because they are much more performant
     #  than is_foo_dtype
-    dtype = cast(np.dtype, pandas_dtype(dtype) if dtype else values.dtype)
+    if dtype is None:
+        dtype = values.dtype
+
     vtype = dtype.type
     kind = dtype.kind
 
     cls: type[Block]
 
-    if is_sparse(dtype):
+    if isinstance(dtype, SparseDtype):
         # Need this first(ish) so that Sparse[datetime] is sparse
         cls = ExtensionBlock
     elif isinstance(dtype, CategoricalDtype):
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -757,6 +757,14 @@ def to_arrays(
                 # i.e. numpy structured array
                 columns = ensure_index(data.dtype.names)
                 arrays = [data[name] for name in columns]
+
+                if len(data) == 0:
+                    # GH#42456 the indexing above results in list of 2D ndarrays
+                    # TODO: is that an issue with numpy?
+                    for i, arr in enumerate(arrays):
+                        if arr.ndim == 2:
+                            arrays[i] = arr[:, 0]
+
                 return arrays, columns
         return [], ensure_index([])
 
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
diff --git a/pandas/core/series.py b/pandas/core/series.py
diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py
diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py
diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,9 @@ including other versions of pandas.`
`14`	`14`
`15`	`15`	`Fixed regressions`
`16`	`16`	`~~~~~~~~~~~~~~~~~`
`17`		`--`
	`17`	+- Performance regression in :meth:`DataFrame.isin` and :meth:`Series.isin` for nullable data types (:issue:`42714`)
	`18`	+- Regression in updating values of :class:`pandas.Series` using boolean index, created by using :meth:`pandas.DataFrame.pop` (:issue:`42530`)
	`19`	+- Regression in :meth:`DataFrame.from_records` with empty records (:issue:`42456`)
`18`	`20`	`-`
`19`	`21`
`20`	`22`	`.. ---------------------------------------------------------------------------`