pandas-dev · jbrockmendel · Jul 16, 2021 · Jul 17, 2021 · Jul 19, 2021 · Jul 19, 2021
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -133,15 +133,15 @@ jobs:
         echo "${{ secrets.server_ssh_key }}" > ~/.ssh/id_rsa
         chmod 600 ~/.ssh/id_rsa
         echo "${{ secrets.server_ip }} ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBE1Kkopomm7FHG5enATf7SgnpICZ4W2bw+Ho+afqin+w7sMcrsa0je7sbztFAV8YchDkiBKnWTG4cRT+KZgZCaY=" > ~/.ssh/known_hosts
-      if: ${{github.event_name == 'push' && github.ref == 'refs/head/master'}}
+      if: ${{github.event_name == 'push' && github.ref == 'refs/heads/master'}}
 
     - name: Upload web
       run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' --exclude='Pandas_Cheat_Sheet*' web/build/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas
-      if: ${{github.event_name == 'push' && github.ref == 'refs/head/master'}}
+      if: ${{github.event_name == 'push' && github.ref == 'refs/heads/master'}}
 
     - name: Upload dev docs
       run: rsync -az --delete doc/build/html/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas/pandas-docs/dev
-      if: ${{github.event_name == 'push' && github.ref == 'refs/head/master'}}
+      if: ${{github.event_name == 'push' && github.ref == 'refs/heads/master'}}
 
     - name: Move docs into site directory
       run: mv doc/build/html web/build/docs

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -9,9 +9,11 @@ trigger:
     - 'doc/*'
 
 pr:
-- master
-- 1.2.x
-- 1.3.x
+  autoCancel: true
+  branches:
+    include:
+    - master
+    - 1.3.x
 
 variables:
   PYTEST_WORKERS: auto

diff --git a/ci/deps/actions-38-db.yaml b/ci/deps/actions-38-db.yaml
@@ -15,7 +15,7 @@ dependencies:
   - beautifulsoup4
   - botocore>=1.11
   - dask
-  - fastparquet>=0.4.0
+  - fastparquet>=0.4.0, < 0.7.0
   - fsspec>=0.7.4, <2021.6.0
   - gcsfs>=0.6.0
   - geopandas

diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml
@@ -15,7 +15,7 @@ dependencies:
   # pandas dependencies
   - blosc
   - bottleneck
-  - fastparquet>=0.4.0
+  - fastparquet>=0.4.0, <0.7.0
   - flask
   - fsspec>=0.8.0, <2021.6.0
   - matplotlib=3.3.2

diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst
@@ -1740,7 +1740,7 @@ Starting in version 0.25, pandas can be extended with third-party plotting backe
 main idea is letting users select a plotting backend different than the provided
 one based on Matplotlib.
 
-This can be done by passsing 'backend.module' as the argument ``backend`` in ``plot``
+This can be done by passing 'backend.module' as the argument ``backend`` in ``plot``
 function. For example:
 
 .. code-block:: python

diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst
@@ -24,6 +24,7 @@ Fixed regressions
 - Fixed regression in indexing with a ``list`` subclass incorrectly raising ``TypeError`` (:issue:`42433`, :issue:`42461`)
 - Fixed regression in :meth:`DataFrame.isin` and :meth:`Series.isin` raising ``TypeError`` with nullable data containing at least one missing value (:issue:`42405`)
 - Regression in :func:`concat` between objects with bool dtype and integer dtype casting to object instead of to integer (:issue:`42092`)
+- Bug in :class:`Series` constructor not accepting a ``dask.Array`` (:issue:`38645`)
 
 .. ---------------------------------------------------------------------------
 

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -256,6 +256,7 @@ Plotting
 
 Groupby/resample/rolling
 ^^^^^^^^^^^^^^^^^^^^^^^^
+- Fixed bug in :meth:`SeriesGroupBy.apply` where passing an unrecognized string argument failed to raise ``TypeError`` when the underlying ``Series`` is empty (:issue:`42021`)
 - Bug in :meth:`Series.rolling.apply`, :meth:`DataFrame.rolling.apply`, :meth:`Series.expanding.apply` and :meth:`DataFrame.expanding.apply` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`42287`)
 -
 

diff --git a/environment.yml b/environment.yml
@@ -99,7 +99,7 @@ dependencies:
   - xlwt
   - odfpy
 
-  - fastparquet>=0.4.0  # pandas.read_parquet, DataFrame.to_parquet
+  - fastparquet>=0.4.0, <0.7.0  # pandas.read_parquet, DataFrame.to_parquet
   - pyarrow>=0.17.0  # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather
   - python-snappy  # required by pyarrow
 

diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -560,8 +560,11 @@ def sanitize_array(
             raise TypeError(f"'{type(data).__name__}' type is unordered")
 
         # materialize e.g. generators, convert e.g. tuples, abc.ValueView
-        # TODO: non-standard array-likes we can convert to ndarray more efficiently?
-        data = list(data)
+        if hasattr(data, "__array__"):
+            # e.g. dask array GH#38645
+            data = np.asarray(data)
+        else:
+            data = list(data)
 
         if dtype is not None or len(data) == 0:
             subarr = _try_cast(data, dtype, copy, raise_cast_failure)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -55,6 +55,7 @@
     ColspaceArgType,
     CompressionOptions,
     Dtype,
+    DtypeObj,
     FilePathOrBuffer,
     FillnaOptions,
     FloatFormatType,
@@ -4297,50 +4298,25 @@ def check_int_infer_dtype(dtypes):
         if not include.isdisjoint(exclude):
             raise ValueError(f"include and exclude overlap on {(include & exclude)}")
 
-        # We raise when both include and exclude are empty
-        # Hence, we can just shrink the columns we want to keep
-        keep_these = np.full(self.shape[1], True)
-
-        def extract_unique_dtypes_from_dtypes_set(
-            dtypes_set: frozenset[Dtype], unique_dtypes: np.ndarray
-        ) -> list[Dtype]:
-            extracted_dtypes = [
-                unique_dtype
-                for unique_dtype in unique_dtypes
-                if (
-                    issubclass(
-                        # error: Argument 1 to "tuple" has incompatible type
-                        # "FrozenSet[Union[ExtensionDtype, Union[str, Any], Type[str],
-                        # Type[float], Type[int], Type[complex], Type[bool],
-                        # Type[object]]]"; expected "Iterable[Union[type, Tuple[Any,
-                        # ...]]]"
-                        unique_dtype.type,
-                        tuple(dtypes_set),  # type: ignore[arg-type]
-                    )
-                    or (
-                        np.number in dtypes_set
-                        and getattr(unique_dtype, "_is_numeric", False)
-                    )
-                )
-            ]
-            return extracted_dtypes
+        def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool:
+            return issubclass(dtype.type, tuple(dtypes_set)) or (
+                np.number in dtypes_set and getattr(dtype, "_is_numeric", False)
+            )
 
-        unique_dtypes = self.dtypes.unique()
+        def predicate(arr: ArrayLike) -> bool:
+            dtype = arr.dtype
+            if include:
+                if not dtype_predicate(dtype, include):
+                    return False
 
-        if include:
-            included_dtypes = extract_unique_dtypes_from_dtypes_set(
-                include, unique_dtypes
-            )
-            keep_these &= self.dtypes.isin(included_dtypes)
+            if exclude:
+                if dtype_predicate(dtype, exclude):
+                    return False
 
-        if exclude:
-            excluded_dtypes = extract_unique_dtypes_from_dtypes_set(
-                exclude, unique_dtypes
-            )
-            keep_these &= ~self.dtypes.isin(excluded_dtypes)
+            return True
 
-        # error: "ndarray" has no attribute "values"
-        return self.iloc[:, keep_these.values]  # type: ignore[attr-defined]
+        mgr = self._mgr._get_data_subset(predicate)
+        return type(self)(mgr).__finalize__(self)
 
     def insert(self, loc, column, value, allow_duplicates: bool = False) -> None:
         """

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1242,7 +1242,17 @@ def f(g):
                 raise ValueError(
                     "func must be a callable if args or kwargs are supplied"
                 )
+        elif isinstance(func, str):
+            if hasattr(self, func):
+                res = getattr(self, func)
+                if callable(res):
+                    return res()
+                return res
+
+            else:
+                raise TypeError(f"apply func should be callable, not '{func}'")
         else:
+
             f = func
 
         # ignore SettingWithCopy here in case the user mutates

diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py
@@ -474,7 +474,11 @@ def _get_data_subset(self: T, predicate: Callable) -> T:
         indices = [i for i, arr in enumerate(self.arrays) if predicate(arr)]
         arrays = [self.arrays[i] for i in indices]
         # TODO copy?
-        new_axes = [self._axes[0], self._axes[1][np.array(indices, dtype="intp")]]
+        # Note: using Index.take ensures we can retain e.g. DatetimeIndex.freq,
+        #  see test_describe_datetime_columns
+        taker = np.array(indices, dtype="intp")
+        new_cols = self._axes[1].take(taker)
+        new_axes = [self._axes[0], new_cols]
         return type(self)(arrays, new_axes, verify_integrity=False)
 
     def get_bool_data(self: T, copy: bool = False) -> T:

diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
@@ -123,12 +123,16 @@ def concat_arrays(to_concat: list) -> ArrayLike:
     # ignore the all-NA proxies to determine the resulting dtype
     to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)]
 
-    single_dtype = len({x.dtype for x in to_concat_no_proxy}) == 1
+    dtypes = {x.dtype for x in to_concat_no_proxy}
+    single_dtype = len(dtypes) == 1
 
-    if not single_dtype:
-        target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy])
-    else:
+    if single_dtype:
         target_dtype = to_concat_no_proxy[0].dtype
+    elif all(x.kind in ["i", "u", "b"] and isinstance(x, np.dtype) for x in dtypes):
+        # GH#42092
+        target_dtype = np.find_common_type(list(dtypes), [])
+    else:
+        target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy])
 
     if target_dtype.kind in ["m", "M"]:
         # for datetimelike use DatetimeArray/TimedeltaArray concatenation

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -1673,12 +1673,7 @@ def read(
             if self.dtyplist[i] is not None:
                 col = data.columns[i]
                 dtype = data[col].dtype
-                # error: Value of type variable "_DTypeScalar" of "dtype" cannot be
-                # "object"
-                if (
-                    dtype != np.dtype(object)  # type: ignore[type-var]
-                    and dtype != self.dtyplist[i]
-                ):
+                if dtype != np.dtype(object) and dtype != self.dtyplist[i]:
                     requires_type_conversion = True
                     data_formatted.append(
                         (col, Series(data[col], ix, self.dtyplist[i]))

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -1770,13 +1770,9 @@ def test_empty_groupby(columns, keys, values, method, op, request):
         isinstance(values, Categorical)
         and not isinstance(columns, list)
         and op in ["sum", "prod"]
-        and method != "apply"
     ):
         # handled below GH#41291
         pass
-    elif isinstance(values, Categorical) and len(keys) == 1 and method == "apply":
-        mark = pytest.mark.xfail(raises=TypeError, match="'str' object is not callable")
-        request.node.add_marker(mark)
     elif (
         isinstance(values, Categorical)
         and len(keys) == 1
@@ -1808,21 +1804,16 @@ def test_empty_groupby(columns, keys, values, method, op, request):
         isinstance(values, Categorical)
         and len(keys) == 2
         and op in ["min", "max", "sum"]
-        and method != "apply"
     ):
         mark = pytest.mark.xfail(
             raises=AssertionError, match="(DataFrame|Series) are different"
         )
         request.node.add_marker(mark)
-    elif (
-        isinstance(values, pd.core.arrays.BooleanArray)
-        and op in ["sum", "prod"]
-        and method != "apply"
-    ):
+    elif isinstance(values, pd.core.arrays.BooleanArray) and op in ["sum", "prod"]:
         # We expect to get Int64 back for these
         override_dtype = "Int64"
 
-    if isinstance(values[0], bool) and op in ("prod", "sum") and method != "apply":
+    if isinstance(values[0], bool) and op in ("prod", "sum"):
         # sum/product of bools is an integer
         override_dtype = "int64"
 
@@ -1846,66 +1837,62 @@ def get_result():
         # i.e. SeriesGroupBy
         if op in ["prod", "sum"]:
             # ops that require more than just ordered-ness
-            if method != "apply":
-                # FIXME: apply goes through different code path
-                if df.dtypes[0].kind == "M":
-                    # GH#41291
-                    # datetime64 -> prod and sum are invalid
-                    msg = "datetime64 type does not support"
-                    with pytest.raises(TypeError, match=msg):
-                        get_result()
-
-                    return
-                elif isinstance(values, Categorical):
-                    # GH#41291
-                    msg = "category type does not support"
-                    with pytest.raises(TypeError, match=msg):
-                        get_result()
-
-                    return
+            if df.dtypes[0].kind == "M":
+                # GH#41291
+                # datetime64 -> prod and sum are invalid
+                msg = "datetime64 type does not support"
+                with pytest.raises(TypeError, match=msg):
+                    get_result()
+
+                return
+            elif isinstance(values, Categorical):
+                # GH#41291
+                msg = "category type does not support"
+                with pytest.raises(TypeError, match=msg):
+                    get_result()
+
+                return
     else:
         # ie. DataFrameGroupBy
         if op in ["prod", "sum"]:
             # ops that require more than just ordered-ness
-            if method != "apply":
-                # FIXME: apply goes through different code path
-                if df.dtypes[0].kind == "M":
-                    # GH#41291
-                    # datetime64 -> prod and sum are invalid
-                    result = get_result()
-
-                    # with numeric_only=True, these are dropped, and we get
-                    # an empty DataFrame back
-                    expected = df.set_index(keys)[[]]
-                    tm.assert_equal(result, expected)
-                    return
-
-                elif isinstance(values, Categorical):
-                    # GH#41291
-                    # Categorical doesn't implement sum or prod
-                    result = get_result()
-
-                    # with numeric_only=True, these are dropped, and we get
-                    # an empty DataFrame back
-                    expected = df.set_index(keys)[[]]
-                    if len(keys) != 1 and op == "prod":
-                        # TODO: why just prod and not sum?
-                        # Categorical is special without 'observed=True'
-                        lev = Categorical([0], dtype=values.dtype)
-                        mi = MultiIndex.from_product([lev, lev], names=["A", "B"])
-                        expected = DataFrame([], columns=[], index=mi)
-
-                    tm.assert_equal(result, expected)
-                    return
-
-                elif df.dtypes[0] == object:
-                    # FIXME: the test is actually wrong here, xref #41341
-                    result = get_result()
-                    # In this case we have list-of-list, will raise TypeError,
-                    # and subsequently be dropped as nuisance columns
-                    expected = df.set_index(keys)[[]]
-                    tm.assert_equal(result, expected)
-                    return
+            if df.dtypes[0].kind == "M":
+                # GH#41291
+                # datetime64 -> prod and sum are invalid
+                result = get_result()
+
+                # with numeric_only=True, these are dropped, and we get
+                # an empty DataFrame back
+                expected = df.set_index(keys)[[]]
+                tm.assert_equal(result, expected)
+                return
+
+            elif isinstance(values, Categorical):
+                # GH#41291
+                # Categorical doesn't implement sum or prod
+                result = get_result()
+
+                # with numeric_only=True, these are dropped, and we get
+                # an empty DataFrame back
+                expected = df.set_index(keys)[[]]
+                if len(keys) != 1 and op == "prod":
+                    # TODO: why just prod and not sum?
+                    # Categorical is special without 'observed=True'
+                    lev = Categorical([0], dtype=values.dtype)
+                    mi = MultiIndex.from_product([lev, lev], names=["A", "B"])
+                    expected = DataFrame([], columns=[], index=mi)
+
+                tm.assert_equal(result, expected)
+                return
+
+            elif df.dtypes[0] == object:
+                # FIXME: the test is actually wrong here, xref #41341
+                result = get_result()
+                # In this case we have list-of-list, will raise TypeError,
+                # and subsequently be dropped as nuisance columns
+                expected = df.set_index(keys)[[]]
+                tm.assert_equal(result, expected)
+                return
 
     result = get_result()
     expected = df.set_index(keys)[columns]