Merge branch 'main' into issue-48949

kostyafarber · web-flow · commit e5ea765ebfc7 · 2022-11-06T15:23:58.000Z
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -72,14 +72,22 @@ jobs:
         env:
           CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }}
 
-      # Used to test the built wheels
-      - uses: actions/setup-python@v4
+      # Used to test(Windows-only) and push the built wheels
+      # You might need to use setup-python separately
+      # if the new Python-dev version
+      # is unavailable on conda-forge.
+      - uses: conda-incubator/setup-miniconda@v2
         with:
+          auto-update-conda: true
           python-version: ${{ matrix.python[1] }}
+          activate-environment: test
+          channels: conda-forge, anaconda
+          channel-priority: true
+          mamba-version: "*"
 
       - name: Test wheels (Windows 64-bit only)
         if: ${{ matrix.buildplat[1] == 'win_amd64' }}
-        shell: cmd
+        shell: cmd /C CALL {0}
         run: |
           python ci/test_wheels.py wheelhouse
 
@@ -88,26 +96,15 @@ jobs:
           name: ${{ matrix.python[0] }}-${{ startsWith(matrix.buildplat[1], 'macosx') && 'macosx' || matrix.buildplat[1] }}
           path: ./wheelhouse/*.whl
 
-      # Used to push the built wheels
-      # TODO: once Python 3.11 is available on conda, de-dup with
-      # setup python above
-      - uses: conda-incubator/setup-miniconda@v2
-        with:
-          auto-update-conda: true
-          # Really doesn't matter what version we upload with
-          # just the version we test with
-          python-version: '3.8'
-          channels: conda-forge
-          channel-priority: true
-          mamba-version: "*"
 
       - name: Install anaconda client
         if: ${{ success() && (env.IS_SCHEDULE_DISPATCH == 'true' || env.IS_PUSH == 'true') }}
+        shell: bash -el {0}
         run: conda install -q -y anaconda-client
 
 
       - name: Upload wheels
-        if: success()
+        if: ${{ success() && (env.IS_SCHEDULE_DISPATCH == 'true' || env.IS_PUSH == 'true') }}
         shell: bash -el {0}
         env:
           PANDAS_STAGING_UPLOAD_TOKEN: ${{ secrets.PANDAS_STAGING_UPLOAD_TOKEN }}
@@ -180,11 +177,12 @@ jobs:
 
       - name: Install anaconda client
         if: ${{ success() && (env.IS_SCHEDULE_DISPATCH == 'true' || env.IS_PUSH == 'true') }}
+        shell: bash -el {0}
         run: |
           conda install -q -y anaconda-client
 
       - name: Upload sdist
-        if: success()
+        if: ${{ success() && (env.IS_SCHEDULE_DISPATCH == 'true' || env.IS_PUSH == 'true') }}
         shell: bash -el {0}
         env:
           PANDAS_STAGING_UPLOAD_TOKEN: ${{ secrets.PANDAS_STAGING_UPLOAD_TOKEN }}
diff --git a/README.md b/README.md
@@ -166,6 +166,6 @@ You can also triage issues which may include reproducing bug reports, or asking
 
 Or maybe through using pandas you have an idea of your own or are looking for something in the documentation and thinking ‘this can be improved’...you can do something about it!
 
-Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Gitter](https://gitter.im/pydata/pandas).
+Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Slack](https://pandas.pydata.org/docs/dev/development/community.html?highlight=slack#community-slack).
 
 As contributors and maintainers to this project, you are expected to abide by pandas' code of conduct. More information can be found at: [Contributor Code of Conduct](https://github.com/pandas-dev/.github/blob/master/CODE_OF_CONDUCT.md)
diff --git a/ci/test_wheels.py b/ci/test_wheels.py
@@ -1,12 +1,11 @@
 import glob
 import os
-import platform
 import shutil
 import subprocess
 import sys
 
 if os.name == "nt":
-    py_ver = platform.python_version()
+    py_ver = f"{sys.version_info.major}.{sys.version_info.minor}"
     is_32_bit = os.getenv("IS_32_BIT") == "true"
     try:
         wheel_dir = sys.argv[1]
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -28,10 +28,24 @@ Available optional dependencies (listed in order of appearance at `install guide
 ``[all, performance, computation, timezone, fss, aws, gcp, excel, parquet, feather, hdf5, spss, postgresql, mysql,
 sql-other, html, xml, plot, output_formatting, clipboard, compression, test]`` (:issue:`39164`).
 
-.. _whatsnew_200.enhancements.enhancement2:
+.. _whatsnew_200.enhancements.io_readers_nullable_pyarrow:
 
-enhancement2
-^^^^^^^^^^^^
+Configuration option, ``io.nullable_backend``, to return pyarrow-backed dtypes from IO functions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in :func:`read_parquet` and :func:`read_csv` (with ``engine="pyarrow"``)
+to return pyarrow-backed dtypes when set to ``"pyarrow"`` (:issue:`48957`).
+
+.. ipython:: python
+
+    import io
+    data = io.StringIO("""a,b,c,d,e,f,g,h,i
+        1,2.5,True,a,,,,,
+        3,4.5,False,b,6,7.5,True,a,
+    """)
+    with pd.option_context("io.nullable_backend", "pyarrow"):
+        df = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow")
+    df
 
 .. _whatsnew_200.enhancements.other:
 
@@ -42,7 +56,6 @@ Other enhancements
 - :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`)
 - :func:`assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`)
 - Added new argument ``use_nullable_dtypes`` to :func:`read_csv` and :func:`read_excel` to enable automatic conversion to nullable dtypes (:issue:`36712`)
-- Added new global configuration, ``io.nullable_backend`` to allow ``use_nullable_dtypes=True`` to return pyarrow-backed dtypes when set to ``"pyarrow"`` in :func:`read_parquet` (:issue:`48957`)
 - Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`)
 - Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`)
 - :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`)
diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
@@ -90,7 +90,7 @@ cdef class ObjectFactorizer(Factorizer):
         self.uniques = ObjectVector()
 
     def factorize(
-        self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None
+        self, ndarray[object] values, na_sentinel=-1, na_value=None
     ) -> np.ndarray:
         """
 
@@ -115,14 +115,6 @@ cdef class ObjectFactorizer(Factorizer):
             self.uniques = uniques
         labels = self.table.get_labels(values, self.uniques,
                                        self.count, na_sentinel, na_value)
-        mask = (labels == na_sentinel)
-        # sort on
-        if sort:
-            sorter = self.uniques.to_array().argsort()
-            reverse_indexer = np.empty(len(sorter), dtype=np.intp)
-            reverse_indexer.put(sorter, np.arange(len(sorter)))
-            labels = reverse_indexer.take(labels, mode='clip')
-            labels[mask] = na_sentinel
         self.count = len(self.uniques)
         return labels
 
@@ -136,7 +128,7 @@ cdef class Int64Factorizer(Factorizer):
         self.table = Int64HashTable(size_hint)
         self.uniques = Int64Vector()
 
-    def factorize(self, const int64_t[:] values, sort=False,
+    def factorize(self, const int64_t[:] values,
                   na_sentinel=-1, na_value=None) -> np.ndarray:
         """
         Returns
@@ -161,14 +153,5 @@ cdef class Int64Factorizer(Factorizer):
         labels = self.table.get_labels(values, self.uniques,
                                        self.count, na_sentinel,
                                        na_value=na_value)
-
-        # sort on
-        if sort:
-            sorter = self.uniques.to_array().argsort()
-            reverse_indexer = np.empty(len(sorter), dtype=np.intp)
-            reverse_indexer.put(sorter, np.arange(len(sorter)))
-
-            labels = reverse_indexer.take(labels)
-
         self.count = len(self.uniques)
         return labels
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -365,7 +365,6 @@ def __contains__(self, key: Any) -> bool:
 
         return contains(self, key, container=self._engine)
 
-    # TODO(2.0): remove reindex once non-unique deprecation is enforced
     def reindex(
         self, target, method=None, level=None, limit=None, tolerance=None
     ) -> tuple[Index, npt.NDArray[np.intp] | None]:
@@ -392,51 +391,7 @@ def reindex(
             raise NotImplementedError(
                 "argument limit is not implemented for CategoricalIndex.reindex"
             )
-
-        target = ibase.ensure_index(target)
-
-        if self.equals(target):
-            indexer = None
-            missing = np.array([], dtype=np.intp)
-        else:
-            indexer, missing = self.get_indexer_non_unique(target)
-            if not self.is_unique:
-                # GH#42568
-                raise ValueError("cannot reindex on an axis with duplicate labels")
-
-        new_target: Index
-        if len(self) and indexer is not None:
-            new_target = self.take(indexer)
-        else:
-            new_target = target
-
-        # filling in missing if needed
-        if len(missing):
-            cats = self.categories.get_indexer(target)
-
-            if not isinstance(target, CategoricalIndex) or (cats == -1).any():
-                new_target, indexer, _ = super()._reindex_non_unique(target)
-            else:
-                # error: "Index" has no attribute "codes"
-                codes = new_target.codes.copy()  # type: ignore[attr-defined]
-                codes[indexer == -1] = cats[missing]
-                cat = self._data._from_backing_data(codes)
-                new_target = type(self)._simple_new(cat, name=self.name)
-
-        # we always want to return an Index type here
-        # to be consistent with .reindex for other index types (e.g. they don't
-        # coerce based on the actual values, only on the dtype)
-        # unless we had an initial Categorical to begin with
-        # in which case we are going to conform to the passed Categorical
-        if is_categorical_dtype(target):
-            cat = Categorical(new_target, dtype=target.dtype)
-            new_target = type(self)._simple_new(cat, name=self.name)
-        else:
-            # e.g. test_reindex_with_categoricalindex, test_reindex_duplicate_target
-            new_target_array = np.asarray(new_target)
-            new_target = Index._with_infer(new_target_array, name=self.name)
-
-        return new_target, indexer
+        return super().reindex(target)
 
     # --------------------------------------------------------------------
     # Indexing Methods
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1982,7 +1982,7 @@ def groupby(
         self,
         by=None,
         axis: Axis = 0,
-        level: Level = None,
+        level: IndexLabel = None,
         as_index: bool = True,
         sort: bool = True,
         group_keys: bool | lib.NoDefault = no_default,
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -1,16 +1,17 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
-
 from pandas._typing import ReadBuffer
 from pandas.compat._optional import import_optional_dependency
 
 from pandas.core.dtypes.inference import is_integer
 
-from pandas.io.parsers.base_parser import ParserBase
+from pandas import (
+    DataFrame,
+    arrays,
+    get_option,
+)
 
-if TYPE_CHECKING:
-    from pandas import DataFrame
+from pandas.io.parsers.base_parser import ParserBase
 
 
 class ArrowParserWrapper(ParserBase):
@@ -77,7 +78,7 @@ def _get_pyarrow_options(self) -> None:
             else self.kwds["skiprows"],
         }
 
-    def _finalize_output(self, frame: DataFrame) -> DataFrame:
+    def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
         """
         Processes data read in based on kwargs.
 
@@ -148,6 +149,16 @@ def read(self) -> DataFrame:
             parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
             convert_options=pyarrow_csv.ConvertOptions(**self.convert_options),
         )
-
-        frame = table.to_pandas()
-        return self._finalize_output(frame)
+        if (
+            self.kwds["use_nullable_dtypes"]
+            and get_option("io.nullable_backend") == "pyarrow"
+        ):
+            frame = DataFrame(
+                {
+                    col_name: arrays.ArrowExtensionArray(pa_col)
+                    for col_name, pa_col in zip(table.column_names, table.itercolumns())
+                }
+            )
+        else:
+            frame = table.to_pandas()
+        return self._finalize_pandas_output(frame)
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -24,6 +24,8 @@
 
 import numpy as np
 
+from pandas._config import get_option
+
 from pandas._libs import lib
 from pandas._libs.parsers import STR_NA_VALUES
 from pandas._typing import (
@@ -560,6 +562,14 @@ def _read(
             raise ValueError(
                 "The 'chunksize' option is not supported with the 'pyarrow' engine"
             )
+    elif (
+        kwds.get("use_nullable_dtypes", False)
+        and get_option("io.nullable_backend") == "pyarrow"
+    ):
+        raise NotImplementedError(
+            f"use_nullable_dtypes=True and engine={kwds['engine']} with "
+            "io.nullable_backend set to 'pyarrow' is not implemented."
+        )
     else:
         chunksize = validate_integer("chunksize", chunksize, 1)
 
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py