sthagen · sthagen · Jun 25, 2021 · Jun 18, 2021 · Jun 19, 2021 · Jun 21, 2021
diff --git a/.github/workflows/sdist.yml b/.github/workflows/sdist.yml
@@ -0,0 +1,64 @@
+name: sdist
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+      - 1.2.x
+      - 1.3.x
+    paths-ignore:
+      - "doc/**"
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    defaults:
+      run:
+        shell: bash -l {0}
+
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.7", "3.8", "3.9"]
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        fetch-depth: 0
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip setuptools wheel
+
+        # GH 39416
+        pip install numpy
+
+    - name: Build pandas sdist
+      run: |
+        pip list
+        python setup.py sdist --formats=gztar
+
+    - uses: conda-incubator/setup-miniconda@v2
+      with:
+        activate-environment: pandas-sdist
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install pandas from sdist
+      run: |
+        conda list
+        python -m pip install dist/*.gz
+
+    - name: Import pandas
+      run: |
+        cd ..
+        conda list
+        python -c "import pandas; pandas.show_versions();"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -9,11 +9,11 @@ repos:
     -   id: absolufy-imports
         files: ^pandas/
 -   repo: https://github.com/python/black
-    rev: 21.5b2
+    rev: 21.6b0
     hooks:
     -   id: black
 -   repo: https://github.com/codespell-project/codespell
-    rev: v2.0.0
+    rev: v2.1.0
     hooks:
     -   id: codespell
         types_or: [python, rst, markdown]
@@ -53,16 +53,16 @@ repos:
         types: [text]
         args: [--append-config=flake8/cython-template.cfg]
 -   repo: https://github.com/PyCQA/isort
-    rev: 5.8.0
+    rev: 5.9.0
     hooks:
     -   id: isort
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v2.18.3
+    rev: v2.19.4
     hooks:
     -   id: pyupgrade
         args: [--py37-plus]
 -   repo: https://github.com/pre-commit/pygrep-hooks
-    rev: v1.8.0
+    rev: v1.9.0
     hooks:
       - id: rst-backticks
       - id: rst-directive-colons

diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py
@@ -325,3 +325,13 @@ def setup(self, dtype, series_type):
 
     def time_isin(self, dtypes, series_type):
         self.series.isin(self.values)
+
+
+class IsInWithLongTupples:
+    def setup(self):
+        t = tuple(range(1000))
+        self.series = Series([t] * 1000)
+        self.values = [t]
+
+    def time_isin(self):
+        self.series.isin(self.values)
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -38,10 +38,7 @@ function invgrep {
 }
 
 if [[ "$GITHUB_ACTIONS" == "true" ]]; then
-    FLAKE8_FORMAT="##[error]%(path)s:%(row)s:%(col)s:%(code)s:%(text)s"
     INVGREP_PREPEND="##[error]"
-else
-    FLAKE8_FORMAT="default"
 fi
 
 ### LINTING ###

diff --git a/ci/deps/actions-37-slow.yaml b/ci/deps/actions-37-slow.yaml
@@ -14,7 +14,7 @@ dependencies:
 
   # pandas dependencies
   - beautifulsoup4
-  - fsspec>=0.7.4
+  - fsspec>=0.7.4, <2021.6.0
   - html5lib
   - lxml
   - matplotlib

diff --git a/ci/deps/actions-38-slow.yaml b/ci/deps/actions-38-slow.yaml
@@ -13,7 +13,7 @@ dependencies:
 
   # pandas dependencies
   - beautifulsoup4
-  - fsspec>=0.7.4
+  - fsspec>=0.7.4, <2021.6.0
   - html5lib
   - lxml
   - matplotlib

diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml
@@ -17,7 +17,7 @@ dependencies:
   - bottleneck
   - fastparquet>=0.4.0
   - flask
-  - fsspec>=0.8.0
+  - fsspec>=0.8.0, <2021.6.0
   - matplotlib=3.1.3
   - moto>=1.3.14
   - numba

diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
@@ -1523,18 +1523,17 @@ Looking up values by index/column labels
 ----------------------------------------
 
 Sometimes you want to extract a set of values given a sequence of row labels
-and column labels, this can be achieved by ``DataFrame.melt`` combined by filtering the corresponding
-rows with ``DataFrame.loc``.  For instance:
+and column labels, this can be achieved by ``pandas.factorize``  and NumPy indexing.
+For instance:
 
 .. ipython:: python
 
     df = pd.DataFrame({'col': ["A", "A", "B", "B"],
                        'A': [80, 23, np.nan, 22],
                        'B': [80, 55, 76, 67]})
     df
-    melt = df.melt('col')
-    melt = melt.loc[melt['col'] == melt['variable'], 'value']
-    melt.reset_index(drop=True)
+    idx, cols = pd.factorize(df['col'])
+    df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx]
 
 Formerly this could be achieved with the dedicated ``DataFrame.lookup`` method
 which was deprecated in version 1.2.0.

diff --git a/doc/source/whatsnew/v1.2.5.rst b/doc/source/whatsnew/v1.2.5.rst
@@ -1,7 +1,7 @@
 .. _whatsnew_125:
 
-What's new in 1.2.5 (May ??, 2021)
-----------------------------------
+What's new in 1.2.5 (June 22, 2021)
+-----------------------------------
 
 These are the changes in pandas 1.2.5. See :ref:`release` for a full changelog
 including other versions of pandas.
@@ -14,32 +14,12 @@ including other versions of pandas.
 
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
-- Regression in :func:`concat` between two :class:`DataFrames` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`)
+- Fixed regression in :func:`concat` between two :class:`DataFrame` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`)
 - Fixed regression in :meth:`DataFrame.sum` and :meth:`DataFrame.prod` when ``min_count`` and ``numeric_only`` are both given (:issue:`41074`)
-- Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`)
-- Regression in :meth:`DataFrame.replace` and :meth:`Series.replace` when the values to replace is a NumPy float array (:issue:`40371`)
-- Regression in :func:`ExcelFile` when a corrupt file is opened but not closed (:issue:`41778`)
-
-.. ---------------------------------------------------------------------------
-
-
-.. _whatsnew_125.bug_fixes:
-
-Bug fixes
-~~~~~~~~~
-
--
--
-
-.. ---------------------------------------------------------------------------
-
-.. _whatsnew_125.other:
-
-Other
-~~~~~
-
--
--
+- Fixed regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`)
+- Fixed regression in :meth:`DataFrame.replace` and :meth:`Series.replace` when the values to replace is a NumPy float array (:issue:`40371`)
+- Fixed regression in :func:`ExcelFile` when a corrupt file is opened but not closed (:issue:`41778`)
+- Fixed regression in :meth:`DataFrame.astype` with ``dtype=str`` failing to convert ``NaN`` in categorical columns (:issue:`41797`)
 
 .. ---------------------------------------------------------------------------
 

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -269,12 +269,14 @@ Other enhancements
 - :meth:`read_csv` and :meth:`read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`)
 - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`)
 - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`)
+- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` raising with ``object`` data containing ``pd.NA`` even when ``skipna=True`` (:issue:`37501`)
 - :meth:`.GroupBy.rank` now supports object-dtype data (:issue:`38278`)
 - Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`)
 - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`)
 - Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`)
 - :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`)
 - Improved error message in ``corr`` and ``cov`` methods on :class:`.Rolling`, :class:`.Expanding`, and :class:`.ExponentialMovingWindow` when ``other`` is not a :class:`DataFrame` or :class:`Series` (:issue:`41741`)
+- :meth:`DataFrame.explode` now supports exploding multiple columns. Its ``column`` argument now also accepts a list of str or tuples for exploding on multiple columns at the same time (:issue:`39240`)
 
 .. ---------------------------------------------------------------------------
 
@@ -914,13 +916,15 @@ Datetimelike
 - Bug in constructing a :class:`DataFrame` or :class:`Series` with mismatched ``datetime64`` data and ``timedelta64`` dtype, or vice-versa, failing to raise a ``TypeError`` (:issue:`38575`, :issue:`38764`, :issue:`38792`)
 - Bug in constructing a :class:`Series` or :class:`DataFrame` with a ``datetime`` object out of bounds for ``datetime64[ns]`` dtype or a ``timedelta`` object out of bounds for ``timedelta64[ns]`` dtype (:issue:`38792`, :issue:`38965`)
 - Bug in :meth:`DatetimeIndex.intersection`, :meth:`DatetimeIndex.symmetric_difference`, :meth:`PeriodIndex.intersection`, :meth:`PeriodIndex.symmetric_difference` always returning object-dtype when operating with :class:`CategoricalIndex` (:issue:`38741`)
+- Bug in :meth:`DatetimeIndex.intersection` giving incorrect results with non-Tick frequencies with ``n != 1`` (:issue:`42104`)
 - Bug in :meth:`Series.where` incorrectly casting ``datetime64`` values to ``int64`` (:issue:`37682`)
 - Bug in :class:`Categorical` incorrectly typecasting ``datetime`` object to ``Timestamp`` (:issue:`38878`)
 - Bug in comparisons between :class:`Timestamp` object and ``datetime64`` objects just outside the implementation bounds for nanosecond ``datetime64`` (:issue:`39221`)
 - Bug in :meth:`Timestamp.round`, :meth:`Timestamp.floor`, :meth:`Timestamp.ceil` for values near the implementation bounds of :class:`Timestamp` (:issue:`39244`)
 - Bug in :meth:`Timedelta.round`, :meth:`Timedelta.floor`, :meth:`Timedelta.ceil` for values near the implementation bounds of :class:`Timedelta` (:issue:`38964`)
 - Bug in :func:`date_range` incorrectly creating :class:`DatetimeIndex` containing ``NaT`` instead of raising ``OutOfBoundsDatetime`` in corner cases (:issue:`24124`)
 - Bug in :func:`infer_freq` incorrectly fails to infer 'H' frequency of :class:`DatetimeIndex` if the latter has a timezone and crosses DST boundaries (:issue:`39556`)
+- Bug in :class:`Series` backed by :class:`DatetimeArray` or :class:`TimedeltaArray` sometimes failing to set the array's ``freq`` to ``None`` (:issue:`41425`)
 
 Timedelta
 ^^^^^^^^^
@@ -951,7 +955,8 @@ Numeric
 - Bug in :class:`Series` and :class:`DataFrame` reductions with methods ``any`` and ``all`` not returning Boolean results for object data (:issue:`12863`, :issue:`35450`, :issue:`27709`)
 - Bug in :meth:`Series.clip` would fail if the Series contains NA values and has nullable int or float as a data type (:issue:`40851`)
 - Bug in :meth:`UInt64Index.where` and :meth:`UInt64Index.putmask` with an ``np.int64`` dtype ``other`` incorrectly raising ``TypeError`` (:issue:`41974`)
-
+- Bug in :meth:`DataFrame.agg()` not sorting the aggregated axis in the order of the provided aggragation functions when one or more aggregation function fails to produce results (:issue:`33634`)
+- Bug in :meth:`DataFrame.clip` not interpreting missing values as no threshold (:issue:`40420`)
 
 Conversion
 ^^^^^^^^^^
@@ -967,6 +972,12 @@ Conversion
 - Bug in :class:`DataFrame` and :class:`Series` construction with ``datetime64[ns]`` data and ``dtype=object`` resulting in ``datetime`` objects instead of :class:`Timestamp` objects (:issue:`41599`)
 - Bug in :class:`DataFrame` and :class:`Series` construction with ``timedelta64[ns]`` data and ``dtype=object`` resulting in ``np.timedelta64`` objects instead of :class:`Timedelta` objects (:issue:`41599`)
 - Bug in :class:`DataFrame` construction when given a two-dimensional object-dtype ``np.ndarray`` of :class:`Period` or :class:`Interval` objects failing to cast to :class:`PeriodDtype` or :class:`IntervalDtype`, respectively (:issue:`41812`)
+- Bug in constructing a :class:`Series` from a list and a :class:`PandasDtype` (:issue:`39357`)
+- Bug in creating a :class:`Series` from a ``range`` object that does not fit in the bounds of ``int64`` dtype (:issue:`30173`)
+- Bug in creating a :class:`Series` from a ``dict`` with all-tuple keys and an :class:`Index` that requires reindexing (:issue:`41707`)
+- Bug in :func:`.infer_dtype` not recognizing Series, Index, or array with a Period dtype (:issue:`23553`)
+- Bug in :func:`.infer_dtype` raising an error for general :class:`.ExtensionArray` objects. It will now return ``"unknown-array"`` instead of raising (:issue:`37367`)
+- Bug in :meth:`DataFrame.convert_dtypes` incorrectly raised a ``ValueError`` when called on an empty DataFrame (:issue:`40393`)
 
 Strings
 ^^^^^^^
@@ -1028,6 +1039,8 @@ Indexing
 - Bug ``.loc.__getitem__`` with a :class:`UInt64Index` and negative-integer keys raising ``OverflowError`` instead of ``KeyError`` in some cases, wrapping around to positive integers in others (:issue:`41777`)
 - Bug in :meth:`Index.get_indexer` failing to raise ``ValueError`` in some cases with invalid ``method``, ``limit``, or ``tolerance`` arguments (:issue:`41918`)
 - Bug when slicing a :class:`Series` or :class:`DataFrame` with a :class:`TimedeltaIndex` when passing an invalid string raising ``ValueError`` instead of a ``TypeError`` (:issue:`41821`)
+- Bug in :class:`Index` constructor sometimes silently ignoring a specified ``dtype`` (:issue:`38879`)
+- :meth:`Index.where` behavior now mirrors :meth:`Index.putmask` behavior, i.e. ``index.where(mask, other)`` matches ``index.putmask(~mask, other)`` (:issue:`39412`)
 
 Missing
 ^^^^^^^
@@ -1197,24 +1210,14 @@ Styler
 
 Other
 ^^^^^
-- Bug in :class:`Index` constructor sometimes silently ignoring a specified ``dtype`` (:issue:`38879`)
-- Bug in :func:`.infer_dtype` not recognizing Series, Index, or array with a Period dtype (:issue:`23553`)
-- Bug in :func:`.infer_dtype` raising an error for general :class:`.ExtensionArray` objects. It will now return ``"unknown-array"`` instead of raising (:issue:`37367`)
-- Bug in constructing a :class:`Series` from a list and a :class:`PandasDtype` (:issue:`39357`)
 - ``inspect.getmembers(Series)`` no longer raises an ``AbstractMethodError`` (:issue:`38782`)
 - Bug in :meth:`Series.where` with numeric dtype and ``other=None`` not casting to ``nan`` (:issue:`39761`)
-- :meth:`Index.where` behavior now mirrors :meth:`Index.putmask` behavior, i.e. ``index.where(mask, other)`` matches ``index.putmask(~mask, other)`` (:issue:`39412`)
 - Bug in :func:`.assert_series_equal`, :func:`.assert_frame_equal`, :func:`.assert_index_equal` and :func:`.assert_extension_array_equal` incorrectly raising when an attribute has an unrecognized NA type (:issue:`39461`)
 - Bug in :func:`.assert_index_equal` with ``exact=True`` not raising when comparing :class:`CategoricalIndex` instances with ``Int64Index`` and ``RangeIndex`` categories (:issue:`41263`)
 - Bug in :meth:`DataFrame.equals`, :meth:`Series.equals`, and :meth:`Index.equals` with object-dtype containing ``np.datetime64("NaT")`` or ``np.timedelta64("NaT")`` (:issue:`39650`)
 - Bug in :func:`show_versions` where console JSON output was not proper JSON (:issue:`39701`)
 - pandas can now compile on z/OS when using `xlc <https://www.ibm.com/products/xl-cpp-compiler-zos>`_ (:issue:`35826`)
-- Bug in :meth:`DataFrame.convert_dtypes` incorrectly raised a ``ValueError`` when called on an empty DataFrame (:issue:`40393`)
-- Bug in :meth:`DataFrame.agg()` not sorting the aggregated axis in the order of the provided aggragation functions when one or more aggregation function fails to produce results (:issue:`33634`)
-- Bug in :meth:`DataFrame.clip` not interpreting missing values as no threshold (:issue:`40420`)
-- Bug in :class:`Series` backed by :class:`DatetimeArray` or :class:`TimedeltaArray` sometimes failing to set the array's ``freq`` to ``None`` (:issue:`41425`)
-- Bug in creating a :class:`Series` from a ``range`` object that does not fit in the bounds of ``int64`` dtype (:issue:`30173`)
-- Bug in creating a :class:`Series` from a ``dict`` with all-tuple keys and an :class:`Index` that requires reindexing (:issue:`41707`)
+- Bug in :func:`pandas.util.hash_pandas_object` not recognizing ``hash_key``, ``encoding`` and ``categorize`` when the input object type is a :class:`DataFrame` (:issue:`41404`)
 
 .. ---------------------------------------------------------------------------
 

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -96,7 +96,7 @@ Other API changes
 
 Deprecations
 ~~~~~~~~~~~~
--
+- Deprecated :meth:`Index.is_type_compatible` (:issue:`42113`)
 -
 
 .. ---------------------------------------------------------------------------

diff --git a/environment.yml b/environment.yml
@@ -24,7 +24,7 @@ dependencies:
   - flake8-bugbear=21.3.2  # used by flake8, find likely bugs
   - flake8-comprehensions=3.1.0  # used by flake8, linting of unnecessary comprehensions
   - isort>=5.2.1  # check that imports are in the right order
-  - mypy=0.812
+  - mypy=0.910
   - pre-commit>=2.9.2
   - pycodestyle  # used by flake8
   - pyupgrade
@@ -118,3 +118,7 @@ dependencies:
     - git+https://github.com/pydata/pydata-sphinx-theme.git@master
     - numpydoc < 1.2  # 2021-02-09 1.2dev breaking CI
     - pandas-dev-flaker==0.2.0
+    - types-python-dateutil
+    - types-PyMySQL
+    - types-pytz
+    - types-setuptools
diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi
@@ -228,3 +228,5 @@ def ismember(
     arr: np.ndarray,
     values: np.ndarray,
 ) -> np.ndarray: ...  # np.ndarray[bool]
+def object_hash(obj) -> int: ...
+def objects_are_equal(a, b) -> bool: ...
diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
@@ -34,6 +34,8 @@ from pandas._libs.khash cimport (
     are_equivalent_khcomplex64_t,
     are_equivalent_khcomplex128_t,
     kh_needed_n_buckets,
+    kh_python_hash_equal,
+    kh_python_hash_func,
     kh_str_t,
     khcomplex64_t,
     khcomplex128_t,
@@ -46,6 +48,14 @@ def get_hashtable_trace_domain():
     return KHASH_TRACE_DOMAIN
 
 
+def object_hash(obj):
+    return kh_python_hash_func(obj)
+
+
+def objects_are_equal(a, b):
+    return kh_python_hash_equal(a, b)
+
+
 cdef int64_t NPY_NAT = util.get_nat()
 SIZE_HINT_LIMIT = (1 << 20) + 7
 

diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd
@@ -41,6 +41,9 @@ cdef extern from "khash_python.h":
     bint are_equivalent_float32_t \
     "kh_floats_hash_equal" (float32_t a, float32_t b) nogil
 
+    uint32_t kh_python_hash_func(object key)
+    bint kh_python_hash_equal(object a, object b)
+
     ctypedef struct kh_pymap_t:
         khuint_t n_buckets, size, n_occupied, upper_bound
         uint32_t *flags

diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
@@ -219,8 +219,7 @@ def array_equivalent_object(
     left: np.ndarray,  # object[:]
     right: np.ndarray,  # object[:]
 ) -> bool: ...
-def has_infs_f8(arr: np.ndarray) -> bool: ...  # const float64_t[:]
-def has_infs_f4(arr: np.ndarray) -> bool: ...  # const float32_t[:]
+def has_infs(arr: np.ndarray) -> bool: ...  # const floating[:]
 def get_reverse_indexer(
     indexer: np.ndarray,  # const intp_t[:]
     length: int,