Merge remote-tracking branch 'upstream/main'

Dr-Irv · Dr-Irv · commit 5761bf2cc847 · 2022-02-22T22:16:41.000-05:00
diff --git a/.github/workflows/posix.yml b/.github/workflows/posix.yml
@@ -26,6 +26,9 @@ jobs:
       matrix:
         env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml]
         pattern: ["not single_cpu", "single_cpu"]
+        # Don't test pyarrow v2/3: Causes timeouts in read_csv engine
+        # even if tests are skipped/xfailed
+        pyarrow_version: ["5", "6", "7"]
         include:
           - env_file: actions-38-downstream_compat.yaml
             pattern: "not slow and not network and not single_cpu"
@@ -65,7 +68,7 @@ jobs:
       COVERAGE: ${{ !contains(matrix.env_file, 'pypy') }}
     concurrency:
       # https://github.community/t/concurrecy-not-work-for-push/183068/7
-      group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}
+      group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.pyarrow_version || '' }}-${{ matrix.extra_apt || '' }}
       cancel-in-progress: true
 
     services:
@@ -133,6 +136,10 @@ jobs:
         use-only-tar-bz2: true
       if: ${{ env.IS_PYPY == 'false' }} # No pypy3.8 support
 
+    - name: Upgrade Arrow version
+      run: conda install -n pandas-dev -c conda-forge --no-update-deps pyarrow=${{ matrix.pyarrow_version }}
+      if: ${{ matrix.pyarrow_version }}
+
     - name: Setup PyPy
       uses: actions/setup-python@v2
       with:
diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py
@@ -158,6 +158,19 @@ def time_left_outer_join_index(self):
         self.left.join(self.right, on="jim")
 
 
+class JoinEmpty:
+    def setup(self):
+        N = 100_000
+        self.df = DataFrame({"A": np.arange(N)})
+        self.df_empty = DataFrame(columns=["B", "C"], dtype="int64")
+
+    def time_inner_join_left_empty(self):
+        self.df_empty.join(self.df, how="inner")
+
+    def time_inner_join_right_empty(self):
+        self.df.join(self.df_empty, how="inner")
+
+
 class JoinNonUnique:
     # outer join of non-unique
     # GH 6329
diff --git a/doc/source/whatsnew/v1.4.2.rst b/doc/source/whatsnew/v1.4.2.rst
@@ -14,7 +14,7 @@ including other versions of pandas.
 
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
-- Fixed regression in :meth:`DataFrame.drop` and :meth:`Series.drop` when :class:`Index` had extension dtype and duplicates (:issue:`45820`)
+- Fixed regression in :meth:`DataFrame.drop` and :meth:`Series.drop` when :class:`Index` had extension dtype and duplicates (:issue:`45860`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -256,6 +256,7 @@ Performance improvements
 - Performance improvement in :meth:`.GroupBy.transform` for user-defined functions when only a single group exists (:issue:`44977`)
 - Performance improvement in :meth:`MultiIndex.get_locs` (:issue:`45681`)
 - Performance improvement in :func:`merge` when left and/or right are empty (:issue:`45838`)
+- Performance improvement in :meth:`DataFrame.join` when left and/or right are empty (:issue:`46015`)
 - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
 -
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -7448,7 +7448,11 @@ def combine_first(self, other: DataFrame) -> DataFrame:
 
         Combine two DataFrame objects by filling null values in one DataFrame
         with non-null values from other DataFrame. The row and column indexes
-        of the resulting DataFrame will be the union of the two.
+        of the resulting DataFrame will be the union of the two. The resulting
+        dataframe contains the 'first' dataframe values and overrides the
+        second one values where both first.loc[index, col] and
+        second.loc[index, col] are not missing values, upon calling
+        first.combine_first(second).
 
         Parameters
         ----------
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -4542,15 +4542,25 @@ def join(
         if level is not None and (self._is_multi or other._is_multi):
             return self._join_level(other, level, how=how)
 
-        if len(other) == 0 and how in ("left", "outer"):
-            join_index = self._view()
-            rindexer = np.repeat(np.intp(-1), len(join_index))
-            return join_index, None, rindexer
-
-        if len(self) == 0 and how in ("right", "outer"):
-            join_index = other._view()
-            lindexer = np.repeat(np.intp(-1), len(join_index))
-            return join_index, lindexer, None
+        if len(other) == 0:
+            if how in ("left", "outer"):
+                join_index = self._view()
+                rindexer = np.broadcast_to(np.intp(-1), len(join_index))
+                return join_index, None, rindexer
+            elif how in ("right", "inner", "cross"):
+                join_index = other._view()
+                lindexer = np.array([])
+                return join_index, lindexer, None
+
+        if len(self) == 0:
+            if how in ("right", "outer"):
+                join_index = other._view()
+                lindexer = np.broadcast_to(np.intp(-1), len(join_index))
+                return join_index, lindexer, None
+            elif how in ("left", "inner", "cross"):
+                join_index = self._view()
+                rindexer = np.array([])
+                return join_index, None, rindexer
 
         if self._join_precedence < other._join_precedence:
             how = {"right": "left", "left": "right"}.get(how, how)
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -1880,6 +1880,7 @@ def encode(self, encoding, errors="strict"):
 
     Strip whitespaces (including newlines) or a set of specified characters
     from each string in the Series/Index from %(side)s.
+    Replaces any non-strings in Series with NaNs.
     Equivalent to :meth:`str.%(method)s`.
 
     Parameters
@@ -1901,40 +1902,50 @@ def encode(self, encoding, errors="strict"):
 
     Examples
     --------
-    >>> s = pd.Series(['1. Ant.  ', '2. Bee!\n', '3. Cat?\t', np.nan])
+    >>> s = pd.Series(['1. Ant.  ', '2. Bee!\n', '3. Cat?\t', np.nan, 10, True])
     >>> s
     0    1. Ant.
     1    2. Bee!\n
     2    3. Cat?\t
     3          NaN
+    4           10
+    5         True
     dtype: object
 
     >>> s.str.strip()
     0    1. Ant.
     1    2. Bee!
     2    3. Cat?
     3        NaN
+    4        NaN
+    5        NaN
     dtype: object
 
     >>> s.str.lstrip('123.')
     0    Ant.
     1    Bee!\n
     2    Cat?\t
     3       NaN
+    4       NaN
+    5       NaN
     dtype: object
 
     >>> s.str.rstrip('.!? \n\t')
     0    1. Ant
     1    2. Bee
     2    3. Cat
     3       NaN
+    4       NaN
+    5       NaN
     dtype: object
 
     >>> s.str.strip('123.!? \n\t')
     0    Ant
     1    Bee
     2    Cat
     3    NaN
+    4    NaN
+    5    NaN
     dtype: object
     """
 
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
@@ -748,9 +748,9 @@ def pandasSQL_builder(con, schema: str | None = None):
         return SQLDatabase(con, schema=schema)
 
     warnings.warn(
-        "pandas only support SQLAlchemy connectable(engine/connection) or"
-        "database string URI or sqlite3 DBAPI2 connection"
-        "other DBAPI2 objects are not tested, please consider using SQLAlchemy",
+        "pandas only supports SQLAlchemy connectable (engine/connection) or "
+        "database string URI or sqlite3 DBAPI2 connection. "
+        "Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.",
         UserWarning,
     )
     return SQLiteDatabase(con)
diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py
@@ -314,3 +314,12 @@ def test_unary_int_operators(any_signed_int_ea_dtype, source, neg_target, abs_ta
     tm.assert_extension_array_equal(pos_result, arr)
     assert not tm.shares_memory(pos_result, arr)
     tm.assert_extension_array_equal(abs_result, abs_target)
+
+
+def test_values_multiplying_large_series_by_NA():
+    # GH#33701
+
+    result = pd.NA * pd.Series(np.zeros(10001))
+    expected = pd.Series([pd.NA] * 10001)
+
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py
@@ -891,6 +891,7 @@ def test_replace_input_formats_scalar(self):
         tm.assert_frame_equal(result, expected)
 
     def test_replace_limit(self):
+        # TODO
         pass
 
     def test_replace_dict_no_regex(self):
diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py
@@ -201,39 +201,42 @@ def test_reset_index_name(self):
         assert return_value is None
         assert df.index.name is None
 
-    def test_reset_index_level(self):
+    @pytest.mark.parametrize("levels", [["A", "B"], [0, 1]])
+    def test_reset_index_level(self, levels):
         df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "C", "D"])
 
-        for levels in ["A", "B"], [0, 1]:
-            # With MultiIndex
-            result = df.set_index(["A", "B"]).reset_index(level=levels[0])
-            tm.assert_frame_equal(result, df.set_index("B"))
+        # With MultiIndex
+        result = df.set_index(["A", "B"]).reset_index(level=levels[0])
+        tm.assert_frame_equal(result, df.set_index("B"))
 
-            result = df.set_index(["A", "B"]).reset_index(level=levels[:1])
-            tm.assert_frame_equal(result, df.set_index("B"))
+        result = df.set_index(["A", "B"]).reset_index(level=levels[:1])
+        tm.assert_frame_equal(result, df.set_index("B"))
 
-            result = df.set_index(["A", "B"]).reset_index(level=levels)
-            tm.assert_frame_equal(result, df)
+        result = df.set_index(["A", "B"]).reset_index(level=levels)
+        tm.assert_frame_equal(result, df)
 
-            result = df.set_index(["A", "B"]).reset_index(level=levels, drop=True)
-            tm.assert_frame_equal(result, df[["C", "D"]])
+        result = df.set_index(["A", "B"]).reset_index(level=levels, drop=True)
+        tm.assert_frame_equal(result, df[["C", "D"]])
 
-            # With single-level Index (GH 16263)
-            result = df.set_index("A").reset_index(level=levels[0])
-            tm.assert_frame_equal(result, df)
+        # With single-level Index (GH 16263)
+        result = df.set_index("A").reset_index(level=levels[0])
+        tm.assert_frame_equal(result, df)
 
-            result = df.set_index("A").reset_index(level=levels[:1])
-            tm.assert_frame_equal(result, df)
+        result = df.set_index("A").reset_index(level=levels[:1])
+        tm.assert_frame_equal(result, df)
 
-            result = df.set_index(["A"]).reset_index(level=levels[0], drop=True)
-            tm.assert_frame_equal(result, df[["B", "C", "D"]])
+        result = df.set_index(["A"]).reset_index(level=levels[0], drop=True)
+        tm.assert_frame_equal(result, df[["B", "C", "D"]])
 
+    @pytest.mark.parametrize("idx_lev", [["A", "B"], ["A"]])
+    def test_reset_index_level_missing(self, idx_lev):
         # Missing levels - for both MultiIndex and single-level Index:
-        for idx_lev in ["A", "B"], ["A"]:
-            with pytest.raises(KeyError, match=r"(L|l)evel \(?E\)?"):
-                df.set_index(idx_lev).reset_index(level=["A", "E"])
-            with pytest.raises(IndexError, match="Too many levels"):
-                df.set_index(idx_lev).reset_index(level=[0, 1, 2])
+        df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "C", "D"])
+
+        with pytest.raises(KeyError, match=r"(L|l)evel \(?E\)?"):
+            df.set_index(idx_lev).reset_index(level=["A", "E"])
+        with pytest.raises(IndexError, match="Too many levels"):
+            df.set_index(idx_lev).reset_index(level=[0, 1, 2])
 
     def test_reset_index_right_dtype(self):
         time = np.arange(0.0, 10, np.sqrt(2) / 2)
diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py
@@ -92,17 +92,17 @@ def test_shift_int(self, datetime_frame, frame_or_series):
         expected = ts.astype(float).shift(1)
         tm.assert_equal(shifted, expected)
 
-    def test_shift_32bit_take(self, frame_or_series):
+    @pytest.mark.parametrize("dtype", ["int32", "int64"])
+    def test_shift_32bit_take(self, frame_or_series, dtype):
         # 32-bit taking
         # GH#8129
         index = date_range("2000-01-01", periods=5)
-        for dtype in ["int32", "int64"]:
-            arr = np.arange(5, dtype=dtype)
-            s1 = frame_or_series(arr, index=index)
-            p = arr[1]
-            result = s1.shift(periods=p)
-            expected = frame_or_series([np.nan, 0, 1, 2, 3], index=index)
-            tm.assert_equal(result, expected)
+        arr = np.arange(5, dtype=dtype)
+        s1 = frame_or_series(arr, index=index)
+        p = arr[1]
+        result = s1.shift(periods=p)
+        expected = frame_or_series([np.nan, 0, 1, 2, 3], index=index)
+        tm.assert_equal(result, expected)
 
     @pytest.mark.parametrize("periods", [1, 2, 3, 4])
     def test_shift_preserve_freqstr(self, periods, frame_or_series):
@@ -141,11 +141,15 @@ def test_shift_dst(self, frame_or_series):
         tm.assert_equal(res, exp)
         assert tm.get_dtype(res) == "datetime64[ns, US/Eastern]"
 
-        for ex in [10, -10, 20, -20]:
-            res = obj.shift(ex)
-            exp = frame_or_series([NaT] * 10, dtype="datetime64[ns, US/Eastern]")
-            tm.assert_equal(res, exp)
-            assert tm.get_dtype(res) == "datetime64[ns, US/Eastern]"
+    @pytest.mark.parametrize("ex", [10, -10, 20, -20])
+    def test_shift_dst_beyond(self, frame_or_series, ex):
+        # GH#13926
+        dates = date_range("2016-11-06", freq="H", periods=10, tz="US/Eastern")
+        obj = frame_or_series(dates)
+        res = obj.shift(ex)
+        exp = frame_or_series([NaT] * 10, dtype="datetime64[ns, US/Eastern]")
+        tm.assert_equal(res, exp)
+        assert tm.get_dtype(res) == "datetime64[ns, US/Eastern]"
 
     def test_shift_by_zero(self, datetime_frame, frame_or_series):
         # shift by 0
diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py
@@ -194,15 +194,19 @@ def test_empty_nonzero(self):
         df = DataFrame(index=["a", "b"], columns=["c", "d"]).dropna()
         assert df.empty
         assert df.T.empty
-        empty_frames = [
+
+    @pytest.mark.parametrize(
+        "df",
+        [
             DataFrame(),
             DataFrame(index=[1]),
             DataFrame(columns=[1]),
             DataFrame({1: []}),
-        ]
-        for df in empty_frames:
-            assert df.empty
-            assert df.T.empty
+        ],
+    )
+    def test_empty_like(self, df):
+        assert df.empty
+        assert df.T.empty
 
     def test_with_datetimelikes(self):
 
diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py
@@ -881,3 +881,44 @@ def test_join_multiindex_not_alphabetical_categorical(categories, values):
         }
     ).set_index(["first", "second"])
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "left_empty, how, exp",
+    [
+        (False, "left", "left"),
+        (False, "right", "empty"),
+        (False, "inner", "empty"),
+        (False, "outer", "left"),
+        (False, "cross", "empty"),
+        (True, "left", "empty"),
+        (True, "right", "right"),
+        (True, "inner", "empty"),
+        (True, "outer", "right"),
+        (True, "cross", "empty"),
+    ],
+)
+def test_join_empty(left_empty, how, exp):
+
+    left = DataFrame({"A": [2, 1], "B": [3, 4]}, dtype="int64").set_index("A")
+    right = DataFrame({"A": [1], "C": [5]}, dtype="int64").set_index("A")
+
+    if left_empty:
+        left = left.head(0)
+    else:
+        right = right.head(0)
+
+    result = left.join(right, how=how)
+
+    if exp == "left":
+        expected = DataFrame({"A": [2, 1], "B": [3, 4], "C": [np.nan, np.nan]})
+        expected = expected.set_index("A")
+    elif exp == "right":
+        expected = DataFrame({"B": [np.nan], "A": [1], "C": [5]})
+        expected = expected.set_index("A")
+    elif exp == "empty":
+        expected = DataFrame(index=Index([]), columns=["B", "C"], dtype="int64")
+        if how != "cross":
+            expected = expected.rename_axis("A")
+
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@ including other versions of pandas.`
`14`	`14`
`15`	`15`	`Fixed regressions`
`16`	`16`	`~~~~~~~~~~~~~~~~~`
`17`		-- Fixed regression in :meth:`DataFrame.drop` and :meth:`Series.drop` when :class:`Index` had extension dtype and duplicates (:issue:`45820`)
	`17`	+- Fixed regression in :meth:`DataFrame.drop` and :meth:`Series.drop` when :class:`Index` had extension dtype and duplicates (:issue:`45860`)
`18`	`18`	`-`
`19`	`19`
`20`	`20`	`.. ---------------------------------------------------------------------------`
Original file line number	Diff line number	Diff line change
`@@ -256,6 +256,7 @@ Performance improvements`
`256`	`256`	- Performance improvement in :meth:`.GroupBy.transform` for user-defined functions when only a single group exists (:issue:`44977`)
`257`	`257`	- Performance improvement in :meth:`MultiIndex.get_locs` (:issue:`45681`)
`258`	`258`	- Performance improvement in :func:`merge` when left and/or right are empty (:issue:`45838`)
	`259`	+- Performance improvement in :meth:`DataFrame.join` when left and/or right are empty (:issue:`46015`)
`259`	`260`	- Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
`260`	`261`	`-`
`261`	`262`