Skip to content

Commit 5761bf2

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents 1ee6e00 + aafa7a9 commit 5761bf2

File tree

15 files changed

+167
-58
lines changed

15 files changed

+167
-58
lines changed

.github/workflows/posix.yml

+8-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ jobs:
2626
matrix:
2727
env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml]
2828
pattern: ["not single_cpu", "single_cpu"]
29+
# Don't test pyarrow v2/3: Causes timeouts in read_csv engine
30+
# even if tests are skipped/xfailed
31+
pyarrow_version: ["5", "6", "7"]
2932
include:
3033
- env_file: actions-38-downstream_compat.yaml
3134
pattern: "not slow and not network and not single_cpu"
@@ -65,7 +68,7 @@ jobs:
6568
COVERAGE: ${{ !contains(matrix.env_file, 'pypy') }}
6669
concurrency:
6770
# https://github.community/t/concurrecy-not-work-for-push/183068/7
68-
group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}
71+
group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.pyarrow_version || '' }}-${{ matrix.extra_apt || '' }}
6972
cancel-in-progress: true
7073

7174
services:
@@ -133,6 +136,10 @@ jobs:
133136
use-only-tar-bz2: true
134137
if: ${{ env.IS_PYPY == 'false' }} # No pypy3.8 support
135138

139+
- name: Upgrade Arrow version
140+
run: conda install -n pandas-dev -c conda-forge --no-update-deps pyarrow=${{ matrix.pyarrow_version }}
141+
if: ${{ matrix.pyarrow_version }}
142+
136143
- name: Setup PyPy
137144
uses: actions/setup-python@v2
138145
with:

asv_bench/benchmarks/join_merge.py

+13
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,19 @@ def time_left_outer_join_index(self):
158158
self.left.join(self.right, on="jim")
159159

160160

161+
class JoinEmpty:
162+
def setup(self):
163+
N = 100_000
164+
self.df = DataFrame({"A": np.arange(N)})
165+
self.df_empty = DataFrame(columns=["B", "C"], dtype="int64")
166+
167+
def time_inner_join_left_empty(self):
168+
self.df_empty.join(self.df, how="inner")
169+
170+
def time_inner_join_right_empty(self):
171+
self.df.join(self.df_empty, how="inner")
172+
173+
161174
class JoinNonUnique:
162175
# outer join of non-unique
163176
# GH 6329

doc/source/whatsnew/v1.4.2.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ including other versions of pandas.
1414

1515
Fixed regressions
1616
~~~~~~~~~~~~~~~~~
17-
- Fixed regression in :meth:`DataFrame.drop` and :meth:`Series.drop` when :class:`Index` had extension dtype and duplicates (:issue:`45820`)
17+
- Fixed regression in :meth:`DataFrame.drop` and :meth:`Series.drop` when :class:`Index` had extension dtype and duplicates (:issue:`45860`)
1818
-
1919

2020
.. ---------------------------------------------------------------------------

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@ Performance improvements
256256
- Performance improvement in :meth:`.GroupBy.transform` for user-defined functions when only a single group exists (:issue:`44977`)
257257
- Performance improvement in :meth:`MultiIndex.get_locs` (:issue:`45681`)
258258
- Performance improvement in :func:`merge` when left and/or right are empty (:issue:`45838`)
259+
- Performance improvement in :meth:`DataFrame.join` when left and/or right are empty (:issue:`46015`)
259260
- Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
260261
-
261262

pandas/core/frame.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -7448,7 +7448,11 @@ def combine_first(self, other: DataFrame) -> DataFrame:
74487448
74497449
Combine two DataFrame objects by filling null values in one DataFrame
74507450
with non-null values from other DataFrame. The row and column indexes
7451-
of the resulting DataFrame will be the union of the two.
7451+
of the resulting DataFrame will be the union of the two. The resulting
7452+
dataframe contains the 'first' dataframe values and overrides the
7453+
second one values where both first.loc[index, col] and
7454+
second.loc[index, col] are not missing values, upon calling
7455+
first.combine_first(second).
74527456
74537457
Parameters
74547458
----------

pandas/core/indexes/base.py

+19-9
Original file line numberDiff line numberDiff line change
@@ -4542,15 +4542,25 @@ def join(
45424542
if level is not None and (self._is_multi or other._is_multi):
45434543
return self._join_level(other, level, how=how)
45444544

4545-
if len(other) == 0 and how in ("left", "outer"):
4546-
join_index = self._view()
4547-
rindexer = np.repeat(np.intp(-1), len(join_index))
4548-
return join_index, None, rindexer
4549-
4550-
if len(self) == 0 and how in ("right", "outer"):
4551-
join_index = other._view()
4552-
lindexer = np.repeat(np.intp(-1), len(join_index))
4553-
return join_index, lindexer, None
4545+
if len(other) == 0:
4546+
if how in ("left", "outer"):
4547+
join_index = self._view()
4548+
rindexer = np.broadcast_to(np.intp(-1), len(join_index))
4549+
return join_index, None, rindexer
4550+
elif how in ("right", "inner", "cross"):
4551+
join_index = other._view()
4552+
lindexer = np.array([])
4553+
return join_index, lindexer, None
4554+
4555+
if len(self) == 0:
4556+
if how in ("right", "outer"):
4557+
join_index = other._view()
4558+
lindexer = np.broadcast_to(np.intp(-1), len(join_index))
4559+
return join_index, lindexer, None
4560+
elif how in ("left", "inner", "cross"):
4561+
join_index = self._view()
4562+
rindexer = np.array([])
4563+
return join_index, None, rindexer
45544564

45554565
if self._join_precedence < other._join_precedence:
45564566
how = {"right": "left", "left": "right"}.get(how, how)

pandas/core/strings/accessor.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -1880,6 +1880,7 @@ def encode(self, encoding, errors="strict"):
18801880
18811881
Strip whitespaces (including newlines) or a set of specified characters
18821882
from each string in the Series/Index from %(side)s.
1883+
Replaces any non-strings in Series with NaNs.
18831884
Equivalent to :meth:`str.%(method)s`.
18841885
18851886
Parameters
@@ -1901,40 +1902,50 @@ def encode(self, encoding, errors="strict"):
19011902
19021903
Examples
19031904
--------
1904-
>>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan])
1905+
>>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan, 10, True])
19051906
>>> s
19061907
0 1. Ant.
19071908
1 2. Bee!\n
19081909
2 3. Cat?\t
19091910
3 NaN
1911+
4 10
1912+
5 True
19101913
dtype: object
19111914
19121915
>>> s.str.strip()
19131916
0 1. Ant.
19141917
1 2. Bee!
19151918
2 3. Cat?
19161919
3 NaN
1920+
4 NaN
1921+
5 NaN
19171922
dtype: object
19181923
19191924
>>> s.str.lstrip('123.')
19201925
0 Ant.
19211926
1 Bee!\n
19221927
2 Cat?\t
19231928
3 NaN
1929+
4 NaN
1930+
5 NaN
19241931
dtype: object
19251932
19261933
>>> s.str.rstrip('.!? \n\t')
19271934
0 1. Ant
19281935
1 2. Bee
19291936
2 3. Cat
19301937
3 NaN
1938+
4 NaN
1939+
5 NaN
19311940
dtype: object
19321941
19331942
>>> s.str.strip('123.!? \n\t')
19341943
0 Ant
19351944
1 Bee
19361945
2 Cat
19371946
3 NaN
1947+
4 NaN
1948+
5 NaN
19381949
dtype: object
19391950
"""
19401951

pandas/io/sql.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -748,9 +748,9 @@ def pandasSQL_builder(con, schema: str | None = None):
748748
return SQLDatabase(con, schema=schema)
749749

750750
warnings.warn(
751-
"pandas only support SQLAlchemy connectable(engine/connection) or"
752-
"database string URI or sqlite3 DBAPI2 connection"
753-
"other DBAPI2 objects are not tested, please consider using SQLAlchemy",
751+
"pandas only supports SQLAlchemy connectable (engine/connection) or "
752+
"database string URI or sqlite3 DBAPI2 connection. "
753+
"Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.",
754754
UserWarning,
755755
)
756756
return SQLiteDatabase(con)

pandas/tests/arrays/integer/test_arithmetic.py

+9
Original file line numberDiff line numberDiff line change
@@ -314,3 +314,12 @@ def test_unary_int_operators(any_signed_int_ea_dtype, source, neg_target, abs_ta
314314
tm.assert_extension_array_equal(pos_result, arr)
315315
assert not tm.shares_memory(pos_result, arr)
316316
tm.assert_extension_array_equal(abs_result, abs_target)
317+
318+
319+
def test_values_multiplying_large_series_by_NA():
320+
# GH#33701
321+
322+
result = pd.NA * pd.Series(np.zeros(10001))
323+
expected = pd.Series([pd.NA] * 10001)
324+
325+
tm.assert_series_equal(result, expected)

pandas/tests/frame/methods/test_replace.py

+1
Original file line numberDiff line numberDiff line change
@@ -891,6 +891,7 @@ def test_replace_input_formats_scalar(self):
891891
tm.assert_frame_equal(result, expected)
892892

893893
def test_replace_limit(self):
894+
# TODO
894895
pass
895896

896897
def test_replace_dict_no_regex(self):

pandas/tests/frame/methods/test_reset_index.py

+26-23
Original file line numberDiff line numberDiff line change
@@ -201,39 +201,42 @@ def test_reset_index_name(self):
201201
assert return_value is None
202202
assert df.index.name is None
203203

204-
def test_reset_index_level(self):
204+
@pytest.mark.parametrize("levels", [["A", "B"], [0, 1]])
205+
def test_reset_index_level(self, levels):
205206
df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "C", "D"])
206207

207-
for levels in ["A", "B"], [0, 1]:
208-
# With MultiIndex
209-
result = df.set_index(["A", "B"]).reset_index(level=levels[0])
210-
tm.assert_frame_equal(result, df.set_index("B"))
208+
# With MultiIndex
209+
result = df.set_index(["A", "B"]).reset_index(level=levels[0])
210+
tm.assert_frame_equal(result, df.set_index("B"))
211211

212-
result = df.set_index(["A", "B"]).reset_index(level=levels[:1])
213-
tm.assert_frame_equal(result, df.set_index("B"))
212+
result = df.set_index(["A", "B"]).reset_index(level=levels[:1])
213+
tm.assert_frame_equal(result, df.set_index("B"))
214214

215-
result = df.set_index(["A", "B"]).reset_index(level=levels)
216-
tm.assert_frame_equal(result, df)
215+
result = df.set_index(["A", "B"]).reset_index(level=levels)
216+
tm.assert_frame_equal(result, df)
217217

218-
result = df.set_index(["A", "B"]).reset_index(level=levels, drop=True)
219-
tm.assert_frame_equal(result, df[["C", "D"]])
218+
result = df.set_index(["A", "B"]).reset_index(level=levels, drop=True)
219+
tm.assert_frame_equal(result, df[["C", "D"]])
220220

221-
# With single-level Index (GH 16263)
222-
result = df.set_index("A").reset_index(level=levels[0])
223-
tm.assert_frame_equal(result, df)
221+
# With single-level Index (GH 16263)
222+
result = df.set_index("A").reset_index(level=levels[0])
223+
tm.assert_frame_equal(result, df)
224224

225-
result = df.set_index("A").reset_index(level=levels[:1])
226-
tm.assert_frame_equal(result, df)
225+
result = df.set_index("A").reset_index(level=levels[:1])
226+
tm.assert_frame_equal(result, df)
227227

228-
result = df.set_index(["A"]).reset_index(level=levels[0], drop=True)
229-
tm.assert_frame_equal(result, df[["B", "C", "D"]])
228+
result = df.set_index(["A"]).reset_index(level=levels[0], drop=True)
229+
tm.assert_frame_equal(result, df[["B", "C", "D"]])
230230

231+
@pytest.mark.parametrize("idx_lev", [["A", "B"], ["A"]])
232+
def test_reset_index_level_missing(self, idx_lev):
231233
# Missing levels - for both MultiIndex and single-level Index:
232-
for idx_lev in ["A", "B"], ["A"]:
233-
with pytest.raises(KeyError, match=r"(L|l)evel \(?E\)?"):
234-
df.set_index(idx_lev).reset_index(level=["A", "E"])
235-
with pytest.raises(IndexError, match="Too many levels"):
236-
df.set_index(idx_lev).reset_index(level=[0, 1, 2])
234+
df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "C", "D"])
235+
236+
with pytest.raises(KeyError, match=r"(L|l)evel \(?E\)?"):
237+
df.set_index(idx_lev).reset_index(level=["A", "E"])
238+
with pytest.raises(IndexError, match="Too many levels"):
239+
df.set_index(idx_lev).reset_index(level=[0, 1, 2])
237240

238241
def test_reset_index_right_dtype(self):
239242
time = np.arange(0.0, 10, np.sqrt(2) / 2)

pandas/tests/frame/methods/test_shift.py

+17-13
Original file line numberDiff line numberDiff line change
@@ -92,17 +92,17 @@ def test_shift_int(self, datetime_frame, frame_or_series):
9292
expected = ts.astype(float).shift(1)
9393
tm.assert_equal(shifted, expected)
9494

95-
def test_shift_32bit_take(self, frame_or_series):
95+
@pytest.mark.parametrize("dtype", ["int32", "int64"])
96+
def test_shift_32bit_take(self, frame_or_series, dtype):
9697
# 32-bit taking
9798
# GH#8129
9899
index = date_range("2000-01-01", periods=5)
99-
for dtype in ["int32", "int64"]:
100-
arr = np.arange(5, dtype=dtype)
101-
s1 = frame_or_series(arr, index=index)
102-
p = arr[1]
103-
result = s1.shift(periods=p)
104-
expected = frame_or_series([np.nan, 0, 1, 2, 3], index=index)
105-
tm.assert_equal(result, expected)
100+
arr = np.arange(5, dtype=dtype)
101+
s1 = frame_or_series(arr, index=index)
102+
p = arr[1]
103+
result = s1.shift(periods=p)
104+
expected = frame_or_series([np.nan, 0, 1, 2, 3], index=index)
105+
tm.assert_equal(result, expected)
106106

107107
@pytest.mark.parametrize("periods", [1, 2, 3, 4])
108108
def test_shift_preserve_freqstr(self, periods, frame_or_series):
@@ -141,11 +141,15 @@ def test_shift_dst(self, frame_or_series):
141141
tm.assert_equal(res, exp)
142142
assert tm.get_dtype(res) == "datetime64[ns, US/Eastern]"
143143

144-
for ex in [10, -10, 20, -20]:
145-
res = obj.shift(ex)
146-
exp = frame_or_series([NaT] * 10, dtype="datetime64[ns, US/Eastern]")
147-
tm.assert_equal(res, exp)
148-
assert tm.get_dtype(res) == "datetime64[ns, US/Eastern]"
144+
@pytest.mark.parametrize("ex", [10, -10, 20, -20])
145+
def test_shift_dst_beyond(self, frame_or_series, ex):
146+
# GH#13926
147+
dates = date_range("2016-11-06", freq="H", periods=10, tz="US/Eastern")
148+
obj = frame_or_series(dates)
149+
res = obj.shift(ex)
150+
exp = frame_or_series([NaT] * 10, dtype="datetime64[ns, US/Eastern]")
151+
tm.assert_equal(res, exp)
152+
assert tm.get_dtype(res) == "datetime64[ns, US/Eastern]"
149153

150154
def test_shift_by_zero(self, datetime_frame, frame_or_series):
151155
# shift by 0

pandas/tests/frame/test_api.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -194,15 +194,19 @@ def test_empty_nonzero(self):
194194
df = DataFrame(index=["a", "b"], columns=["c", "d"]).dropna()
195195
assert df.empty
196196
assert df.T.empty
197-
empty_frames = [
197+
198+
@pytest.mark.parametrize(
199+
"df",
200+
[
198201
DataFrame(),
199202
DataFrame(index=[1]),
200203
DataFrame(columns=[1]),
201204
DataFrame({1: []}),
202-
]
203-
for df in empty_frames:
204-
assert df.empty
205-
assert df.T.empty
205+
],
206+
)
207+
def test_empty_like(self, df):
208+
assert df.empty
209+
assert df.T.empty
206210

207211
def test_with_datetimelikes(self):
208212

pandas/tests/reshape/merge/test_join.py

+41
Original file line numberDiff line numberDiff line change
@@ -881,3 +881,44 @@ def test_join_multiindex_not_alphabetical_categorical(categories, values):
881881
}
882882
).set_index(["first", "second"])
883883
tm.assert_frame_equal(result, expected)
884+
885+
886+
@pytest.mark.parametrize(
887+
"left_empty, how, exp",
888+
[
889+
(False, "left", "left"),
890+
(False, "right", "empty"),
891+
(False, "inner", "empty"),
892+
(False, "outer", "left"),
893+
(False, "cross", "empty"),
894+
(True, "left", "empty"),
895+
(True, "right", "right"),
896+
(True, "inner", "empty"),
897+
(True, "outer", "right"),
898+
(True, "cross", "empty"),
899+
],
900+
)
901+
def test_join_empty(left_empty, how, exp):
902+
903+
left = DataFrame({"A": [2, 1], "B": [3, 4]}, dtype="int64").set_index("A")
904+
right = DataFrame({"A": [1], "C": [5]}, dtype="int64").set_index("A")
905+
906+
if left_empty:
907+
left = left.head(0)
908+
else:
909+
right = right.head(0)
910+
911+
result = left.join(right, how=how)
912+
913+
if exp == "left":
914+
expected = DataFrame({"A": [2, 1], "B": [3, 4], "C": [np.nan, np.nan]})
915+
expected = expected.set_index("A")
916+
elif exp == "right":
917+
expected = DataFrame({"B": [np.nan], "A": [1], "C": [5]})
918+
expected = expected.set_index("A")
919+
elif exp == "empty":
920+
expected = DataFrame(index=Index([]), columns=["B", "C"], dtype="int64")
921+
if how != "cross":
922+
expected = expected.rename_axis("A")
923+
924+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)