Skip to content

Commit 4582c1c

Browse files
REGR: fix isin for large series with nan and mixed object dtype (causing regression in read_csv) (#37499)
1 parent d8c8cbb commit 4582c1c

File tree

4 files changed

+27
-1
lines changed

4 files changed

+27
-1
lines changed

doc/source/whatsnew/v1.1.4.rst

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ including other versions of pandas.
1515
Fixed regressions
1616
~~~~~~~~~~~~~~~~~
1717
- Fixed regression in :func:`read_csv` raising a ``ValueError`` when ``names`` was of type ``dict_keys`` (:issue:`36928`)
18+
- Fixed regression in :func:`read_csv` with more than 1M rows and specifying a ``index_col`` argument (:issue:`37094`)
1819
- Fixed regression where attempting to mutate a :class:`DateOffset` object would no longer raise an ``AttributeError`` (:issue:`36940`)
1920
- Fixed regression where :meth:`DataFrame.agg` would fail with :exc:`TypeError` when passed positional arguments to be passed on to the aggregation function (:issue:`36948`).
2021
- Fixed regression in :class:`RollingGroupby` with ``sort=False`` not being respected (:issue:`36889`)

pandas/core/algorithms.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -441,7 +441,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
441441
if len(comps) > 1_000_000 and not is_object_dtype(comps):
442442
# If the the values include nan we need to check for nan explicitly
443443
# since np.nan it not equal to np.nan
444-
if np.isnan(values).any():
444+
if isna(values).any():
445445
f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c))
446446
else:
447447
f = np.in1d

pandas/tests/io/parser/test_index_col.py

+15
Original file line numberDiff line numberDiff line change
@@ -207,3 +207,18 @@ def test_header_with_index_col(all_parsers):
207207

208208
result = parser.read_csv(StringIO(data), index_col="I11", header=0)
209209
tm.assert_frame_equal(result, expected)
210+
211+
212+
@pytest.mark.slow
213+
def test_index_col_large_csv(all_parsers):
214+
# https://github.com/pandas-dev/pandas/issues/37094
215+
parser = all_parsers
216+
217+
N = 1_000_001
218+
df = DataFrame({"a": range(N), "b": np.random.randn(N)})
219+
220+
with tm.ensure_clean() as path:
221+
df.to_csv(path, index=False)
222+
result = parser.read_csv(path, index_col=[0])
223+
224+
tm.assert_frame_equal(result, df.set_index("a"))

pandas/tests/series/methods/test_isin.py

+10
Original file line numberDiff line numberDiff line change
@@ -89,3 +89,13 @@ def test_isin_read_only(self):
8989
result = s.isin(arr)
9090
expected = Series([True, True, True])
9191
tm.assert_series_equal(result, expected)
92+
93+
94+
@pytest.mark.slow
95+
def test_isin_large_series_mixed_dtypes_and_nan():
96+
# https://github.com/pandas-dev/pandas/issues/37094
97+
# combination of object dtype for the values and > 1_000_000 elements
98+
ser = Series([1, 2, np.nan] * 1_000_000)
99+
result = ser.isin({"foo", "bar"})
100+
expected = Series([False] * 3 * 1_000_000)
101+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)