From a62e4c0d42eba9c5a1a3161848818c7410f4405f Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 29 Oct 2020 22:32:07 +0100
Subject: [PATCH 1/4] REGR: fix isin for large series with nan and mixed object
 dtype (causing regression in read_csv)

---
 pandas/core/algorithms.py                |  6 +++++-
 pandas/tests/series/methods/test_isin.py | 10 ++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 9a3144d1ccbaa..317c9ac18dd79 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -438,7 +438,11 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
 
     # GH16012
     # Ensure np.in1d doesn't get object types or it *may* throw an exception
-    if len(comps) > 1_000_000 and not is_object_dtype(comps):
+    if (
+        len(comps) > 1_000_000
+        and not is_object_dtype(comps)
+        and not is_object_dtype(values)
+    ):
         # If the the values include nan we need to check for nan explicitly
         # since np.nan it not equal to np.nan
         if np.isnan(values).any():
diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py
index 62766c692f4df..7d89b083a809b 100644
--- a/pandas/tests/series/methods/test_isin.py
+++ b/pandas/tests/series/methods/test_isin.py
@@ -89,3 +89,13 @@ def test_isin_read_only(self):
         result = s.isin(arr)
         expected = Series([True, True, True])
         tm.assert_series_equal(result, expected)
+
+
+def test_isin_large_series_mixed_dtypes_and_nan():
+    # https://github.com/pandas-dev/pandas/issues/37094
+    # combination of object dtype for the values, > 1_000_000 elements
+    # and the presence of NaNs in the Series
+    ser = Series([1, 2, np.nan] * 1_000_000)
+    result = ser.isin({"foo", "bar"})
+    expected = Series([False] * 3 * 1_000_000)
+    tm.assert_series_equal(result, expected)

From d516b43ef07430d4cd4d57cf3dfb167dabc323dc Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 29 Oct 2020 22:37:53 +0100
Subject: [PATCH 2/4] add whatsnew

---
 doc/source/whatsnew/v1.1.4.rst           | 1 +
 pandas/tests/series/methods/test_isin.py | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst
index 6cb728800dc68..277edd15a9322 100644
--- a/doc/source/whatsnew/v1.1.4.rst
+++ b/doc/source/whatsnew/v1.1.4.rst
@@ -15,6 +15,7 @@ including other versions of pandas.
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
 - Fixed regression in :func:`read_csv` raising a ``ValueError`` when ``names`` was of type ``dict_keys`` (:issue:`36928`)
+- Fixed regression in :func:`read_csv` with more than 1M rows and specifying a ``index_col`` argument (:issue:`37094`)
 - Fixed regression where attempting to mutate a :class:`DateOffset` object would no longer raise an ``AttributeError`` (:issue:`36940`)
 - Fixed regression where :meth:`DataFrame.agg` would fail with :exc:`TypeError` when passed positional arguments to be passed on to the aggregation function (:issue:`36948`).
 - Fixed regression in :class:`RollingGroupby` with ``sort=False`` not being respected (:issue:`36889`)
diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py
index 7d89b083a809b..419b7b65a5aa2 100644
--- a/pandas/tests/series/methods/test_isin.py
+++ b/pandas/tests/series/methods/test_isin.py
@@ -93,8 +93,7 @@ def test_isin_read_only(self):
 
 def test_isin_large_series_mixed_dtypes_and_nan():
     # https://github.com/pandas-dev/pandas/issues/37094
-    # combination of object dtype for the values, > 1_000_000 elements
-    # and the presence of NaNs in the Series
+    # combination of object dtype for the valuesa and > 1_000_000 elements
     ser = Series([1, 2, np.nan] * 1_000_000)
     result = ser.isin({"foo", "bar"})
     expected = Series([False] * 3 * 1_000_000)

From 520cf1013823dd7d167245c844c1f9ef36802cea Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 29 Oct 2020 22:51:23 +0100
Subject: [PATCH 3/4] also add csv test

---
 pandas/tests/io/parser/test_index_col.py | 15 +++++++++++++++
 pandas/tests/series/methods/test_isin.py |  1 +
 2 files changed, 16 insertions(+)

diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py
index 4d64f2bf411bd..9c6cad4b41949 100644
--- a/pandas/tests/io/parser/test_index_col.py
+++ b/pandas/tests/io/parser/test_index_col.py
@@ -207,3 +207,18 @@ def test_header_with_index_col(all_parsers):
 
     result = parser.read_csv(StringIO(data), index_col="I11", header=0)
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.slow
+def test_index_col_large_csv(all_parsers):
+    # https://github.com/pandas-dev/pandas/issues/37094
+    parser = all_parsers
+
+    N = 1_000_001
+    df = DataFrame({"a": range(N), "b": np.random.randn(N)})
+
+    with tm.ensure_clean() as path:
+        df.to_csv(path, index=False)
+        result = parser.read_csv(path, index_col=[0])
+
+    tm.assert_frame_equal(result, df.set_index("a"))
diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py
index 419b7b65a5aa2..33d02379e5ee6 100644
--- a/pandas/tests/series/methods/test_isin.py
+++ b/pandas/tests/series/methods/test_isin.py
@@ -91,6 +91,7 @@ def test_isin_read_only(self):
         tm.assert_series_equal(result, expected)
 
 
+@pytest.mark.slow
 def test_isin_large_series_mixed_dtypes_and_nan():
     # https://github.com/pandas-dev/pandas/issues/37094
     # combination of object dtype for the valuesa and > 1_000_000 elements

From 93c79cce89badfcefcd1da7330d5f4b7a0ec04f8 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 30 Oct 2020 09:23:57 +0100
Subject: [PATCH 4/4] use pd.isna instead of np.isnan

---
 pandas/core/algorithms.py                | 8 ++------
 pandas/tests/series/methods/test_isin.py | 2 +-
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 317c9ac18dd79..a310ec5312cf4 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -438,14 +438,10 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
 
     # GH16012
     # Ensure np.in1d doesn't get object types or it *may* throw an exception
-    if (
-        len(comps) > 1_000_000
-        and not is_object_dtype(comps)
-        and not is_object_dtype(values)
-    ):
+    if len(comps) > 1_000_000 and not is_object_dtype(comps):
         # If the the values include nan we need to check for nan explicitly
         # since np.nan it not equal to np.nan
-        if np.isnan(values).any():
+        if isna(values).any():
             f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c))
         else:
             f = np.in1d
diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py
index 33d02379e5ee6..86ea2b2f02a4d 100644
--- a/pandas/tests/series/methods/test_isin.py
+++ b/pandas/tests/series/methods/test_isin.py
@@ -94,7 +94,7 @@ def test_isin_read_only(self):
 @pytest.mark.slow
 def test_isin_large_series_mixed_dtypes_and_nan():
     # https://github.com/pandas-dev/pandas/issues/37094
-    # combination of object dtype for the valuesa and > 1_000_000 elements
+    # combination of object dtype for the values and > 1_000_000 elements
     ser = Series([1, 2, np.nan] * 1_000_000)
     result = ser.isin({"foo", "bar"})
     expected = Series([False] * 3 * 1_000_000)