From f22ff466b510d13b323c5e483cdeecbbf739dd4e Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sat, 8 Feb 2020 20:24:35 -0800
Subject: [PATCH 01/95] add arrow engine to read_csv

---
 pandas/io/parsers.py | 132 +++++++++++++++++++++++++++++++------------
 1 file changed, 97 insertions(+), 35 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 84a8b5b2a94fe..f5c00f3f7d137 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -20,6 +20,7 @@
 from pandas._libs.parsers import STR_NA_VALUES
 from pandas._libs.tslibs import parsing
 from pandas._typing import FilePathOrBuffer
+from pandas.compat._optional import import_optional_dependency
 from pandas.errors import (
     AbstractMethodError,
     EmptyDataError,
@@ -165,9 +166,10 @@
     to preserve and not interpret dtype.
     If converters are specified, they will be applied INSTEAD
     of dtype conversion.
-engine : {{'c', 'python'}}, optional
-    Parser engine to use. The C engine is faster while the python engine is
-    currently more feature-complete.
+engine : {{'c', 'python', 'arrow'}}, optional
+    Parser engine to use. The C and arrow engines are faster while the python engine is
+    currently more feature-complete. The arrow engine requires ``pyarrow``
+    as a dependency however.
 converters : dict, optional
     Dict of functions for converting values in certain columns. Keys can either
     be integers or column labels.
@@ -506,7 +508,6 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     "skip_blank_lines": True,
 }
 
-
 _c_parser_defaults = {
     "delim_whitespace": False,
     "na_filter": True,
@@ -520,6 +521,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
 _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
 
 _c_unsupported = {"skipfooter"}
+_arrow_unsupported = {"skipfooter", "low_memory", "float_precision"}
 _python_unsupported = {"low_memory", "float_precision"}
 
 _deprecated_defaults: Dict[str, Any] = {}
@@ -705,7 +707,6 @@ def read_fwf(
     infer_nrows=100,
     **kwds,
 ):
-
     r"""
     Read a table of fixed-width formatted lines into DataFrame.
 
@@ -879,7 +880,8 @@ def __init__(self, f, engine=None, **kwds):
         self._make_engine(self.engine)
 
     def close(self):
-        self._engine.close()
+        if self.engine != "arrow":
+            self._engine.close()
 
     def _get_options_with_defaults(self, engine):
         kwds = self.orig_options
@@ -945,16 +947,16 @@ def _clean_options(self, options, engine):
         delim_whitespace = options["delim_whitespace"]
 
         # C engine not supported yet
-        if engine == "c":
+        if engine == "c" or engine == "arrow":
             if options["skipfooter"] > 0:
-                fallback_reason = "the 'c' engine does not support skipfooter"
+                fallback_reason = f"the {engine} engine does not support skipfooter"
                 engine = "python"
 
         encoding = sys.getfilesystemencoding() or "utf-8"
         if sep is None and not delim_whitespace:
-            if engine == "c":
+            if engine == "c" or engine == "arrow":
                 fallback_reason = (
-                    "the 'c' engine does not support "
+                    f"the {engine} engine does not support "
                     "sep=None with delim_whitespace=False"
                 )
                 engine = "python"
@@ -1081,14 +1083,20 @@ def _clean_options(self, options, engine):
         na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
 
         # handle skiprows; this is internally handled by the
-        # c-engine, so only need for python parsers
+        # c-engine, so only need for python parser
         if engine != "c":
-            if is_integer(skiprows):
-                skiprows = list(range(skiprows))
-            if skiprows is None:
-                skiprows = set()
-            elif not callable(skiprows):
-                skiprows = set(skiprows)
+            if engine == "arrow":
+                if not is_integer(skiprows) and skiprows is not None:
+                    raise ValueError(
+                        "skiprows argument must be integer when using arrow engine"
+                    )
+            else:
+                if is_integer(skiprows):
+                    skiprows = list(range(skiprows))
+                if skiprows is None:
+                    skiprows = set()
+                elif not callable(skiprows):
+                    skiprows = set(skiprows)
 
         # put stuff back
         result["names"] = names
@@ -1109,6 +1117,8 @@ def __next__(self):
     def _make_engine(self, engine="c"):
         if engine == "c":
             self._engine = CParserWrapper(self.f, **self.options)
+        elif engine == "arrow":
+            self._engine = ArrowParserWrapper(self.f, **self.options)
         else:
             if engine == "python":
                 klass = PythonParser
@@ -1125,29 +1135,32 @@ def _failover_to_python(self):
         raise AbstractMethodError(self)
 
     def read(self, nrows=None):
-        nrows = _validate_integer("nrows", nrows)
-        ret = self._engine.read(nrows)
+        if self.engine == "arrow":
+            return self._engine.read(nrows)
+        else:
+            nrows = _validate_integer("nrows", nrows)
+            ret = self._engine.read(nrows)
 
-        # May alter columns / col_dict
-        index, columns, col_dict = self._create_index(ret)
+            # May alter columns / col_dict
+            index, columns, col_dict = self._create_index(ret)
 
-        if index is None:
-            if col_dict:
-                # Any column is actually fine:
-                new_rows = len(next(iter(col_dict.values())))
-                index = RangeIndex(self._currow, self._currow + new_rows)
+            if index is None:
+                if col_dict:
+                    # Any column is actually fine:
+                    new_rows = len(next(iter(col_dict.values())))
+                    index = RangeIndex(self._currow, self._currow + new_rows)
+                else:
+                    new_rows = 0
             else:
-                new_rows = 0
-        else:
-            new_rows = len(index)
+                new_rows = len(index)
 
-        df = DataFrame(col_dict, columns=columns, index=index)
+            df = DataFrame(col_dict, columns=columns, index=index)
 
-        self._currow += new_rows
+            self._currow += new_rows
 
-        if self.squeeze and len(df.columns) == 1:
-            return df[df.columns[0]].copy()
-        return df
+            if self.squeeze and len(df.columns) == 1:
+                return df[df.columns[0]].copy()
+            return df
 
     def _create_index(self, ret):
         index, columns, col_dict = ret
@@ -2135,6 +2148,56 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True):
         return values
 
 
+class ArrowParserWrapper(ParserBase):
+    """
+
+    """
+
+    def __init__(self, src, **kwds):
+        self.kwds = kwds
+        self.src = src
+        kwds = kwds.copy()
+
+        ParserBase.__init__(self, kwds)
+
+        # #2442
+        kwds["allow_leading_cols"] = self.index_col is not False
+
+        # GH20529, validate usecol arg before TextReader
+        self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
+        kwds["usecols"] = self.usecols
+
+        self.names = kwds["names"]
+
+    def read(self, nrows=None):
+        pyarrow = import_optional_dependency(
+            "pyarrow.csv", extra="pyarrow is required to use arrow engine"
+        )
+        nrows = _validate_integer("nrows", nrows)
+        table = pyarrow.read_csv(
+            self.src,
+            read_options=pyarrow.ReadOptions(
+                skip_rows=self.kwds.get("skiprows"), column_names=self.names
+            ),
+            parse_options=pyarrow.ParseOptions(
+                delimiter=self.kwds.get("delimiter"),
+                quote_char=self.kwds.get("quotechar"),
+            ),
+            convert_options=pyarrow.ConvertOptions(
+                include_columns=self.usecols, column_types=self.kwds.get("dtype")
+            ),
+        )
+        if nrows:
+            table = table[:nrows]
+        table_width = len(table.column_names)
+        if self.names is None:
+            if self.prefix:
+                self.names = [f"{self.prefix}{i}" for i in range(table_width)]
+        if self.names:
+            table = table.rename_columns(self.names)
+        return table.to_pandas()
+
+
 def TextParser(*args, **kwds):
     """
     Converts lists of lists/tuples into DataFrames with proper type inference
@@ -3336,7 +3399,6 @@ def _try_convert_dates(parser, colspec, data_dict, columns):
 
 
 def _clean_na_values(na_values, keep_default_na=True):
-
     if na_values is None:
         if keep_default_na:
             na_values = STR_NA_VALUES

From 8ae43e44cdbec134771173b69a5d4c1a2400504f Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sat, 8 Feb 2020 21:01:26 -0800
Subject: [PATCH 02/95] fix failing test

---
 pandas/io/parsers.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index f5c00f3f7d137..75da1d991dc9b 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1135,7 +1135,7 @@ def _failover_to_python(self):
         raise AbstractMethodError(self)
 
     def read(self, nrows=None):
-        if self.engine == "arrow":
+        if isinstance(self._engine, ArrowParserWrapper):
             return self._engine.read(nrows)
         else:
             nrows = _validate_integer("nrows", nrows)
@@ -2165,9 +2165,6 @@ def __init__(self, src, **kwds):
 
         # GH20529, validate usecol arg before TextReader
         self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
-        kwds["usecols"] = self.usecols
-
-        self.names = kwds["names"]
 
     def read(self, nrows=None):
         pyarrow = import_optional_dependency(

From 09074df84e42eec3e7f7dd1ae7c710af53b386cc Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sun, 9 Feb 2020 10:01:55 -0800
Subject: [PATCH 03/95] formatting and revert unnecessary change

---
 pandas/io/parsers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 75da1d991dc9b..ad60b223daa06 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -167,7 +167,7 @@
     If converters are specified, they will be applied INSTEAD
     of dtype conversion.
 engine : {{'c', 'python', 'arrow'}}, optional
-    Parser engine to use. The C and arrow engines are faster while the python engine is
+    Parser engine to use. The C and arrow engines are faster, while the python engine is
     currently more feature-complete. The arrow engine requires ``pyarrow``
     as a dependency however.
 converters : dict, optional
@@ -508,6 +508,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     "skip_blank_lines": True,
 }
 
+
 _c_parser_defaults = {
     "delim_whitespace": False,
     "na_filter": True,

From 6be276db8c7c5e1384bfb45591534176d2f6bfe5 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sun, 9 Feb 2020 10:07:03 -0800
Subject: [PATCH 04/95] remove bloat and more formatting changes

---
 pandas/io/parsers.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index ad60b223daa06..6d8764fef385c 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -881,8 +881,7 @@ def __init__(self, f, engine=None, **kwds):
         self._make_engine(self.engine)
 
     def close(self):
-        if self.engine != "arrow":
-            self._engine.close()
+        self._engine.close()
 
     def _get_options_with_defaults(self, engine):
         kwds = self.orig_options
@@ -1089,7 +1088,7 @@ def _clean_options(self, options, engine):
             if engine == "arrow":
                 if not is_integer(skiprows) and skiprows is not None:
                     raise ValueError(
-                        "skiprows argument must be integer when using arrow engine"
+                        "skiprows argument must be an integer when using engine='arrow'"
                     )
             else:
                 if is_integer(skiprows):

From df4fa7e2ac359f7e25031f8f92d312049972d1ec Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sun, 9 Feb 2020 10:25:25 -0800
Subject: [PATCH 05/95] Whatsnew

---
 doc/source/whatsnew/v1.1.0.rst | 4 +++-
 pandas/io/parsers.py           | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 920919755dc23..2c4f5dcfbcde8 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -42,7 +42,9 @@ Other enhancements
 ^^^^^^^^^^^^^^^^^^
 
 - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`)
--
+- :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing
+  if pyarrow>0.11 is installed. However, the pyarrow engine is less feature-complete than its "c" or
+  "python" counterparts.
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 6d8764fef385c..938bafa780d89 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -708,6 +708,7 @@ def read_fwf(
     infer_nrows=100,
     **kwds,
 ):
+
     r"""
     Read a table of fixed-width formatted lines into DataFrame.
 
@@ -3396,6 +3397,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns):
 
 
 def _clean_na_values(na_values, keep_default_na=True):
+
     if na_values is None:
         if keep_default_na:
             na_values = STR_NA_VALUES

From ecaf3fd036d38dfd34e5d9a5de45304dbdfacca4 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sun, 9 Feb 2020 16:35:32 -0800
Subject: [PATCH 06/95] Get tests up and running

---
 pandas/io/parsers.py               | 12 +++++++-----
 pandas/tests/io/parser/conftest.py | 12 ++++++++++--
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 371660b19b171..43272ef2cf600 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -508,7 +508,6 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     "skip_blank_lines": True,
 }
 
-
 _c_parser_defaults = {
     "delim_whitespace": False,
     "na_filter": True,
@@ -522,7 +521,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
 _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
 
 _c_unsupported = {"skipfooter"}
-_arrow_unsupported = {"skipfooter", "low_memory", "float_precision"}
+_arrow_unsupported = {"skipfooter", "low_memory", "float_precision", "chunksize"}
 _python_unsupported = {"low_memory", "float_precision"}
 
 _deprecated_defaults: Dict[str, Any] = {}
@@ -708,7 +707,6 @@ def read_fwf(
     infer_nrows=100,
     **kwds,
 ):
-
     r"""
     Read a table of fixed-width formatted lines into DataFrame.
 
@@ -947,7 +945,12 @@ def _clean_options(self, options, engine):
         sep = options["delimiter"]
         delim_whitespace = options["delim_whitespace"]
 
-        # C engine not supported yet
+        # arrow engine not supported yet
+        if engine == "arrow":
+            if options["chunksize"] is not None:
+                fallback_reason = f"the arrow engine does not support chunksize"
+                engine = "python"
+        # C and arrow engine not supported yet
         if engine == "c" or engine == "arrow":
             if options["skipfooter"] > 0:
                 fallback_reason = f"the {engine} engine does not support skipfooter"
@@ -3401,7 +3404,6 @@ def _try_convert_dates(parser, colspec, data_dict, columns):
 
 
 def _clean_na_values(na_values, keep_default_na=True):
-
     if na_values is None:
         if keep_default_na:
             na_values = STR_NA_VALUES
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index 15967e3be176a..751db1d22e8ae 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -44,6 +44,11 @@ class PythonParser(BaseParser):
     float_precision_choices = [None]
 
 
+class ArrowParser(BaseParser):
+    engine = "arrow"
+    float_precision_choices = [None]
+
+
 @pytest.fixture
 def csv_dir_path(datapath):
     """
@@ -63,14 +68,17 @@ def csv1(csv_dir_path):
 _cParserHighMemory = CParserHighMemory()
 _cParserLowMemory = CParserLowMemory()
 _pythonParser = PythonParser()
+_arrowParser = ArrowParser()
 
 _py_parsers_only = [_pythonParser]
 _c_parsers_only = [_cParserHighMemory, _cParserLowMemory]
-_all_parsers = [*_c_parsers_only, *_py_parsers_only]
+_arrow_parsers_only = [_arrowParser]
+_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_arrow_parsers_only]
 
 _py_parser_ids = ["python"]
 _c_parser_ids = ["c_high", "c_low"]
-_all_parser_ids = [*_c_parser_ids, *_py_parser_ids]
+_arrow_parser_ids = ["arrow"]
+_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_arrow_parser_ids]
 
 
 @pytest.fixture(params=_all_parsers, ids=_all_parser_ids)

From b3c328723bb997a675e31cd8db84d77d75afa4f7 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Mon, 10 Feb 2020 07:26:58 -0800
Subject: [PATCH 07/95] Some fixes

---
 pandas/io/parsers.py | 45 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 43272ef2cf600..d3f40a6b9df2b 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -947,7 +947,7 @@ def _clean_options(self, options, engine):
 
         # arrow engine not supported yet
         if engine == "arrow":
-            if options["chunksize"] is not None:
+            if self.chunksize is not None:
                 fallback_reason = f"the arrow engine does not support chunksize"
                 engine = "python"
         # C and arrow engine not supported yet
@@ -1087,10 +1087,11 @@ def _clean_options(self, options, engine):
         na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
 
         # handle skiprows; this is internally handled by the
-        # c-engine, so only need for python parser
+        # c-engine, so only need for python and arrow parsers
         if engine != "c":
             if engine == "arrow":
                 if not is_integer(skiprows) and skiprows is not None:
+                    # pyarrow expects skiprows to be passed as an integer
                     raise ValueError(
                         "skiprows argument must be an integer when using engine='arrow'"
                     )
@@ -1131,7 +1132,7 @@ def _make_engine(self, engine="c"):
             else:
                 raise ValueError(
                     f"Unknown engine: {engine} (valid options "
-                    'are "c", "python", or "python-fwf")'
+                    'are "c", "python", "arrow", or "python-fwf")'
                 )
             self._engine = klass(self.f, **self.options)
 
@@ -1139,32 +1140,31 @@ def _failover_to_python(self):
         raise AbstractMethodError(self)
 
     def read(self, nrows=None):
-        if isinstance(self._engine, ArrowParserWrapper):
+        nrows = _validate_integer("nrows", nrows)
+        if self.engine == "arrow":
             return self._engine.read(nrows)
-        else:
-            nrows = _validate_integer("nrows", nrows)
-            ret = self._engine.read(nrows)
+        ret = self._engine.read(nrows)
 
-            # May alter columns / col_dict
-            index, columns, col_dict = self._create_index(ret)
+        # May alter columns / col_dict
+        index, columns, col_dict = self._create_index(ret)
 
-            if index is None:
-                if col_dict:
-                    # Any column is actually fine:
-                    new_rows = len(next(iter(col_dict.values())))
-                    index = RangeIndex(self._currow, self._currow + new_rows)
-                else:
-                    new_rows = 0
+        if index is None:
+            if col_dict:
+                # Any column is actually fine:
+                new_rows = len(next(iter(col_dict.values())))
+                index = RangeIndex(self._currow, self._currow + new_rows)
             else:
-                new_rows = len(index)
+                new_rows = 0
+        else:
+            new_rows = len(index)
 
-            df = DataFrame(col_dict, columns=columns, index=index)
+        df = DataFrame(col_dict, columns=columns, index=index)
 
-            self._currow += new_rows
+        self._currow += new_rows
 
-            if self.squeeze and len(df.columns) == 1:
-                return df[df.columns[0]].copy()
-            return df
+        if self.squeeze and len(df.columns) == 1:
+            return df[df.columns[0]].copy()
+        return df
 
     def _create_index(self, ret):
         index, columns, col_dict = ret
@@ -2178,7 +2178,6 @@ def read(self, nrows=None):
         pyarrow = import_optional_dependency(
             "pyarrow.csv", extra="pyarrow is required to use arrow engine"
         )
-        nrows = _validate_integer("nrows", nrows)
         table = pyarrow.read_csv(
             self.src,
             read_options=pyarrow.ReadOptions(

From 474baf4c83ee28330ef38b426f09617d2f8cfc9e Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Mon, 10 Feb 2020 20:35:38 -0800
Subject: [PATCH 08/95] Add asvs and xfail some tests

---
 asv_bench/benchmarks/io/csv.py | 10 ++++++++++
 pandas/io/parsers.py           |  8 +++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 9bcd125f56bbb..89c81a937090b 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -254,6 +254,16 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
             names=list("abc"),
         )
 
+    def time_read_csv_arrow_engine(self, sep, decimal, float_precision):
+        read_csv(
+            self.data(self.StringIO_input),
+            sep=sep,
+            header=None,
+            engine="arrow",
+            float_precision=None,
+            names=list("abc"),
+        )
+
 
 class ReadCSVCategorical(BaseIO):
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index d3f40a6b9df2b..dd2155d2d735b 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -521,7 +521,13 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
 _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
 
 _c_unsupported = {"skipfooter"}
-_arrow_unsupported = {"skipfooter", "low_memory", "float_precision", "chunksize"}
+_arrow_unsupported = {
+    "skipfooter",
+    "low_memory",
+    "float_precision",
+    "chunksize",
+    "comment",
+}
 _python_unsupported = {"low_memory", "float_precision"}
 
 _deprecated_defaults: Dict[str, Any] = {}

From 2cd993771b6c07a8144c8472c710e164410c8e37 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 19 Feb 2020 16:57:52 -0800
Subject: [PATCH 09/95] address comments

---
 asv_bench/benchmarks/io/csv.py     |  4 +-
 doc/source/whatsnew/v1.1.0.rst     |  2 +-
 pandas/io/parsers.py               | 63 +++++++++++++++++++-----------
 pandas/tests/io/parser/conftest.py | 14 +++----
 4 files changed, 50 insertions(+), 33 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 89c81a937090b..a4e6f94f326ba 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -254,12 +254,12 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
             names=list("abc"),
         )
 
-    def time_read_csv_arrow_engine(self, sep, decimal, float_precision):
+    def time_read_csv_pyarrow_engine(self, sep, decimal, float_precision):
         read_csv(
             self.data(self.StringIO_input),
             sep=sep,
             header=None,
-            engine="arrow",
+            engine="pyarrow",
             float_precision=None,
             names=list("abc"),
         )
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index fc0e486978ffb..297c561557053 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -43,7 +43,7 @@ Other enhancements
 
 - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`)
 - :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing
-  if pyarrow>0.11 is installed. However, the pyarrow engine is less feature-complete than its "c" or
+  if pyarrow>0.13 is installed. However, the pyarrow engine is less feature-complete than its "c" or
   "python" counterparts.
 -
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index dd2155d2d735b..59678d675b0b1 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -5,7 +5,7 @@
 from collections import abc, defaultdict
 import csv
 import datetime
-from io import BufferedIOBase, RawIOBase, StringIO, TextIOWrapper
+from io import BufferedIOBase, BytesIO, RawIOBase, StringIO, TextIOWrapper
 import re
 import sys
 from textwrap import fill
@@ -166,10 +166,11 @@
     to preserve and not interpret dtype.
     If converters are specified, they will be applied INSTEAD
     of dtype conversion.
-engine : {{'c', 'python', 'arrow'}}, optional
-    Parser engine to use. The C and arrow engines are faster, while the python engine is
-    currently more feature-complete. The arrow engine requires ``pyarrow``
+engine : {{'c', 'python', 'pyarrow'}}, optional
+    Parser engine to use. The C and pyarrow engines are faster, while the python engine
+    is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.13
     as a dependency however.
+    .. versionchanged(1.1)
 converters : dict, optional
     Dict of functions for converting values in certain columns. Keys can either
     be integers or column labels.
@@ -521,9 +522,8 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
 _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
 
 _c_unsupported = {"skipfooter"}
-_arrow_unsupported = {
+_pyarrow_unsupported = {
     "skipfooter",
-    "low_memory",
     "float_precision",
     "chunksize",
     "comment",
@@ -951,20 +951,29 @@ def _clean_options(self, options, engine):
         sep = options["delimiter"]
         delim_whitespace = options["delim_whitespace"]
 
-        # arrow engine not supported yet
-        if engine == "arrow":
-            if self.chunksize is not None:
-                fallback_reason = f"the arrow engine does not support chunksize"
-                engine = "python"
-        # C and arrow engine not supported yet
-        if engine == "c" or engine == "arrow":
+        # pyarrow engine not supported yet
+        if engine == "pyarrow":
+            for option in _pyarrow_unsupported:
+                if option != "chunksize" and option != "skipfooter":
+                    if options[option] is not None:
+                        fallback_reason = (
+                            f"the pyarrow engine does not support the {option} argumnet"
+                        )
+                        engine = "python"
+                else:
+                    if self.chunksize is not None:
+                        fallback_reason = (
+                            "the pyarrow engine does not support using chunksize"
+                        )
+        # C and pyarrow engine not supported yet
+        if engine == "c" or "pyarrow":
             if options["skipfooter"] > 0:
                 fallback_reason = f"the {engine} engine does not support skipfooter"
                 engine = "python"
 
         encoding = sys.getfilesystemencoding() or "utf-8"
         if sep is None and not delim_whitespace:
-            if engine == "c" or engine == "arrow":
+            if engine == "c" or engine == "pyarrow":
                 fallback_reason = (
                     f"the {engine} engine does not support "
                     "sep=None with delim_whitespace=False"
@@ -1093,13 +1102,14 @@ def _clean_options(self, options, engine):
         na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
 
         # handle skiprows; this is internally handled by the
-        # c-engine, so only need for python and arrow parsers
+        # c-engine, so only need for python and pyarrow parsers
         if engine != "c":
-            if engine == "arrow":
+            if engine == "pyarrow":
                 if not is_integer(skiprows) and skiprows is not None:
                     # pyarrow expects skiprows to be passed as an integer
                     raise ValueError(
-                        "skiprows argument must be an integer when using engine='arrow'"
+                        "skiprows argument must be an integer when using "
+                        "engine='pyarrow'"
                     )
             else:
                 if is_integer(skiprows):
@@ -2164,7 +2174,7 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True):
 
 class ArrowParserWrapper(ParserBase):
     """
-
+    Wrapper for the pyarrow engine for pd.read_csv()
     """
 
     def __init__(self, src, **kwds):
@@ -2174,12 +2184,13 @@ def __init__(self, src, **kwds):
 
         ParserBase.__init__(self, kwds)
 
-        # #2442
-        kwds["allow_leading_cols"] = self.index_col is not False
+        encoding = kwds.get("encoding") if kwds.get("encoding") is not None else "utf-8"
 
-        # GH20529, validate usecol arg before TextReader
         self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
 
+        if isinstance(self.src, StringIO):
+            self.src = BytesIO(self.src.getvalue().encode(encoding))
+
     def read(self, nrows=None):
         pyarrow = import_optional_dependency(
             "pyarrow.csv", extra="pyarrow is required to use arrow engine"
@@ -2197,12 +2208,18 @@ def read(self, nrows=None):
                 include_columns=self.usecols, column_types=self.kwds.get("dtype")
             ),
         )
-        if nrows:
-            table = table[:nrows]
+
         table_width = len(table.column_names)
         if self.names is None:
             if self.prefix:
                 self.names = [f"{self.prefix}{i}" for i in range(table_width)]
+            elif self.header is not None:
+                if self.header == "infer":
+                    header = 0
+                else:
+                    header = self.header
+                self.names = table[header]
+                del table[header]
         if self.names:
             table = table.rename_columns(self.names)
         return table.to_pandas()
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index 751db1d22e8ae..327f87303aeb0 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -44,8 +44,8 @@ class PythonParser(BaseParser):
     float_precision_choices = [None]
 
 
-class ArrowParser(BaseParser):
-    engine = "arrow"
+class PyArrowParser(BaseParser):
+    engine = "pyarrow"
     float_precision_choices = [None]
 
 
@@ -68,17 +68,17 @@ def csv1(csv_dir_path):
 _cParserHighMemory = CParserHighMemory()
 _cParserLowMemory = CParserLowMemory()
 _pythonParser = PythonParser()
-_arrowParser = ArrowParser()
+_pyarrowParser = PyArrowParser()
 
 _py_parsers_only = [_pythonParser]
 _c_parsers_only = [_cParserHighMemory, _cParserLowMemory]
-_arrow_parsers_only = [_arrowParser]
-_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_arrow_parsers_only]
+_pyarrow_parsers_only = [_pyarrowParser]
+_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
 
 _py_parser_ids = ["python"]
 _c_parser_ids = ["c_high", "c_low"]
-_arrow_parser_ids = ["arrow"]
-_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_arrow_parser_ids]
+_pyarrow_parser_ids = ["pyarrow"]
+_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids]
 
 
 @pytest.fixture(params=_all_parsers, ids=_all_parser_ids)

From 3d15a5660d7779eb7638875a33882b3e9103b190 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Thu, 20 Feb 2020 10:57:11 -0800
Subject: [PATCH 10/95] fix typo

---
 pandas/io/parsers.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 59678d675b0b1..4d31ca3230df6 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1138,7 +1138,7 @@ def __next__(self):
     def _make_engine(self, engine="c"):
         if engine == "c":
             self._engine = CParserWrapper(self.f, **self.options)
-        elif engine == "arrow":
+        elif engine == "pyarrow":
             self._engine = ArrowParserWrapper(self.f, **self.options)
         else:
             if engine == "python":
@@ -1157,7 +1157,7 @@ def _failover_to_python(self):
 
     def read(self, nrows=None):
         nrows = _validate_integer("nrows", nrows)
-        if self.engine == "arrow":
+        if self.engine == "pyarrow":
             return self._engine.read(nrows)
         ret = self._engine.read(nrows)
 
@@ -2208,21 +2208,19 @@ def read(self, nrows=None):
                 include_columns=self.usecols, column_types=self.kwds.get("dtype")
             ),
         )
-
+        frame = table.to_pandas()
         table_width = len(table.column_names)
         if self.names is None:
             if self.prefix:
                 self.names = [f"{self.prefix}{i}" for i in range(table_width)]
-            elif self.header is not None:
-                if self.header == "infer":
-                    header = 0
-                else:
-                    header = self.header
-                self.names = table[header]
-                del table[header]
+            elif self.header is not None and self.header != "infer":
+                header = self.header
+                self.names = frame.iloc[header]
+                frame = frame.drop(header, axis=0)
+
         if self.names:
-            table = table.rename_columns(self.names)
-        return table.to_pandas()
+            frame = frame.rename(self.names, axis="columns")
+        return frame
 
 
 def TextParser(*args, **kwds):

From 98aa134d85044ab84adade39f66639777d971eed Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sat, 29 Feb 2020 08:59:43 -0800
Subject: [PATCH 11/95] some fixes

---
 pandas/io/parsers.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 3ab847ebd7e04..dbd55f2015d1c 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -2195,7 +2195,9 @@ def read(self, nrows=None):
         table = pyarrow.read_csv(
             self.src,
             read_options=pyarrow.ReadOptions(
-                skip_rows=self.kwds.get("skiprows"), column_names=self.names
+                skip_rows=self.kwds.get("skiprows"),
+                column_names=self.names,
+                autogenerate_column_names=True if self.header != 0 else False,
             ),
             parse_options=pyarrow.ParseOptions(
                 delimiter=self.kwds.get("delimiter"),
@@ -2215,8 +2217,7 @@ def read(self, nrows=None):
                 self.names = frame.iloc[header]
                 frame = frame.drop(header, axis=0)
 
-        if self.names:
-            frame = frame.rename(self.names, axis="columns")
+        frame = frame.rename(zip(frame.names, self.names), axis="columns")
         return frame
 
 

From b9c6d2c0a2b177c12c94b30f7c1395d77d1d0242 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sat, 4 Apr 2020 19:42:14 -0700
Subject: [PATCH 12/95] Fix bug

---
 pandas/io/parsers.py | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index dbd55f2015d1c..ac7658d5b3772 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -526,6 +526,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     "float_precision",
     "chunksize",
     "comment",
+    "nrows",
 }
 _python_unsupported = {"low_memory", "float_precision"}
 
@@ -952,7 +953,11 @@ def _clean_options(self, options, engine):
         # pyarrow engine not supported yet
         if engine == "pyarrow":
             for option in _pyarrow_unsupported:
-                if option != "chunksize" and option != "skipfooter":
+                if (
+                    option != "chunksize"
+                    and option != "skipfooter"
+                    and option != "nrows"
+                ):
                     if options[option] is not None:
                         fallback_reason = (
                             f"the pyarrow engine does not support the {option} argumnet"
@@ -963,6 +968,10 @@ def _clean_options(self, options, engine):
                         fallback_reason = (
                             "the pyarrow engine does not support using chunksize"
                         )
+                    if self.nrows is not None:
+                        fallback_reason = (
+                            "the pyarrow engine does not support using skipfooter"
+                        )
         # C and pyarrow engine not supported yet
         if engine == "c" or "pyarrow":
             if options["skipfooter"] > 0:
@@ -2171,7 +2180,7 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True):
 
 class ArrowParserWrapper(ParserBase):
     """
-    Wrapper for the pyarrow engine for pd.read_csv()
+    Wrapper for the pyarrow engine for read_csv()
     """
 
     def __init__(self, src, **kwds):
@@ -2208,16 +2217,22 @@ def read(self, nrows=None):
             ),
         )
         frame = table.to_pandas()
-        table_width = len(table.column_names)
+        num_cols = len(frame.columns)
         if self.names is None:
             if self.prefix:
-                self.names = [f"{self.prefix}{i}" for i in range(table_width)]
-            elif self.header is not None and self.header != "infer":
+                self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
+                frame = frame.rename(
+                    dict(zip(frame.columns, self.names), axis="columns")
+                )
+            elif self.header != 0:
                 header = self.header
                 self.names = frame.iloc[header]
                 frame = frame.drop(header, axis=0)
-
-        frame = frame.rename(zip(frame.names, self.names), axis="columns")
+                frame = frame.rename(
+                    dict(zip(frame.columns, self.names), axis="columns")
+                )
+        if self.kwds.get("squeeze"):
+            frame = frame.squeeze()
         return frame
 
 

From 7f891a64d8887d69ca435d6b7093a81239ca95f3 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Fri, 10 Apr 2020 11:02:05 -0700
Subject: [PATCH 13/95] New benchmark and fix more tests

---
 asv_bench/benchmarks/io/csv.py | 37 ++++++++++-------
 pandas/io/parsers.py           | 73 ++++++++++++++++++++++------------
 2 files changed, 71 insertions(+), 39 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index a4e6f94f326ba..047fc1fe5f7f7 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -10,7 +10,6 @@
 
 
 class ToCSV(BaseIO):
-
     fname = "__test__.csv"
     params = ["wide", "long", "mixed"]
     param_names = ["kind"]
@@ -43,7 +42,6 @@ def time_frame(self, kind):
 
 
 class ToCSVDatetime(BaseIO):
-
     fname = "__test__.csv"
 
     def setup(self):
@@ -55,7 +53,6 @@ def time_frame_date_formatting(self):
 
 
 class ToCSVDatetimeBig(BaseIO):
-
     fname = "__test__.csv"
     timeout = 1500
     params = [1000, 10000, 100000]
@@ -83,7 +80,6 @@ def data(self, stringio_object):
 
 
 class ReadCSVDInferDatetimeFormat(StringIORewind):
-
     params = ([True, False], ["custom", "iso8601", "ymd"])
     param_names = ["infer_datetime_format", "format"]
 
@@ -108,7 +104,6 @@ def time_read_csv(self, infer_datetime_format, format):
 
 
 class ReadCSVConcatDatetime(StringIORewind):
-
     iso8601 = "%Y-%m-%d %H:%M:%S"
 
     def setup(self):
@@ -126,7 +121,6 @@ def time_read_csv(self):
 
 
 class ReadCSVConcatDatetimeBadDateValue(StringIORewind):
-
     params = (["nan", "0", ""],)
     param_names = ["bad_date_value"]
 
@@ -144,7 +138,6 @@ def time_read_csv(self, bad_date_value):
 
 
 class ReadCSVSkipRows(BaseIO):
-
     fname = "__test__.csv"
     params = [None, 10000]
     param_names = ["skiprows"]
@@ -190,7 +183,6 @@ def time_read_uint64_na_values(self):
 
 
 class ReadCSVThousands(BaseIO):
-
     fname = "__test__.csv"
     params = ([",", "|"], [None, ","])
     param_names = ["sep", "thousands"]
@@ -222,7 +214,6 @@ def time_comment(self):
 
 
 class ReadCSVFloatPrecision(StringIORewind):
-
     params = ([",", ";"], [".", "_"], [None, "high", "round_trip"])
     param_names = ["sep", "decimal", "float_precision"]
 
@@ -254,19 +245,38 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
             names=list("abc"),
         )
 
-    def time_read_csv_pyarrow_engine(self, sep, decimal, float_precision):
+    def time_read_csv_arrow(self, sep):
+        read_csv(
+            self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"),
+        )
+
+
+class ReadCSVEngine(StringIORewind):
+    def setup(self):
+        data = ["A,B,C"] + (["1,2,3"] * 100000)
+        self.StringIO_input = StringIO("\n".join(data))
+
+    def time_read_csv_c(self, sep):
+        read_csv(
+            self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"),
+        )
+
+    def time_read_csv_arrow(self, sep):
+        read_csv(
+            self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"),
+        )
+
+    def time_read_csv_python_engine(self, sep):
         read_csv(
             self.data(self.StringIO_input),
             sep=sep,
             header=None,
-            engine="pyarrow",
-            float_precision=None,
+            engine="python",
             names=list("abc"),
         )
 
 
 class ReadCSVCategorical(BaseIO):
-
     fname = "__test__.csv"
 
     def setup(self):
@@ -335,7 +345,6 @@ def time_read_csv_cached(self, do_cache):
 
 
 class ReadCSVMemoryGrowth(BaseIO):
-
     chunksize = 20
     num_rows = 1000
     fname = "__test__.csv"
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index f17c1008e29a5..175dccf0633df 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -5,7 +5,7 @@
 from collections import abc, defaultdict
 import csv
 import datetime
-from io import BytesIO, StringIO, TextIOWrapper
+from io import StringIO, TextIOBase, TextIOWrapper
 import itertools
 import re
 import sys
@@ -172,7 +172,7 @@
     Parser engine to use. The C and pyarrow engines are faster, while the python engine
     is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.13
     as a dependency however.
-    .. versionchanged(1.1)
+    .. versionchanged:: (1.1)
 converters : dict, optional
     Dict of functions for converting values in certain columns. Keys can either
     be integers or column labels.
@@ -1167,27 +1167,28 @@ def _failover_to_python(self):
         raise AbstractMethodError(self)
 
     def read(self, nrows=None):
-        nrows = _validate_integer("nrows", nrows)
         if self.engine == "pyarrow":
-            return self._engine.read(nrows)
-        ret = self._engine.read(nrows)
+            df = self._engine.read()
+        else:
+            nrows = _validate_integer("nrows", nrows)
+            ret = self._engine.read(nrows)
 
-        # May alter columns / col_dict
-        index, columns, col_dict = self._create_index(ret)
+            # May alter columns / col_dict
+            index, columns, col_dict = self._create_index(ret)
 
-        if index is None:
-            if col_dict:
-                # Any column is actually fine:
-                new_rows = len(next(iter(col_dict.values())))
-                index = RangeIndex(self._currow, self._currow + new_rows)
+            if index is None:
+                if col_dict:
+                    # Any column is actually fine:
+                    new_rows = len(next(iter(col_dict.values())))
+                    index = RangeIndex(self._currow, self._currow + new_rows)
+                else:
+                    new_rows = 0
             else:
-                new_rows = 0
-        else:
-            new_rows = len(index)
+                new_rows = len(index)
 
-        df = DataFrame(col_dict, columns=columns, index=index)
+            df = DataFrame(col_dict, columns=columns, index=index)
 
-        self._currow += new_rows
+            self._currow += new_rows
 
         if self.squeeze and len(df.columns) == 1:
             return df[df.columns[0]].copy()
@@ -2231,6 +2232,19 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True):
         return values
 
 
+class BytesIOWrapper:
+    def __init__(self, string_buffer, encoding="utf-8"):
+        self.string_buffer = string_buffer
+        self.encoding = encoding
+
+    def __getattr__(self, attr):
+        return getattr(self.string_buffer, attr)
+
+    def read(self, size=-1):
+        content = self.string_buffer.read(size)
+        return content.encode(self.encoding)
+
+
 class ArrowParserWrapper(ParserBase):
     """
     Wrapper for the pyarrow engine for read_csv()
@@ -2247,10 +2261,10 @@ def __init__(self, src, **kwds):
 
         self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
 
-        if isinstance(self.src, StringIO):
-            self.src = BytesIO(self.src.getvalue().encode(encoding))
+        if isinstance(self.src, TextIOBase):
+            self.src = BytesIOWrapper(self.src, encoding=encoding)
 
-    def read(self, nrows=None):
+    def read(self):
         pyarrow = import_optional_dependency(
             "pyarrow.csv", extra="pyarrow is required to use arrow engine"
         )
@@ -2259,7 +2273,9 @@ def read(self, nrows=None):
             read_options=pyarrow.ReadOptions(
                 skip_rows=self.kwds.get("skiprows"),
                 column_names=self.names,
-                autogenerate_column_names=True if self.header != 0 else False,
+                autogenerate_column_names=True
+                if self.header != 0 or self.kwds.get("skiprows") != set()
+                else False,
             ),
             parse_options=pyarrow.ParseOptions(
                 delimiter=self.kwds.get("delimiter"),
@@ -2277,15 +2293,22 @@ def read(self, nrows=None):
                 frame = frame.rename(
                     dict(zip(frame.columns, self.names), axis="columns")
                 )
-            elif self.header != 0:
+            elif self.header is not None and self.header != 0:
                 header = self.header
                 self.names = frame.iloc[header]
                 frame = frame.drop(header, axis=0)
                 frame = frame.rename(
-                    dict(zip(frame.columns, self.names), axis="columns")
+                    columns=dict(zip(frame.columns, self.names), axis="columns")
                 )
-        if self.kwds.get("squeeze"):
-            frame = frame.squeeze()
+            elif self.header is None:
+                self.names = range(len(frame.columns))
+                frame = frame.rename(
+                    columns=dict(zip(frame.columns, self.names), axis="columns")
+                )
+
+        index_col = self.kwds.get("index_col")[0]  # flatten list w/ 1 elem
+        if index_col is not None:
+            frame.set_index(frame.columns[index_col], drop=True, inplace=True)
         return frame
 
 

From 23425f7be4840ac48ff35058ae9a64d064628537 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Fri, 10 Apr 2020 15:27:33 -0700
Subject: [PATCH 14/95] More cleanups

---
 asv_bench/benchmarks/io/csv.py | 22 +++++++---------------
 doc/source/whatsnew/v1.1.0.rst |  6 +++---
 pandas/io/parsers.py           |  7 +++----
 3 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 047fc1fe5f7f7..b7d7c4e8c120a 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -245,7 +245,7 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
             names=list("abc"),
         )
 
-    def time_read_csv_arrow(self, sep):
+    def time_read_csv_arrow(self, sep, decimal, float_precision):
         read_csv(
             self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"),
         )
@@ -256,23 +256,15 @@ def setup(self):
         data = ["A,B,C"] + (["1,2,3"] * 100000)
         self.StringIO_input = StringIO("\n".join(data))
 
-    def time_read_csv_c(self, sep):
-        read_csv(
-            self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"),
-        )
+    def time_read_csv_c(self):
+        read_csv(self.data(self.StringIO_input))
 
-    def time_read_csv_arrow(self, sep):
-        read_csv(
-            self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"),
-        )
+    def time_read_csv_arrow(self):
+        read_csv(self.data(self.StringIO_input), engine="pyarrow")
 
-    def time_read_csv_python_engine(self, sep):
+    def time_read_csv_python_engine(self):
         read_csv(
-            self.data(self.StringIO_input),
-            sep=sep,
-            header=None,
-            engine="python",
-            names=list("abc"),
+            self.data(self.StringIO_input), engine="python",
         )
 
 
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 4c44e35169ba7..b60a79a239628 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -88,9 +88,6 @@ Other enhancements
 - :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`).
 - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`)
 - :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`)
-- :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing
-  if pyarrow>0.13 is installed. However, the pyarrow engine is less feature-complete than its "c" or
-  "python" counterparts. (:issue:`23697`)
 
 .. ---------------------------------------------------------------------------
 
@@ -412,6 +409,9 @@ I/O
 - Bug in :meth:`read_csv` was raising a misleading exception on a permissions issue (:issue:`23784`)
 - Bug in :meth:`read_csv` was raising an ``IndexError`` when header=None and 2 extra data columns
 - Bug in :meth:`DataFrame.to_sql` where an ``AttributeError`` was raised when saving an out of bounds date (:issue:`26761`)
+- :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing
+  if pyarrow>0.13 is installed. However, the pyarrow engine is less feature-complete than its "c" or
+  "python" counterparts. (:issue:`23697`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 175dccf0633df..455b7f748102d 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -172,6 +172,7 @@
     Parser engine to use. The C and pyarrow engines are faster, while the python engine
     is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.13
     as a dependency however.
+
     .. versionchanged:: (1.1)
 converters : dict, optional
     Dict of functions for converting values in certain columns. Keys can either
@@ -2266,16 +2267,14 @@ def __init__(self, src, **kwds):
 
     def read(self):
         pyarrow = import_optional_dependency(
-            "pyarrow.csv", extra="pyarrow is required to use arrow engine"
+            "pyarrow.csv", extra="pyarrow is required to use the pyarrow engine"
         )
         table = pyarrow.read_csv(
             self.src,
             read_options=pyarrow.ReadOptions(
                 skip_rows=self.kwds.get("skiprows"),
                 column_names=self.names,
-                autogenerate_column_names=True
-                if self.header != 0 or self.kwds.get("skiprows") != set()
-                else False,
+                autogenerate_column_names=True if self.header != 0 else False,
             ),
             parse_options=pyarrow.ParseOptions(
                 delimiter=self.kwds.get("delimiter"),

From 01c03942b61f4ab38cf4712c4d078a52c4f27939 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Fri, 10 Apr 2020 19:46:34 -0700
Subject: [PATCH 15/95] Formatting fixes and typo correction

---
 asv_bench/benchmarks/io/csv.py | 9 +++++++++
 doc/source/whatsnew/v1.1.0.rst | 2 ++
 pandas/io/parsers.py           | 6 +++---
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index b7d7c4e8c120a..8dec39091e322 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -10,6 +10,7 @@
 
 
 class ToCSV(BaseIO):
+
     fname = "__test__.csv"
     params = ["wide", "long", "mixed"]
     param_names = ["kind"]
@@ -42,6 +43,7 @@ def time_frame(self, kind):
 
 
 class ToCSVDatetime(BaseIO):
+
     fname = "__test__.csv"
 
     def setup(self):
@@ -53,6 +55,7 @@ def time_frame_date_formatting(self):
 
 
 class ToCSVDatetimeBig(BaseIO):
+
     fname = "__test__.csv"
     timeout = 1500
     params = [1000, 10000, 100000]
@@ -80,6 +83,7 @@ def data(self, stringio_object):
 
 
 class ReadCSVDInferDatetimeFormat(StringIORewind):
+
     params = ([True, False], ["custom", "iso8601", "ymd"])
     param_names = ["infer_datetime_format", "format"]
 
@@ -104,6 +108,7 @@ def time_read_csv(self, infer_datetime_format, format):
 
 
 class ReadCSVConcatDatetime(StringIORewind):
+
     iso8601 = "%Y-%m-%d %H:%M:%S"
 
     def setup(self):
@@ -121,6 +126,7 @@ def time_read_csv(self):
 
 
 class ReadCSVConcatDatetimeBadDateValue(StringIORewind):
+
     params = (["nan", "0", ""],)
     param_names = ["bad_date_value"]
 
@@ -138,6 +144,7 @@ def time_read_csv(self, bad_date_value):
 
 
 class ReadCSVSkipRows(BaseIO):
+
     fname = "__test__.csv"
     params = [None, 10000]
     param_names = ["skiprows"]
@@ -183,6 +190,7 @@ def time_read_uint64_na_values(self):
 
 
 class ReadCSVThousands(BaseIO):
+
     fname = "__test__.csv"
     params = ([",", "|"], [None, ","])
     param_names = ["sep", "thousands"]
@@ -214,6 +222,7 @@ def time_comment(self):
 
 
 class ReadCSVFloatPrecision(StringIORewind):
+
     params = ([",", ";"], [".", "_"], [None, "high", "round_trip"])
     param_names = ["sep", "decimal", "float_precision"]
 
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 690df648ceada..1704f3c096801 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -527,6 +527,8 @@ I/O
 - :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing
   if pyarrow>0.13 is installed. However, the pyarrow engine is less feature-complete than its "c" or
   "python" counterparts. (:issue:`23697`)
+
+
 Plotting
 ^^^^^^^^
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 455b7f748102d..0cf148366cc1c 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -975,7 +975,7 @@ def _clean_options(self, options, engine):
                         )
                     if self.nrows is not None:
                         fallback_reason = (
-                            "the pyarrow engine does not support using skipfooter"
+                            "the pyarrow engine does not support using nrows"
                         )
         # C and pyarrow engine not supported yet
         if engine == "c" or "pyarrow":
@@ -2305,9 +2305,9 @@ def read(self):
                     columns=dict(zip(frame.columns, self.names), axis="columns")
                 )
 
-        index_col = self.kwds.get("index_col")[0]  # flatten list w/ 1 elem
+        index_col = self.kwds.get("index_col")  # need to flatten since returns list
         if index_col is not None:
-            frame.set_index(frame.columns[index_col], drop=True, inplace=True)
+            frame.set_index(frame.columns[index_col[0]], drop=True, inplace=True)
         return frame
 
 

From ba5620ff84a14baa0814f96d2499b652a30afdd8 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sat, 11 Apr 2020 17:22:45 -0700
Subject: [PATCH 16/95] skip pyarrow tests if not installed

---
 asv_bench/benchmarks/io/csv.py     |  1 +
 pandas/tests/io/parser/conftest.py | 10 ++++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 8dec39091e322..fef4fee047862 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -346,6 +346,7 @@ def time_read_csv_cached(self, do_cache):
 
 
 class ReadCSVMemoryGrowth(BaseIO):
+
     chunksize = 20
     num_rows = 1000
     fname = "__test__.csv"
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index 327f87303aeb0..87a34d728bc60 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -1,4 +1,5 @@
 import os
+import pkgutil
 from typing import List, Optional
 
 import pytest
@@ -73,12 +74,17 @@ def csv1(csv_dir_path):
 _py_parsers_only = [_pythonParser]
 _c_parsers_only = [_cParserHighMemory, _cParserLowMemory]
 _pyarrow_parsers_only = [_pyarrowParser]
-_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
 
 _py_parser_ids = ["python"]
 _c_parser_ids = ["c_high", "c_low"]
 _pyarrow_parser_ids = ["pyarrow"]
-_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids]
+
+if pkgutil.find_loader("pyarrow"):
+    _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
+    _all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids]
+else:
+    _all_parsers = [*_c_parsers_only, *_py_parsers_only]
+    _all_parser_ids = [*_c_parser_ids, *_py_parser_ids]
 
 
 @pytest.fixture(params=_all_parsers, ids=_all_parser_ids)

From 2570c823f28eb722435929dd86ccfdfb2ff1a37b Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sat, 11 Apr 2020 17:31:51 -0700
Subject: [PATCH 17/95] Address comments

---
 pandas/io/parsers.py | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 0cf148366cc1c..235cefd82f2d5 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -173,7 +173,8 @@
     is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.13
     as a dependency however.
 
-    .. versionchanged:: (1.1)
+    .. versionchanged:: 1.1
+        The "pyarrow" engine was added.
 converters : dict, optional
     Dict of functions for converting values in certain columns. Keys can either
     be integers or column labels.
@@ -958,11 +959,7 @@ def _clean_options(self, options, engine):
         # pyarrow engine not supported yet
         if engine == "pyarrow":
             for option in _pyarrow_unsupported:
-                if (
-                    option != "chunksize"
-                    and option != "skipfooter"
-                    and option != "nrows"
-                ):
+                if option not in ["chunksize", "skipfooter", "nrows"]:
                     if options[option] is not None:
                         fallback_reason = (
                             f"the pyarrow engine does not support the {option} argumnet"
@@ -2274,11 +2271,12 @@ def read(self):
             read_options=pyarrow.ReadOptions(
                 skip_rows=self.kwds.get("skiprows"),
                 column_names=self.names,
-                autogenerate_column_names=True if self.header != 0 else False,
+                autogenerate_column_names=False if self.header == 0 else True,
             ),
             parse_options=pyarrow.ParseOptions(
                 delimiter=self.kwds.get("delimiter"),
                 quote_char=self.kwds.get("quotechar"),
+                ignore_empty_lines=self.kwds.get("skip_blank_lines"),
             ),
             convert_options=pyarrow.ConvertOptions(
                 include_columns=self.usecols, column_types=self.kwds.get("dtype")
@@ -2289,21 +2287,15 @@ def read(self):
         if self.names is None:
             if self.prefix:
                 self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
-                frame = frame.rename(
-                    dict(zip(frame.columns, self.names), axis="columns")
-                )
+                frame.columns = self.names
             elif self.header is not None and self.header != 0:
                 header = self.header
                 self.names = frame.iloc[header]
                 frame = frame.drop(header, axis=0)
-                frame = frame.rename(
-                    columns=dict(zip(frame.columns, self.names), axis="columns")
-                )
+                frame.columns = self.names
             elif self.header is None:
-                self.names = range(len(frame.columns))
-                frame = frame.rename(
-                    columns=dict(zip(frame.columns, self.names), axis="columns")
-                )
+                self.names = range(num_cols)
+                frame.columns = self.names
 
         index_col = self.kwds.get("index_col")  # need to flatten since returns list
         if index_col is not None:

From b3a1f6628879b8df819c82bc75686d6fd89f42d2 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Tue, 14 Apr 2020 14:24:28 -0700
Subject: [PATCH 18/95] Get some more tests to pass

---
 asv_bench/benchmarks/io/csv.py        |  2 +-
 pandas/io/parsers.py                  | 45 ++++++++++++++++-----------
 pandas/tests/io/parser/test_common.py |  1 +
 3 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index fef4fee047862..55bc8d35af432 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -262,7 +262,7 @@ def time_read_csv_arrow(self, sep, decimal, float_precision):
 
 class ReadCSVEngine(StringIORewind):
     def setup(self):
-        data = ["A,B,C"] + (["1,2,3"] * 100000)
+        data = ["A,B,C"] + (["1,2,3"] * 1000000)
         self.StringIO_input = StringIO("\n".join(data))
 
     def time_read_csv_c(self):
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 235cefd82f2d5..444582cbe723c 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -531,6 +531,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     "chunksize",
     "comment",
     "nrows",
+    "thousands",
 }
 _python_unsupported = {"low_memory", "float_precision"}
 
@@ -959,12 +960,11 @@ def _clean_options(self, options, engine):
         # pyarrow engine not supported yet
         if engine == "pyarrow":
             for option in _pyarrow_unsupported:
-                if option not in ["chunksize", "skipfooter", "nrows"]:
+                if option not in ["chunksize", "nrows"]:
                     if options[option] is not None:
                         fallback_reason = (
                             f"the pyarrow engine does not support the {option} argumnet"
                         )
-                        engine = "python"
                 else:
                     if self.chunksize is not None:
                         fallback_reason = (
@@ -974,10 +974,10 @@ def _clean_options(self, options, engine):
                         fallback_reason = (
                             "the pyarrow engine does not support using nrows"
                         )
-        # C and pyarrow engine not supported yet
-        if engine == "c" or "pyarrow":
+        # C engine not supported yet
+        if engine == "c":
             if options["skipfooter"] > 0:
-                fallback_reason = f"the {engine} engine does not support skipfooter"
+                fallback_reason = f"the 'c' engine does not support skipfooter"
                 engine = "python"
 
         encoding = sys.getfilesystemencoding() or "utf-8"
@@ -1157,7 +1157,7 @@ def _make_engine(self, engine="c"):
             else:
                 raise ValueError(
                     f"Unknown engine: {engine} (valid options "
-                    'are "c", "python", "arrow", or "python-fwf")'
+                    'are "c", "python", "pyarrow", or "python-fwf")'
                 )
             self._engine = klass(self.f, **self.options)
 
@@ -2266,13 +2266,24 @@ def read(self):
         pyarrow = import_optional_dependency(
             "pyarrow.csv", extra="pyarrow is required to use the pyarrow engine"
         )
-        table = pyarrow.read_csv(
-            self.src,
-            read_options=pyarrow.ReadOptions(
+        try:
+            read_options = pyarrow.ReadOptions(
                 skip_rows=self.kwds.get("skiprows"),
-                column_names=self.names,
                 autogenerate_column_names=False if self.header == 0 else True,
-            ),
+            )
+        except TypeError as e:
+            msg = "__init__() got an unexpected keyword argument"
+            if msg in str(e):
+                raise ImportError(
+                    "Pyarrow version >= 0.15.0 is needed in order "
+                    "to use skiprows kwarg with engine=pyarrow. "
+                    "Please upgrade Pyarrow or switch engines."
+                )
+            else:
+                raise e
+        table = pyarrow.read_csv(
+            self.src,
+            read_options=read_options,
             parse_options=pyarrow.ParseOptions(
                 delimiter=self.kwds.get("delimiter"),
                 quote_char=self.kwds.get("quotechar"),
@@ -2287,17 +2298,13 @@ def read(self):
         if self.names is None:
             if self.prefix:
                 self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
-                frame.columns = self.names
             elif self.header is not None and self.header != 0:
-                header = self.header
-                self.names = frame.iloc[header]
-                frame = frame.drop(header, axis=0)
-                frame.columns = self.names
+                self.names = frame.iloc[self.header]
+                frame = frame.drop(self.header, axis=0)
             elif self.header is None:
                 self.names = range(num_cols)
-                frame.columns = self.names
-
-        index_col = self.kwds.get("index_col")  # need to flatten since returns list
+        frame.columns = self.names
+        index_col = self.index_col  # need to flatten since returns list
         if index_col is not None:
             frame.set_index(frame.columns[index_col[0]], drop=True, inplace=True)
         return frame
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index 5bf9587a6ca22..f27178cdc429f 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -63,6 +63,7 @@ def _set_noconvert_columns(self):
         "parse_dates": parse_dates,
         "delimiter": ",",
     }
+    parser.engine = "c"
     parser._engine = MyCParserWrapper(StringIO(data), **parser.options)
 
     result = parser.read()

From d46ceed07a5197cc24748e09a92c3b8199ce7fa3 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Thu, 16 Apr 2020 20:20:22 -0700
Subject: [PATCH 19/95] Fix some bugs and cleanups

---
 pandas/io/parsers.py | 113 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 85 insertions(+), 28 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 444582cbe723c..39ee43f905950 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -532,6 +532,24 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     "comment",
     "nrows",
     "thousands",
+    "memory_map",
+    "dialect",
+    "warn_bad_lines",
+    "error_bad_lines",
+    "delim_whitespace",
+    "quoting",
+    "lineterminator",
+    "converters",
+    "decimal",
+    "iterator",
+    "cache_dates",
+    "dayfirst",
+    "keep_date_col",
+    "infer_datetime_format",
+    "verbose",
+    "skipinitialspace",
+    "date_parser",
+    "cache_dates",
 }
 _python_unsupported = {"low_memory", "float_precision"}
 
@@ -902,6 +920,16 @@ def _get_options_with_defaults(self, engine):
         for argname, default in _parser_defaults.items():
             value = kwds.get(argname, default)
 
+            if argname in _pyarrow_unsupported:
+                if engine == "pyarrow" and value != default:
+                    raise ValueError(
+                        f"The {repr(argname)} option is not supported with the "
+                        f"{repr(engine)} engine"
+                    )
+            if argname == "iterator" and engine == "pyarrow":
+                raise ValueError(
+                    "The iterator option is not supported with the" "pyarrow engine"
+                )
             # see gh-12935
             if argname == "mangle_dupe_cols" and not value:
                 raise ValueError("Setting mangle_dupe_cols=False is not supported yet")
@@ -957,27 +985,10 @@ def _clean_options(self, options, engine):
         sep = options["delimiter"]
         delim_whitespace = options["delim_whitespace"]
 
-        # pyarrow engine not supported yet
-        if engine == "pyarrow":
-            for option in _pyarrow_unsupported:
-                if option not in ["chunksize", "nrows"]:
-                    if options[option] is not None:
-                        fallback_reason = (
-                            f"the pyarrow engine does not support the {option} argumnet"
-                        )
-                else:
-                    if self.chunksize is not None:
-                        fallback_reason = (
-                            "the pyarrow engine does not support using chunksize"
-                        )
-                    if self.nrows is not None:
-                        fallback_reason = (
-                            "the pyarrow engine does not support using nrows"
-                        )
         # C engine not supported yet
         if engine == "c":
             if options["skipfooter"] > 0:
-                fallback_reason = f"the 'c' engine does not support skipfooter"
+                fallback_reason = "the 'c' engine does not support skipfooter"
                 engine = "python"
 
         encoding = sys.getfilesystemencoding() or "utf-8"
@@ -2251,13 +2262,16 @@ class ArrowParserWrapper(ParserBase):
     def __init__(self, src, **kwds):
         self.kwds = kwds
         self.src = src
-        kwds = kwds.copy()
+        # kwds = kwds.copy()
 
         ParserBase.__init__(self, kwds)
 
         encoding = kwds.get("encoding") if kwds.get("encoding") is not None else "utf-8"
 
         self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
+        self.na_values = _clean_na_values(
+            kwds["na_values"], keep_default_na=kwds["keep_default_na"]
+        )
 
         if isinstance(self.src, TextIOBase):
             self.src = BytesIOWrapper(self.src, encoding=encoding)
@@ -2268,8 +2282,7 @@ def read(self):
         )
         try:
             read_options = pyarrow.ReadOptions(
-                skip_rows=self.kwds.get("skiprows"),
-                autogenerate_column_names=False if self.header == 0 else True,
+                skip_rows=self.kwds.get("skiprows"), autogenerate_column_names=True,
             )
         except TypeError as e:
             msg = "__init__() got an unexpected keyword argument"
@@ -2287,10 +2300,14 @@ def read(self):
             parse_options=pyarrow.ParseOptions(
                 delimiter=self.kwds.get("delimiter"),
                 quote_char=self.kwds.get("quotechar"),
+                escape_char=self.kwds.get("escapechar"),
                 ignore_empty_lines=self.kwds.get("skip_blank_lines"),
             ),
             convert_options=pyarrow.ConvertOptions(
-                include_columns=self.usecols, column_types=self.kwds.get("dtype")
+                include_columns=self.usecols,
+                null_values=self.kwds.get("na_values"),
+                true_values=self.kwds.get("true_values"),
+                false_values=self.kwds.get("false_values"),
             ),
         )
         frame = table.to_pandas()
@@ -2298,17 +2315,57 @@ def read(self):
         if self.names is None:
             if self.prefix:
                 self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
-            elif self.header is not None and self.header != 0:
-                self.names = frame.iloc[self.header]
-                frame = frame.drop(self.header, axis=0)
+            elif self.header is not None:
+                self.names = frame.iloc[self.header].tolist()
+                frame.drop(range(self.header + 1), axis=0, inplace=True)
+                frame.reset_index(drop=True, inplace=True)
             elif self.header is None:
                 self.names = range(num_cols)
         frame.columns = self.names
-        index_col = self.index_col  # need to flatten since returns list
-        if index_col is not None:
-            frame.set_index(frame.columns[index_col[0]], drop=True, inplace=True)
+        if self.index_col is not None:
+            index_col = [frame.columns[i] for i in self.index_col]
+            frame.set_index(index_col, drop=True, inplace=True)
+        if self.kwds.get("dtype") is not None:
+            frame = frame.astype(self.kwds.get("dtype"))
+        else:
+            frame = frame.infer_objects()
         return frame
 
+    def _clean_na_values(na_values, keep_default_na=True):
+        if na_values is None:
+            if keep_default_na:
+                na_values = STR_NA_VALUES
+            else:
+                na_values = set()
+            na_fvalues = set()
+        elif isinstance(na_values, dict):
+            old_na_values = na_values.copy()
+            na_values = {}  # Prevent aliasing.
+
+            # Convert the values in the na_values dictionary
+            # into array-likes for further use. This is also
+            # where we append the default NaN values, provided
+            # that `keep_default_na=True`.
+            for k, v in old_na_values.items():
+                if not is_list_like(v):
+                    v = [v]
+
+                if keep_default_na:
+                    v = set(v) | STR_NA_VALUES
+
+                na_values[k] = v
+            na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}
+        else:
+            if not is_list_like(na_values):
+                na_values = [na_values]
+            na_values = _stringify_na_values(na_values)
+            if keep_default_na:
+                na_values = na_values | STR_NA_VALUES
+
+            na_fvalues = _floatify_na_values(na_values)
+
+        return na_values, na_fvalues
+
 
 def TextParser(*args, **kwds):
     """

From 637845922e829e9a6bc97c577b064935591f99ac Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Tue, 19 May 2020 20:40:57 -0700
Subject: [PATCH 20/95] Perform version checks for submodule imports too

---
 pandas/compat/_optional.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
index 7e253a52a9c00..139641f300980 100644
--- a/pandas/compat/_optional.py
+++ b/pandas/compat/_optional.py
@@ -1,5 +1,6 @@
 import distutils.version
 import importlib
+import sys
 import types
 import warnings
 
@@ -92,10 +93,16 @@ def import_optional_dependency(
             raise ImportError(msg) from None
         else:
             return None
-
+    # Grab parent module if submodule being imported
+    parent = name.split(".")[0]
+    if parent != name:
+        name = parent
+        module_to_get = sys.modules[name]
+    else:
+        module_to_get = module
     minimum_version = VERSIONS.get(name)
     if minimum_version:
-        version = _get_version(module)
+        version = _get_version(module_to_get)
         if distutils.version.LooseVersion(version) < minimum_version:
             assert on_version in {"warn", "raise", "ignore"}
             msg = (

From 9d648821b047419b9541381ad50c419f9f571847 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Tue, 19 May 2020 20:44:52 -0700
Subject: [PATCH 21/95] Refresh with newer pyarrow

---
 asv_bench/benchmarks/io/csv.py |  19 ++++--
 pandas/io/parsers.py           | 116 +++++++++++++--------------------
 2 files changed, 59 insertions(+), 76 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 55bc8d35af432..52d88d20b6d52 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -262,20 +262,31 @@ def time_read_csv_arrow(self, sep, decimal, float_precision):
 
 class ReadCSVEngine(StringIORewind):
     def setup(self):
-        data = ["A,B,C"] + (["1,2,3"] * 1000000)
+        data = ["A,B,C"] + (["1,2,3"] * 100000)
         self.StringIO_input = StringIO("\n".join(data))
+        # simulate reading from file
+        self.BytesIO_input = self.StringIO_input.read().encode("utf-8")
 
-    def time_read_csv_c(self):
+    def time_read_stringcsv_c(self):
         read_csv(self.data(self.StringIO_input))
 
-    def time_read_csv_arrow(self):
+    def time_read_stringcsv_arrow(self):
         read_csv(self.data(self.StringIO_input), engine="pyarrow")
 
-    def time_read_csv_python_engine(self):
+    def time_read_stringcsv_python_engine(self):
         read_csv(
             self.data(self.StringIO_input), engine="python",
         )
 
+    def time_read_bytescsv_c(self):
+        read_csv(self.BytesIO_input)
+
+    def time_read_bytescsv_arrow(self):
+        read_csv(self.BytesIO_input, engine="pyarrow")
+
+    def time_read_bytescsv_python_engine(self):
+        read_csv(self.BytesIO_input, engine="python")
+
 
 class ReadCSVCategorical(BaseIO):
     fname = "__test__.csv"
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 39ee43f905950..40dbfc4c4956d 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -170,7 +170,7 @@
     of dtype conversion.
 engine : {{'c', 'python', 'pyarrow'}}, optional
     Parser engine to use. The C and pyarrow engines are faster, while the python engine
-    is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.13
+    is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.15
     as a dependency however.
 
     .. versionchanged:: 1.1
@@ -919,7 +919,6 @@ def _get_options_with_defaults(self, engine):
 
         for argname, default in _parser_defaults.items():
             value = kwds.get(argname, default)
-
             if argname in _pyarrow_unsupported:
                 if engine == "pyarrow" and value != default:
                     raise ValueError(
@@ -928,7 +927,7 @@ def _get_options_with_defaults(self, engine):
                     )
             if argname == "iterator" and engine == "pyarrow":
                 raise ValueError(
-                    "The iterator option is not supported with the" "pyarrow engine"
+                    "The iterator option is not supported with the pyarrow engine"
                 )
             # see gh-12935
             if argname == "mangle_dupe_cols" and not value:
@@ -2262,17 +2261,22 @@ class ArrowParserWrapper(ParserBase):
     def __init__(self, src, **kwds):
         self.kwds = kwds
         self.src = src
-        # kwds = kwds.copy()
 
         ParserBase.__init__(self, kwds)
 
         encoding = kwds.get("encoding") if kwds.get("encoding") is not None else "utf-8"
 
         self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
-        self.na_values = _clean_na_values(
-            kwds["na_values"], keep_default_na=kwds["keep_default_na"]
+        na_values = kwds["na_values"]
+        if isinstance(na_values, dict):
+            raise ValueError(
+                "The pyarrow engine doesn't support passing a dict for na_values"
+            )
+        self.na_values = list(
+            _clean_na_values(
+                kwds["na_values"], keep_default_na=kwds["keep_default_na"]
+            )[0]
         )
-
         if isinstance(self.src, TextIOBase):
             self.src = BytesIOWrapper(self.src, encoding=encoding)
 
@@ -2280,48 +2284,51 @@ def read(self):
         pyarrow = import_optional_dependency(
             "pyarrow.csv", extra="pyarrow is required to use the pyarrow engine"
         )
+        kwdscopy = {k: v for k, v in self.kwds.items() if v is not None}
+        # these are kwargs passed to pyarrow
+        parseoptions = {"delimiter", "quote_char", "escape_char", "ignore_empty_lines"}
+        convertoptions = {
+            "include_columns",
+            "null_values",
+            "true_values",
+            "false_values",
+        }
+        parse_options = {k: v for k, v in kwdscopy.items() if k in parseoptions}
+        convert_options = {k: v for k, v in kwdscopy.items() if k in convertoptions}
+        read_options = pyarrow.ReadOptions(autogenerate_column_names=True)
+        headerexists = True if self.header is not None and self.header >= 0 else False
         try:
-            read_options = pyarrow.ReadOptions(
-                skip_rows=self.kwds.get("skiprows"), autogenerate_column_names=True,
-            )
+            skiprows = self.kwds.get("skiprows")
+            if skiprows is not None:
+                read_options = pyarrow.ReadOptions(skip_rows=skiprows)
+            elif self.header >= 0:
+                read_options = pyarrow.ReadOptions(skip_rows=self.header)
         except TypeError as e:
             msg = "__init__() got an unexpected keyword argument"
             if msg in str(e):
                 raise ImportError(
-                    "Pyarrow version >= 0.15.0 is needed in order "
-                    "to use skiprows kwarg with engine=pyarrow. "
-                    "Please upgrade Pyarrow or switch engines."
+                    "pyarrow version >= 0.15.0 is required to use "
+                    "read_csv with engine='pyarrow'"
                 )
-            else:
-                raise e
         table = pyarrow.read_csv(
             self.src,
             read_options=read_options,
-            parse_options=pyarrow.ParseOptions(
-                delimiter=self.kwds.get("delimiter"),
-                quote_char=self.kwds.get("quotechar"),
-                escape_char=self.kwds.get("escapechar"),
-                ignore_empty_lines=self.kwds.get("skip_blank_lines"),
-            ),
-            convert_options=pyarrow.ConvertOptions(
-                include_columns=self.usecols,
-                null_values=self.kwds.get("na_values"),
-                true_values=self.kwds.get("true_values"),
-                false_values=self.kwds.get("false_values"),
-            ),
+            parse_options=pyarrow.ParseOptions(**parse_options),
+            convert_options=pyarrow.ConvertOptions(**convert_options),
         )
         frame = table.to_pandas()
         num_cols = len(frame.columns)
-        if self.names is None:
-            if self.prefix:
-                self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
-            elif self.header is not None:
-                self.names = frame.iloc[self.header].tolist()
-                frame.drop(range(self.header + 1), axis=0, inplace=True)
-                frame.reset_index(drop=True, inplace=True)
-            elif self.header is None:
-                self.names = range(num_cols)
-        frame.columns = self.names
+        if not headerexists:
+            if self.names is None:
+                if self.prefix is not None:
+                    self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
+                # elif self.header is not None:
+                #    self.names = frame.iloc[self.header].tolist()
+                #    frame.drop(range(self.header + 1), axis=0, inplace=True)
+                #    frame.reset_index(drop=True, inplace=True)
+                elif self.header is None:
+                    self.names = range(num_cols)
+            frame.columns = self.names
         if self.index_col is not None:
             index_col = [frame.columns[i] for i in self.index_col]
             frame.set_index(index_col, drop=True, inplace=True)
@@ -2331,41 +2338,6 @@ def read(self):
             frame = frame.infer_objects()
         return frame
 
-    def _clean_na_values(na_values, keep_default_na=True):
-        if na_values is None:
-            if keep_default_na:
-                na_values = STR_NA_VALUES
-            else:
-                na_values = set()
-            na_fvalues = set()
-        elif isinstance(na_values, dict):
-            old_na_values = na_values.copy()
-            na_values = {}  # Prevent aliasing.
-
-            # Convert the values in the na_values dictionary
-            # into array-likes for further use. This is also
-            # where we append the default NaN values, provided
-            # that `keep_default_na=True`.
-            for k, v in old_na_values.items():
-                if not is_list_like(v):
-                    v = [v]
-
-                if keep_default_na:
-                    v = set(v) | STR_NA_VALUES
-
-                na_values[k] = v
-            na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}
-        else:
-            if not is_list_like(na_values):
-                na_values = [na_values]
-            na_values = _stringify_na_values(na_values)
-            if keep_default_na:
-                na_values = na_values | STR_NA_VALUES
-
-            na_fvalues = _floatify_na_values(na_values)
-
-        return na_values, na_fvalues
-
 
 def TextParser(*args, **kwds):
     """

From 93382b421cf62c2ad2f1ede65bd702e2912e8db6 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Thu, 21 May 2020 11:55:20 -0700
Subject: [PATCH 22/95] Start xfailing tests

---
 asv_bench/benchmarks/io/csv.py             |  4 +--
 pandas/io/parsers.py                       |  4 ---
 pandas/tests/io/parser/conftest.py         | 19 ++++++++--
 pandas/tests/io/parser/test_common.py      | 42 +++++++++++-----------
 pandas/tests/io/parser/test_compression.py | 15 +++++---
 5 files changed, 50 insertions(+), 34 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 52d88d20b6d52..6e166ec315df6 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -1,4 +1,4 @@
-from io import StringIO
+from io import BytesIO, StringIO
 import random
 import string
 
@@ -265,7 +265,7 @@ def setup(self):
         data = ["A,B,C"] + (["1,2,3"] * 100000)
         self.StringIO_input = StringIO("\n".join(data))
         # simulate reading from file
-        self.BytesIO_input = self.StringIO_input.read().encode("utf-8")
+        self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8"))
 
     def time_read_stringcsv_c(self):
         read_csv(self.data(self.StringIO_input))
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 5bcd9253abb72..e64ca0651e7c7 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -2322,10 +2322,6 @@ def read(self):
             if self.names is None:
                 if self.prefix is not None:
                     self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
-                # elif self.header is not None:
-                #    self.names = frame.iloc[self.header].tolist()
-                #    frame.drop(range(self.header + 1), axis=0, inplace=True)
-                #    frame.reset_index(drop=True, inplace=True)
                 elif self.header is None:
                     self.names = range(num_cols)
             frame.columns = self.names
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index 87a34d728bc60..8f473bded9225 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -1,7 +1,8 @@
+import distutils.version
 import os
-import pkgutil
 from typing import List, Optional
 
+import pkg_resources
 import pytest
 
 from pandas import read_csv, read_table
@@ -79,7 +80,10 @@ def csv1(csv_dir_path):
 _c_parser_ids = ["c_high", "c_low"]
 _pyarrow_parser_ids = ["pyarrow"]
 
-if pkgutil.find_loader("pyarrow"):
+pyarrow_version = pkg_resources.get_distribution("pyarrow").version
+if (
+    distutils.version.LooseVersion(pyarrow_version) > "0.15.0"
+):  # TODO remove this if block once required pyarrow>0.15.0
     _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
     _all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids]
 else:
@@ -135,3 +139,14 @@ def encoding_fmt(request):
     Fixture for all possible string formats of a UTF encoding.
     """
     return request.param
+
+
+@pytest.fixture
+def pyarrow_xfail(request):
+    """
+    Fixture that xfails a test if the engine is pyarrow.
+    """
+    if "all_parsers" in request.fixturenames:
+        parser = request.getfixturevalue("all_parsers")
+        if parser.engine == "pyarrow":
+            pytest.xfail("pyarrow doesn't support this.")
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index b6987dae5ed2b..e0b6d70b607d6 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -70,7 +70,7 @@ def _set_noconvert_columns(self):
     tm.assert_frame_equal(result, expected)
 
 
-def test_empty_decimal_marker(all_parsers):
+def test_empty_decimal_marker(all_parsers, pyarrow_xfail):
     data = """A|B|C
 1|2,334|5
 10|13|10.
@@ -83,7 +83,7 @@ def test_empty_decimal_marker(all_parsers):
         parser.read_csv(StringIO(data), decimal="")
 
 
-def test_bad_stream_exception(all_parsers, csv_dir_path):
+def test_bad_stream_exception(all_parsers, csv_dir_path, pyarrow_xfail):
     # see gh-13652
     #
     # This test validates that both the Python engine and C engine will
@@ -169,7 +169,7 @@ def test_squeeze(all_parsers):
     assert not result._is_view
 
 
-def test_malformed(all_parsers):
+def test_malformed(all_parsers, pyarrow_xfail):
     # see gh-6607
     parser = all_parsers
     data = """ignore
@@ -184,7 +184,7 @@ def test_malformed(all_parsers):
 
 
 @pytest.mark.parametrize("nrows", [5, 3, None])
-def test_malformed_chunks(all_parsers, nrows):
+def test_malformed_chunks(all_parsers, nrows, pyarrow_xfail):
     data = """ignore
 A,B,C
 skip
@@ -203,7 +203,7 @@ def test_malformed_chunks(all_parsers, nrows):
         reader.read(nrows)
 
 
-def test_unnamed_columns(all_parsers):
+def test_unnamed_columns(all_parsers, pyarrow_xfail):
     data = """A,B,C,,
 1,2,3,4,5
 6,7,8,9,10
@@ -306,7 +306,7 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path):
     tm.assert_frame_equal(result, expected)
 
 
-def test_read_csv_wrong_num_columns(all_parsers):
+def test_read_csv_wrong_num_columns(all_parsers, pyarrow_xfail):
     # Too few columns.
     data = """A,B,C,D,E,F
 1,2,3,4,5,6
@@ -422,7 +422,7 @@ def test_int_conversion(all_parsers):
 
 
 @pytest.mark.parametrize("nrows", [3, 3.0])
-def test_read_nrows(all_parsers, nrows):
+def test_read_nrows(all_parsers, nrows, pyarrow_xfail):
     # see gh-10476
     data = """index,A,B,C,D
 foo,2,3,4,5
@@ -443,7 +443,7 @@ def test_read_nrows(all_parsers, nrows):
 
 
 @pytest.mark.parametrize("nrows", [1.2, "foo", -1])
-def test_read_nrows_bad(all_parsers, nrows):
+def test_read_nrows_bad(all_parsers, nrows, pyarrow_xfail):
     data = """index,A,B,C,D
 foo,2,3,4,5
 bar,7,8,9,10
@@ -460,7 +460,7 @@ def test_read_nrows_bad(all_parsers, nrows):
 
 
 @pytest.mark.parametrize("index_col", [0, "index"])
-def test_read_chunksize_with_index(all_parsers, index_col):
+def test_read_chunksize_with_index(all_parsers, index_col, pyarrow_xfail):
     parser = all_parsers
     data = """index,A,B,C,D
 foo,2,3,4,5
@@ -492,7 +492,7 @@ def test_read_chunksize_with_index(all_parsers, index_col):
 
 
 @pytest.mark.parametrize("chunksize", [1.3, "foo", 0])
-def test_read_chunksize_bad(all_parsers, chunksize):
+def test_read_chunksize_bad(all_parsers, chunksize, pyarrow_xfail):
     data = """index,A,B,C,D
 foo,2,3,4,5
 bar,7,8,9,10
@@ -509,7 +509,7 @@ def test_read_chunksize_bad(all_parsers, chunksize):
 
 
 @pytest.mark.parametrize("chunksize", [2, 8])
-def test_read_chunksize_and_nrows(all_parsers, chunksize):
+def test_read_chunksize_and_nrows(all_parsers, chunksize, pyarrow_xfail):
     # see gh-15755
     data = """index,A,B,C,D
 foo,2,3,4,5
@@ -527,7 +527,7 @@ def test_read_chunksize_and_nrows(all_parsers, chunksize):
     tm.assert_frame_equal(concat(reader), expected)
 
 
-def test_read_chunksize_and_nrows_changing_size(all_parsers):
+def test_read_chunksize_and_nrows_changing_size(all_parsers, pyarrow_xfail):
     data = """index,A,B,C,D
 foo,2,3,4,5
 bar,7,8,9,10
@@ -549,7 +549,7 @@ def test_read_chunksize_and_nrows_changing_size(all_parsers):
         reader.get_chunk(size=3)
 
 
-def test_get_chunk_passed_chunksize(all_parsers):
+def test_get_chunk_passed_chunksize(all_parsers, pyarrow_xfail):
     parser = all_parsers
     data = """A,B,C
 1,2,3
@@ -565,7 +565,7 @@ def test_get_chunk_passed_chunksize(all_parsers):
 
 
 @pytest.mark.parametrize("kwargs", [dict(), dict(index_col=0)])
-def test_read_chunksize_compat(all_parsers, kwargs):
+def test_read_chunksize_compat(all_parsers, kwargs, pyarrow_xfail):
     # see gh-12185
     data = """index,A,B,C,D
 foo,2,3,4,5
@@ -582,7 +582,7 @@ def test_read_chunksize_compat(all_parsers, kwargs):
     tm.assert_frame_equal(concat(reader), result)
 
 
-def test_read_chunksize_jagged_names(all_parsers):
+def test_read_chunksize_jagged_names(all_parsers, pyarrow_xfail):
     # see gh-23509
     parser = all_parsers
     data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
@@ -594,7 +594,7 @@ def test_read_chunksize_jagged_names(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-def test_read_data_list(all_parsers):
+def test_read_data_list(all_parsers, pyarrow_xfail):
     parser = all_parsers
     kwargs = dict(index_col=0)
     data = "A,B,C\nfoo,1,2,3\nbar,4,5,6"
@@ -608,7 +608,7 @@ def test_read_data_list(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-def test_iterator(all_parsers):
+def test_iterator(all_parsers, pyarrow_xfail):
     # see gh-6607
     data = """index,A,B,C,D
 foo,2,3,4,5
@@ -631,7 +631,7 @@ def test_iterator(all_parsers):
     tm.assert_frame_equal(last_chunk, expected[3:])
 
 
-def test_iterator2(all_parsers):
+def test_iterator2(all_parsers, pyarrow_xfail):
     parser = all_parsers
     data = """A,B,C
 foo,1,2,3
@@ -694,7 +694,7 @@ def test_reader_list_skiprows(all_parsers):
     tm.assert_frame_equal(chunks[0], expected[1:3])
 
 
-def test_iterator_stop_on_chunksize(all_parsers):
+def test_iterator_stop_on_chunksize(all_parsers, pyarrow_xfail):
     # gh-3967: stopping iteration when chunksize is specified
     parser = all_parsers
     data = """A,B,C
@@ -718,7 +718,7 @@ def test_iterator_stop_on_chunksize(all_parsers):
 @pytest.mark.parametrize(
     "kwargs", [dict(iterator=True, chunksize=1), dict(iterator=True), dict(chunksize=1)]
 )
-def test_iterator_skipfooter_errors(all_parsers, kwargs):
+def test_iterator_skipfooter_errors(all_parsers, kwargs, pyarrow_xfail):
     msg = "'skipfooter' not supported for 'iteration'"
     parser = all_parsers
     data = "a\n1\n2"
@@ -727,7 +727,7 @@ def test_iterator_skipfooter_errors(all_parsers, kwargs):
         parser.read_csv(StringIO(data), skipfooter=1, **kwargs)
 
 
-def test_nrows_skipfooter_errors(all_parsers):
+def test_nrows_skipfooter_errors(all_parsers, pyarrow_xfail):
     msg = "'skipfooter' not supported with 'nrows'"
     data = "a\n1\n2\n3\n4\n5\n6"
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py
index b773664adda72..22bba9bd3f98a 100644
--- a/pandas/tests/io/parser/test_compression.py
+++ b/pandas/tests/io/parser/test_compression.py
@@ -29,7 +29,7 @@ def parser_and_data(all_parsers, csv1):
 
 
 @pytest.mark.parametrize("compression", ["zip", "infer", "zip2"])
-def test_zip(parser_and_data, compression):
+def test_zip(parser_and_data, compression, pyarrow_xfail):
     parser, data, expected = parser_and_data
 
     with tm.ensure_clean("test_file.zip") as path:
@@ -46,7 +46,7 @@ def test_zip(parser_and_data, compression):
 
 
 @pytest.mark.parametrize("compression", ["zip", "infer"])
-def test_zip_error_multiple_files(parser_and_data, compression):
+def test_zip_error_multiple_files(parser_and_data, compression, pyarrow_xfail):
     parser, data, expected = parser_and_data
 
     with tm.ensure_clean("combined_zip.zip") as path:
@@ -60,7 +60,7 @@ def test_zip_error_multiple_files(parser_and_data, compression):
             parser.read_csv(path, compression=compression)
 
 
-def test_zip_error_no_files(parser_and_data):
+def test_zip_error_no_files(parser_and_data, pyarrow_xfail):
     parser, _, _ = parser_and_data
 
     with tm.ensure_clean() as path:
@@ -71,7 +71,7 @@ def test_zip_error_no_files(parser_and_data):
             parser.read_csv(path, compression="zip")
 
 
-def test_zip_error_invalid_zip(parser_and_data):
+def test_zip_error_invalid_zip(parser_and_data, pyarrow_xfail):
     parser, _, _ = parser_and_data
 
     with tm.ensure_clean() as path:
@@ -86,6 +86,11 @@ def test_compression(parser_and_data, compression_only, buffer, filename):
     compress_type = compression_only
 
     ext = "gz" if compress_type == "gzip" else compress_type
+    pyarrow_unsupported_exts = {"bz2", "zip", "xz"}
+    if ext in pyarrow_unsupported_exts and parser.engine == "pyarrow":
+        # need to skip since this test will hang forever and not fail
+        pytest.skip(f"The pyarrow package doesn't come with {ext} support")
+
     filename = filename if filename is None else filename.format(ext=ext)
 
     if filename and buffer:
@@ -141,7 +146,7 @@ def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding
 
 
 @pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"])
-def test_invalid_compression(all_parsers, invalid_compression):
+def test_invalid_compression(all_parsers, invalid_compression, pyarrow_xfail):
     parser = all_parsers
     compress_kwargs = dict(compression=invalid_compression)
 

From f1bb4e25c77f4b672ddd5dfc7afc2af51abc9e32 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 27 May 2020 10:57:57 -0700
Subject: [PATCH 23/95] Get all tests to run & some fixes

---
 pandas/io/parsers.py                       | 37 ++++++++++++----------
 pandas/tests/io/parser/conftest.py         |  7 ++--
 pandas/tests/io/parser/test_common.py      |  4 +--
 pandas/tests/io/parser/test_compression.py | 11 +++----
 pandas/tests/io/parser/test_unsupported.py | 19 +++++++++++
 5 files changed, 50 insertions(+), 28 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index e64ca0651e7c7..2f9e4ec11187e 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -21,7 +21,7 @@
 from pandas._libs.parsers import STR_NA_VALUES
 from pandas._libs.tslibs import parsing
 from pandas._typing import FilePathOrBuffer
-from pandas.compat._optional import import_optional_dependency
+from pandas.compat._optional import import_optional_dependency, VERSIONS
 from pandas.errors import (
     AbstractMethodError,
     EmptyDataError,
@@ -444,7 +444,14 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
 
     # Extract some of the arguments (pass chunksize on).
     iterator = kwds.get("iterator", False)
-    chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1)
+    chunksize = kwds.get("chunksize", None)
+    if kwds.get("engine") == "pyarrow":  # chunksize not supported for pyarrow
+        if iterator:
+            raise ValueError("The 'iterator' option is not supported with the 'pyarrow' engine")
+        if chunksize is not None:
+            raise ValueError("The 'chunksize' option is not supported with the 'pyarrow' engine")
+    else:
+        chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1)
     nrows = kwds.get("nrows", None)
 
     # Check for duplicates in names.
@@ -830,6 +837,9 @@ def __init__(self, f, engine=None, **kwds):
         self._engine_specified = kwds.get("engine_specified", engine_specified)
 
         if kwds.get("dialect") is not None:
+            if engine == "pyarrow":
+                raise ValueError("The 'dialect' option is not supported with the 'pyarrow' engine")
+
             dialect = kwds["dialect"]
             if dialect in csv.list_dialects():
                 dialect = csv.get_dialect(dialect)
@@ -923,11 +933,11 @@ def _get_options_with_defaults(self, engine):
                 if engine == "pyarrow" and value != default:
                     raise ValueError(
                         f"The {repr(argname)} option is not supported with the "
-                        f"{repr(engine)} engine"
+                        f"'pyarrow' engine"
                     )
             if argname == "iterator" and engine == "pyarrow":
                 raise ValueError(
-                    "The iterator option is not supported with the pyarrow engine"
+                    "The iterator option is not supported with the 'pyarrow' engine"
                 )
             # see gh-12935
             if argname == "mangle_dupe_cols" and not value:
@@ -2281,6 +2291,7 @@ def __init__(self, src, **kwds):
             self.src = BytesIOWrapper(self.src, encoding=encoding)
 
     def read(self):
+        VERSIONS["pyarrow"] = "0.15.0"
         pyarrow = import_optional_dependency(
             "pyarrow.csv", extra="pyarrow is required to use the pyarrow engine"
         )
@@ -2297,19 +2308,11 @@ def read(self):
         convert_options = {k: v for k, v in kwdscopy.items() if k in convertoptions}
         read_options = pyarrow.ReadOptions(autogenerate_column_names=True)
         headerexists = True if self.header is not None and self.header >= 0 else False
-        try:
-            skiprows = self.kwds.get("skiprows")
-            if skiprows is not None:
-                read_options = pyarrow.ReadOptions(skip_rows=skiprows)
-            elif self.header >= 0:
-                read_options = pyarrow.ReadOptions(skip_rows=self.header)
-        except TypeError as e:
-            msg = "__init__() got an unexpected keyword argument"
-            if msg in str(e):
-                raise ImportError(
-                    "pyarrow version >= 0.15.0 is required to use "
-                    "read_csv with engine='pyarrow'"
-                )
+        skiprows = self.kwds.get("skiprows")
+        if skiprows is not None:
+            read_options = pyarrow.ReadOptions(skip_rows=skiprows)
+        elif headerexists:
+            read_options = pyarrow.ReadOptions(skip_rows=self.header)
         table = pyarrow.read_csv(
             self.src,
             read_options=read_options,
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index 8f473bded9225..09379ac1b6922 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -80,10 +80,13 @@ def csv1(csv_dir_path):
 _c_parser_ids = ["c_high", "c_low"]
 _pyarrow_parser_ids = ["pyarrow"]
 
-pyarrow_version = pkg_resources.get_distribution("pyarrow").version
+try:
+    pyarrow_version = pkg_resources.get_distribution("pyarrow").version
+except pkg_resources.DistributionNotFound:
+    pyarrow_version = None
 if (
     distutils.version.LooseVersion(pyarrow_version) > "0.15.0"
-):  # TODO remove this if block once required pyarrow>0.15.0
+):
     _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
     _all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids]
 else:
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index e0b6d70b607d6..f35da606110fe 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -1517,7 +1517,7 @@ def test_uneven_lines_with_usecols(all_parsers, usecols):
         ),
     ],
 )
-def test_read_empty_with_usecols(all_parsers, data, kwargs, expected):
+def test_read_empty_with_usecols(all_parsers, data, kwargs, expected, pyarrow_xfail):
     # see gh-12493
     parser = all_parsers
 
@@ -2082,7 +2082,7 @@ def test_read_table_equivalency_to_read_csv(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-def test_first_row_bom(all_parsers):
+def test_first_row_bom(all_parsers, pyarrow_xfail):
     # see gh-26545
     parser = all_parsers
     data = '''\ufeff"Head1"	"Head2"	"Head3"'''
diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py
index 22bba9bd3f98a..2c5f1b61370a5 100644
--- a/pandas/tests/io/parser/test_compression.py
+++ b/pandas/tests/io/parser/test_compression.py
@@ -81,16 +81,11 @@ def test_zip_error_invalid_zip(parser_and_data, pyarrow_xfail):
 
 
 @pytest.mark.parametrize("filename", [None, "test.{ext}"])
-def test_compression(parser_and_data, compression_only, buffer, filename):
+def test_compression(parser_and_data, compression_only, buffer, filename, pyarrow_xfail):
     parser, data, expected = parser_and_data
     compress_type = compression_only
 
     ext = "gz" if compress_type == "gzip" else compress_type
-    pyarrow_unsupported_exts = {"bz2", "zip", "xz"}
-    if ext in pyarrow_unsupported_exts and parser.engine == "pyarrow":
-        # need to skip since this test will hang forever and not fail
-        pytest.skip(f"The pyarrow package doesn't come with {ext} support")
-
     filename = filename if filename is None else filename.format(ext=ext)
 
     if filename and buffer:
@@ -118,6 +113,8 @@ def test_infer_compression(all_parsers, csv1, buffer, ext):
     expected = parser.read_csv(csv1, **kwargs)
     kwargs["compression"] = "infer"
 
+    if ext == "bz2":
+        pytest.xfail("pyarrow wheels don't have bz2 codec support")
     if buffer:
         with open(csv1) as f:
             result = parser.read_csv(f, **kwargs)
@@ -128,7 +125,7 @@ def test_infer_compression(all_parsers, csv1, buffer, ext):
     tm.assert_frame_equal(result, expected)
 
 
-def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt):
+def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt, pyarrow_xfail):
     # see gh-18071, gh-24130
     parser = all_parsers
     encoding = encoding_fmt.format(utf_value)
diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py
index 267fae760398a..44865d61d1b05 100644
--- a/pandas/tests/io/parser/test_unsupported.py
+++ b/pandas/tests/io/parser/test_unsupported.py
@@ -121,3 +121,22 @@ def read(self):
 
         with pytest.raises(ValueError, match=msg):
             read_csv(NoNextBuffer(data), engine=python_engine)
+
+    def test_pyarrow_engine(self):
+        from pandas.io.parsers import _pyarrow_unsupported as pa_unsupported
+
+        data = """1,2,3,,
+        1,2,3,4,
+        1,2,3,4,5
+        1,2,,,
+        1,2,3,4,"""
+
+        for default in pa_unsupported:
+            msg = (
+                f"The {repr(default)} option is not "
+                f"supported with the 'pyarrow' engine"
+            )
+            print(default)
+            kwargs = {default: object()}
+            with pytest.raises(ValueError, match=msg):
+                read_csv(StringIO(data), engine="pyarrow", **kwargs)

From 7876b4ef795150510837f74538fdc10b1c38333e Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Fri, 29 May 2020 15:57:58 -0700
Subject: [PATCH 24/95] Lint and CI

---
 pandas/io/parsers.py                       | 15 +++++++++++----
 pandas/tests/io/parser/conftest.py         |  6 ++----
 pandas/tests/io/parser/test_common.py      |  2 +-
 pandas/tests/io/parser/test_compression.py |  8 ++++++--
 pandas/tests/io/parser/test_dtypes.py      |  2 +-
 pandas/tests/io/parser/test_unsupported.py |  1 -
 6 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 2f9e4ec11187e..f1a89da794849 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -21,7 +21,7 @@
 from pandas._libs.parsers import STR_NA_VALUES
 from pandas._libs.tslibs import parsing
 from pandas._typing import FilePathOrBuffer
-from pandas.compat._optional import import_optional_dependency, VERSIONS
+from pandas.compat._optional import VERSIONS, import_optional_dependency
 from pandas.errors import (
     AbstractMethodError,
     EmptyDataError,
@@ -447,9 +447,13 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     chunksize = kwds.get("chunksize", None)
     if kwds.get("engine") == "pyarrow":  # chunksize not supported for pyarrow
         if iterator:
-            raise ValueError("The 'iterator' option is not supported with the 'pyarrow' engine")
+            raise ValueError(
+                "The 'iterator' option is not supported with the 'pyarrow' engine"
+            )
         if chunksize is not None:
-            raise ValueError("The 'chunksize' option is not supported with the 'pyarrow' engine")
+            raise ValueError(
+                "The 'chunksize' option is not supported with the 'pyarrow' engine"
+            )
     else:
         chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1)
     nrows = kwds.get("nrows", None)
@@ -557,6 +561,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     "skipinitialspace",
     "date_parser",
     "cache_dates",
+    "parse_dates",
 }
 _python_unsupported = {"low_memory", "float_precision"}
 
@@ -838,7 +843,9 @@ def __init__(self, f, engine=None, **kwds):
 
         if kwds.get("dialect") is not None:
             if engine == "pyarrow":
-                raise ValueError("The 'dialect' option is not supported with the 'pyarrow' engine")
+                raise ValueError(
+                    "The 'dialect' option is not supported with the 'pyarrow' engine"
+                )
 
             dialect = kwds["dialect"]
             if dialect in csv.list_dialects():
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index 09379ac1b6922..9aa23bd739d24 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -83,10 +83,8 @@ def csv1(csv_dir_path):
 try:
     pyarrow_version = pkg_resources.get_distribution("pyarrow").version
 except pkg_resources.DistributionNotFound:
-    pyarrow_version = None
-if (
-    distutils.version.LooseVersion(pyarrow_version) > "0.15.0"
-):
+    pyarrow_version = "0"  # represents pyarrow not found
+if distutils.version.LooseVersion(pyarrow_version) > "0.15.0":
     _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
     _all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids]
 else:
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index f35da606110fe..96410f626952b 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -1561,7 +1561,7 @@ def test_trailing_spaces(all_parsers, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
-def test_raise_on_sep_with_delim_whitespace(all_parsers):
+def test_raise_on_sep_with_delim_whitespace(all_parsers, pyarrow_xfail):
     # see gh-6607
     data = "a b c\n1 2 3"
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py
index 2c5f1b61370a5..ecc35dd6644c8 100644
--- a/pandas/tests/io/parser/test_compression.py
+++ b/pandas/tests/io/parser/test_compression.py
@@ -81,7 +81,9 @@ def test_zip_error_invalid_zip(parser_and_data, pyarrow_xfail):
 
 
 @pytest.mark.parametrize("filename", [None, "test.{ext}"])
-def test_compression(parser_and_data, compression_only, buffer, filename, pyarrow_xfail):
+def test_compression(
+    parser_and_data, compression_only, buffer, filename, pyarrow_xfail
+):
     parser, data, expected = parser_and_data
     compress_type = compression_only
 
@@ -125,7 +127,9 @@ def test_infer_compression(all_parsers, csv1, buffer, ext):
     tm.assert_frame_equal(result, expected)
 
 
-def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt, pyarrow_xfail):
+def test_compression_utf_encoding(
+    all_parsers, csv_dir_path, utf_value, encoding_fmt, pyarrow_xfail
+):
     # see gh-18071, gh-24130
     parser = all_parsers
     encoding = encoding_fmt.format(utf_value)
diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py
index d1ed85cc6f466..626d4febd7ddf 100644
--- a/pandas/tests/io/parser/test_dtypes.py
+++ b/pandas/tests/io/parser/test_dtypes.py
@@ -403,7 +403,7 @@ def test_empty_with_multi_index_pass_dtype(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers):
+def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers, pyarrow_xfail):
     parser = all_parsers
 
     data = "one,one"
diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py
index 44865d61d1b05..2e6165619f318 100644
--- a/pandas/tests/io/parser/test_unsupported.py
+++ b/pandas/tests/io/parser/test_unsupported.py
@@ -136,7 +136,6 @@ def test_pyarrow_engine(self):
                 f"The {repr(default)} option is not "
                 f"supported with the 'pyarrow' engine"
             )
-            print(default)
             kwargs = {default: object()}
             with pytest.raises(ValueError, match=msg):
                 read_csv(StringIO(data), engine="pyarrow", **kwargs)

From 008acab51559e76c1646bd659146d6b79081b99d Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 3 Jun 2020 14:20:56 -0700
Subject: [PATCH 25/95] parse_dates support and fixups of some tests

---
 asv_bench/benchmarks/io/csv.py             | 2 +-
 pandas/io/parsers.py                       | 8 +++-----
 pandas/tests/io/parser/test_unsupported.py | 1 +
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 6e166ec315df6..f2462184abb37 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -262,7 +262,7 @@ def time_read_csv_arrow(self, sep, decimal, float_precision):
 
 class ReadCSVEngine(StringIORewind):
     def setup(self):
-        data = ["A,B,C"] + (["1,2,3"] * 100000)
+        data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 1000000)
         self.StringIO_input = StringIO("\n".join(data))
         # simulate reading from file
         self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8"))
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index f1a89da794849..24aff9ddba376 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -555,13 +555,10 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     "iterator",
     "cache_dates",
     "dayfirst",
-    "keep_date_col",
     "infer_datetime_format",
     "verbose",
     "skipinitialspace",
-    "date_parser",
     "cache_dates",
-    "parse_dates",
 }
 _python_unsupported = {"low_memory", "float_precision"}
 
@@ -2338,10 +2335,11 @@ def read(self):
         if self.index_col is not None:
             index_col = [frame.columns[i] for i in self.index_col]
             frame.set_index(index_col, drop=True, inplace=True)
+
+        frame.columns, frame = self._do_date_conversions(frame.columns, frame)
+
         if self.kwds.get("dtype") is not None:
             frame = frame.astype(self.kwds.get("dtype"))
-        else:
-            frame = frame.infer_objects()
         return frame
 
 
diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py
index 2e6165619f318..d2ae4c160d519 100644
--- a/pandas/tests/io/parser/test_unsupported.py
+++ b/pandas/tests/io/parser/test_unsupported.py
@@ -132,6 +132,7 @@ def test_pyarrow_engine(self):
         1,2,3,4,"""
 
         for default in pa_unsupported:
+            print(default)
             msg = (
                 f"The {repr(default)} option is not "
                 f"supported with the 'pyarrow' engine"

From 2dddae747d4d612ab8e78761bd058ff76a13a5eb Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Fri, 12 Jun 2020 21:33:34 -0700
Subject: [PATCH 26/95] Date parsing fixes and address comments

---
 asv_bench/benchmarks/io/csv.py     |  68 +++++++++----------
 doc/source/user_guide/io.rst       |   8 ++-
 doc/source/whatsnew/v1.1.0.rst     |   6 +-
 pandas/io/parsers.py               | 102 ++++++++++++++++++++++++-----
 pandas/tests/io/parser/conftest.py |  16 ++---
 5 files changed, 130 insertions(+), 70 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index f2462184abb37..3681cd4df481f 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -146,10 +146,10 @@ def time_read_csv(self, bad_date_value):
 class ReadCSVSkipRows(BaseIO):
 
     fname = "__test__.csv"
-    params = [None, 10000]
-    param_names = ["skiprows"]
+    params = ([None, 10000], ["c", "pyarrow"])
+    param_names = ["skiprows", "engine"]
 
-    def setup(self, skiprows):
+    def setup(self, skiprows, engine):
         N = 20000
         index = tm.makeStringIndex(N)
         df = DataFrame(
@@ -164,8 +164,8 @@ def setup(self, skiprows):
         )
         df.to_csv(self.fname)
 
-    def time_skipprows(self, skiprows):
-        read_csv(self.fname, skiprows=skiprows)
+    def time_skipprows(self, skiprows, engine):
+        read_csv(self.fname, skiprows=skiprows, engine=engine)
 
 
 class ReadUint64Integers(StringIORewind):
@@ -261,31 +261,20 @@ def time_read_csv_arrow(self, sep, decimal, float_precision):
 
 
 class ReadCSVEngine(StringIORewind):
-    def setup(self):
-        data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 1000000)
+    params = ["c", "python", "pyarrow"]
+    param_names = ["engine"]
+
+    def setup(self, engine):
+        data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 100000)
         self.StringIO_input = StringIO("\n".join(data))
         # simulate reading from file
         self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8"))
 
-    def time_read_stringcsv_c(self):
-        read_csv(self.data(self.StringIO_input))
-
-    def time_read_stringcsv_arrow(self):
-        read_csv(self.data(self.StringIO_input), engine="pyarrow")
-
-    def time_read_stringcsv_python_engine(self):
-        read_csv(
-            self.data(self.StringIO_input), engine="python",
-        )
-
-    def time_read_bytescsv_c(self):
-        read_csv(self.BytesIO_input)
-
-    def time_read_bytescsv_arrow(self):
-        read_csv(self.BytesIO_input, engine="pyarrow")
+    def time_read_stringcsv(self, engine):
+        read_csv(self.data(self.StringIO_input), engine=engine)
 
-    def time_read_bytescsv_python_engine(self):
-        read_csv(self.BytesIO_input, engine="python")
+    def time_read_bytescsv(self, engine):
+        read_csv(self.data(self.BytesIO_input), engine=engine)
 
 
 class ReadCSVCategorical(BaseIO):
@@ -305,7 +294,10 @@ def time_convert_direct(self):
 
 
 class ReadCSVParseDates(StringIORewind):
-    def setup(self):
+    params = ["c", "pyarrow", "python"]
+    param_names = ["engine"]
+
+    def setup(self, engine):
         data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n
                   {},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n
                   {},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n
@@ -316,18 +308,20 @@ def setup(self):
         data = data.format(*two_cols)
         self.StringIO_input = StringIO(data)
 
-    def time_multiple_date(self):
+    def time_multiple_date(self, engine):
         read_csv(
             self.data(self.StringIO_input),
+            engine=engine,
             sep=",",
             header=None,
             names=list(string.digits[:9]),
             parse_dates=[[1, 2], [1, 3]],
         )
 
-    def time_baseline(self):
+    def time_baseline(self, engine):
         read_csv(
             self.data(self.StringIO_input),
+            engine=engine,
             sep=",",
             header=None,
             parse_dates=[1],
@@ -336,17 +330,18 @@ def time_baseline(self):
 
 
 class ReadCSVCachedParseDates(StringIORewind):
-    params = ([True, False],)
-    param_names = ["do_cache"]
+    params = ([True, False], ["c", "pyarrow", "python"])
+    param_names = ["do_cache", "engine"]
 
-    def setup(self, do_cache):
+    def setup(self, do_cache, engine):
         data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10
         self.StringIO_input = StringIO(data)
 
-    def time_read_csv_cached(self, do_cache):
+    def time_read_csv_cached(self, do_cache, engine):
         try:
             read_csv(
                 self.data(self.StringIO_input),
+                engine=engine,
                 header=None,
                 parse_dates=[0],
                 cache_dates=do_cache,
@@ -376,22 +371,23 @@ def mem_parser_chunks(self):
 
 
 class ReadCSVParseSpecialDate(StringIORewind):
-    params = (["mY", "mdY", "hm"],)
-    param_names = ["value"]
+    params = (["mY", "mdY", "hm"], ["c", "pyarrow", "python"])
+    param_names = ["value", "engine"]
     objects = {
         "mY": "01-2019\n10-2019\n02/2000\n",
         "mdY": "12/02/2010\n",
         "hm": "21:34\n",
     }
 
-    def setup(self, value):
+    def setup(self, value, engine):
         count_elem = 10000
         data = self.objects[value] * count_elem
         self.StringIO_input = StringIO(data)
 
-    def time_read_special_date(self, value):
+    def time_read_special_date(self, value, engine):
         read_csv(
             self.data(self.StringIO_input),
+            engine=engine,
             sep=",",
             header=None,
             names=["Date"],
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index df6b44ac654ce..9ff714a8211bb 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -160,9 +160,11 @@ dtype : Type name or dict of column -> type, default ``None``
   (unsupported with ``engine='python'``). Use `str` or `object` together
   with suitable ``na_values`` settings to preserve and
   not interpret dtype.
-engine : {``'c'``, ``'python'``}
-  Parser engine to use. The C engine is faster while the Python engine is
-  currently more feature-complete.
+engine : {``'c'``, ``'pyarrow'``,``'python'``}
+  Parser engine to use. In terms of performance, the pyarrow engine,
+  which requires pyarrow>=0.15.0, is faster than the C engine, which
+  is faster than the python engine. However, the pyarrow and C engines
+  are currently less feature complete than their Python counterpart.
 converters : dict, default ``None``
   Dict of functions for converting values in certain columns. Keys can either be
   integers or column labels.
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 44a56e0818ae8..dee66257f2d56 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -288,6 +288,9 @@ Other enhancements
 - :meth:`HDFStore.put` now accepts `track_times` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`).
 - Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable（:issue:`11704`)
 - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`).
+- :func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing
+  if pyarrow>=0.15 is installed. However, the pyarrow engine is less feature-complete than its "c" or
+  "python" counterparts. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
 
 .. ---------------------------------------------------------------------------
 
@@ -901,9 +904,6 @@ I/O
 - Bug in :meth:`~DataFrame.read_feather` was raising an `ArrowIOError` when reading an s3 or http file path (:issue:`29055`)
 - Bug in :meth:`~DataFrame.to_excel` could not handle the column name `render` and was raising an ``KeyError`` (:issue:`34331`)
 - Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`)
-- :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing
-  if pyarrow>0.15 is installed. However, the pyarrow engine is less feature-complete than its "c" or
-  "python" counterparts. (:issue:`23697`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 24aff9ddba376..d8ef6488dc02a 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -170,9 +170,8 @@
     of dtype conversion.
 engine : {{'c', 'python', 'pyarrow'}}, optional
     Parser engine to use. The C and pyarrow engines are faster, while the python engine
-    is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.15
+    is currently more feature-complete. The pyarrow engine requires ``pyarrow`` >= 0.15
     as a dependency however.
-
     .. versionchanged:: 1.1
         The "pyarrow" engine was added.
 converters : dict, optional
@@ -445,7 +444,8 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     # Extract some of the arguments (pass chunksize on).
     iterator = kwds.get("iterator", False)
     chunksize = kwds.get("chunksize", None)
-    if kwds.get("engine") == "pyarrow":  # chunksize not supported for pyarrow
+    # chunksize and iterator not supported for pyarrow
+    if kwds.get("engine") == "pyarrow":
         if iterator:
             raise ValueError(
                 "The 'iterator' option is not supported with the 'pyarrow' engine"
@@ -523,6 +523,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     "skip_blank_lines": True,
 }
 
+
 _c_parser_defaults = {
     "delim_whitespace": False,
     "na_filter": True,
@@ -553,12 +554,11 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     "converters",
     "decimal",
     "iterator",
-    "cache_dates",
     "dayfirst",
     "infer_datetime_format",
     "verbose",
     "skipinitialspace",
-    "cache_dates",
+    "low_memory",
 }
 _python_unsupported = {"low_memory", "float_precision"}
 
@@ -939,10 +939,6 @@ def _get_options_with_defaults(self, engine):
                         f"The {repr(argname)} option is not supported with the "
                         f"'pyarrow' engine"
                     )
-            if argname == "iterator" and engine == "pyarrow":
-                raise ValueError(
-                    "The iterator option is not supported with the 'pyarrow' engine"
-                )
             # see gh-12935
             if argname == "mangle_dupe_cols" and not value:
                 raise ValueError("Setting mangle_dupe_cols=False is not supported yet")
@@ -2255,14 +2251,18 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True):
 
 
 class BytesIOWrapper:
-    def __init__(self, string_buffer, encoding="utf-8"):
+    """
+    Allows the pyarrow engine for read_csv() to read from string buffers
+    """
+
+    def __init__(self, string_buffer: StringIO, encoding: str = "utf-8"):
         self.string_buffer = string_buffer
         self.encoding = encoding
 
-    def __getattr__(self, attr):
+    def __getattr__(self, attr: str):
         return getattr(self.string_buffer, attr)
 
-    def read(self, size=-1):
+    def read(self, size: int = -1):
         content = self.string_buffer.read(size)
         return content.encode(self.encoding)
 
@@ -2332,16 +2332,85 @@ def read(self):
                 elif self.header is None:
                     self.names = range(num_cols)
             frame.columns = self.names
-        if self.index_col is not None:
-            index_col = [frame.columns[i] for i in self.index_col]
-            frame.set_index(index_col, drop=True, inplace=True)
 
-        frame.columns, frame = self._do_date_conversions(frame.columns, frame)
+        frame = self._date_conversion(
+            frame, self._date_conv, self.parse_dates, keep_date_col=self.keep_date_col
+        )
+
+        if self.index_col is not None:
+            for i, item in enumerate(self.index_col):
+                if is_integer(item):
+                    self.index_col[i] = frame.columns[item]
+            frame.set_index(self.index_col, drop=True, inplace=True)
 
         if self.kwds.get("dtype") is not None:
             frame = frame.astype(self.kwds.get("dtype"))
         return frame
 
+    def _date_conversion(
+        self, data, converter, parse_spec, keep_date_col=False,
+    ):
+
+        orig_names = data.columns
+        columns = list(data.columns)
+
+        date_cols = set()
+
+        if parse_spec is None or isinstance(parse_spec, bool):
+            return data, columns
+
+        if isinstance(parse_spec, list):
+            # list of column lists
+            for colspec in parse_spec:
+                if is_scalar(colspec):
+                    if isinstance(colspec, int) and colspec not in data:
+                        colspec = orig_names[colspec]
+                    data[colspec] = converter(data[colspec].values)
+                else:
+                    new_name, col, old_names = self._try_convert_dates(
+                        converter, colspec, data, orig_names
+                    )
+                    if new_name in data:
+                        raise ValueError(f"New date column already in dict {new_name}")
+                    data[new_name] = col
+                    date_cols.update(old_names)
+
+        elif isinstance(parse_spec, dict):
+            # dict of new name to column list
+            for new_name, colspec in parse_spec.items():
+                if new_name in data:
+                    raise ValueError(f"Date column {new_name} already in dict")
+
+                _, col, old_names = self._try_convert_dates(
+                    converter, colspec, data, orig_names
+                )
+
+                data[new_name] = col
+                date_cols.update(old_names)
+
+        if not keep_date_col:
+            data = data.drop(date_cols, axis=1)
+
+        return data
+
+    def _try_convert_dates(self, parser, colspec, data, columns):
+        colset = set(columns)
+        colnames = []
+
+        for c in colspec:
+            if c in colset:
+                colnames.append(c)
+            elif isinstance(c, int) and c not in columns:
+                colnames.append(columns[c])
+            else:
+                colnames.append(c)
+
+        new_name = "_".join(str(x) for x in colnames)
+        to_parse = [data[c].values for c in colnames if c in data]
+
+        new_col = parser(*to_parse)
+        return new_name, new_col, colnames
+
 
 def TextParser(*args, **kwds):
     """
@@ -3548,6 +3617,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns):
 
 
 def _clean_na_values(na_values, keep_default_na=True):
+
     if na_values is None:
         if keep_default_na:
             na_values = STR_NA_VALUES
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index 9aa23bd739d24..11710fda521f1 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -1,8 +1,6 @@
-import distutils.version
 import os
 from typing import List, Optional
 
-import pkg_resources
 import pytest
 
 from pandas import read_csv, read_table
@@ -80,16 +78,8 @@ def csv1(csv_dir_path):
 _c_parser_ids = ["c_high", "c_low"]
 _pyarrow_parser_ids = ["pyarrow"]
 
-try:
-    pyarrow_version = pkg_resources.get_distribution("pyarrow").version
-except pkg_resources.DistributionNotFound:
-    pyarrow_version = "0"  # represents pyarrow not found
-if distutils.version.LooseVersion(pyarrow_version) > "0.15.0":
-    _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
-    _all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids]
-else:
-    _all_parsers = [*_c_parsers_only, *_py_parsers_only]
-    _all_parser_ids = [*_c_parser_ids, *_py_parser_ids]
+_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
+_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids]
 
 
 @pytest.fixture(params=_all_parsers, ids=_all_parser_ids)
@@ -97,6 +87,8 @@ def all_parsers(request):
     """
     Fixture all of the CSV parsers.
     """
+    if request.param.engine == "pyarrow":
+        pytest.importorskip("pyarrow", "0.15.0")
     return request.param
 
 

From 88e200a108985baa5ac05e5c07287b8971ea091d Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Mon, 29 Jun 2020 11:04:49 -0700
Subject: [PATCH 27/95] Clean/Address comments/Update docs

---
 asv_bench/benchmarks/io/csv.py           |   2 +-
 doc/source/whatsnew/v1.1.0.rst           |  11 ++-
 pandas/compat/_optional.py               |  16 ++--
 pandas/io/parsers.py                     | 108 ++++++-----------------
 pandas/tests/test_optional_dependency.py |   7 +-
 5 files changed, 51 insertions(+), 93 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 3681cd4df481f..8792fff5300d3 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -294,7 +294,7 @@ def time_convert_direct(self):
 
 
 class ReadCSVParseDates(StringIORewind):
-    params = ["c", "pyarrow", "python"]
+    params = ["c", "python"]
     param_names = ["engine"]
 
     def setup(self, engine):
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 7c0a707c964c5..d54935c2bdc08 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -245,6 +245,14 @@ If needed you can adjust the bins with the argument ``offset`` (a Timedelta) tha
 
 For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`.
 
+.. _whatsnew_110.enhancements.read_csv_pyarrow_engine_support:
+
+read_csv() now accepts pyarrow as an engine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing on multicore machines
+with pyarrow>=0.15 installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
+
 
 .. _whatsnew_110.enhancements.other:
 
@@ -293,9 +301,6 @@ Other enhancements
 - :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`).
 - :meth `~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`).
 - :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`).
-- :func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing
-  if pyarrow>=0.15 is installed. However, the pyarrow engine is less feature-complete than its "c" or
-  "python" counterparts. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
index ed025ec36dafd..f65d53c05257c 100644
--- a/pandas/compat/_optional.py
+++ b/pandas/compat/_optional.py
@@ -2,6 +2,7 @@
 import importlib
 import sys
 import types
+from typing import Optional
 import warnings
 
 # Update install.rst when updating versions!
@@ -46,7 +47,11 @@ def _get_version(module: types.ModuleType) -> str:
 
 
 def import_optional_dependency(
-    name: str, extra: str = "", raise_on_missing: bool = True, on_version: str = "raise"
+    name: str,
+    extra: str = "",
+    raise_on_missing: bool = True,
+    on_version: str = "raise",
+    min_version: Optional[str] = None,
 ):
     """
     Import an optional dependency.
@@ -58,8 +63,7 @@ def import_optional_dependency(
     Parameters
     ----------
     name : str
-        The module name. This should be top-level only, so that the
-        version may be checked.
+        The module name.
     extra : str
         Additional text to include in the ImportError message.
     raise_on_missing : bool, default True
@@ -73,6 +77,8 @@ def import_optional_dependency(
         * ignore: Return the module, even if the version is too old.
           It's expected that users validate the version locally when
           using ``on_version="ignore"`` (see. ``io/html.py``)
+    min_version: Optional[str]
+        Specify the minimum version
 
     Returns
     -------
@@ -93,14 +99,14 @@ def import_optional_dependency(
             raise ImportError(msg) from None
         else:
             return None
-    # Grab parent module if submodule being imported
+    # Handle submodules: if we have submodule, grab parent module from sys.modules
     parent = name.split(".")[0]
     if parent != name:
         name = parent
         module_to_get = sys.modules[name]
     else:
         module_to_get = module
-    minimum_version = VERSIONS.get(name)
+    minimum_version = min_version if min_version is not None else VERSIONS.get(name)
     if minimum_version:
         version = _get_version(module_to_get)
         if distutils.version.LooseVersion(version) < minimum_version:
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 3563a1ea0f04e..ebaefafd8b5b8 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -21,7 +21,7 @@
 from pandas._libs.parsers import STR_NA_VALUES
 from pandas._libs.tslibs import parsing
 from pandas._typing import FilePathOrBuffer, Union
-from pandas.compat._optional import VERSIONS, import_optional_dependency
+from pandas.compat._optional import import_optional_dependency
 from pandas.errors import (
     AbstractMethodError,
     EmptyDataError,
@@ -172,6 +172,7 @@
     Parser engine to use. The C and pyarrow engines are faster, while the python engine
     is currently more feature-complete. The pyarrow engine requires ``pyarrow`` >= 0.15
     as a dependency however.
+
     .. versionchanged:: 1.1
         The "pyarrow" engine was added.
 converters : dict, optional
@@ -1015,7 +1016,7 @@ def _clean_options(self, options, engine):
             elif engine not in ("python", "python-fwf"):
                 # wait until regex engine integrated
                 fallback_reason = (
-                    "the 'c' engine does not support "
+                    f"the '{engine}' engine does not support "
                     "regex separators (separators > 1 char and "
                     r"different from '\s+' are interpreted as regex)"
                 )
@@ -2302,9 +2303,10 @@ def __init__(self, src, **kwds):
             self.src = BytesIOWrapper(self.src, encoding=encoding)
 
     def read(self):
-        VERSIONS["pyarrow"] = "0.15.0"
         pyarrow = import_optional_dependency(
-            "pyarrow.csv", extra="pyarrow is required to use the pyarrow engine"
+            "pyarrow.csv",
+            min_version="0.15.0",
+            extra="pyarrow is required to use the pyarrow engine",
         )
         kwdscopy = {k: v for k, v in self.kwds.items() if v is not None}
         # these are kwargs passed to pyarrow
@@ -2315,15 +2317,26 @@ def read(self):
             "true_values",
             "false_values",
         }
+        # rename some arguments to pass to pyarrow
+        kwdscopy["include_columns"] = kwdscopy.get("usecols")
+        kwdscopy["null_values"] = kwdscopy.get("na_values")
+        kwdscopy["escape_char"] = kwdscopy.get("escapechar")
+        kwdscopy["ignore_empty_lines"] = kwdscopy.get("skip_blank_lines")
+
         parse_options = {k: v for k, v in kwdscopy.items() if k in parseoptions}
         convert_options = {k: v for k, v in kwdscopy.items() if k in convertoptions}
-        read_options = pyarrow.ReadOptions(autogenerate_column_names=True)
-        headerexists = True if self.header is not None and self.header >= 0 else False
+        headerexists = True if self.header is not None else False
+        read_options = dict()
+
         skiprows = self.kwds.get("skiprows")
-        if skiprows is not None:
-            read_options = pyarrow.ReadOptions(skip_rows=skiprows)
-        elif headerexists:
-            read_options = pyarrow.ReadOptions(skip_rows=self.header)
+        if headerexists:
+            read_options["skip_rows"] = self.header
+            read_options["autogenerate_column_names"] = False
+        else:
+            if skiprows is not None:
+                read_options["skip_rows"] = skiprows
+            read_options["autogenerate_column_names"] = True
+        read_options = pyarrow.ReadOptions(**read_options)
         table = pyarrow.read_csv(
             self.src,
             read_options=read_options,
@@ -2339,11 +2352,8 @@ def read(self):
                 elif self.header is None:
                     self.names = range(num_cols)
             frame.columns = self.names
-
-        frame = self._date_conversion(
-            frame, self._date_conv, self.parse_dates, keep_date_col=self.keep_date_col
-        )
-
+        # we only need the frame not the names
+        frame.columns, frame = self._do_date_conversions(frame.columns, frame)
         if self.index_col is not None:
             for i, item in enumerate(self.index_col):
                 if is_integer(item):
@@ -2354,70 +2364,6 @@ def read(self):
             frame = frame.astype(self.kwds.get("dtype"))
         return frame
 
-    def _date_conversion(
-        self, data, converter, parse_spec, keep_date_col=False,
-    ):
-
-        orig_names = data.columns
-        columns = list(data.columns)
-
-        date_cols = set()
-
-        if parse_spec is None or isinstance(parse_spec, bool):
-            return data, columns
-
-        if isinstance(parse_spec, list):
-            # list of column lists
-            for colspec in parse_spec:
-                if is_scalar(colspec):
-                    if isinstance(colspec, int) and colspec not in data:
-                        colspec = orig_names[colspec]
-                    data[colspec] = converter(data[colspec].values)
-                else:
-                    new_name, col, old_names = self._try_convert_dates(
-                        converter, colspec, data, orig_names
-                    )
-                    if new_name in data:
-                        raise ValueError(f"New date column already in dict {new_name}")
-                    data[new_name] = col
-                    date_cols.update(old_names)
-
-        elif isinstance(parse_spec, dict):
-            # dict of new name to column list
-            for new_name, colspec in parse_spec.items():
-                if new_name in data:
-                    raise ValueError(f"Date column {new_name} already in dict")
-
-                _, col, old_names = self._try_convert_dates(
-                    converter, colspec, data, orig_names
-                )
-
-                data[new_name] = col
-                date_cols.update(old_names)
-
-        if not keep_date_col:
-            data = data.drop(date_cols, axis=1)
-
-        return data
-
-    def _try_convert_dates(self, parser, colspec, data, columns):
-        colset = set(columns)
-        colnames = []
-
-        for c in colspec:
-            if c in colset:
-                colnames.append(c)
-            elif isinstance(c, int) and c not in columns:
-                colnames.append(columns[c])
-            else:
-                colnames.append(c)
-
-        new_name = "_".join(str(x) for x in colnames)
-        to_parse = [data[c].values for c in colnames if c in data]
-
-        new_col = parser(*to_parse)
-        return new_name, new_col, colnames
-
 
 def TextParser(*args, **kwds):
     """
@@ -3568,7 +3514,7 @@ def _isindex(colspec):
                     colspec = orig_names[colspec]
                 if _isindex(colspec):
                     continue
-                data_dict[colspec] = converter(data_dict[colspec])
+                data_dict[colspec] = converter(np.array(data_dict[colspec]))
             else:
                 new_name, col, old_names = _try_convert_dates(
                     converter, colspec, data_dict, orig_names
@@ -3617,7 +3563,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns):
             colnames.append(c)
 
     new_name = "_".join(str(x) for x in colnames)
-    to_parse = [data_dict[c] for c in colnames if c in data_dict]
+    to_parse = [np.array(data_dict[c]) for c in colnames if c in data_dict]
 
     new_col = parser(*to_parse)
     return new_name, new_col, colnames
diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py
index e5ed69b7703b1..61dbd81e2cee5 100644
--- a/pandas/tests/test_optional_dependency.py
+++ b/pandas/tests/test_optional_dependency.py
@@ -27,14 +27,15 @@ def test_bad_version(monkeypatch):
     module = types.ModuleType(name)
     module.__version__ = "0.9.0"
     sys.modules[name] = module
-    monkeypatch.setitem(VERSIONS, name, "1.0.0")
 
     match = "Pandas requires .*1.0.0.* of .fakemodule.*'0.9.0'"
     with pytest.raises(ImportError, match=match):
-        import_optional_dependency("fakemodule")
+        import_optional_dependency("fakemodule", min_version="1.0.0")
 
     with tm.assert_produces_warning(UserWarning):
-        result = import_optional_dependency("fakemodule", on_version="warn")
+        result = import_optional_dependency(
+            "fakemodule", min_version="1.0.0", on_version="warn"
+        )
     assert result is None
 
     module.__version__ = "1.0.0"  # exact match is OK

From ede279925c591f42a1585d0aae9e186a3b936cd0 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Mon, 29 Jun 2020 11:08:18 -0700
Subject: [PATCH 28/95] Fix typo

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 pandas/io/parsers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index b3127d4f84cd8..de2a833e51ea0 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -173,6 +173,7 @@
     is currently more feature-complete. The pyarrow engine requires ``pyarrow`` >= 0.15
     as a dependency however.
 
+
     .. versionchanged:: 1.1
         The "pyarrow" engine was added.
 converters : dict, optional

From e8eff08c8b939539ecbe6e9466f9248722fd0927 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 8 Jul 2020 16:46:46 -0700
Subject: [PATCH 29/95] Fix doc failures

---
 doc/source/user_guide/io.rst | 21 ++++++++++++++-------
 pandas/io/parsers.py         |  1 -
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 2fcffcd814195..e4da778ee7378 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -160,9 +160,9 @@ dtype : Type name or dict of column -> type, default ``None``
   (unsupported with ``engine='python'``). Use `str` or `object` together
   with suitable ``na_values`` settings to preserve and
   not interpret dtype.
-engine : {``'c'``, ``'pyarrow'``,``'python'``}
+engine : {``'c'``, ``'pyarrow'``, ``'python'``}
   Parser engine to use. In terms of performance, the pyarrow engine,
-  which requires pyarrow>=0.15.0, is faster than the C engine, which
+  which requires ``pyarrow`` >= 0.15.0, is faster than the C engine, which
   is faster than the python engine. However, the pyarrow and C engines
   are currently less feature complete than their Python counterpart.
 converters : dict, default ``None``
@@ -1621,11 +1621,18 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object:
 Specifying the parser engine
 ''''''''''''''''''''''''''''
 
-Under the hood pandas uses a fast and efficient parser implemented in C as well
-as a Python implementation which is currently more feature-complete. Where
-possible pandas uses the C parser (specified as ``engine='c'``), but may fall
-back to Python if C-unsupported options are specified. Currently, C-unsupported
-options include:
+Currently, pandas supports using three engines, the C engine, the python engine,
+and an optional pyarrow engine(requires ``pyarrow`` >= 0.15). In terms of performance
+the pyarrow engine is fastest, followed by the C and Python engines. However,
+the pyarrow engine is much less robust than the C engine, which in turn lacks a
+couple of features present in the Python parser.
+
+Where possible pandas uses the C parser (specified as ``engine='c'``), but may fall
+back to Python if C-unsupported options are specified. If pyarrow unsupported options are
+specified while using ``engine='pyarrow'``, the parser will error out
+(a full list of unsupported options is available at ``pandas.io.parsers._pyarrow_unsupported``).
+
+Currently, C-unsupported options include:
 
 * ``sep`` other than a single character (e.g. regex separators)
 * ``skipfooter``
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index de2a833e51ea0..b3127d4f84cd8 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -173,7 +173,6 @@
     is currently more feature-complete. The pyarrow engine requires ``pyarrow`` >= 0.15
     as a dependency however.
 
-
     .. versionchanged:: 1.1
         The "pyarrow" engine was added.
 converters : dict, optional

From 55139ee19a512c3bd83b3c07caa4c44a92a49a59 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 22 Oct 2020 16:35:14 +0100
Subject: [PATCH 30/95] wip

---
 pandas/tests/io/parser/conftest.py     | 14 +++++-
 pandas/tests/io/parser/test_comment.py |  2 +
 pandas/tests/io/parser/test_common.py  | 64 ++++++++++++++++++++++++--
 3 files changed, 76 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index 05fae470f5a88..a179c1b82baae 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -142,4 +142,16 @@ def pyarrow_xfail(request):
     if "all_parsers" in request.fixturenames:
         parser = request.getfixturevalue("all_parsers")
         if parser.engine == "pyarrow":
-            pytest.xfail("pyarrow doesn't support this.")
+            mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
+            request.node.add_marker(mark)
+
+
+@pytest.fixture
+def pyarrow_skip(request):
+    """
+    Fixture that skips a test if the engine is pyarrow.
+    """
+    if "all_parsers" in request.fixturenames:
+        parser = request.getfixturevalue("all_parsers")
+        if parser.engine == "pyarrow":
+            pytest.skip("pyarrow doesn't support this.")
diff --git a/pandas/tests/io/parser/test_comment.py b/pandas/tests/io/parser/test_comment.py
index 60e32d7c27200..a9a03f006668b 100644
--- a/pandas/tests/io/parser/test_comment.py
+++ b/pandas/tests/io/parser/test_comment.py
@@ -10,6 +10,8 @@
 from pandas import DataFrame
 import pandas._testing as tm
 
+pytestmark = pytest.mark.usefixtures("pyarrow_xfail")
+
 
 @pytest.mark.parametrize("na_values", [None, ["NaN"]])
 def test_comment(all_parsers, na_values):
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index 753189ea7c8d2..1295f0061f808 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -23,6 +23,9 @@
 
 from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser
 
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
+
 
 def test_override_set_noconvert_columns():
     # see gh-17351
@@ -84,7 +87,8 @@ def test_empty_decimal_marker(all_parsers, pyarrow_xfail):
         parser.read_csv(StringIO(data), decimal="")
 
 
-def test_bad_stream_exception(all_parsers, csv_dir_path, pyarrow_xfail):
+@skip_pyarrow
+def test_bad_stream_exception(all_parsers, csv_dir_path):
     # see gh-13652
     #
     # This test validates that both the Python engine and C engine will
@@ -139,6 +143,7 @@ def test_read_csv_local(all_parsers, csv1):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_1000_sep(all_parsers):
     parser = all_parsers
     data = """A|B|C
@@ -232,6 +237,7 @@ def test_csv_mixed_type(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_read_csv_low_memory_no_rows_with_index(all_parsers):
     # see gh-21141
     parser = all_parsers
@@ -280,6 +286,7 @@ def test_read_csv_dataframe(all_parsers, csv1):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_read_csv_no_index_name(all_parsers, csv_dir_path):
     parser = all_parsers
     csv2 = os.path.join(csv_dir_path, "test2.csv")
@@ -348,6 +355,7 @@ def test_read_duplicate_index_explicit(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_read_duplicate_index_implicit(all_parsers):
     data = """A,B,C,D
 foo,2,3,4,5
@@ -728,7 +736,7 @@ def test_iterator_skipfooter_errors(all_parsers, kwargs, pyarrow_xfail):
         parser.read_csv(StringIO(data), skipfooter=1, **kwargs)
 
 
-def test_nrows_skipfooter_errors(all_parsers, pyarrow_xfail):
+def test_nrows_skipfooter_errors(all_parsers):
     msg = "'skipfooter' not supported with 'nrows'"
     data = "a\n1\n2\n3\n4\n5\n6"
     parser = all_parsers
@@ -799,6 +807,7 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
 def test_multi_index_no_level_names(all_parsers, index_col):
     data = """index1,index2,A,B,C,D
@@ -823,6 +832,7 @@ def test_multi_index_no_level_names(all_parsers, index_col):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_multi_index_no_level_names_implicit(all_parsers):
     parser = all_parsers
     data = """A,B,C,D
@@ -856,6 +866,7 @@ def test_multi_index_no_level_names_implicit(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,expected,header",
     [
@@ -877,6 +888,7 @@ def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_no_unnamed_index(all_parsers):
     parser = all_parsers
     data = """ id c0 c1 c2
@@ -939,6 +951,7 @@ def test_local_file(all_parsers, csv_dir_path):
         pytest.skip("Failing on: " + " ".join(platform.uname()))
 
 
+@xfail_pyarrow
 def test_path_path_lib(all_parsers):
     parser = all_parsers
     df = tm.makeDataFrame()
@@ -946,6 +959,7 @@ def test_path_path_lib(all_parsers):
     tm.assert_frame_equal(df, result)
 
 
+@xfail_pyarrow
 def test_path_local_path(all_parsers):
     parser = all_parsers
     df = tm.makeDataFrame()
@@ -955,6 +969,7 @@ def test_path_local_path(all_parsers):
     tm.assert_frame_equal(df, result)
 
 
+@xfail_pyarrow
 def test_nonexistent_path(all_parsers):
     # gh-2428: pls no segfault
     # gh-14086: raise more helpful FileNotFoundError
@@ -968,6 +983,7 @@ def test_nonexistent_path(all_parsers):
     assert path == e.value.filename
 
 
+@xfail_pyarrow
 @td.skip_if_windows  # os.chmod does not work in windows
 def test_no_permission(all_parsers):
     # GH 23784
@@ -990,6 +1006,7 @@ def test_no_permission(all_parsers):
         assert path == e.value.filename
 
 
+@xfail_pyarrow
 def test_missing_trailing_delimiters(all_parsers):
     parser = all_parsers
     data = """A,B,C,D
@@ -1005,6 +1022,7 @@ def test_missing_trailing_delimiters(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_skip_initial_space(all_parsers):
     data = (
         '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, '
@@ -1065,6 +1083,7 @@ def test_skip_initial_space(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_trailing_delimiters(all_parsers):
     # see gh-2442
     data = """A,B,C
@@ -1168,6 +1187,7 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):
     assert df.a.dtype == object
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("sep", [" ", r"\s+"])
 def test_integer_overflow_bug(all_parsers, sep):
     # see gh-2601
@@ -1179,6 +1199,7 @@ def test_integer_overflow_bug(all_parsers, sep):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_catch_too_many_names(all_parsers):
     # see gh-5156
     data = """\
@@ -1198,6 +1219,7 @@ def test_catch_too_many_names(all_parsers):
         parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"])
 
 
+@xfail_pyarrow
 def test_ignore_leading_whitespace(all_parsers):
     # see gh-3374, gh-6607
     parser = all_parsers
@@ -1218,6 +1240,7 @@ def test_chunk_begins_with_newline_whitespace(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_empty_with_index(all_parsers):
     # see gh-10184
     data = "x,y"
@@ -1228,6 +1251,7 @@ def test_empty_with_index(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_empty_with_multi_index(all_parsers):
     # see gh-10467
     data = "x,y,z"
@@ -1240,6 +1264,7 @@ def test_empty_with_multi_index(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_empty_with_reversed_multi_index(all_parsers):
     data = "x,y,z"
     parser = all_parsers
@@ -1251,6 +1276,7 @@ def test_empty_with_reversed_multi_index(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_float_parser(all_parsers):
     # see gh-9565
     parser = all_parsers
@@ -1272,6 +1298,7 @@ def test_scientific_no_exponent(all_parsers):
         tm.assert_frame_equal(df_roundtrip, df)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
 def test_int64_overflow(all_parsers, conv):
     data = """ID
@@ -1315,6 +1342,7 @@ def test_int64_overflow(all_parsers, conv):
             parser.read_csv(StringIO(data), converters={"ID": conv})
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min]
 )
@@ -1328,6 +1356,7 @@ def test_int64_uint64_range(all_parsers, val):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
 )
@@ -1341,6 +1370,7 @@ def test_outside_int64_uint64_range(all_parsers, val):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("exp_data", [[str(-1), str(2 ** 63)], [str(2 ** 63), str(-1)]])
 def test_numeric_range_too_wide(all_parsers, exp_data):
     # No numerical dtype can hold both negative and uint64
@@ -1353,6 +1383,7 @@ def test_numeric_range_too_wide(all_parsers, exp_data):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("iterator", [True, False])
 def test_empty_with_nrows_chunksize(all_parsers, iterator):
     # see gh-9535
@@ -1370,6 +1401,7 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data,kwargs,expected,msg",
     [
@@ -1477,6 +1509,7 @@ def test_eof_states(all_parsers, data, kwargs, expected, msg):
         tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]])
 def test_uneven_lines_with_usecols(all_parsers, usecols):
     # see gh-12203
@@ -1531,6 +1564,7 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected, pyarrow_xf
         tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "kwargs,expected",
     [
@@ -1562,7 +1596,7 @@ def test_trailing_spaces(all_parsers, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
-def test_raise_on_sep_with_delim_whitespace(all_parsers, pyarrow_xfail):
+def test_raise_on_sep_with_delim_whitespace(all_parsers):
     # see gh-6607
     data = "a b c\n1 2 3"
     parser = all_parsers
@@ -1571,6 +1605,7 @@ def test_raise_on_sep_with_delim_whitespace(all_parsers, pyarrow_xfail):
         parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("delim_whitespace", [True, False])
 def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
     # see gh-9710
@@ -1589,6 +1624,7 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "sep,skip_blank_lines,exp_data",
     [
@@ -1628,6 +1664,7 @@ def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_whitespace_lines(all_parsers):
     parser = all_parsers
     data = """
@@ -1643,6 +1680,7 @@ def test_whitespace_lines(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,expected",
     [
@@ -1671,6 +1709,7 @@ def test_whitespace_regex_separator(all_parsers, data, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_verbose_read(all_parsers, capsys):
     parser = all_parsers
     data = """a,b,c,d
@@ -1694,6 +1733,7 @@ def test_verbose_read(all_parsers, capsys):
         assert captured.out == "Filled 3 NA values in column a\n"
 
 
+@xfail_pyarrow
 def test_verbose_read2(all_parsers, capsys):
     parser = all_parsers
     data = """a,b,c,d
@@ -1735,6 +1775,7 @@ def test_iteration_open_handle(all_parsers):
             tm.assert_series_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,thousands,decimal",
     [
@@ -1766,6 +1807,7 @@ def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_euro_decimal_format(all_parsers):
     parser = all_parsers
     data = """Id;Number1;Number2;Text1;Text2;Number3
@@ -1785,6 +1827,7 @@ def test_euro_decimal_format(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("na_filter", [True, False])
 def test_inf_parsing(all_parsers, na_filter):
     parser = all_parsers
@@ -1808,6 +1851,7 @@ def test_inf_parsing(all_parsers, na_filter):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("na_filter", [True, False])
 def test_infinity_parsing(all_parsers, na_filter):
     parser = all_parsers
@@ -1825,6 +1869,7 @@ def test_infinity_parsing(all_parsers, na_filter):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5])
 def test_raise_on_no_columns(all_parsers, nrows):
     parser = all_parsers
@@ -1835,6 +1880,7 @@ def test_raise_on_no_columns(all_parsers, nrows):
         parser.read_csv(StringIO(data))
 
 
+@xfail_pyarrow
 @td.check_file_leaks
 def test_memory_map(all_parsers, csv_dir_path):
     mmap_file = os.path.join(csv_dir_path, "test_mmap.csv")
@@ -1848,6 +1894,7 @@ def test_memory_map(all_parsers, csv_dir_path):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_null_byte_char(all_parsers):
     # see gh-2741
     data = "\x00,foo"
@@ -1864,6 +1911,7 @@ def test_null_byte_char(all_parsers):
             parser.read_csv(StringIO(data), names=names)
 
 
+@xfail_pyarrow
 def test_temporary_file(all_parsers):
     # see gh-13398
     parser = all_parsers
@@ -1985,6 +2033,7 @@ def seek(self, pos, whence=0):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "kwargs",
     [dict(), dict(error_bad_lines=True)],  # Default is True.  # Explicitly pass in.
@@ -2003,6 +2052,7 @@ def test_error_bad_lines(all_parsers, kwargs, warn_kwargs):
         parser.read_csv(StringIO(data), **kwargs)
 
 
+@xfail_pyarrow
 def test_warn_bad_lines(all_parsers, capsys):
     # see gh-15925
     parser = all_parsers
@@ -2017,6 +2067,7 @@ def test_warn_bad_lines(all_parsers, capsys):
     assert "Skipping line 5" in captured.err
 
 
+@xfail_pyarrow
 def test_suppress_error_output(all_parsers, capsys):
     # see gh-15925
     parser = all_parsers
@@ -2045,6 +2096,7 @@ def test_filename_with_special_chars(all_parsers, filename):
         tm.assert_frame_equal(result, df)
 
 
+@xfail_pyarrow
 def test_read_csv_memory_growth_chunksize(all_parsers):
     # see gh-24805
     #
@@ -2127,6 +2179,7 @@ def test_first_row_bom(all_parsers, pyarrow_xfail):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_first_row_bom_unquoted(all_parsers):
     # see gh-36343
     parser = all_parsers
@@ -2147,6 +2200,7 @@ def test_integer_precision(all_parsers):
     tm.assert_series_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_file_descriptor_leak(all_parsers):
     # GH 31488
 
@@ -2160,6 +2214,7 @@ def test():
         td.check_file_leaks(test)()
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("nrows", range(1, 6))
 def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
     # GH 28071
@@ -2173,6 +2228,7 @@ def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
     tm.assert_frame_equal(df, ref[:nrows])
 
 
+@xfail_pyarrow
 def test_no_header_two_extra_columns(all_parsers):
     # GH 26218
     column_names = ["one", "two", "three"]
@@ -2203,6 +2259,7 @@ def test_read_csv_with_use_inf_as_na(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_read_table_delim_whitespace_default_sep(all_parsers):
     # GH: 35958
     f = StringIO("a  b  c\n1 -2 -3\n4  5   6")
@@ -2244,6 +2301,7 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
         parser.read_table(f, delim_whitespace=True, delimiter=delimiter)
 
 
+@xfail_pyarrow
 def test_dict_keys_as_names(all_parsers):
     # GH: 36928
     data = "1,2"

From c1aeecf20a519d3ae5b198097a4746291942c936 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 22 Oct 2020 20:27:33 +0100
Subject: [PATCH 31/95] more xfails and skips

---
 pandas/tests/io/parser/test_common.py       |  8 ++---
 pandas/tests/io/parser/test_compression.py  |  5 +++-
 pandas/tests/io/parser/test_converters.py   |  2 ++
 pandas/tests/io/parser/test_dialect.py      |  2 ++
 pandas/tests/io/parser/test_dtypes.py       | 25 ++++++++++++++++
 pandas/tests/io/parser/test_encoding.py     | 13 ++++++++
 pandas/tests/io/parser/test_header.py       | 18 +++++++++++
 pandas/tests/io/parser/test_index_col.py    | 11 +++++++
 pandas/tests/io/parser/test_mangle_dupes.py |  6 ++++
 pandas/tests/io/parser/test_multi_thread.py |  2 ++
 pandas/tests/io/parser/test_na_values.py    | 24 +++++++++++++++
 pandas/tests/io/parser/test_parse_dates.py  | 33 +++++++++++++++++++++
 pandas/tests/io/parser/test_quoting.py      | 10 +++++++
 pandas/tests/io/parser/test_skiprows.py     | 13 ++++++++
 pandas/tests/io/parser/test_usecols.py      | 25 ++++++++++++++++
 15 files changed, 192 insertions(+), 5 deletions(-)

diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index 1295f0061f808..cbf474ad5e5c6 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -1240,7 +1240,7 @@ def test_chunk_begins_with_newline_whitespace(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_empty_with_index(all_parsers):
     # see gh-10184
     data = "x,y"
@@ -1264,7 +1264,7 @@ def test_empty_with_multi_index(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_empty_with_reversed_multi_index(all_parsers):
     data = "x,y,z"
     parser = all_parsers
@@ -1869,7 +1869,7 @@ def test_infinity_parsing(all_parsers, na_filter):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5])
 def test_raise_on_no_columns(all_parsers, nrows):
     parser = all_parsers
@@ -2301,7 +2301,7 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
         parser.read_table(f, delim_whitespace=True, delimiter=delimiter)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_dict_keys_as_names(all_parsers):
     # GH: 36928
     data = "1,2"
diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py
index ecc35dd6644c8..e23b91373f611 100644
--- a/pandas/tests/io/parser/test_compression.py
+++ b/pandas/tests/io/parser/test_compression.py
@@ -11,6 +11,8 @@
 import pandas as pd
 import pandas._testing as tm
 
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+
 
 @pytest.fixture(params=[True, False])
 def buffer(request):
@@ -80,6 +82,7 @@ def test_zip_error_invalid_zip(parser_and_data, pyarrow_xfail):
                 parser.read_csv(f, compression="zip")
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("filename", [None, "test.{ext}"])
 def test_compression(
     parser_and_data, compression_only, buffer, filename, pyarrow_xfail
@@ -147,7 +150,7 @@ def test_compression_utf_encoding(
 
 
 @pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"])
-def test_invalid_compression(all_parsers, invalid_compression, pyarrow_xfail):
+def test_invalid_compression(all_parsers, invalid_compression):
     parser = all_parsers
     compress_kwargs = dict(compression=invalid_compression)
 
diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py
index 88b400d9a11df..a70fe847b6ae9 100644
--- a/pandas/tests/io/parser/test_converters.py
+++ b/pandas/tests/io/parser/test_converters.py
@@ -12,6 +12,8 @@
 from pandas import DataFrame, Index
 import pandas._testing as tm
 
+pytestmark = pytest.mark.usefixtures("pyarrow_xfail")
+
 
 def test_converters_type_must_be_dict(all_parsers):
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py
index cc65def0fd096..7a65e46ba670f 100644
--- a/pandas/tests/io/parser/test_dialect.py
+++ b/pandas/tests/io/parser/test_dialect.py
@@ -13,6 +13,8 @@
 from pandas import DataFrame
 import pandas._testing as tm
 
+pytestmark = pytest.mark.usefixtures("pyarrow_xfail")
+
 
 @pytest.fixture
 def custom_dialect():
diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py
index 1ba6f0ea0a342..8e6462767513a 100644
--- a/pandas/tests/io/parser/test_dtypes.py
+++ b/pandas/tests/io/parser/test_dtypes.py
@@ -16,7 +16,11 @@
 from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat
 import pandas._testing as tm
 
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
+
+@xfail_pyarrow
 @pytest.mark.parametrize("dtype", [str, object])
 @pytest.mark.parametrize("check_orig", [True, False])
 def test_dtype_all_columns(all_parsers, dtype, check_orig):
@@ -43,6 +47,7 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig):
         tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_dtype_all_columns_empty(all_parsers):
     # see gh-12048
     parser = all_parsers
@@ -52,6 +57,7 @@ def test_dtype_all_columns_empty(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_dtype_per_column(all_parsers):
     parser = all_parsers
     data = """\
@@ -70,6 +76,7 @@ def test_dtype_per_column(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_invalid_dtype_per_column(all_parsers):
     parser = all_parsers
     data = """\
@@ -83,6 +90,7 @@ def test_invalid_dtype_per_column(all_parsers):
         parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"})
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -109,6 +117,7 @@ def test_categorical_dtype(all_parsers, dtype):
     tm.assert_frame_equal(actual, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}])
 def test_categorical_dtype_single(all_parsers, dtype):
     # see gh-10153
@@ -124,6 +133,7 @@ def test_categorical_dtype_single(all_parsers, dtype):
     tm.assert_frame_equal(actual, expected)
 
 
+@xfail_pyarrow
 def test_categorical_dtype_unsorted(all_parsers):
     # see gh-10153
     parser = all_parsers
@@ -142,6 +152,7 @@ def test_categorical_dtype_unsorted(all_parsers):
     tm.assert_frame_equal(actual, expected)
 
 
+@xfail_pyarrow
 def test_categorical_dtype_missing(all_parsers):
     # see gh-10153
     parser = all_parsers
@@ -160,6 +171,7 @@ def test_categorical_dtype_missing(all_parsers):
     tm.assert_frame_equal(actual, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.slow
 def test_categorical_dtype_high_cardinality_numeric(all_parsers):
     # see gh-18186
@@ -187,6 +199,7 @@ def test_categorical_dtype_latin1(all_parsers, csv_dir_path):
     tm.assert_frame_equal(actual, expected)
 
 
+@xfail_pyarrow
 def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
     # see gh-10153
     pth = os.path.join(csv_dir_path, "utf16_ex.txt")
@@ -201,6 +214,7 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
     tm.assert_frame_equal(actual, expected)
 
 
+@xfail_pyarrow
 def test_categorical_dtype_chunksize_infer_categories(all_parsers):
     # see gh-10153
     parser = all_parsers
@@ -219,6 +233,7 @@ def test_categorical_dtype_chunksize_infer_categories(all_parsers):
         tm.assert_frame_equal(actual, expected)
 
 
+@xfail_pyarrow
 def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
     # see gh-10153
     parser = all_parsers
@@ -320,6 +335,7 @@ def test_categorical_coerces_timestamp(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_categorical_coerces_timedelta(all_parsers):
     parser = all_parsers
     dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))}
@@ -361,6 +377,7 @@ def test_categorical_unexpected_categories(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_empty_pass_dtype(all_parsers):
     parser = all_parsers
 
@@ -374,6 +391,7 @@ def test_empty_pass_dtype(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_empty_with_index_pass_dtype(all_parsers):
     parser = all_parsers
 
@@ -388,6 +406,7 @@ def test_empty_with_index_pass_dtype(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_empty_with_multi_index_pass_dtype(all_parsers):
     parser = all_parsers
 
@@ -416,6 +435,7 @@ def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers, pyarrow_xfai
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers):
     parser = all_parsers
 
@@ -429,6 +449,7 @@ def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers):
     # see gh-9424
     parser = all_parsers
@@ -457,6 +478,7 @@ def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers):
         parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"})
 
 
+@xfail_pyarrow
 def test_raise_on_passed_int_dtype_with_nas(all_parsers):
     # see gh-2631
     parser = all_parsers
@@ -474,6 +496,7 @@ def test_raise_on_passed_int_dtype_with_nas(all_parsers):
         parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True)
 
 
+@xfail_pyarrow
 def test_dtype_with_converters(all_parsers):
     parser = all_parsers
     data = """a,b
@@ -489,6 +512,7 @@ def test_dtype_with_converters(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "dtype,expected",
     [
@@ -553,6 +577,7 @@ def test_numeric_dtype(all_parsers, dtype):
     tm.assert_frame_equal(expected, result)
 
 
+@xfail_pyarrow
 def test_boolean_dtype(all_parsers):
     parser = all_parsers
     data = "\n".join(
diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py
index 876696ecdad9c..eac906601876b 100644
--- a/pandas/tests/io/parser/test_encoding.py
+++ b/pandas/tests/io/parser/test_encoding.py
@@ -13,7 +13,11 @@
 from pandas import DataFrame, read_csv
 import pandas._testing as tm
 
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
+
+@xfail_pyarrow
 def test_bytes_io_input(all_parsers):
     encoding = "cp1255"
     parser = all_parsers
@@ -25,6 +29,7 @@ def test_bytes_io_input(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_read_csv_unicode(all_parsers):
     parser = all_parsers
     data = BytesIO("\u0141aski, Jan;1".encode())
@@ -34,6 +39,7 @@ def test_read_csv_unicode(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("sep", [",", "\t"])
 @pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"])
 def test_utf16_bom_skiprows(all_parsers, sep, encoding):
@@ -68,6 +74,7 @@ def test_utf16_bom_skiprows(all_parsers, sep, encoding):
         tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_utf16_example(all_parsers, csv_dir_path):
     path = os.path.join(csv_dir_path, "utf16_ex.txt")
     parser = all_parsers
@@ -75,6 +82,7 @@ def test_utf16_example(all_parsers, csv_dir_path):
     assert len(result) == 50
 
 
+@xfail_pyarrow
 def test_unicode_encoding(all_parsers, csv_dir_path):
     path = os.path.join(csv_dir_path, "unicode_series.csv")
     parser = all_parsers
@@ -87,6 +95,7 @@ def test_unicode_encoding(all_parsers, csv_dir_path):
     assert got == expected
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data,kwargs,expected",
     [
@@ -120,6 +129,7 @@ def _encode_data_with_bom(_data):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
     # see gh-13549
     expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
@@ -132,6 +142,7 @@ def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "file_path,encoding",
     [
@@ -163,6 +174,7 @@ def test_binary_mode_file_buffers(
     tm.assert_frame_equal(expected, result)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("pass_encoding", [True, False])
 def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding):
     # see gh-24130
@@ -179,6 +191,7 @@ def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding)
         tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_encoding_named_temp_file(all_parsers):
     # see gh-31819
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py
index 4cd110136d7b0..34eaf6ae306b4 100644
--- a/pandas/tests/io/parser/test_header.py
+++ b/pandas/tests/io/parser/test_header.py
@@ -14,7 +14,11 @@
 from pandas import DataFrame, Index, MultiIndex
 import pandas._testing as tm
 
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
+
+@xfail_pyarrow
 def test_read_with_bad_header(all_parsers):
     parser = all_parsers
     msg = r"but only \d+ lines in file"
@@ -82,6 +86,7 @@ def test_no_header_prefix(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_header_with_index_col(all_parsers):
     parser = all_parsers
     data = """foo,1,2,3
@@ -119,6 +124,7 @@ def test_header_not_first_line(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_header_multi_index(all_parsers):
     parser = all_parsers
     expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
@@ -184,6 +190,7 @@ def test_header_multi_index_invalid(all_parsers, kwargs, msg):
 _TestTuple = namedtuple("names", ["first", "second"])
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "kwargs",
     [
@@ -231,6 +238,7 @@ def test_header_multi_index_common_format1(all_parsers, kwargs):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "kwargs",
     [
@@ -277,6 +285,7 @@ def test_header_multi_index_common_format2(all_parsers, kwargs):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "kwargs",
     [
@@ -324,6 +333,7 @@ def test_header_multi_index_common_format3(all_parsers, kwargs):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_header_multi_index_common_format_malformed1(all_parsers):
     parser = all_parsers
     expected = DataFrame(
@@ -344,6 +354,7 @@ def test_header_multi_index_common_format_malformed1(all_parsers):
     tm.assert_frame_equal(expected, result)
 
 
+@xfail_pyarrow
 def test_header_multi_index_common_format_malformed2(all_parsers):
     parser = all_parsers
     expected = DataFrame(
@@ -365,6 +376,7 @@ def test_header_multi_index_common_format_malformed2(all_parsers):
     tm.assert_frame_equal(expected, result)
 
 
+@xfail_pyarrow
 def test_header_multi_index_common_format_malformed3(all_parsers):
     parser = all_parsers
     expected = DataFrame(
@@ -385,6 +397,7 @@ def test_header_multi_index_common_format_malformed3(all_parsers):
     tm.assert_frame_equal(expected, result)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)]
 )
@@ -397,6 +410,7 @@ def test_header_names_backward_compat(all_parsers, data, header):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("kwargs", [dict(), dict(index_col=False)])
 def test_read_only_header_no_rows(all_parsers, kwargs):
     # See gh-7773
@@ -442,6 +456,7 @@ def test_non_int_header(all_parsers, header):
         parser.read_csv(StringIO(data), header=header)
 
 
+@xfail_pyarrow
 def test_singleton_header(all_parsers):
     # see gh-7757
     data = """a,b,c\n0,1,2\n1,2,3"""
@@ -452,6 +467,7 @@ def test_singleton_header(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,expected",
     [
@@ -498,6 +514,7 @@ def test_mangles_multi_index(all_parsers, data, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("index_col", [None, [0]])
 @pytest.mark.parametrize(
     "columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])]
@@ -541,6 +558,7 @@ def test_multi_index_unnamed(all_parsers, index_col, columns):
         tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_read_csv_multiindex_columns(all_parsers):
     # GH#6051
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py
index 4d64f2bf411bd..a0a4fdbc25d49 100644
--- a/pandas/tests/io/parser/test_index_col.py
+++ b/pandas/tests/io/parser/test_index_col.py
@@ -11,7 +11,11 @@
 from pandas import DataFrame, Index, MultiIndex
 import pandas._testing as tm
 
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
+
+@skip_pyarrow
 @pytest.mark.parametrize("with_header", [True, False])
 def test_index_col_named(all_parsers, with_header):
     parser = all_parsers
@@ -66,6 +70,7 @@ def test_index_col_is_true(all_parsers):
         parser.read_csv(StringIO(data), index_col=True)
 
 
+@xfail_pyarrow
 def test_infer_index_col(all_parsers):
     data = """A,B,C
 foo,1,2,3
@@ -83,6 +88,7 @@ def test_infer_index_col(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "index_col,kwargs",
     [
@@ -127,6 +133,7 @@ def test_index_col_empty_data(all_parsers, index_col, kwargs):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_empty_with_index_col_false(all_parsers):
     # see gh-10413
     data = "x,y"
@@ -137,6 +144,7 @@ def test_empty_with_index_col_false(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "index_names",
     [
@@ -161,6 +169,7 @@ def test_multi_index_naming(all_parsers, index_names):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_multi_index_naming_not_all_at_beginning(all_parsers):
     parser = all_parsers
     data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
@@ -175,6 +184,7 @@ def test_multi_index_naming_not_all_at_beginning(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_no_multi_index_level_names_empty(all_parsers):
     # GH 10984
     parser = all_parsers
@@ -186,6 +196,7 @@ def test_no_multi_index_level_names_empty(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_header_with_index_col(all_parsers):
     # GH 33476
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py
index 5c4e642115798..cc88a1d974767 100644
--- a/pandas/tests/io/parser/test_mangle_dupes.py
+++ b/pandas/tests/io/parser/test_mangle_dupes.py
@@ -10,7 +10,10 @@
 from pandas import DataFrame
 import pandas._testing as tm
 
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
+
+@xfail_pyarrow
 @pytest.mark.parametrize("kwargs", [dict(), dict(mangle_dupe_cols=True)])
 def test_basic(all_parsers, kwargs):
     # TODO: add test for condition "mangle_dupe_cols=False"
@@ -24,6 +27,7 @@ def test_basic(all_parsers, kwargs):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_basic_names(all_parsers):
     # See gh-7160
     parser = all_parsers
@@ -44,6 +48,7 @@ def test_basic_names_raise(all_parsers):
         parser.read_csv(StringIO(data), names=["a", "b", "a"])
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,expected",
     [
@@ -111,6 +116,7 @@ def test_thorough_mangle_names(all_parsers, data, names, expected):
         parser.read_csv(StringIO(data), names=names)
 
 
+@xfail_pyarrow
 def test_mangled_unnamed_placeholders(all_parsers):
     # xref gh-13017
     orig_key = "0"
diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py
index d50560c684084..06f14e28435ef 100644
--- a/pandas/tests/io/parser/test_multi_thread.py
+++ b/pandas/tests/io/parser/test_multi_thread.py
@@ -12,6 +12,8 @@
 from pandas import DataFrame
 import pandas._testing as tm
 
+pytestmark = pytest.mark.usefixtures("pyarrow_xfail")
+
 
 def _construct_dataframe(num_rows):
     """
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
index 9f86bbd65640e..9e7a445234a45 100644
--- a/pandas/tests/io/parser/test_na_values.py
+++ b/pandas/tests/io/parser/test_na_values.py
@@ -12,7 +12,11 @@
 from pandas import DataFrame, Index, MultiIndex
 import pandas._testing as tm
 
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
+
+@xfail_pyarrow
 def test_string_nas(all_parsers):
     parser = all_parsers
     data = """A,B,C
@@ -28,6 +32,7 @@ def test_string_nas(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_detect_string_na(all_parsers):
     parser = all_parsers
     data = """A,B
@@ -42,6 +47,7 @@ def test_detect_string_na(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "na_values",
     [
@@ -79,6 +85,7 @@ def test_non_string_na_values(all_parsers, data, na_values):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_default_na_values(all_parsers):
     _NA_VALUES = {
         "-1.#IND",
@@ -126,6 +133,7 @@ def f(i, v):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("na_values", ["baz", ["baz"]])
 def test_custom_na_values(all_parsers, na_values):
     parser = all_parsers
@@ -159,6 +167,7 @@ def test_bool_na_values(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_na_value_dict(all_parsers):
     data = """A,B,C
 foo,bar,NA
@@ -177,6 +186,7 @@ def test_na_value_dict(all_parsers):
     tm.assert_frame_equal(df, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "index_col,expected",
     [
@@ -210,6 +220,7 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "kwargs,expected",
     [
@@ -297,6 +308,7 @@ def test_no_na_values_no_keep_default(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_no_keep_default_na_dict_na_values(all_parsers):
     # see gh-19227
     data = "a,b\n,2"
@@ -308,6 +320,7 @@ def test_no_keep_default_na_dict_na_values(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_no_keep_default_na_dict_na_scalar_values(all_parsers):
     # see gh-19227
     #
@@ -319,6 +332,7 @@ def test_no_keep_default_na_dict_na_scalar_values(all_parsers):
     tm.assert_frame_equal(df, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("col_zero_na_values", [113125, "113125"])
 def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values):
     # see gh-19227
@@ -348,6 +362,7 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "na_filter,row_data",
     [
@@ -369,6 +384,7 @@ def test_na_values_na_filter_override(all_parsers, na_filter, row_data):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_na_trailing_columns(all_parsers):
     parser = all_parsers
     data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax
@@ -396,6 +412,7 @@ def test_na_trailing_columns(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "na_values,row_data",
     [
@@ -414,6 +431,7 @@ def test_na_values_scalar(all_parsers, na_values, row_data):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_na_values_dict_aliasing(all_parsers):
     parser = all_parsers
     na_values = {"a": 2, "b": 1}
@@ -429,6 +447,7 @@ def test_na_values_dict_aliasing(all_parsers):
     tm.assert_dict_equal(na_values, na_values_copy)
 
 
+@xfail_pyarrow
 def test_na_values_dict_col_index(all_parsers):
     # see gh-14203
     data = "a\nfoo\n1"
@@ -440,6 +459,7 @@ def test_na_values_dict_col_index(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,kwargs,expected",
     [
@@ -469,6 +489,7 @@ def test_empty_na_values_no_default_with_index(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])]
 )
@@ -497,6 +518,7 @@ def test_inf_na_values_with_int_index(all_parsers):
     tm.assert_frame_equal(out, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("na_filter", [True, False])
 def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
     # see gh-20377
@@ -512,6 +534,7 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data, na_values",
     [
@@ -540,6 +563,7 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
         )
 
 
+@xfail_pyarrow
 def test_str_nan_dropped(all_parsers):
     # see gh-21131
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index 662659982c0b3..722170c9b76df 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -34,7 +34,10 @@
 else:
     date_strategy = st.datetimes()
 
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
+
+@xfail_pyarrow
 def test_separator_date_conflict(all_parsers):
     # Regression test for gh-4678
     #
@@ -56,6 +59,7 @@ def test_separator_date_conflict(all_parsers):
     tm.assert_frame_equal(df, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("keep_date_col", [True, False])
 def test_multiple_date_col_custom(all_parsers, keep_date_col):
     data = """\
@@ -199,6 +203,7 @@ def date_parser(*date_cols):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("container", [list, tuple, Index, Series])
 @pytest.mark.parametrize("dim", [1, 2])
 def test_concat_date_col_fail(container, dim):
@@ -211,6 +216,7 @@ def test_concat_date_col_fail(container, dim):
         parsing.concat_date_cols(date_cols)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("keep_date_col", [True, False])
 def test_multiple_date_col(all_parsers, keep_date_col):
     data = """\
@@ -370,6 +376,7 @@ def test_date_col_as_index_col(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]),
@@ -434,6 +441,7 @@ def test_multiple_date_cols_int_cast(all_parsers, date_parser, warning):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_multiple_date_col_timestamp_parse(all_parsers):
     parser = all_parsers
     data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
@@ -468,6 +476,7 @@ def test_multiple_date_col_timestamp_parse(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_multiple_date_cols_with_header(all_parsers):
     parser = all_parsers
     data = """\
@@ -637,6 +646,7 @@ def test_date_parser_int_bug(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_nat_parse(all_parsers):
     # see gh-3062
     parser = all_parsers
@@ -652,6 +662,7 @@ def test_nat_parse(all_parsers):
         tm.assert_frame_equal(result, df)
 
 
+@xfail_pyarrow
 def test_csv_custom_parser(all_parsers):
     data = """A,B,C
 20090101,a,1,2
@@ -666,6 +677,7 @@ def test_csv_custom_parser(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_parse_dates_implicit_first_col(all_parsers):
     data = """A,B,C
 20090101,a,1,2
@@ -679,6 +691,7 @@ def test_parse_dates_implicit_first_col(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_parse_dates_string(all_parsers):
     data = """date,A,B,C
 20090101,a,1,2
@@ -723,6 +736,7 @@ def test_yy_format_with_year_first(all_parsers, parse_dates):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]])
 def test_parse_dates_column_list(all_parsers, parse_dates):
     data = "a,b,c\n01/01/2010,1,15/02/2010"
@@ -739,6 +753,7 @@ def test_parse_dates_column_list(all_parsers, parse_dates):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
 def test_multi_index_parse_dates(all_parsers, index_col):
     data = """index1,index2,A,B,C
@@ -784,6 +799,7 @@ def test_multi_index_parse_dates(all_parsers, index_col):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("kwargs", [dict(dayfirst=True), dict(day_first=True)])
 def test_parse_dates_custom_euro_format(all_parsers, kwargs):
     parser = all_parsers
@@ -828,6 +844,7 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs):
             )
 
 
+@xfail_pyarrow
 def test_parse_tz_aware(all_parsers):
     # See gh-1693
     parser = all_parsers
@@ -841,6 +858,7 @@ def test_parse_tz_aware(all_parsers):
     assert result.index.tz is pytz.utc
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "parse_dates,index_col",
     [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)],
@@ -941,6 +959,7 @@ def test_multiple_date_cols_index(all_parsers, parse_dates, index_col):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_multiple_date_cols_chunked(all_parsers):
     parser = all_parsers
     data = """\
@@ -1033,6 +1052,7 @@ def test_multiple_date_cols_chunked(all_parsers):
     tm.assert_frame_equal(chunks[2], expected[4:])
 
 
+@xfail_pyarrow
 def test_multiple_date_col_named_index_compat(all_parsers):
     parser = all_parsers
     data = """\
@@ -1056,6 +1076,7 @@ def test_multiple_date_col_named_index_compat(all_parsers):
     tm.assert_frame_equal(with_indices, with_names)
 
 
+@xfail_pyarrow
 def test_multiple_date_col_multiple_index_compat(all_parsers):
     parser = all_parsers
     data = """\
@@ -1123,6 +1144,7 @@ def test_bad_date_parse(all_parsers, cache_dates, value):
     )
 
 
+@xfail_pyarrow
 def test_parse_dates_empty_string(all_parsers):
     # see gh-2263
     parser = all_parsers
@@ -1135,6 +1157,7 @@ def test_parse_dates_empty_string(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,kwargs,expected",
     [
@@ -1174,6 +1197,7 @@ def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]),
@@ -1202,6 +1226,7 @@ def test_parse_date_time_multi_level_column_name(all_parsers, date_parser, warni
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]),
@@ -1290,6 +1315,7 @@ def test_parse_date_time(all_parsers, data, kwargs, expected, date_parser, warni
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     ([conv.parse_date_fields, FutureWarning], [pd.to_datetime, None]),
@@ -1312,6 +1338,7 @@ def test_parse_date_fields(all_parsers, date_parser, warning):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     (
@@ -1343,6 +1370,7 @@ def test_parse_date_all_fields(all_parsers, date_parser, warning):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     (
@@ -1374,6 +1402,7 @@ def test_datetime_fractional_seconds(all_parsers, date_parser, warning):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_generic(all_parsers):
     parser = all_parsers
     data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
@@ -1392,6 +1421,7 @@ def test_generic(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_date_parser_resolution_if_not_ns(all_parsers):
     # see gh-10245
     parser = all_parsers
@@ -1489,6 +1519,7 @@ def test_parse_timezone(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "date_string",
     ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"],
@@ -1500,6 +1531,7 @@ def test_invalid_parse_delimited_date(all_parsers, date_string):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "date_string,dayfirst,expected",
     [
@@ -1565,6 +1597,7 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti
     assert result == expected
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "names, usecols, parse_dates, missing_cols",
     [
diff --git a/pandas/tests/io/parser/test_quoting.py b/pandas/tests/io/parser/test_quoting.py
index 14773dfbea20e..8b010df470386 100644
--- a/pandas/tests/io/parser/test_quoting.py
+++ b/pandas/tests/io/parser/test_quoting.py
@@ -13,7 +13,11 @@
 from pandas import DataFrame
 import pandas._testing as tm
 
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
+
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "kwargs,msg",
     [
@@ -33,6 +37,7 @@ def test_bad_quote_char(all_parsers, kwargs, msg):
         parser.read_csv(StringIO(data), **kwargs)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "quoting,msg",
     [
@@ -57,6 +62,7 @@ def test_quote_char_basic(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"])
 def test_quote_char_various(all_parsers, quote_char):
     parser = all_parsers
@@ -69,6 +75,7 @@ def test_quote_char_various(all_parsers, quote_char):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
 @pytest.mark.parametrize("quote_char", ["", None])
 def test_null_quote_char(all_parsers, quoting, quote_char):
@@ -88,6 +95,7 @@ def test_null_quote_char(all_parsers, quoting, quote_char):
         tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "kwargs,exp_data",
     [
@@ -114,6 +122,7 @@ def test_quoting_various(all_parsers, kwargs, exp_data):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])]
 )
@@ -137,6 +146,7 @@ def test_quotechar_unicode(all_parsers, quotechar):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("balanced", [True, False])
 def test_unbalanced_quoting(all_parsers, balanced):
     # see gh-22789.
diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py
index fdccef1127c7e..732f2eb18fdd9 100644
--- a/pandas/tests/io/parser/test_skiprows.py
+++ b/pandas/tests/io/parser/test_skiprows.py
@@ -14,7 +14,10 @@
 from pandas import DataFrame, Index
 import pandas._testing as tm
 
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
+
+@xfail_pyarrow
 @pytest.mark.parametrize("skiprows", [list(range(6)), 6])
 def test_skip_rows_bug(all_parsers, skiprows):
     # see gh-505
@@ -42,6 +45,7 @@ def test_skip_rows_bug(all_parsers, skiprows):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_deep_skip_rows(all_parsers):
     # see gh-4382
     parser = all_parsers
@@ -57,6 +61,7 @@ def test_deep_skip_rows(all_parsers):
     tm.assert_frame_equal(result, condensed_result)
 
 
+@xfail_pyarrow
 def test_skip_rows_blank(all_parsers):
     # see gh-9832
     parser = all_parsers
@@ -83,6 +88,7 @@ def test_skip_rows_blank(all_parsers):
     tm.assert_frame_equal(data, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,kwargs,expected",
     [
@@ -123,6 +129,7 @@ def test_skip_row_with_newline(all_parsers, data, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_skip_row_with_quote(all_parsers):
     # see gh-12775 and gh-10911
     parser = all_parsers
@@ -138,6 +145,7 @@ def test_skip_row_with_quote(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,exp_data",
     [
@@ -173,6 +181,7 @@ def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "line_terminator", ["\n", "\r\n", "\r"]  # "LF"  # "CRLF"  # "CR"
 )
@@ -209,6 +218,7 @@ def test_skiprows_lineterminator(all_parsers, line_terminator):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_skiprows_infield_quote(all_parsers):
     # see gh-14459
     parser = all_parsers
@@ -219,6 +229,7 @@ def test_skiprows_infield_quote(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "kwargs,expected",
     [
@@ -234,6 +245,7 @@ def test_skip_rows_callable(all_parsers, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_skip_rows_skip_all(all_parsers):
     parser = all_parsers
     data = "a\n1\n2\n3\n4\n5"
@@ -243,6 +255,7 @@ def test_skip_rows_skip_all(all_parsers):
         parser.read_csv(StringIO(data), skiprows=lambda x: True)
 
 
+@xfail_pyarrow
 def test_skip_rows_bad_callable(all_parsers):
     msg = "by zero"
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py
index 7e9c9866a666d..0f2e5882439f8 100644
--- a/pandas/tests/io/parser/test_usecols.py
+++ b/pandas/tests/io/parser/test_usecols.py
@@ -12,6 +12,9 @@
 from pandas import DataFrame, Index
 import pandas._testing as tm
 
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
+
 _msg_validate_usecols_arg = (
     "'usecols' must either be list-like "
     "of all strings, all unicode, all "
@@ -22,6 +25,7 @@
 )
 
 
+@skip_pyarrow
 def test_raise_on_mixed_dtype_usecols(all_parsers):
     # See gh-12678
     data = """a,b,c
@@ -35,6 +39,7 @@ def test_raise_on_mixed_dtype_usecols(all_parsers):
         parser.read_csv(StringIO(data), usecols=usecols)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")])
 def test_usecols(all_parsers, usecols):
     data = """\
@@ -50,6 +55,7 @@ def test_usecols(all_parsers, usecols):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_usecols_with_names(all_parsers):
     data = """\
 a,b,c
@@ -65,6 +71,7 @@ def test_usecols_with_names(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])]
 )
@@ -81,6 +88,7 @@ def test_usecols_relative_to_names(all_parsers, names, usecols):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_usecols_relative_to_names2(all_parsers):
     # see gh-5766
     data = """\
@@ -97,6 +105,7 @@ def test_usecols_relative_to_names2(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_usecols_name_length_conflict(all_parsers):
     data = """\
 1,2,3
@@ -125,6 +134,7 @@ def test_usecols_single_string(all_parsers):
         parser.read_csv(StringIO(data), usecols="foo")
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"]
 )
@@ -138,6 +148,7 @@ def test_usecols_index_col_false(all_parsers, data):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("index_col", ["b", 0])
 @pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]])
 def test_usecols_index_col_conflict(all_parsers, usecols, index_col):
@@ -164,6 +175,7 @@ def test_usecols_index_col_conflict2(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_usecols_implicit_index_col(all_parsers):
     # see gh-2654
     parser = all_parsers
@@ -174,6 +186,7 @@ def test_usecols_implicit_index_col(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_usecols_regex_sep(all_parsers):
     # see gh-2733
     parser = all_parsers
@@ -184,6 +197,7 @@ def test_usecols_regex_sep(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_usecols_with_whitespace(all_parsers):
     parser = all_parsers
     data = "a  b  c\n4  apple  bat  5.7\n8  orange  cow  10"
@@ -193,6 +207,7 @@ def test_usecols_with_whitespace(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "usecols,expected",
     [
@@ -212,6 +227,7 @@ def test_usecols_with_integer_like_header(all_parsers, usecols, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
 def test_usecols_with_parse_dates(all_parsers, usecols):
     # see gh-9755
@@ -230,6 +246,7 @@ def test_usecols_with_parse_dates(all_parsers, usecols):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_usecols_with_parse_dates2(all_parsers):
     # see gh-13604
     parser = all_parsers
@@ -290,6 +307,7 @@ def test_usecols_with_parse_dates3(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_usecols_with_parse_dates4(all_parsers):
     data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
     usecols = list("abcdefghij")
@@ -313,6 +331,7 @@ def test_usecols_with_parse_dates4(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
 @pytest.mark.parametrize(
     "names",
@@ -406,6 +425,7 @@ def test_usecols_with_multi_byte_characters(all_parsers, usecols):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_empty_usecols(all_parsers):
     data = "a,b,c\n1,2,3\n4,5,6"
     expected = DataFrame()
@@ -426,6 +446,7 @@ def test_np_array_usecols(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "usecols,expected",
     [
@@ -458,6 +479,7 @@ def test_callable_usecols(all_parsers, usecols, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]])
 def test_incomplete_first_row(all_parsers, usecols):
     # see gh-6710
@@ -470,6 +492,7 @@ def test_incomplete_first_row(all_parsers, usecols):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,usecols,kwargs,expected",
     [
@@ -502,6 +525,7 @@ def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "usecols,kwargs,expected,msg",
     [
@@ -558,6 +582,7 @@ def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected
         tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
 def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request):
     if all_parsers.engine != "c":

From b53a620b8fb77e1ab804a18e01662d85cf653bf7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 28 Oct 2020 04:07:45 +0000
Subject: [PATCH 32/95] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 asv_bench/benchmarks/io/csv.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 8792fff5300d3..c1fad1efde082 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -256,7 +256,10 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
 
     def time_read_csv_arrow(self, sep, decimal, float_precision):
         read_csv(
-            self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"),
+            self.data(self.StringIO_input),
+            sep=sep,
+            header=None,
+            names=list("abc"),
         )
 
 

From f13113d37ccad7f16d493931dac876d4cd246d96 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 28 Oct 2020 10:39:52 -0700
Subject: [PATCH 33/95] Fix typos

---
 pandas/compat/_optional.py                 | 30 +++++++++++-----------
 pandas/io/parsers.py                       | 10 +++-----
 pandas/tests/io/parser/test_unsupported.py |  3 ++-
 3 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
index 6f00c8ddb37af..6569b077069e2 100644
--- a/pandas/compat/_optional.py
+++ b/pandas/compat/_optional.py
@@ -45,6 +45,7 @@
     "pandas_gbq": "pandas-gbq",
     "sqlalchemy": "SQLAlchemy",
     "jinja2": "Jinja2",
+    "pyarrow.csv": "pyarrow",
 }
 
 
@@ -119,23 +120,22 @@ def import_optional_dependency(
     # Handle submodules: if we have submodule, grab parent module from sys.modules
     parent = name.split(".")[0]
     if parent != name:
-        name = parent
-        module_to_get = sys.modules[name]
+        install_name = parent
+        module_to_get = sys.modules[install_name]
     else:
         module_to_get = module
     minimum_version = min_version if min_version is not None else VERSIONS.get(name)
-    if minimum_version:
-        version = _get_version(module_to_get)
-        if distutils.version.LooseVersion(version) < minimum_version:
-            assert on_version in {"warn", "raise", "ignore"}
-            msg = (
-                f"Pandas requires version '{minimum_version}' or newer of '{name}' "
-                f"(version '{version}' currently installed)."
-            )
-            if on_version == "warn":
-                warnings.warn(msg, UserWarning)
-                return None
-            elif on_version == "raise":
-                raise ImportError(msg)
+    version = _get_version(module_to_get)
+    if distutils.version.LooseVersion(version) < minimum_version:
+        assert on_version in {"warn", "raise", "ignore"}
+        msg = (
+            f"Pandas requires version '{minimum_version}' or newer of '{name}' "
+            f"(version '{version}' currently installed)."
+        )
+        if on_version == "warn":
+            warnings.warn(msg, UserWarning)
+            return None
+        elif on_version == "raise":
+            raise ImportError(msg)
 
     return module
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 03a70615591a1..75c1d7b06b635 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -839,7 +839,7 @@ def __init__(self, f, engine=None, **kwds):
             if engine == "pyarrow":
                 raise ValueError(
                     "The 'dialect' option is not supported with the 'pyarrow' engine"
-                
+                )
             kwds = _merge_with_dialect_properties(dialect, kwds)
 
         if kwds.get("header", "infer") == "infer":
@@ -2223,11 +2223,7 @@ def __init__(self, src, **kwds):
             self.src = BytesIOWrapper(self.src, encoding=encoding)
 
     def read(self):
-        pyarrow = import_optional_dependency(
-            "pyarrow.csv",
-            min_version="0.15.0",
-            extra="pyarrow is required to use the pyarrow engine",
-        )
+        pyarrow = import_optional_dependency("pyarrow.csv", min_version="0.15.0")
         kwdscopy = {k: v for k, v in self.kwds.items() if v is not None}
         # these are kwargs passed to pyarrow
         parseoptions = {"delimiter", "quote_char", "escape_char", "ignore_empty_lines"}
@@ -3434,7 +3430,7 @@ def _isindex(colspec):
                     colspec = orig_names[colspec]
                 if _isindex(colspec):
                     continue
-                data_dict[colspec] = converter(np.array(data_dict[colspec]))
+                data_dict[colspec] = converter(np.asarray(data_dict[colspec]))
             else:
                 new_name, col, old_names = _try_convert_dates(
                     converter, colspec, data_dict, orig_names
diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py
index d2ae4c160d519..6e9cdacd40586 100644
--- a/pandas/tests/io/parser/test_unsupported.py
+++ b/pandas/tests/io/parser/test_unsupported.py
@@ -132,11 +132,12 @@ def test_pyarrow_engine(self):
         1,2,3,4,"""
 
         for default in pa_unsupported:
-            print(default)
             msg = (
                 f"The {repr(default)} option is not "
                 f"supported with the 'pyarrow' engine"
             )
             kwargs = {default: object()}
+            if default == "dialect":
+                kwargs[default] = "excel"  # test a random dialect
             with pytest.raises(ValueError, match=msg):
                 read_csv(StringIO(data), engine="pyarrow", **kwargs)

From f9ce2e46838a0aec07d180dc8e909573b5408918 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 28 Oct 2020 11:47:47 -0700
Subject: [PATCH 34/95] Doc fixes and more typo fixes

---
 doc/source/whatsnew/v1.1.0.rst |  8 --------
 doc/source/whatsnew/v1.2.0.rst |  6 ++++++
 pandas/compat/_optional.py     | 23 ++++++++++++-----------
 pandas/io/parsers.py           |  7 ++++---
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index a0383d7248624..50443f8810e5f 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -270,14 +270,6 @@ change, as ``fsspec`` will still bring in the same packages as before.
 
 .. _fsspec docs: https://filesystem-spec.readthedocs.io/en/latest/
 
-
-read_csv() now accepts pyarrow as an engine
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-:func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing on multicore machines
-with pyarrow>=0.15 installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
-
-
 .. _whatsnew_110.enhancements.other:
 
 Other enhancements
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index f1f24ab7a101b..16b0324acaf6c 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -203,6 +203,12 @@ example where the index name is preserved:
 The same is true for :class:`MultiIndex`, but the logic is applied separately on a
 level-by-level basis.
 
+read_csv() now accepts pyarrow as an engine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing on multicore machines
+with pyarrow>=0.15 installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
+
 .. _whatsnew_120.enhancements.other:
 
 Other enhancements
diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
index 6569b077069e2..a6a14fcbee757 100644
--- a/pandas/compat/_optional.py
+++ b/pandas/compat/_optional.py
@@ -126,16 +126,17 @@ def import_optional_dependency(
         module_to_get = module
     minimum_version = min_version if min_version is not None else VERSIONS.get(name)
     version = _get_version(module_to_get)
-    if distutils.version.LooseVersion(version) < minimum_version:
-        assert on_version in {"warn", "raise", "ignore"}
-        msg = (
-            f"Pandas requires version '{minimum_version}' or newer of '{name}' "
-            f"(version '{version}' currently installed)."
-        )
-        if on_version == "warn":
-            warnings.warn(msg, UserWarning)
-            return None
-        elif on_version == "raise":
-            raise ImportError(msg)
+    if minimum_version:
+        if distutils.version.LooseVersion(version) < minimum_version:
+            assert on_version in {"warn", "raise", "ignore"}
+            msg = (
+                f"Pandas requires version '{minimum_version}' or newer of '{name}' "
+                f"(version '{version}' currently installed)."
+            )
+            if on_version == "warn":
+                warnings.warn(msg, UserWarning)
+                return None
+            elif on_version == "raise":
+                raise ImportError(msg)
 
     return module
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 75c1d7b06b635..5c70e31aca041 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -165,10 +165,11 @@
     of dtype conversion.
 engine : {{'c', 'python', 'pyarrow'}}, optional
     Parser engine to use. The C and pyarrow engines are faster, while the python engine
-    is currently more feature-complete. The pyarrow engine requires ``pyarrow`` >= 0.15
-    as a dependency however.
+    is currently more feature-complete. The pyarrow engine also supports multithreading
+    something that is not present in the C or python engines. It requires
+    ``pyarrow`` >= 0.15 as a dependency however.
 
-    .. versionchanged:: 1.1
+    .. versionchanged:: 1.2
         The "pyarrow" engine was added.
 converters : dict, optional
     Dict of functions for converting values in certain columns. Keys can either

From 4158d6af395ba4335a59001010621ae0479abf48 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Mon, 2 Nov 2020 09:59:01 -0800
Subject: [PATCH 35/95] Green?

---
 pandas/compat/_optional.py             | 2 +-
 pandas/tests/io/parser/test_dialect.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
index a6a14fcbee757..28741c1560543 100644
--- a/pandas/compat/_optional.py
+++ b/pandas/compat/_optional.py
@@ -125,8 +125,8 @@ def import_optional_dependency(
     else:
         module_to_get = module
     minimum_version = min_version if min_version is not None else VERSIONS.get(name)
-    version = _get_version(module_to_get)
     if minimum_version:
+        version = _get_version(module_to_get)
         if distutils.version.LooseVersion(version) < minimum_version:
             assert on_version in {"warn", "raise", "ignore"}
             msg = (
diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py
index 7a65e46ba670f..afdd7548ed0dd 100644
--- a/pandas/tests/io/parser/test_dialect.py
+++ b/pandas/tests/io/parser/test_dialect.py
@@ -13,7 +13,7 @@
 from pandas import DataFrame
 import pandas._testing as tm
 
-pytestmark = pytest.mark.usefixtures("pyarrow_xfail")
+pytestmark = pytest.mark.usefixtures("pyarrow_skip")
 
 
 @pytest.fixture

From 10be581b3da43373c0b28fd928aa692e4a847e1c Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Tue, 8 Dec 2020 12:39:35 -0500
Subject: [PATCH 36/95] xfail tests

---
 pandas/tests/io/parser/test_common.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index 1f4419988fe5a..2bdc43b892e42 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -109,6 +109,7 @@ def test_bad_stream_exception(all_parsers, csv_dir_path):
             parser.read_csv(stream)
 
 
+@skip_pyarrow
 def test_read_csv_local(all_parsers, csv1):
     prefix = "file:///" if compat.is_platform_windows() else "file://"
     parser = all_parsers
@@ -915,6 +916,7 @@ def test_read_csv_parse_simple_list(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @tm.network
 def test_url(all_parsers, csv_dir_path):
     # TODO: FTP testing
@@ -932,6 +934,7 @@ def test_url(all_parsers, csv_dir_path):
     tm.assert_frame_equal(url_result, local_result)
 
 
+@skip_pyarrow
 @pytest.mark.slow
 def test_local_file(all_parsers, csv_dir_path):
     parser = all_parsers
@@ -1986,6 +1989,7 @@ def test_file_handles_with_open(all_parsers, csv1):
             assert not f.closed
 
 
+@skip_pyarrow
 def test_invalid_file_buffer_class(all_parsers):
     # see gh-15337
     class InvalidBuffer:
@@ -1998,6 +2002,7 @@ class InvalidBuffer:
         parser.read_csv(InvalidBuffer())
 
 
+@skip_pyarrow
 def test_invalid_file_buffer_mock(all_parsers):
     # see gh-15337
     parser = all_parsers
@@ -2332,6 +2337,7 @@ def test_read_csv_file_handle(all_parsers, io_class, encoding):
     assert not handle.closed
 
 
+@skip_pyarrow
 def test_memory_map_file_handle_silent_fallback(all_parsers, compression):
     """
     Do not fail for buffers with memory_map=True (cannot memory map BytesIO).
@@ -2351,6 +2357,7 @@ def test_memory_map_file_handle_silent_fallback(all_parsers, compression):
     )
 
 
+@skip_pyarrow
 def test_memory_map_compression(all_parsers, compression):
     """
     Support memory map for compressed files.
@@ -2369,6 +2376,7 @@ def test_memory_map_compression(all_parsers, compression):
         )
 
 
+@skip_pyarrow
 def test_context_manager(all_parsers, datapath):
     # make sure that opened files are closed
     parser = all_parsers
@@ -2385,6 +2393,7 @@ def test_context_manager(all_parsers, datapath):
         assert reader._engine.handles.handle.closed
 
 
+@skip_pyarrow
 def test_context_manageri_user_provided(all_parsers, datapath):
     # make sure that user-provided handles are not closed
     parser = all_parsers

From fcc7e043e730bb13de7dd58dcbe6519aff870793 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Tue, 8 Dec 2020 12:41:11 -0500
Subject: [PATCH 37/95] xfail test

---
 pandas/tests/io/parser/test_mangle_dupes.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py
index f151ee1d27f99..bef2b08a308f6 100644
--- a/pandas/tests/io/parser/test_mangle_dupes.py
+++ b/pandas/tests/io/parser/test_mangle_dupes.py
@@ -12,6 +12,8 @@
 
 xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
+
+@xfail_pyarrow
 @pytest.mark.parametrize("kwargs", [{}, {"mangle_dupe_cols": True}])
 def test_basic(all_parsers, kwargs):
     # TODO: add test for condition "mangle_dupe_cols=False"

From d7959a102acdecadcaf88dcf47acc752548d7db5 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Tue, 8 Dec 2020 12:41:30 -0500
Subject: [PATCH 38/95] fix import

---
 pandas/io/parsers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index bcd5e33cc2f67..6c44f1ff0077a 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -5,7 +5,7 @@
 from collections import abc, defaultdict
 import csv
 import datetime
-from io import StringIO
+from io import StringIO, TextIOBase
 import itertools
 import re
 import sys

From e37d12698a1c76e0594b7891b3c9371d8fa0a14b Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Tue, 8 Dec 2020 12:49:15 -0500
Subject: [PATCH 39/95] xfail tests

---
 pandas/tests/io/parser/test_compression.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py
index 1c6726f13b843..0af10c4124072 100644
--- a/pandas/tests/io/parser/test_compression.py
+++ b/pandas/tests/io/parser/test_compression.py
@@ -150,6 +150,7 @@ def test_compression_utf_encoding(
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"])
 def test_invalid_compression(all_parsers, invalid_compression):
     parser = all_parsers

From 3bc4775068baf7dd1c70ad807bf064b136d9b0ee Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Wed, 9 Dec 2020 13:29:01 -0500
Subject: [PATCH 40/95] skip tests

---
 pandas/tests/io/parser/test_common.py | 93 +++++++++++++--------------
 1 file changed, 46 insertions(+), 47 deletions(-)

diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index 2bdc43b892e42..a9ebd9004e9f8 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -24,7 +24,6 @@
 from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser
 
 skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
-xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
 
 def test_override_set_noconvert_columns():
@@ -144,7 +143,7 @@ def test_read_csv_local(all_parsers, csv1):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_1000_sep(all_parsers):
     parser = all_parsers
     data = """A|B|C
@@ -237,7 +236,7 @@ def test_csv_mixed_type(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_read_csv_low_memory_no_rows_with_index(all_parsers):
     # see gh-21141
     parser = all_parsers
@@ -286,7 +285,7 @@ def test_read_csv_dataframe(all_parsers, csv1):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_read_csv_no_index_name(all_parsers, csv_dir_path):
     parser = all_parsers
     csv2 = os.path.join(csv_dir_path, "test2.csv")
@@ -355,7 +354,7 @@ def test_read_duplicate_index_explicit(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_read_duplicate_index_implicit(all_parsers):
     data = """A,B,C,D
 foo,2,3,4,5
@@ -806,7 +805,7 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
 def test_multi_index_no_level_names(all_parsers, index_col):
     data = """index1,index2,A,B,C,D
@@ -831,7 +830,7 @@ def test_multi_index_no_level_names(all_parsers, index_col):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_multi_index_no_level_names_implicit(all_parsers):
     parser = all_parsers
     data = """A,B,C,D
@@ -865,7 +864,7 @@ def test_multi_index_no_level_names_implicit(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data,expected,header",
     [
@@ -887,7 +886,7 @@ def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_no_unnamed_index(all_parsers):
     parser = all_parsers
     data = """ id c0 c1 c2
@@ -952,7 +951,7 @@ def test_local_file(all_parsers, csv_dir_path):
         pytest.skip("Failing on: " + " ".join(platform.uname()))
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_path_path_lib(all_parsers):
     parser = all_parsers
     df = tm.makeDataFrame()
@@ -960,7 +959,7 @@ def test_path_path_lib(all_parsers):
     tm.assert_frame_equal(df, result)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_path_local_path(all_parsers):
     parser = all_parsers
     df = tm.makeDataFrame()
@@ -970,7 +969,7 @@ def test_path_local_path(all_parsers):
     tm.assert_frame_equal(df, result)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_nonexistent_path(all_parsers):
     # gh-2428: pls no segfault
     # gh-14086: raise more helpful FileNotFoundError
@@ -984,7 +983,7 @@ def test_nonexistent_path(all_parsers):
     assert path == e.value.filename
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @td.skip_if_windows  # os.chmod does not work in windows
 def test_no_permission(all_parsers):
     # GH 23784
@@ -1007,7 +1006,7 @@ def test_no_permission(all_parsers):
         assert path == e.value.filename
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_missing_trailing_delimiters(all_parsers):
     parser = all_parsers
     data = """A,B,C,D
@@ -1023,7 +1022,7 @@ def test_missing_trailing_delimiters(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_skip_initial_space(all_parsers):
     data = (
         '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, '
@@ -1084,7 +1083,7 @@ def test_skip_initial_space(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_trailing_delimiters(all_parsers):
     # see gh-2442
     data = """A,B,C
@@ -1200,7 +1199,7 @@ def test_integer_overflow_bug(all_parsers, sep):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_catch_too_many_names(all_parsers):
     # see gh-5156
     data = """\
@@ -1220,7 +1219,7 @@ def test_catch_too_many_names(all_parsers):
         parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"])
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_ignore_leading_whitespace(all_parsers):
     # see gh-3374, gh-6607
     parser = all_parsers
@@ -1252,7 +1251,7 @@ def test_empty_with_index(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_empty_with_multi_index(all_parsers):
     # see gh-10467
     data = "x,y,z"
@@ -1299,7 +1298,7 @@ def test_scientific_no_exponent(all_parsers):
         tm.assert_frame_equal(df_roundtrip, df)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
 def test_int64_overflow(all_parsers, conv):
     data = """ID
@@ -1343,7 +1342,7 @@ def test_int64_overflow(all_parsers, conv):
             parser.read_csv(StringIO(data), converters={"ID": conv})
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min]
 )
@@ -1357,7 +1356,7 @@ def test_int64_uint64_range(all_parsers, val):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
 )
@@ -1371,7 +1370,7 @@ def test_outside_int64_uint64_range(all_parsers, val):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("exp_data", [[str(-1), str(2 ** 63)], [str(2 ** 63), str(-1)]])
 def test_numeric_range_too_wide(all_parsers, exp_data):
     # No numerical dtype can hold both negative and uint64
@@ -1384,7 +1383,7 @@ def test_numeric_range_too_wide(all_parsers, exp_data):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("iterator", [True, False])
 def test_empty_with_nrows_chunksize(all_parsers, iterator):
     # see gh-9535
@@ -1511,7 +1510,7 @@ def test_eof_states(all_parsers, data, kwargs, expected, msg):
         tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]])
 def test_uneven_lines_with_usecols(all_parsers, usecols):
     # see gh-12203
@@ -1566,7 +1565,7 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected, pyarrow_xf
         tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "kwargs,expected",
     [
@@ -1607,7 +1606,7 @@ def test_raise_on_sep_with_delim_whitespace(all_parsers):
         parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("delim_whitespace", [True, False])
 def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
     # see gh-9710
@@ -1666,7 +1665,7 @@ def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_whitespace_lines(all_parsers):
     parser = all_parsers
     data = """
@@ -1682,7 +1681,7 @@ def test_whitespace_lines(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data,expected",
     [
@@ -1711,7 +1710,7 @@ def test_whitespace_regex_separator(all_parsers, data, expected):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_verbose_read(all_parsers, capsys):
     parser = all_parsers
     data = """a,b,c,d
@@ -1735,7 +1734,7 @@ def test_verbose_read(all_parsers, capsys):
         assert captured.out == "Filled 3 NA values in column a\n"
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_verbose_read2(all_parsers, capsys):
     parser = all_parsers
     data = """a,b,c,d
@@ -1777,7 +1776,7 @@ def test_iteration_open_handle(all_parsers):
             tm.assert_series_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data,thousands,decimal",
     [
@@ -1809,7 +1808,7 @@ def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_euro_decimal_format(all_parsers):
     parser = all_parsers
     data = """Id;Number1;Number2;Text1;Text2;Number3
@@ -1829,7 +1828,7 @@ def test_euro_decimal_format(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("na_filter", [True, False])
 def test_inf_parsing(all_parsers, na_filter):
     parser = all_parsers
@@ -1853,7 +1852,7 @@ def test_inf_parsing(all_parsers, na_filter):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("na_filter", [True, False])
 def test_infinity_parsing(all_parsers, na_filter):
     parser = all_parsers
@@ -1882,7 +1881,7 @@ def test_raise_on_no_columns(all_parsers, nrows):
         parser.read_csv(StringIO(data))
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @td.check_file_leaks
 def test_memory_map(all_parsers, csv_dir_path):
     mmap_file = os.path.join(csv_dir_path, "test_mmap.csv")
@@ -1896,7 +1895,7 @@ def test_memory_map(all_parsers, csv_dir_path):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_null_byte_char(all_parsers):
     # see gh-2741
     data = "\x00,foo"
@@ -1913,7 +1912,7 @@ def test_null_byte_char(all_parsers):
             parser.read_csv(StringIO(data), names=names)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_temporary_file(all_parsers):
     # see gh-13398
     parser = all_parsers
@@ -2037,7 +2036,7 @@ def seek(self, pos, whence=0):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "kwargs",
     [dict(), dict(error_bad_lines=True)],  # Default is True.  # Explicitly pass in.
@@ -2056,7 +2055,7 @@ def test_error_bad_lines(all_parsers, kwargs, warn_kwargs):
         parser.read_csv(StringIO(data), **kwargs)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_warn_bad_lines(all_parsers, capsys):
     # see gh-15925
     parser = all_parsers
@@ -2071,7 +2070,7 @@ def test_warn_bad_lines(all_parsers, capsys):
     assert "Skipping line 5" in captured.err
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_suppress_error_output(all_parsers, capsys):
     # see gh-15925
     parser = all_parsers
@@ -2100,7 +2099,7 @@ def test_filename_with_special_chars(all_parsers, filename):
         tm.assert_frame_equal(result, df)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_read_csv_memory_growth_chunksize(all_parsers):
     # see gh-24805
     #
@@ -2182,7 +2181,7 @@ def test_first_row_bom(all_parsers, pyarrow_xfail):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_first_row_bom_unquoted(all_parsers):
     # see gh-36343
     parser = all_parsers
@@ -2203,7 +2202,7 @@ def test_integer_precision(all_parsers):
     tm.assert_series_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_file_descriptor_leak(all_parsers):
     # GH 31488
 
@@ -2217,7 +2216,7 @@ def test():
         td.check_file_leaks(test)()
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("nrows", range(1, 6))
 def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
     # GH 28071
@@ -2231,7 +2230,7 @@ def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
     tm.assert_frame_equal(df, ref[:nrows])
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_no_header_two_extra_columns(all_parsers):
     # GH 26218
     column_names = ["one", "two", "three"]
@@ -2262,7 +2261,7 @@ def test_read_csv_with_use_inf_as_na(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_read_table_delim_whitespace_default_sep(all_parsers):
     # GH: 35958
     f = StringIO("a  b  c\n1 -2 -3\n4  5   6")

From 17a502d806063ce77bafc75a5367e450a4cb609b Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Wed, 9 Dec 2020 13:43:37 -0500
Subject: [PATCH 41/95] skip tests

---
 pandas/tests/io/parser/test_dtypes.py | 43 +++++++++++++--------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py
index 23d4fef424624..4ef609cb87980 100644
--- a/pandas/tests/io/parser/test_dtypes.py
+++ b/pandas/tests/io/parser/test_dtypes.py
@@ -17,10 +17,9 @@
 import pandas._testing as tm
 
 skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
-xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("dtype", [str, object])
 @pytest.mark.parametrize("check_orig", [True, False])
 def test_dtype_all_columns(all_parsers, dtype, check_orig):
@@ -47,7 +46,7 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig):
         tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_dtype_all_columns_empty(all_parsers):
     # see gh-12048
     parser = all_parsers
@@ -57,7 +56,7 @@ def test_dtype_all_columns_empty(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_dtype_per_column(all_parsers):
     parser = all_parsers
     data = """\
@@ -76,7 +75,7 @@ def test_dtype_per_column(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_invalid_dtype_per_column(all_parsers):
     parser = all_parsers
     data = """\
@@ -90,7 +89,7 @@ def test_invalid_dtype_per_column(all_parsers):
         parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"})
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -133,7 +132,7 @@ def test_categorical_dtype_single(all_parsers, dtype):
     tm.assert_frame_equal(actual, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_categorical_dtype_unsorted(all_parsers):
     # see gh-10153
     parser = all_parsers
@@ -152,7 +151,7 @@ def test_categorical_dtype_unsorted(all_parsers):
     tm.assert_frame_equal(actual, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_categorical_dtype_missing(all_parsers):
     # see gh-10153
     parser = all_parsers
@@ -171,7 +170,7 @@ def test_categorical_dtype_missing(all_parsers):
     tm.assert_frame_equal(actual, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.slow
 def test_categorical_dtype_high_cardinality_numeric(all_parsers):
     # see gh-18186
@@ -199,7 +198,7 @@ def test_categorical_dtype_latin1(all_parsers, csv_dir_path):
     tm.assert_frame_equal(actual, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
     # see gh-10153
     pth = os.path.join(csv_dir_path, "utf16_ex.txt")
@@ -214,7 +213,7 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
     tm.assert_frame_equal(actual, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_categorical_dtype_chunksize_infer_categories(all_parsers):
     # see gh-10153
     parser = all_parsers
@@ -234,7 +233,7 @@ def test_categorical_dtype_chunksize_infer_categories(all_parsers):
             tm.assert_frame_equal(actual, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
     # see gh-10153
     parser = all_parsers
@@ -335,7 +334,7 @@ def test_categorical_coerces_timestamp(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_categorical_coerces_timedelta(all_parsers):
     parser = all_parsers
     dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))}
@@ -377,7 +376,7 @@ def test_categorical_unexpected_categories(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_empty_pass_dtype(all_parsers):
     parser = all_parsers
 
@@ -391,7 +390,7 @@ def test_empty_pass_dtype(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_empty_with_index_pass_dtype(all_parsers):
     parser = all_parsers
 
@@ -406,7 +405,7 @@ def test_empty_with_index_pass_dtype(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_empty_with_multi_index_pass_dtype(all_parsers):
     parser = all_parsers
 
@@ -435,7 +434,7 @@ def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers, pyarrow_xfai
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers):
     parser = all_parsers
 
@@ -449,7 +448,7 @@ def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers):
     # see gh-9424
     parser = all_parsers
@@ -478,7 +477,7 @@ def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers):
         parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"})
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_raise_on_passed_int_dtype_with_nas(all_parsers):
     # see gh-2631
     parser = all_parsers
@@ -496,7 +495,7 @@ def test_raise_on_passed_int_dtype_with_nas(all_parsers):
         parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_dtype_with_converters(all_parsers):
     parser = all_parsers
     data = """a,b
@@ -512,7 +511,7 @@ def test_dtype_with_converters(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "dtype,expected",
     [
@@ -577,7 +576,7 @@ def test_numeric_dtype(all_parsers, dtype):
     tm.assert_frame_equal(expected, result)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_boolean_dtype(all_parsers):
     parser = all_parsers
     data = "\n".join(

From e27d7ef5d3b3b043d547593a8612681aafb253d7 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Wed, 9 Dec 2020 13:51:12 -0500
Subject: [PATCH 42/95] C408 failure

---
 pandas/io/parsers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 6c44f1ff0077a..e7089f708d47f 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -2292,7 +2292,7 @@ def read(self):
         parse_options = {k: v for k, v in kwdscopy.items() if k in parseoptions}
         convert_options = {k: v for k, v in kwdscopy.items() if k in convertoptions}
         headerexists = True if self.header is not None else False
-        read_options = dict()
+        read_options = {}
 
         skiprows = self.kwds.get("skiprows")
         if headerexists:

From 4e638e9019cf530e53b2670b24204fac8b432db5 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Wed, 9 Dec 2020 17:27:47 -0500
Subject: [PATCH 43/95] skip tests

---
 pandas/tests/io/parser/test_common.py       |  1 +
 pandas/tests/io/parser/test_encoding.py     | 12 ++--
 pandas/tests/io/parser/test_header.py       | 26 ++++-----
 pandas/tests/io/parser/test_index_col.py    | 13 +++--
 pandas/tests/io/parser/test_mangle_dupes.py | 10 ++--
 pandas/tests/io/parser/test_na_values.py    | 40 ++++++-------
 pandas/tests/io/parser/test_parse_dates.py  | 64 ++++++++++-----------
 pandas/tests/io/parser/test_quoting.py      | 10 ++--
 pandas/tests/io/parser/test_skiprows.py     | 24 ++++----
 pandas/tests/io/parser/test_usecols.py      | 32 +++++------
 10 files changed, 117 insertions(+), 115 deletions(-)

diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index 8c8ef1ef26de5..443af3a33be18 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -73,6 +73,7 @@ def _set_noconvert_columns(self):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_empty_decimal_marker(all_parsers, pyarrow_xfail):
     data = """A|B|C
 1|2,334|5
diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py
index c248a878a9d23..dde8277f1732a 100644
--- a/pandas/tests/io/parser/test_encoding.py
+++ b/pandas/tests/io/parser/test_encoding.py
@@ -17,7 +17,7 @@
 xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_bytes_io_input(all_parsers):
     encoding = "cp1255"
     parser = all_parsers
@@ -29,7 +29,7 @@ def test_bytes_io_input(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_read_csv_unicode(all_parsers):
     parser = all_parsers
     data = BytesIO("\u0141aski, Jan;1".encode())
@@ -39,7 +39,7 @@ def test_read_csv_unicode(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("sep", [",", "\t"])
 @pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"])
 def test_utf16_bom_skiprows(all_parsers, sep, encoding):
@@ -74,7 +74,7 @@ def test_utf16_bom_skiprows(all_parsers, sep, encoding):
         tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_utf16_example(all_parsers, csv_dir_path):
     path = os.path.join(csv_dir_path, "utf16_ex.txt")
     parser = all_parsers
@@ -82,7 +82,7 @@ def test_utf16_example(all_parsers, csv_dir_path):
     assert len(result) == 50
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_unicode_encoding(all_parsers, csv_dir_path):
     path = os.path.join(csv_dir_path, "unicode_series.csv")
     parser = all_parsers
@@ -194,7 +194,7 @@ def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding)
         tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_encoding_named_temp_file(all_parsers):
     # see gh-31819
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py
index 95d9a23eb4d92..0de6e389dd09b 100644
--- a/pandas/tests/io/parser/test_header.py
+++ b/pandas/tests/io/parser/test_header.py
@@ -18,7 +18,7 @@
 xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_read_with_bad_header(all_parsers):
     parser = all_parsers
     msg = r"but only \d+ lines in file"
@@ -86,7 +86,7 @@ def test_no_header_prefix(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_header_with_index_col(all_parsers):
     parser = all_parsers
     data = """foo,1,2,3
@@ -124,7 +124,7 @@ def test_header_not_first_line(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_header_multi_index(all_parsers):
     parser = all_parsers
     expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
@@ -190,7 +190,7 @@ def test_header_multi_index_invalid(all_parsers, kwargs, msg):
 _TestTuple = namedtuple("names", ["first", "second"])
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "kwargs",
     [
@@ -238,7 +238,7 @@ def test_header_multi_index_common_format1(all_parsers, kwargs):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "kwargs",
     [
@@ -285,7 +285,7 @@ def test_header_multi_index_common_format2(all_parsers, kwargs):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "kwargs",
     [
@@ -333,7 +333,7 @@ def test_header_multi_index_common_format3(all_parsers, kwargs):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_header_multi_index_common_format_malformed1(all_parsers):
     parser = all_parsers
     expected = DataFrame(
@@ -354,7 +354,7 @@ def test_header_multi_index_common_format_malformed1(all_parsers):
     tm.assert_frame_equal(expected, result)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_header_multi_index_common_format_malformed2(all_parsers):
     parser = all_parsers
     expected = DataFrame(
@@ -376,7 +376,7 @@ def test_header_multi_index_common_format_malformed2(all_parsers):
     tm.assert_frame_equal(expected, result)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_header_multi_index_common_format_malformed3(all_parsers):
     parser = all_parsers
     expected = DataFrame(
@@ -456,7 +456,7 @@ def test_non_int_header(all_parsers, header):
         parser.read_csv(StringIO(data), header=header)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_singleton_header(all_parsers):
     # see gh-7757
     data = """a,b,c\n0,1,2\n1,2,3"""
@@ -467,7 +467,7 @@ def test_singleton_header(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data,expected",
     [
@@ -514,7 +514,7 @@ def test_mangles_multi_index(all_parsers, data, expected):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("index_col", [None, [0]])
 @pytest.mark.parametrize(
     "columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])]
@@ -558,7 +558,7 @@ def test_multi_index_unnamed(all_parsers, index_col, columns):
         tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_read_csv_multiindex_columns(all_parsers):
     # GH#6051
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py
index a17b858940b2f..0ef11f8a91576 100644
--- a/pandas/tests/io/parser/test_index_col.py
+++ b/pandas/tests/io/parser/test_index_col.py
@@ -70,7 +70,7 @@ def test_index_col_is_true(all_parsers):
         parser.read_csv(StringIO(data), index_col=True)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_infer_index_col(all_parsers):
     data = """A,B,C
 foo,1,2,3
@@ -88,7 +88,7 @@ def test_infer_index_col(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "index_col,kwargs",
     [
@@ -137,7 +137,7 @@ def test_index_col_empty_data(all_parsers, index_col, kwargs):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_empty_with_index_col_false(all_parsers):
     # see gh-10413
     data = "x,y"
@@ -173,7 +173,7 @@ def test_multi_index_naming(all_parsers, index_names):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_multi_index_naming_not_all_at_beginning(all_parsers):
     parser = all_parsers
     data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
@@ -188,7 +188,7 @@ def test_multi_index_naming_not_all_at_beginning(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_no_multi_index_level_names_empty(all_parsers):
     # GH 10984
     parser = all_parsers
@@ -200,7 +200,7 @@ def test_no_multi_index_level_names_empty(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_header_with_index_col(all_parsers):
     # GH 33476
     parser = all_parsers
@@ -224,6 +224,7 @@ def test_header_with_index_col(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.slow
 def test_index_col_large_csv(all_parsers):
     # https://github.com/pandas-dev/pandas/issues/37094
diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py
index bef2b08a308f6..8fb7f3c093ae0 100644
--- a/pandas/tests/io/parser/test_mangle_dupes.py
+++ b/pandas/tests/io/parser/test_mangle_dupes.py
@@ -10,10 +10,10 @@
 from pandas import DataFrame
 import pandas._testing as tm
 
-xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("kwargs", [{}, {"mangle_dupe_cols": True}])
 def test_basic(all_parsers, kwargs):
     # TODO: add test for condition "mangle_dupe_cols=False"
@@ -27,7 +27,7 @@ def test_basic(all_parsers, kwargs):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_basic_names(all_parsers):
     # See gh-7160
     parser = all_parsers
@@ -48,7 +48,7 @@ def test_basic_names_raise(all_parsers):
         parser.read_csv(StringIO(data), names=["a", "b", "a"])
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data,expected",
     [
@@ -116,7 +116,7 @@ def test_thorough_mangle_names(all_parsers, data, names, expected):
         parser.read_csv(StringIO(data), names=names)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_mangled_unnamed_placeholders(all_parsers):
     # xref gh-13017
     orig_key = "0"
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
index d84a886e2451b..6e56d325efdad 100644
--- a/pandas/tests/io/parser/test_na_values.py
+++ b/pandas/tests/io/parser/test_na_values.py
@@ -16,7 +16,7 @@
 xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_string_nas(all_parsers):
     parser = all_parsers
     data = """A,B,C
@@ -32,7 +32,7 @@ def test_string_nas(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_detect_string_na(all_parsers):
     parser = all_parsers
     data = """A,B
@@ -47,7 +47,7 @@ def test_detect_string_na(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "na_values",
     [
@@ -85,7 +85,7 @@ def test_non_string_na_values(all_parsers, data, na_values):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_default_na_values(all_parsers):
     _NA_VALUES = {
         "-1.#IND",
@@ -133,7 +133,7 @@ def f(i, v):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("na_values", ["baz", ["baz"]])
 def test_custom_na_values(all_parsers, na_values):
     parser = all_parsers
@@ -167,7 +167,7 @@ def test_bool_na_values(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_na_value_dict(all_parsers):
     data = """A,B,C
 foo,bar,NA
@@ -186,7 +186,7 @@ def test_na_value_dict(all_parsers):
     tm.assert_frame_equal(df, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "index_col,expected",
     [
@@ -220,7 +220,7 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "kwargs,expected",
     [
@@ -308,7 +308,7 @@ def test_no_na_values_no_keep_default(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_no_keep_default_na_dict_na_values(all_parsers):
     # see gh-19227
     data = "a,b\n,2"
@@ -320,7 +320,7 @@ def test_no_keep_default_na_dict_na_values(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_no_keep_default_na_dict_na_scalar_values(all_parsers):
     # see gh-19227
     #
@@ -332,7 +332,7 @@ def test_no_keep_default_na_dict_na_scalar_values(all_parsers):
     tm.assert_frame_equal(df, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("col_zero_na_values", [113125, "113125"])
 def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values):
     # see gh-19227
@@ -362,7 +362,7 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "na_filter,row_data",
     [
@@ -384,7 +384,7 @@ def test_na_values_na_filter_override(all_parsers, na_filter, row_data):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_na_trailing_columns(all_parsers):
     parser = all_parsers
     data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax
@@ -412,7 +412,7 @@ def test_na_trailing_columns(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "na_values,row_data",
     [
@@ -431,7 +431,7 @@ def test_na_values_scalar(all_parsers, na_values, row_data):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_na_values_dict_aliasing(all_parsers):
     parser = all_parsers
     na_values = {"a": 2, "b": 1}
@@ -447,7 +447,7 @@ def test_na_values_dict_aliasing(all_parsers):
     tm.assert_dict_equal(na_values, na_values_copy)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_na_values_dict_col_index(all_parsers):
     # see gh-14203
     data = "a\nfoo\n1"
@@ -459,7 +459,7 @@ def test_na_values_dict_col_index(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data,kwargs,expected",
     [
@@ -518,7 +518,7 @@ def test_inf_na_values_with_int_index(all_parsers):
     tm.assert_frame_equal(out, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("na_filter", [True, False])
 def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
     # see gh-20377
@@ -534,7 +534,7 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data, na_values",
     [
@@ -563,7 +563,7 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
         )
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_str_nan_dropped(all_parsers):
     # see gh-21131
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index 07faab37c6997..d4c0d28214849 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -34,10 +34,10 @@
 else:
     date_strategy = st.datetimes()
 
-xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_separator_date_conflict(all_parsers):
     # Regression test for gh-4678
     #
@@ -59,7 +59,7 @@ def test_separator_date_conflict(all_parsers):
     tm.assert_frame_equal(df, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("keep_date_col", [True, False])
 def test_multiple_date_col_custom(all_parsers, keep_date_col):
     data = """\
@@ -203,7 +203,7 @@ def date_parser(*date_cols):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("container", [list, tuple, Index, Series])
 @pytest.mark.parametrize("dim", [1, 2])
 def test_concat_date_col_fail(container, dim):
@@ -216,7 +216,7 @@ def test_concat_date_col_fail(container, dim):
         parsing.concat_date_cols(date_cols)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("keep_date_col", [True, False])
 def test_multiple_date_col(all_parsers, keep_date_col):
     data = """\
@@ -376,7 +376,7 @@ def test_date_col_as_index_col(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]),
@@ -441,7 +441,7 @@ def test_multiple_date_cols_int_cast(all_parsers, date_parser, warning):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_multiple_date_col_timestamp_parse(all_parsers):
     parser = all_parsers
     data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
@@ -476,7 +476,7 @@ def test_multiple_date_col_timestamp_parse(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_multiple_date_cols_with_header(all_parsers):
     parser = all_parsers
     data = """\
@@ -646,7 +646,7 @@ def test_date_parser_int_bug(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_nat_parse(all_parsers):
     # see gh-3062
     parser = all_parsers
@@ -662,7 +662,7 @@ def test_nat_parse(all_parsers):
         tm.assert_frame_equal(result, df)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_csv_custom_parser(all_parsers):
     data = """A,B,C
 20090101,a,1,2
@@ -677,7 +677,7 @@ def test_csv_custom_parser(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_parse_dates_implicit_first_col(all_parsers):
     data = """A,B,C
 20090101,a,1,2
@@ -691,7 +691,7 @@ def test_parse_dates_implicit_first_col(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_parse_dates_string(all_parsers):
     data = """date,A,B,C
 20090101,a,1,2
@@ -736,7 +736,7 @@ def test_yy_format_with_year_first(all_parsers, parse_dates):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]])
 def test_parse_dates_column_list(all_parsers, parse_dates):
     data = "a,b,c\n01/01/2010,1,15/02/2010"
@@ -753,7 +753,7 @@ def test_parse_dates_column_list(all_parsers, parse_dates):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
 def test_multi_index_parse_dates(all_parsers, index_col):
     data = """index1,index2,A,B,C
@@ -799,7 +799,7 @@ def test_multi_index_parse_dates(all_parsers, index_col):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("kwargs", [{"dayfirst": True}, {"day_first": True}])
 def test_parse_dates_custom_euro_format(all_parsers, kwargs):
     parser = all_parsers
@@ -844,7 +844,7 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs):
             )
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_parse_tz_aware(all_parsers):
     # See gh-1693
     parser = all_parsers
@@ -858,7 +858,7 @@ def test_parse_tz_aware(all_parsers):
     assert result.index.tz is pytz.utc
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "parse_dates,index_col",
     [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)],
@@ -959,7 +959,7 @@ def test_multiple_date_cols_index(all_parsers, parse_dates, index_col):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_multiple_date_cols_chunked(all_parsers):
     parser = all_parsers
     data = """\
@@ -1052,7 +1052,7 @@ def test_multiple_date_cols_chunked(all_parsers):
     tm.assert_frame_equal(chunks[2], expected[4:])
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_multiple_date_col_named_index_compat(all_parsers):
     parser = all_parsers
     data = """\
@@ -1076,7 +1076,7 @@ def test_multiple_date_col_named_index_compat(all_parsers):
     tm.assert_frame_equal(with_indices, with_names)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_multiple_date_col_multiple_index_compat(all_parsers):
     parser = all_parsers
     data = """\
@@ -1144,7 +1144,7 @@ def test_bad_date_parse(all_parsers, cache_dates, value):
     )
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_parse_dates_empty_string(all_parsers):
     # see gh-2263
     parser = all_parsers
@@ -1157,7 +1157,7 @@ def test_parse_dates_empty_string(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data,kwargs,expected",
     [
@@ -1197,7 +1197,7 @@ def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]),
@@ -1226,7 +1226,7 @@ def test_parse_date_time_multi_level_column_name(all_parsers, date_parser, warni
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]),
@@ -1315,7 +1315,7 @@ def test_parse_date_time(all_parsers, data, kwargs, expected, date_parser, warni
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     ([conv.parse_date_fields, FutureWarning], [pd.to_datetime, None]),
@@ -1338,7 +1338,7 @@ def test_parse_date_fields(all_parsers, date_parser, warning):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     (
@@ -1370,7 +1370,7 @@ def test_parse_date_all_fields(all_parsers, date_parser, warning):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     (
@@ -1402,7 +1402,7 @@ def test_datetime_fractional_seconds(all_parsers, date_parser, warning):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_generic(all_parsers):
     parser = all_parsers
     data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
@@ -1421,7 +1421,7 @@ def test_generic(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_date_parser_resolution_if_not_ns(all_parsers):
     # see gh-10245
     parser = all_parsers
@@ -1519,7 +1519,7 @@ def test_parse_timezone(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "date_string",
     ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"],
@@ -1531,7 +1531,7 @@ def test_invalid_parse_delimited_date(all_parsers, date_string):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "date_string,dayfirst,expected",
     [
@@ -1597,7 +1597,7 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti
     assert result == expected
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "names, usecols, parse_dates, missing_cols",
     [
diff --git a/pandas/tests/io/parser/test_quoting.py b/pandas/tests/io/parser/test_quoting.py
index 30ffc598563e7..a93dbde24b001 100644
--- a/pandas/tests/io/parser/test_quoting.py
+++ b/pandas/tests/io/parser/test_quoting.py
@@ -17,7 +17,7 @@
 xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "kwargs,msg",
     [
@@ -37,7 +37,7 @@ def test_bad_quote_char(all_parsers, kwargs, msg):
         parser.read_csv(StringIO(data), **kwargs)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "quoting,msg",
     [
@@ -62,7 +62,7 @@ def test_quote_char_basic(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"])
 def test_quote_char_various(all_parsers, quote_char):
     parser = all_parsers
@@ -75,7 +75,7 @@ def test_quote_char_various(all_parsers, quote_char):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
 @pytest.mark.parametrize("quote_char", ["", None])
 def test_null_quote_char(all_parsers, quoting, quote_char):
@@ -95,7 +95,7 @@ def test_null_quote_char(all_parsers, quoting, quote_char):
         tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "kwargs,exp_data",
     [
diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py
index cf5eb3f813169..6d85e01c6fd4a 100644
--- a/pandas/tests/io/parser/test_skiprows.py
+++ b/pandas/tests/io/parser/test_skiprows.py
@@ -14,10 +14,10 @@
 from pandas import DataFrame, Index
 import pandas._testing as tm
 
-xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("skiprows", [list(range(6)), 6])
 def test_skip_rows_bug(all_parsers, skiprows):
     # see gh-505
@@ -45,7 +45,7 @@ def test_skip_rows_bug(all_parsers, skiprows):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_deep_skip_rows(all_parsers):
     # see gh-4382
     parser = all_parsers
@@ -61,7 +61,7 @@ def test_deep_skip_rows(all_parsers):
     tm.assert_frame_equal(result, condensed_result)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_skip_rows_blank(all_parsers):
     # see gh-9832
     parser = all_parsers
@@ -88,7 +88,7 @@ def test_skip_rows_blank(all_parsers):
     tm.assert_frame_equal(data, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data,kwargs,expected",
     [
@@ -129,7 +129,7 @@ def test_skip_row_with_newline(all_parsers, data, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_skip_row_with_quote(all_parsers):
     # see gh-12775 and gh-10911
     parser = all_parsers
@@ -145,7 +145,7 @@ def test_skip_row_with_quote(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data,exp_data",
     [
@@ -181,7 +181,7 @@ def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "line_terminator", ["\n", "\r\n", "\r"]  # "LF"  # "CRLF"  # "CR"
 )
@@ -218,7 +218,7 @@ def test_skiprows_lineterminator(all_parsers, line_terminator):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_skiprows_infield_quote(all_parsers):
     # see gh-14459
     parser = all_parsers
@@ -229,7 +229,7 @@ def test_skiprows_infield_quote(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "kwargs,expected",
     [
@@ -245,7 +245,7 @@ def test_skip_rows_callable(all_parsers, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_skip_rows_skip_all(all_parsers):
     parser = all_parsers
     data = "a\n1\n2\n3\n4\n5"
@@ -255,7 +255,7 @@ def test_skip_rows_skip_all(all_parsers):
         parser.read_csv(StringIO(data), skiprows=lambda x: True)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_skip_rows_bad_callable(all_parsers):
     msg = "by zero"
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py
index 8f72a32ed99e5..a3a2b3e984339 100644
--- a/pandas/tests/io/parser/test_usecols.py
+++ b/pandas/tests/io/parser/test_usecols.py
@@ -55,7 +55,7 @@ def test_usecols(all_parsers, usecols):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_usecols_with_names(all_parsers):
     data = """\
 a,b,c
@@ -71,7 +71,7 @@ def test_usecols_with_names(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])]
 )
@@ -88,7 +88,7 @@ def test_usecols_relative_to_names(all_parsers, names, usecols):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_usecols_relative_to_names2(all_parsers):
     # see gh-5766
     data = """\
@@ -105,7 +105,7 @@ def test_usecols_relative_to_names2(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_usecols_name_length_conflict(all_parsers):
     data = """\
 1,2,3
@@ -134,7 +134,7 @@ def test_usecols_single_string(all_parsers):
         parser.read_csv(StringIO(data), usecols="foo")
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"]
 )
@@ -175,7 +175,7 @@ def test_usecols_index_col_conflict2(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_usecols_implicit_index_col(all_parsers):
     # see gh-2654
     parser = all_parsers
@@ -186,7 +186,7 @@ def test_usecols_implicit_index_col(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_usecols_regex_sep(all_parsers):
     # see gh-2733
     parser = all_parsers
@@ -197,7 +197,7 @@ def test_usecols_regex_sep(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_usecols_with_whitespace(all_parsers):
     parser = all_parsers
     data = "a  b  c\n4  apple  bat  5.7\n8  orange  cow  10"
@@ -227,7 +227,7 @@ def test_usecols_with_integer_like_header(all_parsers, usecols, expected):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
 def test_usecols_with_parse_dates(all_parsers, usecols):
     # see gh-9755
@@ -307,7 +307,7 @@ def test_usecols_with_parse_dates3(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_usecols_with_parse_dates4(all_parsers):
     data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
     usecols = list("abcdefghij")
@@ -331,7 +331,7 @@ def test_usecols_with_parse_dates4(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
 @pytest.mark.parametrize(
     "names",
@@ -425,7 +425,7 @@ def test_usecols_with_multi_byte_characters(all_parsers, usecols):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_empty_usecols(all_parsers):
     data = "a,b,c\n1,2,3\n4,5,6"
     expected = DataFrame()
@@ -446,7 +446,7 @@ def test_np_array_usecols(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "usecols,expected",
     [
@@ -479,7 +479,7 @@ def test_callable_usecols(all_parsers, usecols, expected):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]])
 def test_incomplete_first_row(all_parsers, usecols):
     # see gh-6710
@@ -492,7 +492,7 @@ def test_incomplete_first_row(all_parsers, usecols):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data,usecols,kwargs,expected",
     [
@@ -582,7 +582,7 @@ def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected
         tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
 def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request):
     if all_parsers.engine != "c":

From 4f7ebd05133cb6340c70d6226741ce1dbf7d199a Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Wed, 9 Dec 2020 23:24:00 -0500
Subject: [PATCH 44/95] simplify import_optional_dependency code

---
 pandas/compat/_optional.py | 26 ++++++--------------------
 pandas/io/parsers.py       |  2 +-
 2 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
index 89b36e07f3c1d..533e67acfa2f4 100644
--- a/pandas/compat/_optional.py
+++ b/pandas/compat/_optional.py
@@ -1,8 +1,6 @@
 import distutils.version
 import importlib
-import sys
 import types
-from typing import Optional
 import warnings
 
 # Update install.rst when updating versions!
@@ -45,7 +43,6 @@
     "pandas_gbq": "pandas-gbq",
     "sqlalchemy": "SQLAlchemy",
     "jinja2": "Jinja2",
-    "pyarrow.csv": "pyarrow",
 }
 
 
@@ -61,11 +58,7 @@ def _get_version(module: types.ModuleType) -> str:
 
 
 def import_optional_dependency(
-    name: str,
-    extra: str = "",
-    raise_on_missing: bool = True,
-    on_version: str = "raise",
-    min_version: Optional[str] = None,
+    name: str, extra: str = "", raise_on_missing: bool = True, on_version: str = "raise"
 ):
     """
     Import an optional dependency.
@@ -77,7 +70,8 @@ def import_optional_dependency(
     Parameters
     ----------
     name : str
-        The module name.
+        The module name. This should be top-level only, so that the
+        version may be checked.
     extra : str
         Additional text to include in the ImportError message.
     raise_on_missing : bool, default True
@@ -91,8 +85,6 @@ def import_optional_dependency(
         * ignore: Return the module, even if the version is too old.
           It's expected that users validate the version locally when
           using ``on_version="ignore"`` (see. ``io/html.py``)
-    min_version: Optional[str]
-        Specify the minimum version
 
     Returns
     -------
@@ -117,16 +109,10 @@ def import_optional_dependency(
             raise ImportError(msg) from None
         else:
             return None
-    # Handle submodules: if we have submodule, grab parent module from sys.modules
-    parent = name.split(".")[0]
-    if parent != name:
-        install_name = parent
-        module_to_get = sys.modules[install_name]
-    else:
-        module_to_get = module
-    minimum_version = min_version if min_version is not None else VERSIONS.get(name)
+
+    minimum_version = VERSIONS.get(name)
     if minimum_version:
-        version = _get_version(module_to_get)
+        version = _get_version(module)
         if distutils.version.LooseVersion(version) < minimum_version:
             assert on_version in {"warn", "raise", "ignore"}
             msg = (
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index e7089f708d47f..a0bc537dce6dc 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -2273,7 +2273,7 @@ def __init__(self, src, **kwds):
             self.src = BytesIOWrapper(self.src, encoding=encoding)
 
     def read(self):
-        pyarrow = import_optional_dependency("pyarrow.csv", min_version="0.15.0")
+        pyarrow = import_optional_dependency("pyarrow.csv")
         kwdscopy = {k: v for k, v in self.kwds.items() if v is not None}
         # these are kwargs passed to pyarrow
         parseoptions = {"delimiter", "quote_char", "escape_char", "ignore_empty_lines"}

From 69b3b42ef3521bb3fb81e78b640bcc96745271db Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Wed, 9 Dec 2020 23:29:06 -0500
Subject: [PATCH 45/95] move whatsnew to 1.3

---
 doc/source/whatsnew/v1.1.0.rst | 1 -
 doc/source/whatsnew/v1.2.0.rst | 5 -----
 doc/source/whatsnew/v1.3.0.rst | 6 ++++++
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 50443f8810e5f..e054ac830ce41 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -252,7 +252,6 @@ If needed you can adjust the bins with the argument ``offset`` (a :class:`Timede
 
 For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`.
 
-
 fsspec now used for filesystem handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index f6493c68e5aa4..af9219bc25931 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -249,11 +249,6 @@ example where the index name is preserved:
 The same is true for :class:`MultiIndex`, but the logic is applied separately on a
 level-by-level basis.
 
-read_csv() now accepts pyarrow as an engine
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-:func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing on multicore machines
-with pyarrow>=0.15 installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
 .. _whatsnew_120.groupby_ewm:
 
 Groupby supports EWM operations directly
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 90f611c55e710..130ee90349cfe 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -13,6 +13,12 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
+read_csv() now accepts pyarrow as an engine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing on multicore machines
+with pyarrow>=0.15 installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
+
 
 .. _whatsnew_130.enhancements.other:
 

From 9d5cf249e74dce82a30656cedd0cf514aa9c3960 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Wed, 9 Dec 2020 23:40:15 -0500
Subject: [PATCH 46/95] clean _get_options_with_defaults

---
 pandas/io/parsers.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index a0bc537dce6dc..68657a86ea27f 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -868,14 +868,17 @@ def _get_options_with_defaults(self, engine):
 
         for argname, default in _parser_defaults.items():
             value = kwds.get(argname, default)
-            if argname in _pyarrow_unsupported:
-                if engine == "pyarrow" and value != default:
-                    raise ValueError(
-                        f"The {repr(argname)} option is not supported with the "
-                        f"'pyarrow' engine"
-                    )
-            # see gh-12935
-            if argname == "mangle_dupe_cols" and not value:
+            if (
+                engine == "pyarrow"
+                and argname in _pyarrow_unsupported
+                and value != default
+            ):
+                raise ValueError(
+                    f"The {repr(argname)} option is not supported with the "
+                    f"'pyarrow' engine"
+                )
+            elif argname == "mangle_dupe_cols" and value is False:
+                # GH12935
                 raise ValueError("Setting mangle_dupe_cols=False is not supported yet")
             else:
                 options[argname] = value

From 2d4a0aa70ae26bc48a328bcb5f240d4e8d677b34 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Wed, 9 Dec 2020 23:46:10 -0500
Subject: [PATCH 47/95] clean _clean_options

---
 pandas/io/parsers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 68657a86ea27f..5a111385c4455 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -936,7 +936,7 @@ def _clean_options(self, options, engine):
         delim_whitespace = options["delim_whitespace"]
 
         if sep is None and not delim_whitespace:
-            if engine == "c" or engine == "pyarrow":
+            if engine in ("c", "pyarrow"):
                 fallback_reason = (
                     f"the {engine} engine does not support "
                     "sep=None with delim_whitespace=False"

From e46b95d7b4432ec5b00a1094dbbf9f984d8d8e7e Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Wed, 9 Dec 2020 23:54:48 -0500
Subject: [PATCH 48/95] clean _read

---
 pandas/io/parsers.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 5a111385c4455..bf33040b075d4 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -452,20 +452,20 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
             kwds["parse_dates"] = True
 
     # Extract some of the arguments (pass chunksize on).
+
     iterator = kwds.get("iterator", False)
+    if kwds.get("engine") == "pyarrow" and iterator is True:
+        raise ValueError(
+            "The 'iterator' option is not supported with the 'pyarrow' engine"
+        )
+
     chunksize = kwds.get("chunksize", None)
-    # chunksize and iterator not supported for pyarrow
-    if kwds.get("engine") == "pyarrow":
-        if iterator:
-            raise ValueError(
-                "The 'iterator' option is not supported with the 'pyarrow' engine"
-            )
-        if chunksize is not None:
-            raise ValueError(
-                "The 'chunksize' option is not supported with the 'pyarrow' engine"
-            )
-    else:
-        chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1)
+    if kwds.get("engine") == "pyarrow" and chunksize is not None:
+        raise ValueError(
+            "The 'chunksize' option is not supported with the 'pyarrow' engine"
+        )
+    chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1)
+
     nrows = kwds.get("nrows", None)
 
     # Check for duplicates in names.

From 1844a6c48ebdedfb77e50facb19462f3b7f49727 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Thu, 10 Dec 2020 01:20:52 -0500
Subject: [PATCH 49/95] extract kwd validation from __init__

---
 pandas/io/parsers.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index bf33040b075d4..4122d2d263d3f 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -2259,7 +2259,16 @@ def __init__(self, src, **kwds):
 
         ParserBase.__init__(self, kwds)
 
-        encoding = kwds.get("encoding") if kwds.get("encoding") is not None else "utf-8"
+        self._validate_kwds()
+
+        if isinstance(self.src, TextIOBase):
+            self.src = BytesIOWrapper(self.src, encoding=self.encoding)
+
+    def _validate_kwds(self):
+        kwds = self.kwds
+        self.encoding = (
+            kwds.get("encoding") if kwds.get("encoding") is not None else "utf-8"
+        )
 
         self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
         na_values = kwds["na_values"]
@@ -2272,8 +2281,6 @@ def __init__(self, src, **kwds):
                 kwds["na_values"], keep_default_na=kwds["keep_default_na"]
             )[0]
         )
-        if isinstance(self.src, TextIOBase):
-            self.src = BytesIOWrapper(self.src, encoding=encoding)
 
     def read(self):
         pyarrow = import_optional_dependency("pyarrow.csv")

From 94178e467697793af549e858bf2c5d7164c45353 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Fri, 11 Dec 2020 17:13:47 -0500
Subject: [PATCH 50/95] revert mistaken refactor

---
 pandas/io/parsers.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 4122d2d263d3f..fd674d301d9ac 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -454,17 +454,19 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     # Extract some of the arguments (pass chunksize on).
 
     iterator = kwds.get("iterator", False)
-    if kwds.get("engine") == "pyarrow" and iterator is True:
-        raise ValueError(
-            "The 'iterator' option is not supported with the 'pyarrow' engine"
-        )
-
     chunksize = kwds.get("chunksize", None)
-    if kwds.get("engine") == "pyarrow" and chunksize is not None:
-        raise ValueError(
-            "The 'chunksize' option is not supported with the 'pyarrow' engine"
-        )
-    chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1)
+    if kwds.get("engine") == "pyarrow":
+        if iterator:
+            raise ValueError(
+                "The 'iterator' option is not supported with the 'pyarrow' engine"
+            )
+
+        if chunksize is not None:
+            raise ValueError(
+                "The 'chunksize' option is not supported with the 'pyarrow' engine"
+            )
+    else:
+        chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1)
 
     nrows = kwds.get("nrows", None)
 

From 13a24880e8c575b37e119073c78405b294adb66d Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Fri, 11 Dec 2020 17:17:59 -0500
Subject: [PATCH 51/95] typing

---
 pandas/io/parsers.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index fd674d301d9ac..c3b0b2d46d7ee 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -2238,7 +2238,11 @@ class BytesIOWrapper:
     Allows the pyarrow engine for read_csv() to read from string buffers
     """
 
-    def __init__(self, string_buffer: StringIO, encoding: str = "utf-8"):
+    def __init__(
+        self,
+        string_buffer: Union[StringIO, TextIOBase],
+        encoding: Optional[str] = "utf-8",
+    ):
         self.string_buffer = string_buffer
         self.encoding = encoding
 

From a98cffd33a89ec98ed1f1ea88e8534fdc56d6095 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Fri, 11 Dec 2020 17:25:18 -0500
Subject: [PATCH 52/95] REF: ArrowParserWrapper.read

---
 pandas/io/parsers.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index c3b0b2d46d7ee..93d5171fbad45 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -2290,7 +2290,7 @@ def _validate_kwds(self):
 
     def read(self):
         pyarrow = import_optional_dependency("pyarrow.csv")
-        kwdscopy = {k: v for k, v in self.kwds.items() if v is not None}
+        self.kwds = {k: v for k, v in self.kwds.items() if v is not None}
         # these are kwargs passed to pyarrow
         parseoptions = {"delimiter", "quote_char", "escape_char", "ignore_empty_lines"}
         convertoptions = {
@@ -2300,13 +2300,13 @@ def read(self):
             "false_values",
         }
         # rename some arguments to pass to pyarrow
-        kwdscopy["include_columns"] = kwdscopy.get("usecols")
-        kwdscopy["null_values"] = kwdscopy.get("na_values")
-        kwdscopy["escape_char"] = kwdscopy.get("escapechar")
-        kwdscopy["ignore_empty_lines"] = kwdscopy.get("skip_blank_lines")
+        self.kwds["include_columns"] = self.kwds.pop("usecols")
+        self.kwds["null_values"] = self.kwds.pop("na_values")
+        self.kwds["escape_char"] = self.kwds.pop("escapechar")
+        self.kwds["ignore_empty_lines"] = self.kwds.pop("skip_blank_lines")
 
-        parse_options = {k: v for k, v in kwdscopy.items() if k in parseoptions}
-        convert_options = {k: v for k, v in kwdscopy.items() if k in convertoptions}
+        parse_options = {k: v for k, v in self.kwds.items() if k in parseoptions}
+        convert_options = {k: v for k, v in self.kwds.items() if k in convertoptions}
         headerexists = True if self.header is not None else False
         read_options = {}
 

From a32e3a595114650e7272a212552dd1ad7a6f7e22 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Fri, 11 Dec 2020 18:07:52 -0500
Subject: [PATCH 53/95] REF: ArrowParserWrapper.read

---
 pandas/io/parsers.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 93d5171fbad45..5efedb1d5a026 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -2292,21 +2292,26 @@ def read(self):
         pyarrow = import_optional_dependency("pyarrow.csv")
         self.kwds = {k: v for k, v in self.kwds.items() if v is not None}
         # these are kwargs passed to pyarrow
-        parseoptions = {"delimiter", "quote_char", "escape_char", "ignore_empty_lines"}
-        convertoptions = {
+        parse_kwargs = {"delimiter", "quote_char", "escape_char", "ignore_empty_lines"}
+        convert_kwargs = {
             "include_columns",
             "null_values",
             "true_values",
             "false_values",
         }
         # rename some arguments to pass to pyarrow
-        self.kwds["include_columns"] = self.kwds.pop("usecols")
-        self.kwds["null_values"] = self.kwds.pop("na_values")
-        self.kwds["escape_char"] = self.kwds.pop("escapechar")
-        self.kwds["ignore_empty_lines"] = self.kwds.pop("skip_blank_lines")
+        mapping = {
+            "usecols": "include_columns",
+            "na_values": "null_values",
+            "escapechar": "escape_char",
+            "skip_blank_lines": "ignore_empty_lines",
+        }
+        for pandas_name, pyarrow_name in mapping.items():
+            if pandas_name in self.kwds:
+                self.kwds[pyarrow_name] = self.kwds.pop(pandas_name)
 
-        parse_options = {k: v for k, v in self.kwds.items() if k in parseoptions}
-        convert_options = {k: v for k, v in self.kwds.items() if k in convertoptions}
+        parse_options = {k: v for k, v in self.kwds.items() if k in parse_kwargs}
+        convert_options = {k: v for k, v in self.kwds.items() if k in convert_kwargs}
         headerexists = True if self.header is not None else False
         read_options = {}
 

From 89416cc217dc61b510da8b796511d6ea24fbff36 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 12 Dec 2020 12:16:23 -0500
Subject: [PATCH 54/95] remove optional dependency code

---
 pandas/tests/test_optional_dependency.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py
index 61dbd81e2cee5..e5ed69b7703b1 100644
--- a/pandas/tests/test_optional_dependency.py
+++ b/pandas/tests/test_optional_dependency.py
@@ -27,15 +27,14 @@ def test_bad_version(monkeypatch):
     module = types.ModuleType(name)
     module.__version__ = "0.9.0"
     sys.modules[name] = module
+    monkeypatch.setitem(VERSIONS, name, "1.0.0")
 
     match = "Pandas requires .*1.0.0.* of .fakemodule.*'0.9.0'"
     with pytest.raises(ImportError, match=match):
-        import_optional_dependency("fakemodule", min_version="1.0.0")
+        import_optional_dependency("fakemodule")
 
     with tm.assert_produces_warning(UserWarning):
-        result = import_optional_dependency(
-            "fakemodule", min_version="1.0.0", on_version="warn"
-        )
+        result = import_optional_dependency("fakemodule", on_version="warn")
     assert result is None
 
     module.__version__ = "1.0.0"  # exact match is OK

From 9687990b0c2ed662af78bcdf0314ca7cfce5bd2f Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 12 Dec 2020 13:08:31 -0500
Subject: [PATCH 55/95] REF: ArrowParserWrapper.read

---
 pandas/io/parsers.py | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 5efedb1d5a026..41ed74309a934 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -2292,13 +2292,6 @@ def read(self):
         pyarrow = import_optional_dependency("pyarrow.csv")
         self.kwds = {k: v for k, v in self.kwds.items() if v is not None}
         # these are kwargs passed to pyarrow
-        parse_kwargs = {"delimiter", "quote_char", "escape_char", "ignore_empty_lines"}
-        convert_kwargs = {
-            "include_columns",
-            "null_values",
-            "true_values",
-            "false_values",
-        }
         # rename some arguments to pass to pyarrow
         mapping = {
             "usecols": "include_columns",
@@ -2310,19 +2303,25 @@ def read(self):
             if pandas_name in self.kwds:
                 self.kwds[pyarrow_name] = self.kwds.pop(pandas_name)
 
-        parse_options = {k: v for k, v in self.kwds.items() if k in parse_kwargs}
-        convert_options = {k: v for k, v in self.kwds.items() if k in convert_kwargs}
-        headerexists = True if self.header is not None else False
-        read_options = {}
+        parse_options = {
+            k: v
+            for k, v in self.kwds.items()
+            if k is not None
+            and k in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines")
+        }
+        convert_options = {
+            k: v
+            for k, v in self.kwds.items()
+            if k is not None
+            and k in ("include_columns", "null_values", "true_values", "false_values")
+        }
 
-        skiprows = self.kwds.get("skiprows")
-        if headerexists:
+        read_options = {"autogenerate_column_names": self.header is None}
+        if self.header is not None:
             read_options["skip_rows"] = self.header
-            read_options["autogenerate_column_names"] = False
-        else:
-            if skiprows is not None:
-                read_options["skip_rows"] = skiprows
-            read_options["autogenerate_column_names"] = True
+        elif self.kwds.get("skiprows") is not None:
+            read_options["skip_rows"] = self.kwds.get("skiprows")
+
         read_options = pyarrow.ReadOptions(**read_options)
         table = pyarrow.read_csv(
             self.src,
@@ -2332,7 +2331,7 @@ def read(self):
         )
         frame = table.to_pandas()
         num_cols = len(frame.columns)
-        if not headerexists:
+        if self.header is None:
             if self.names is None:
                 if self.prefix is not None:
                     self.names = [f"{self.prefix}{i}" for i in range(num_cols)]

From 98f20617aa22a8443339962f3ba6f1ad955a4246 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 12 Dec 2020 13:14:47 -0500
Subject: [PATCH 56/95] REF: ArrowParserWrapper.read

---
 pandas/io/parsers.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 41ed74309a934..f2544c078f2c3 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -2301,19 +2301,23 @@ def read(self):
         }
         for pandas_name, pyarrow_name in mapping.items():
             if pandas_name in self.kwds:
-                self.kwds[pyarrow_name] = self.kwds.pop(pandas_name)
+                value = self.kwds.pop(pandas_name)
+                if value is not None:
+                    self.kwds[pyarrow_name] = value
 
         parse_options = {
-            k: v
-            for k, v in self.kwds.items()
-            if k is not None
-            and k in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines")
+            option_name: option_value
+            for option_name, option_value in self.kwds.items()
+            if option_value is not None
+            and option_name
+            in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines")
         }
         convert_options = {
-            k: v
-            for k, v in self.kwds.items()
-            if k is not None
-            and k in ("include_columns", "null_values", "true_values", "false_values")
+            option_name: option_value
+            for option_name, option_value in self.kwds.items()
+            if option_value is not None
+            and option_name
+            in ("include_columns", "null_values", "true_values", "false_values")
         }
 
         read_options = {"autogenerate_column_names": self.header is None}

From ec01fad2dc7864955d0edd34bfeb71b849e656ea Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 12 Dec 2020 13:34:05 -0500
Subject: [PATCH 57/95] REF: ArrowParserWrapper.read

---
 pandas/io/parsers.py | 46 +++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index f2544c078f2c3..474171223292e 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -2288,10 +2288,7 @@ def _validate_kwds(self):
             )[0]
         )
 
-    def read(self):
-        pyarrow = import_optional_dependency("pyarrow.csv")
-        self.kwds = {k: v for k, v in self.kwds.items() if v is not None}
-        # these are kwargs passed to pyarrow
+    def _get_pyarrow_options(self):
         # rename some arguments to pass to pyarrow
         mapping = {
             "usecols": "include_columns",
@@ -2300,40 +2297,30 @@ def read(self):
             "skip_blank_lines": "ignore_empty_lines",
         }
         for pandas_name, pyarrow_name in mapping.items():
-            if pandas_name in self.kwds:
-                value = self.kwds.pop(pandas_name)
-                if value is not None:
-                    self.kwds[pyarrow_name] = value
+            if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None:
+                self.kwds[pyarrow_name] = self.kwds.pop(pandas_name)
 
-        parse_options = {
+        self.parse_options = {
             option_name: option_value
             for option_name, option_value in self.kwds.items()
             if option_value is not None
             and option_name
             in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines")
         }
-        convert_options = {
+        self.convert_options = {
             option_name: option_value
             for option_name, option_value in self.kwds.items()
             if option_value is not None
             and option_name
             in ("include_columns", "null_values", "true_values", "false_values")
         }
-
-        read_options = {"autogenerate_column_names": self.header is None}
+        self.read_options = {"autogenerate_column_names": self.header is None}
         if self.header is not None:
-            read_options["skip_rows"] = self.header
+            self.read_options["skip_rows"] = self.header
         elif self.kwds.get("skiprows") is not None:
-            read_options["skip_rows"] = self.kwds.get("skiprows")
+            self.read_options["skip_rows"] = self.kwds.get("skiprows")
 
-        read_options = pyarrow.ReadOptions(**read_options)
-        table = pyarrow.read_csv(
-            self.src,
-            read_options=read_options,
-            parse_options=pyarrow.ParseOptions(**parse_options),
-            convert_options=pyarrow.ConvertOptions(**convert_options),
-        )
-        frame = table.to_pandas()
+    def _finalize_output(self, frame):
         num_cols = len(frame.columns)
         if self.header is None:
             if self.names is None:
@@ -2354,6 +2341,21 @@ def read(self):
             frame = frame.astype(self.kwds.get("dtype"))
         return frame
 
+    def read(self):
+        pyarrow = import_optional_dependency("pyarrow.csv")
+
+        self._get_pyarrow_options()
+
+        table = pyarrow.read_csv(
+            self.src,
+            read_options=pyarrow.ReadOptions(**self.read_options),
+            parse_options=pyarrow.ParseOptions(**self.parse_options),
+            convert_options=pyarrow.ConvertOptions(**self.convert_options),
+        )
+
+        frame = table.to_pandas()
+        return self._finalize_output(frame)
+
 
 def TextParser(*args, **kwds):
     """

From 7b9572b960492a2ee6d2d6fd25dab031130ee260 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 12 Dec 2020 13:48:40 -0500
Subject: [PATCH 58/95] rewrite docs

---
 doc/source/user_guide/io.rst | 21 ++++++++++-----------
 pandas/io/parsers.py         |  7 +++----
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 6828dbf319a7e..47295f2cb5bc1 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -159,10 +159,10 @@ dtype : Type name or dict of column -> type, default ``None``
   with suitable ``na_values`` settings to preserve and
   not interpret dtype.
 engine : {``'c'``, ``'pyarrow'``, ``'python'``}
-  Parser engine to use. In terms of performance, the pyarrow engine,
-  which requires ``pyarrow`` >= 0.15.0, is faster than the C engine, which
-  is faster than the python engine. However, the pyarrow and C engines
-  are currently less feature complete than their Python counterpart.
+  Parser engine to use. The pyarrow engine is the most performant, followed by
+  the C engine, which in turn is faster than the python engine. However, the
+  pyarrow and C engine are currently less feature complete than their Python
+  counterpart.
 converters : dict, default ``None``
   Dict of functions for converting values in certain columns. Keys can either be
   integers or column labels.
@@ -1604,15 +1604,14 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object:
 Specifying the parser engine
 ''''''''''''''''''''''''''''
 
-Currently, pandas supports using three engines, the C engine, the python engine,
-and an optional pyarrow engine(requires ``pyarrow`` >= 0.15). In terms of performance
-the pyarrow engine is fastest, followed by the C and Python engines. However,
-the pyarrow engine is much less robust than the C engine, which in turn lacks a
-couple of features present in the Python parser.
+Pandas currently supports three engines, the C engine, the python engine, and an optional
+pyarrow engine. The pyarrow engine is fastest, followed by the C and Python engines. However,
+the pyarrow engine is much less robust than the C engine, and the C engine is less feature-rich
+than the Python engine.
 
-Where possible pandas uses the C parser (specified as ``engine='c'``), but may fall
+Where possible pandas uses the C parser (specified as ``engine='c'``), but it may fall
 back to Python if C-unsupported options are specified. If pyarrow unsupported options are
-specified while using ``engine='pyarrow'``, the parser will error out
+specified while using ``engine='pyarrow'``, the parser will throw an error.
 (a full list of unsupported options is available at ``pandas.io.parsers._pyarrow_unsupported``).
 
 Currently, C-unsupported options include:
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 474171223292e..13e9939f850e8 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -176,11 +176,10 @@
     of dtype conversion.
 engine : {{'c', 'python', 'pyarrow'}}, optional
     Parser engine to use. The C and pyarrow engines are faster, while the python engine
-    is currently more feature-complete. The pyarrow engine also supports multithreading
-    something that is not present in the C or python engines. It requires
-    ``pyarrow`` >= 0.15 as a dependency however.
+    is currently more feature-complete. Multithreading is currently only supported by
+    the pyarrow engine.
 
-    .. versionchanged:: 1.2
+    .. versionchanged:: 1.3
         The "pyarrow" engine was added.
 converters : dict, optional
     Dict of functions for converting values in certain columns. Keys can either

From 6773a719ab06e4c5ab443086d4a6c634ebe5a53d Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 12 Dec 2020 13:50:15 -0500
Subject: [PATCH 59/95] rewrite docs

---
 doc/source/whatsnew/v1.3.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index b2cc1450d1efb..a45d651440ffb 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -17,7 +17,7 @@ read_csv() now accepts pyarrow as an engine
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 :func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing on multicore machines
-with pyarrow>=0.15 installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
+with pyarrow installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
 
 
 .. _whatsnew_130.enhancements.other:

From d63f5d0b5d87174f50703464b4dddca6b7b551d4 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 12 Dec 2020 14:05:46 -0500
Subject: [PATCH 60/95] remove datetime hadling

---
 pandas/io/parsers.py                       |  4 ++--
 pandas/tests/io/parser/test_parse_dates.py | 10 ++++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 13e9939f850e8..a9ad6bba531e1 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -3557,7 +3557,7 @@ def _isindex(colspec):
                     colspec = orig_names[colspec]
                 if _isindex(colspec):
                     continue
-                data_dict[colspec] = converter(np.asarray(data_dict[colspec]))
+                data_dict[colspec] = converter(data_dict[colspec])
             else:
                 new_name, col, old_names = _try_convert_dates(
                     converter, colspec, data_dict, orig_names
@@ -3606,7 +3606,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns):
             colnames.append(c)
 
     new_name = "_".join(str(x) for x in colnames)
-    to_parse = [np.array(data_dict[c]) for c in colnames if c in data_dict]
+    to_parse = [data_dict[c] for c in colnames if c in data_dict]
 
     new_col = parser(*to_parse)
     return new_name, new_col, colnames
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index d4c0d28214849..641579922e506 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -339,6 +339,7 @@ def test_multiple_date_col(all_parsers, keep_date_col):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_date_col_as_index_col(all_parsers):
     data = """\
 KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
@@ -566,6 +567,7 @@ def test_multiple_date_cols_with_header(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data,parse_dates,msg",
     [
@@ -594,6 +596,7 @@ def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg):
         parser.read_csv(StringIO(data), parse_dates=parse_dates)
 
 
+@skip_pyarrow
 def test_date_parser_int_bug(all_parsers):
     # see gh-3071
     parser = all_parsers
@@ -713,6 +716,7 @@ def test_parse_dates_string(all_parsers):
 
 # Bug in https://github.com/dateutil/dateutil/issues/217
 # has been addressed, but we just don't pass in the `yearfirst`
+@skip_pyarrow
 @pytest.mark.xfail(reason="yearfirst is not surfaced in read_*")
 @pytest.mark.parametrize("parse_dates", [[["date", "time"]], [[0, 1]]])
 def test_yy_format_with_year_first(all_parsers, parse_dates):
@@ -1126,6 +1130,7 @@ def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates):
         parser.read_csv(StringIO(data), parse_dates=(1,))
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("cache_dates", [True, False])
 @pytest.mark.parametrize("value", ["nan", "0", ""])
 def test_bad_date_parse(all_parsers, cache_dates, value):
@@ -1455,6 +1460,7 @@ def date_parser(dt, time):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_parse_date_column_with_empty_string(all_parsers):
     # see gh-6428
     parser = all_parsers
@@ -1466,6 +1472,7 @@ def test_parse_date_column_with_empty_string(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data,expected",
     [
@@ -1491,6 +1498,7 @@ def test_parse_date_float(all_parsers, data, expected, parse_dates):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_parse_timezone(all_parsers):
     # see gh-22256
     parser = all_parsers
@@ -1554,6 +1562,7 @@ def test_parse_delimited_date_swap(all_parsers, date_string, dayfirst, expected)
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def _helper_hypothesis_delimited_date(call, date_string, **kwargs):
     msg, result = None, None
     try:
@@ -1564,6 +1573,7 @@ def _helper_hypothesis_delimited_date(call, date_string, **kwargs):
     return msg, result
 
 
+@skip_pyarrow
 @given(date_strategy)
 @settings(deadline=None)
 @pytest.mark.parametrize("delimiter", list(" -./"))

From 9ff95ad01cab0d715ad9c3b867569c9d3d795922 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 12 Dec 2020 14:39:13 -0500
Subject: [PATCH 61/95] skiprows cannot be None

---
 pandas/io/parsers.py                   | 16 +++++++++++-----
 pandas/tests/io/parser/test_usecols.py |  1 +
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index a9ad6bba531e1..61b546a4ccf31 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -2313,11 +2313,17 @@ def _get_pyarrow_options(self):
             and option_name
             in ("include_columns", "null_values", "true_values", "false_values")
         }
-        self.read_options = {"autogenerate_column_names": self.header is None}
-        if self.header is not None:
-            self.read_options["skip_rows"] = self.header
-        elif self.kwds.get("skiprows") is not None:
-            self.read_options["skip_rows"] = self.kwds.get("skiprows")
+        # self.read_options = {"autogenerate_column_names": self.header is None}
+        # if self.header is not None:
+        #     self.read_options["skip_rows"] = self.header
+        # elif self.kwds.get("skiprows") is not None:
+        #     self.read_options["skip_rows"] = self.kwds.get("skiprows")
+        self.read_options = {
+            "autogenerate_column_names": self.header is None,
+            "skip_rows": self.header
+            if self.header is not None
+            else self.kwds["skiprows"],
+        }
 
     def _finalize_output(self, frame):
         num_cols = len(frame.columns)
diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py
index a3a2b3e984339..bdfe121bae179 100644
--- a/pandas/tests/io/parser/test_usecols.py
+++ b/pandas/tests/io/parser/test_usecols.py
@@ -280,6 +280,7 @@ def test_usecols_with_parse_dates2(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_usecols_with_parse_dates3(all_parsers):
     # see gh-14792
     parser = all_parsers

From 6133a4c0d60bcbf559f8665231c726a790c3954a Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 12 Dec 2020 16:34:09 -0500
Subject: [PATCH 62/95] REF: ArrowParserWrapper.read

---
 pandas/io/parsers.py | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 61b546a4ccf31..14e0ad591db07 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -2240,7 +2240,7 @@ class BytesIOWrapper:
     def __init__(
         self,
         string_buffer: Union[StringIO, TextIOBase],
-        encoding: Optional[str] = "utf-8",
+        encoding: str = "utf-8",
     ):
         self.string_buffer = string_buffer
         self.encoding = encoding
@@ -2264,29 +2264,27 @@ def __init__(self, src, **kwds):
 
         ParserBase.__init__(self, kwds)
 
-        self._validate_kwds()
+        self._parse_kwds()
 
-        if isinstance(self.src, TextIOBase):
-            self.src = BytesIOWrapper(self.src, encoding=self.encoding)
-
-    def _validate_kwds(self):
-        kwds = self.kwds
-        self.encoding = (
-            kwds.get("encoding") if kwds.get("encoding") is not None else "utf-8"
-        )
+    def _parse_kwds(self):
+        encoding: Optional[str] = self.kwds.get("encoding")
+        self.encoding = "utf-8" if encoding is None else encoding
 
-        self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
-        na_values = kwds["na_values"]
+        self.usecols, self.usecols_dtype = _validate_usecols_arg(self.kwds["usecols"])
+        na_values = self.kwds["na_values"]
         if isinstance(na_values, dict):
             raise ValueError(
                 "The pyarrow engine doesn't support passing a dict for na_values"
             )
         self.na_values = list(
             _clean_na_values(
-                kwds["na_values"], keep_default_na=kwds["keep_default_na"]
+                self.kwds["na_values"], keep_default_na=self.kwds["keep_default_na"]
             )[0]
         )
 
+        if isinstance(self.src, TextIOBase):
+            self.src = BytesIOWrapper(self.src, encoding=self.encoding)
+
     def _get_pyarrow_options(self):
         # rename some arguments to pass to pyarrow
         mapping = {

From 454892f6420cd7bb85f549983d274d4d21ce0fd7 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 12 Dec 2020 16:38:00 -0500
Subject: [PATCH 63/95] REF: ArrowParserWrapper.read

---
 pandas/io/parsers.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 14e0ad591db07..dfef7e32836ab 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -2311,11 +2311,6 @@ def _get_pyarrow_options(self):
             and option_name
             in ("include_columns", "null_values", "true_values", "false_values")
         }
-        # self.read_options = {"autogenerate_column_names": self.header is None}
-        # if self.header is not None:
-        #     self.read_options["skip_rows"] = self.header
-        # elif self.kwds.get("skiprows") is not None:
-        #     self.read_options["skip_rows"] = self.kwds.get("skiprows")
         self.read_options = {
             "autogenerate_column_names": self.header is None,
             "skip_rows": self.header

From e0503945d704d7d8e8db28ecc2f8f13fd7b8edb5 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 12 Dec 2020 16:54:50 -0500
Subject: [PATCH 64/95] skip all pyarrow csv datetime tests

---
 pandas/tests/io/parser/test_parse_dates.py | 43 +---------------------
 1 file changed, 1 insertion(+), 42 deletions(-)

diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index 641579922e506..77c0e3a9c4f6f 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -34,10 +34,9 @@
 else:
     date_strategy = st.datetimes()
 
-skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+pytestmark = pytest.mark.usefixtures("pyarrow_skip")
 
 
-@skip_pyarrow
 def test_separator_date_conflict(all_parsers):
     # Regression test for gh-4678
     #
@@ -59,7 +58,6 @@ def test_separator_date_conflict(all_parsers):
     tm.assert_frame_equal(df, expected)
 
 
-@skip_pyarrow
 @pytest.mark.parametrize("keep_date_col", [True, False])
 def test_multiple_date_col_custom(all_parsers, keep_date_col):
     data = """\
@@ -203,7 +201,6 @@ def date_parser(*date_cols):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 @pytest.mark.parametrize("container", [list, tuple, Index, Series])
 @pytest.mark.parametrize("dim", [1, 2])
 def test_concat_date_col_fail(container, dim):
@@ -216,7 +213,6 @@ def test_concat_date_col_fail(container, dim):
         parsing.concat_date_cols(date_cols)
 
 
-@skip_pyarrow
 @pytest.mark.parametrize("keep_date_col", [True, False])
 def test_multiple_date_col(all_parsers, keep_date_col):
     data = """\
@@ -339,7 +335,6 @@ def test_multiple_date_col(all_parsers, keep_date_col):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 def test_date_col_as_index_col(all_parsers):
     data = """\
 KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
@@ -377,7 +372,6 @@ def test_date_col_as_index_col(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]),
@@ -442,7 +436,6 @@ def test_multiple_date_cols_int_cast(all_parsers, date_parser, warning):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 def test_multiple_date_col_timestamp_parse(all_parsers):
     parser = all_parsers
     data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
@@ -477,7 +470,6 @@ def test_multiple_date_col_timestamp_parse(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 def test_multiple_date_cols_with_header(all_parsers):
     parser = all_parsers
     data = """\
@@ -567,7 +559,6 @@ def test_multiple_date_cols_with_header(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 @pytest.mark.parametrize(
     "data,parse_dates,msg",
     [
@@ -596,7 +587,6 @@ def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg):
         parser.read_csv(StringIO(data), parse_dates=parse_dates)
 
 
-@skip_pyarrow
 def test_date_parser_int_bug(all_parsers):
     # see gh-3071
     parser = all_parsers
@@ -649,7 +639,6 @@ def test_date_parser_int_bug(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 def test_nat_parse(all_parsers):
     # see gh-3062
     parser = all_parsers
@@ -665,7 +654,6 @@ def test_nat_parse(all_parsers):
         tm.assert_frame_equal(result, df)
 
 
-@skip_pyarrow
 def test_csv_custom_parser(all_parsers):
     data = """A,B,C
 20090101,a,1,2
@@ -680,7 +668,6 @@ def test_csv_custom_parser(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 def test_parse_dates_implicit_first_col(all_parsers):
     data = """A,B,C
 20090101,a,1,2
@@ -694,7 +681,6 @@ def test_parse_dates_implicit_first_col(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 def test_parse_dates_string(all_parsers):
     data = """date,A,B,C
 20090101,a,1,2
@@ -716,7 +702,6 @@ def test_parse_dates_string(all_parsers):
 
 # Bug in https://github.com/dateutil/dateutil/issues/217
 # has been addressed, but we just don't pass in the `yearfirst`
-@skip_pyarrow
 @pytest.mark.xfail(reason="yearfirst is not surfaced in read_*")
 @pytest.mark.parametrize("parse_dates", [[["date", "time"]], [[0, 1]]])
 def test_yy_format_with_year_first(all_parsers, parse_dates):
@@ -740,7 +725,6 @@ def test_yy_format_with_year_first(all_parsers, parse_dates):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 @pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]])
 def test_parse_dates_column_list(all_parsers, parse_dates):
     data = "a,b,c\n01/01/2010,1,15/02/2010"
@@ -757,7 +741,6 @@ def test_parse_dates_column_list(all_parsers, parse_dates):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
 def test_multi_index_parse_dates(all_parsers, index_col):
     data = """index1,index2,A,B,C
@@ -803,7 +786,6 @@ def test_multi_index_parse_dates(all_parsers, index_col):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 @pytest.mark.parametrize("kwargs", [{"dayfirst": True}, {"day_first": True}])
 def test_parse_dates_custom_euro_format(all_parsers, kwargs):
     parser = all_parsers
@@ -848,7 +830,6 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs):
             )
 
 
-@skip_pyarrow
 def test_parse_tz_aware(all_parsers):
     # See gh-1693
     parser = all_parsers
@@ -862,7 +843,6 @@ def test_parse_tz_aware(all_parsers):
     assert result.index.tz is pytz.utc
 
 
-@skip_pyarrow
 @pytest.mark.parametrize(
     "parse_dates,index_col",
     [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)],
@@ -963,7 +943,6 @@ def test_multiple_date_cols_index(all_parsers, parse_dates, index_col):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 def test_multiple_date_cols_chunked(all_parsers):
     parser = all_parsers
     data = """\
@@ -1056,7 +1035,6 @@ def test_multiple_date_cols_chunked(all_parsers):
     tm.assert_frame_equal(chunks[2], expected[4:])
 
 
-@skip_pyarrow
 def test_multiple_date_col_named_index_compat(all_parsers):
     parser = all_parsers
     data = """\
@@ -1080,7 +1058,6 @@ def test_multiple_date_col_named_index_compat(all_parsers):
     tm.assert_frame_equal(with_indices, with_names)
 
 
-@skip_pyarrow
 def test_multiple_date_col_multiple_index_compat(all_parsers):
     parser = all_parsers
     data = """\
@@ -1130,7 +1107,6 @@ def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates):
         parser.read_csv(StringIO(data), parse_dates=(1,))
 
 
-@skip_pyarrow
 @pytest.mark.parametrize("cache_dates", [True, False])
 @pytest.mark.parametrize("value", ["nan", "0", ""])
 def test_bad_date_parse(all_parsers, cache_dates, value):
@@ -1149,7 +1125,6 @@ def test_bad_date_parse(all_parsers, cache_dates, value):
     )
 
 
-@skip_pyarrow
 def test_parse_dates_empty_string(all_parsers):
     # see gh-2263
     parser = all_parsers
@@ -1162,7 +1137,6 @@ def test_parse_dates_empty_string(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 @pytest.mark.parametrize(
     "data,kwargs,expected",
     [
@@ -1202,7 +1176,6 @@ def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]),
@@ -1231,7 +1204,6 @@ def test_parse_date_time_multi_level_column_name(all_parsers, date_parser, warni
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]),
@@ -1320,7 +1292,6 @@ def test_parse_date_time(all_parsers, data, kwargs, expected, date_parser, warni
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     ([conv.parse_date_fields, FutureWarning], [pd.to_datetime, None]),
@@ -1343,7 +1314,6 @@ def test_parse_date_fields(all_parsers, date_parser, warning):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     (
@@ -1375,7 +1345,6 @@ def test_parse_date_all_fields(all_parsers, date_parser, warning):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     (
@@ -1407,7 +1376,6 @@ def test_datetime_fractional_seconds(all_parsers, date_parser, warning):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 def test_generic(all_parsers):
     parser = all_parsers
     data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
@@ -1426,7 +1394,6 @@ def test_generic(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 def test_date_parser_resolution_if_not_ns(all_parsers):
     # see gh-10245
     parser = all_parsers
@@ -1460,7 +1427,6 @@ def date_parser(dt, time):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 def test_parse_date_column_with_empty_string(all_parsers):
     # see gh-6428
     parser = all_parsers
@@ -1472,7 +1438,6 @@ def test_parse_date_column_with_empty_string(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 @pytest.mark.parametrize(
     "data,expected",
     [
@@ -1498,7 +1463,6 @@ def test_parse_date_float(all_parsers, data, expected, parse_dates):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 def test_parse_timezone(all_parsers):
     # see gh-22256
     parser = all_parsers
@@ -1527,7 +1491,6 @@ def test_parse_timezone(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 @pytest.mark.parametrize(
     "date_string",
     ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"],
@@ -1539,7 +1502,6 @@ def test_invalid_parse_delimited_date(all_parsers, date_string):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 @pytest.mark.parametrize(
     "date_string,dayfirst,expected",
     [
@@ -1562,7 +1524,6 @@ def test_parse_delimited_date_swap(all_parsers, date_string, dayfirst, expected)
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 def _helper_hypothesis_delimited_date(call, date_string, **kwargs):
     msg, result = None, None
     try:
@@ -1573,7 +1534,6 @@ def _helper_hypothesis_delimited_date(call, date_string, **kwargs):
     return msg, result
 
 
-@skip_pyarrow
 @given(date_strategy)
 @settings(deadline=None)
 @pytest.mark.parametrize("delimiter", list(" -./"))
@@ -1607,7 +1567,6 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti
     assert result == expected
 
 
-@skip_pyarrow
 @pytest.mark.parametrize(
     "names, usecols, parse_dates, missing_cols",
     [

From 09fca60e1634a6be4231c6d10b5090d7e5f453e6 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 12 Dec 2020 17:54:29 -0500
Subject: [PATCH 65/95] rewrite benchmarks

---
 asv_bench/benchmarks/io/csv.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index c1fad1efde082..db271c84ceed0 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -254,7 +254,7 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
             names=list("abc"),
         )
 
-    def time_read_csv_arrow(self, sep, decimal, float_precision):
+    def time_read_csv_pyarrow(self, sep, decimal, float_precision):
         read_csv(
             self.data(self.StringIO_input),
             sep=sep,
@@ -333,7 +333,7 @@ def time_baseline(self, engine):
 
 
 class ReadCSVCachedParseDates(StringIORewind):
-    params = ([True, False], ["c", "pyarrow", "python"])
+    params = ([True, False], ["c", "python"])
     param_names = ["do_cache", "engine"]
 
     def setup(self, do_cache, engine):
@@ -374,7 +374,7 @@ def mem_parser_chunks(self):
 
 
 class ReadCSVParseSpecialDate(StringIORewind):
-    params = (["mY", "mdY", "hm"], ["c", "pyarrow", "python"])
+    params = (["mY", "mdY", "hm"], ["c", "python"])
     param_names = ["value", "engine"]
     objects = {
         "mY": "01-2019\n10-2019\n02/2000\n",

From f9bf5f1dbaf62958bb84d552b75faefcaec97ec5 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 19 Dec 2020 01:19:55 -0500
Subject: [PATCH 66/95] typo

---
 asv_bench/benchmarks/io/csv.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index f5fad16d5afbb..83eb1bea42a14 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -263,6 +263,7 @@ def time_read_csv_pyarrow(self, sep, decimal, float_precision):
             sep=sep,
             header=None,
             names=list("abc"),
+            engine="pyarrow",
         )
 
 

From 922bf4fa87dac6daaf83faddc32bf9abf10ef92f Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 19 Dec 2020 01:20:45 -0500
Subject: [PATCH 67/95] typo

---
 asv_bench/benchmarks/io/csv.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 83eb1bea42a14..287f1d997d665 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -285,6 +285,7 @@ def time_read_bytescsv(self, engine):
 
 
 class ReadCSVCategorical(BaseIO):
+
     fname = "__test__.csv"
     params = ["c", "python"]
     param_names = ["engine"]

From 1252a054e3bc61e2daef1c75600977123121676a Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 19 Dec 2020 01:38:34 -0500
Subject: [PATCH 68/95] test reorg

---
 pandas/tests/io/parser/test_usecols.py | 1014 ++++++++++++------------
 1 file changed, 501 insertions(+), 513 deletions(-)

diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py
index d774ca4113ab3..d0de6e5c2c95f 100644
--- a/pandas/tests/io/parser/test_usecols.py
+++ b/pandas/tests/io/parser/test_usecols.py
@@ -26,566 +26,554 @@
 
 
 @skip_pyarrow
-def test_raise_on_mixed_dtype_usecols(all_parsers):
-    # See gh-12678
-    data = """a,b,c
-        1000,2000,3000
-        4000,5000,6000
-        """
-    usecols = [0, "b", 2]
-    parser = all_parsers
-
-    with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
-        parser.read_csv(StringIO(data), usecols=usecols)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")])
-def test_usecols(all_parsers, usecols):
-    data = """\
-a,b,c
-1,2,3
-4,5,6
-7,8,9
-10,11,12"""
-    parser = all_parsers
-    result = parser.read_csv(StringIO(data), usecols=usecols)
-
-    expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_usecols_with_names(all_parsers):
-    data = """\
-a,b,c
-1,2,3
-4,5,6
-7,8,9
-10,11,12"""
-    parser = all_parsers
-    names = ["foo", "bar"]
-    result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0)
-
-    expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names)
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])]
-)
-def test_usecols_relative_to_names(all_parsers, names, usecols):
-    data = """\
-1,2,3
-4,5,6
-7,8,9
-10,11,12"""
-    parser = all_parsers
-    result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols)
-
-    expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
-    tm.assert_frame_equal(result, expected)
+class TestParserUsecolsBasic:
+    def test_raise_on_mixed_dtype_usecols(self, all_parsers):
+        # See gh-12678
+        data = """a,b,c
+            1000,2000,3000
+            4000,5000,6000
+            """
+        usecols = [0, "b", 2]
+        parser = all_parsers
+
+        with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
+            parser.read_csv(StringIO(data), usecols=usecols)
+
+    @pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")])
+    def test_usecols(self, all_parsers, usecols):
+        data = """\
+    a,b,c
+    1,2,3
+    4,5,6
+    7,8,9
+    10,11,12"""
+        parser = all_parsers
+        result = parser.read_csv(StringIO(data), usecols=usecols)
+
+        expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
+        tm.assert_frame_equal(result, expected)
 
+    def test_usecols_with_names(self, all_parsers):
+        data = """\
+    a,b,c
+    1,2,3
+    4,5,6
+    7,8,9
+    10,11,12"""
+        parser = all_parsers
+        names = ["foo", "bar"]
+        result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0)
+
+        expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names)
+        tm.assert_frame_equal(result, expected)
 
-@skip_pyarrow
-def test_usecols_relative_to_names2(all_parsers):
-    # see gh-5766
-    data = """\
-1,2,3
-4,5,6
-7,8,9
-10,11,12"""
-    parser = all_parsers
-    result = parser.read_csv(
-        StringIO(data), names=["a", "b"], header=None, usecols=[0, 1]
+    @pytest.mark.parametrize(
+        "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])]
     )
+    def test_usecols_relative_to_names(self, all_parsers, names, usecols):
+        data = """\
+    1,2,3
+    4,5,6
+    7,8,9
+    10,11,12"""
+        parser = all_parsers
+        result = parser.read_csv(
+            StringIO(data), names=names, header=None, usecols=usecols
+        )
+
+        expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
+        tm.assert_frame_equal(result, expected)
 
-    expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"])
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_usecols_name_length_conflict(all_parsers):
-    data = """\
-1,2,3
-4,5,6
-7,8,9
-10,11,12"""
-    parser = all_parsers
-    msg = "Number of passed names did not match number of header fields in the file"
-
-    with pytest.raises(ValueError, match=msg):
-        parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1])
-
-
-def test_usecols_single_string(all_parsers):
-    # see gh-20558
-    parser = all_parsers
-    data = """foo, bar, baz
-1000, 2000, 3000
-4000, 5000, 6000"""
-
-    with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
-        parser.read_csv(StringIO(data), usecols="foo")
-
-
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"]
-)
-def test_usecols_index_col_false(all_parsers, data):
-    # see gh-9082
-    parser = all_parsers
-    usecols = ["a", "c", "d"]
-    expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]})
-
-    result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False)
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("index_col", ["b", 0])
-@pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]])
-def test_usecols_index_col_conflict(all_parsers, usecols, index_col):
-    # see gh-4201: test that index_col as integer reflects usecols
-    parser = all_parsers
-    data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
-    expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b"))
+    def test_usecols_relative_to_names2(self, all_parsers):
+        # see gh-5766
+        data = """\
+    1,2,3
+    4,5,6
+    7,8,9
+    10,11,12"""
+        parser = all_parsers
+        result = parser.read_csv(
+            StringIO(data), names=["a", "b"], header=None, usecols=[0, 1]
+        )
+
+        expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"])
+        tm.assert_frame_equal(result, expected)
 
-    result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col)
-    tm.assert_frame_equal(result, expected)
+    def test_usecols_name_length_conflict(self, all_parsers):
+        data = """\
+    1,2,3
+    4,5,6
+    7,8,9
+    10,11,12"""
+        parser = all_parsers
+        msg = "Number of passed names did not match number of header fields in the file"
 
+        with pytest.raises(ValueError, match=msg):
+            parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1])
 
-def test_usecols_index_col_conflict2(all_parsers):
-    # see gh-4201: test that index_col as integer reflects usecols
-    parser = all_parsers
-    data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
+    def test_usecols_single_string(self, all_parsers):
+        # see gh-20558
+        parser = all_parsers
+        data = """foo, bar, baz
+    1000, 2000, 3000
+    4000, 5000, 6000"""
 
-    expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")})
-    expected = expected.set_index(["b", "c"])
+        with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
+            parser.read_csv(StringIO(data), usecols="foo")
 
-    result = parser.read_csv(
-        StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"]
+    @pytest.mark.parametrize(
+        "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"]
     )
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_usecols_implicit_index_col(all_parsers):
-    # see gh-2654
-    parser = all_parsers
-    data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10"
-
-    result = parser.read_csv(StringIO(data), usecols=["a", "b"])
-    expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
-    tm.assert_frame_equal(result, expected)
+    def test_usecols_index_col_false(self, all_parsers, data):
+        # see gh-9082
+        parser = all_parsers
+        usecols = ["a", "c", "d"]
+        expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]})
 
+        result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False)
+        tm.assert_frame_equal(result, expected)
 
-@skip_pyarrow
-def test_usecols_regex_sep(all_parsers):
-    # see gh-2733
-    parser = all_parsers
-    data = "a  b  c\n4  apple  bat  5.7\n8  orange  cow  10"
-    result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
+    @pytest.mark.parametrize("index_col", ["b", 0])
+    @pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]])
+    def test_usecols_index_col_conflict(self, all_parsers, usecols, index_col):
+        # see gh-4201: test that index_col as integer reflects usecols
+        parser = all_parsers
+        data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
+        expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b"))
 
-    expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
-    tm.assert_frame_equal(result, expected)
+        result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col)
+        tm.assert_frame_equal(result, expected)
 
+    def test_usecols_index_col_conflict2(self, all_parsers):
+        # see gh-4201: test that index_col as integer reflects usecols
+        parser = all_parsers
+        data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
 
-@skip_pyarrow
-def test_usecols_with_whitespace(all_parsers):
-    parser = all_parsers
-    data = "a  b  c\n4  apple  bat  5.7\n8  orange  cow  10"
+        expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")})
+        expected = expected.set_index(["b", "c"])
 
-    result = parser.read_csv(StringIO(data), delim_whitespace=True, usecols=("a", "b"))
-    expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
-    tm.assert_frame_equal(result, expected)
+        result = parser.read_csv(
+            StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"]
+        )
+        tm.assert_frame_equal(result, expected)
 
+    def test_usecols_implicit_index_col(self, all_parsers):
+        # see gh-2654
+        parser = all_parsers
+        data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10"
 
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "usecols,expected",
-    [
-        # Column selection by index.
-        ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])),
-        # Column selection by name.
-        (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"])),
-    ],
-)
-def test_usecols_with_integer_like_header(all_parsers, usecols, expected):
-    parser = all_parsers
-    data = """2,0,1
-1000,2000,3000
-4000,5000,6000"""
+        result = parser.read_csv(StringIO(data), usecols=["a", "b"])
+        expected = DataFrame(
+            {"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]
+        )
+        tm.assert_frame_equal(result, expected)
 
-    result = parser.read_csv(StringIO(data), usecols=usecols)
-    tm.assert_frame_equal(result, expected)
+    def test_usecols_regex_sep(self, all_parsers):
+        # see gh-2733
+        parser = all_parsers
+        data = "a  b  c\n4  apple  bat  5.7\n8  orange  cow  10"
+        result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
 
+        expected = DataFrame(
+            {"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]
+        )
+        tm.assert_frame_equal(result, expected)
 
-@skip_pyarrow
-@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
-def test_usecols_with_parse_dates(all_parsers, usecols):
-    # see gh-9755
-    data = """a,b,c,d,e
-0,1,20140101,0900,4
-0,1,20140102,1000,4"""
-    parser = all_parsers
-    parse_dates = [[1, 2]]
-
-    cols = {
-        "a": [0, 0],
-        "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
-    }
-    expected = DataFrame(cols, columns=["c_d", "a"])
-    result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
-    tm.assert_frame_equal(result, expected)
+    def test_usecols_with_whitespace(self, all_parsers):
+        parser = all_parsers
+        data = "a  b  c\n4  apple  bat  5.7\n8  orange  cow  10"
 
+        result = parser.read_csv(
+            StringIO(data), delim_whitespace=True, usecols=("a", "b")
+        )
+        expected = DataFrame(
+            {"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]
+        )
+        tm.assert_frame_equal(result, expected)
 
-@skip_pyarrow
-def test_usecols_with_parse_dates2(all_parsers):
-    # see gh-13604
-    parser = all_parsers
-    data = """2008-02-07 09:40,1032.43
-2008-02-07 09:50,1042.54
-2008-02-07 10:00,1051.65"""
-
-    names = ["date", "values"]
-    usecols = names[:]
-    parse_dates = [0]
-
-    index = Index(
+    @pytest.mark.parametrize(
+        "usecols,expected",
         [
-            Timestamp("2008-02-07 09:40"),
-            Timestamp("2008-02-07 09:50"),
-            Timestamp("2008-02-07 10:00"),
+            # Column selection by index.
+            ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])),
+            # Column selection by name.
+            (
+                ["0", "1"],
+                DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"]),
+            ),
         ],
-        name="date",
     )
-    cols = {"values": [1032.43, 1042.54, 1051.65]}
-    expected = DataFrame(cols, index=index)
-
-    result = parser.read_csv(
-        StringIO(data),
-        parse_dates=parse_dates,
-        index_col=0,
-        usecols=usecols,
-        header=None,
-        names=names,
-    )
-    tm.assert_frame_equal(result, expected)
+    def test_usecols_with_integer_like_header(self, all_parsers, usecols, expected):
+        parser = all_parsers
+        data = """2,0,1
+    1000,2000,3000
+    4000,5000,6000"""
 
+        result = parser.read_csv(StringIO(data), usecols=usecols)
+        tm.assert_frame_equal(result, expected)
 
-@skip_pyarrow
-def test_usecols_with_parse_dates3(all_parsers):
-    # see gh-14792
-    parser = all_parsers
-    data = """a,b,c,d,e,f,g,h,i,j
-2016/09/21,1,1,2,3,4,5,6,7,8"""
-
-    usecols = list("abcdefghij")
-    parse_dates = [0]
-
-    cols = {
-        "a": Timestamp("2016-09-21"),
-        "b": [1],
-        "c": [1],
-        "d": [2],
-        "e": [3],
-        "f": [4],
-        "g": [5],
-        "h": [6],
-        "i": [7],
-        "j": [8],
-    }
-    expected = DataFrame(cols, columns=usecols)
-
-    result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
-    tm.assert_frame_equal(result, expected)
+    def test_empty_usecols(self, all_parsers):
+        data = "a,b,c\n1,2,3\n4,5,6"
+        expected = DataFrame()
+        parser = all_parsers
 
+        result = parser.read_csv(StringIO(data), usecols=set())
+        tm.assert_frame_equal(result, expected)
 
-@skip_pyarrow
-def test_usecols_with_parse_dates4(all_parsers):
-    data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
-    usecols = list("abcdefghij")
-    parse_dates = [[0, 1]]
-    parser = all_parsers
-
-    cols = {
-        "a_b": "2016/09/21 1",
-        "c": [1],
-        "d": [2],
-        "e": [3],
-        "f": [4],
-        "g": [5],
-        "h": [6],
-        "i": [7],
-        "j": [8],
-    }
-    expected = DataFrame(cols, columns=["a_b"] + list("cdefghij"))
-
-    result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
-    tm.assert_frame_equal(result, expected)
+    def test_np_array_usecols(self, all_parsers):
+        # see gh-12546
+        parser = all_parsers
+        data = "a,b,c\n1,2,3"
+        usecols = np.array(["a", "b"])
 
+        expected = DataFrame([[1, 2]], columns=usecols)
+        result = parser.read_csv(StringIO(data), usecols=usecols)
+        tm.assert_frame_equal(result, expected)
 
-@skip_pyarrow
-@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
-@pytest.mark.parametrize(
-    "names",
-    [
-        list("abcde"),  # Names span all columns in original data.
-        list("acd"),  # Names span only the selected columns.
-    ],
-)
-def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names):
-    # see gh-9755
-    s = """0,1,20140101,0900,4
-0,1,20140102,1000,4"""
-    parse_dates = [[1, 2]]
-    parser = all_parsers
-
-    cols = {
-        "a": [0, 0],
-        "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
-    }
-    expected = DataFrame(cols, columns=["c_d", "a"])
-
-    result = parser.read_csv(
-        StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols
+    @pytest.mark.parametrize(
+        "usecols,expected",
+        [
+            (
+                lambda x: x.upper() in ["AAA", "BBB", "DDD"],
+                DataFrame(
+                    {
+                        "AaA": {
+                            0: 0.056674972999999997,
+                            1: 2.6132309819999997,
+                            2: 3.5689350380000002,
+                        },
+                        "bBb": {0: 8, 1: 2, 2: 7},
+                        "ddd": {0: "a", 1: "b", 2: "a"},
+                    }
+                ),
+            ),
+            (lambda x: False, DataFrame()),
+        ],
     )
-    tm.assert_frame_equal(result, expected)
-
-
-def test_usecols_with_unicode_strings(all_parsers):
-    # see gh-13219
-    data = """AAA,BBB,CCC,DDD
-0.056674973,8,True,a
-2.613230982,2,False,b
-3.568935038,7,False,a"""
-    parser = all_parsers
-
-    exp_data = {
-        "AAA": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002},
-        "BBB": {0: 8, 1: 2, 2: 7},
-    }
-    expected = DataFrame(exp_data)
-
-    result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"])
-    tm.assert_frame_equal(result, expected)
-
-
-def test_usecols_with_single_byte_unicode_strings(all_parsers):
-    # see gh-13219
-    data = """A,B,C,D
-0.056674973,8,True,a
-2.613230982,2,False,b
-3.568935038,7,False,a"""
-    parser = all_parsers
-
-    exp_data = {
-        "A": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002},
-        "B": {0: 8, 1: 2, 2: 7},
-    }
-    expected = DataFrame(exp_data)
-
-    result = parser.read_csv(StringIO(data), usecols=["A", "B"])
-    tm.assert_frame_equal(result, expected)
-
-
-@pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]])
-def test_usecols_with_mixed_encoding_strings(all_parsers, usecols):
-    data = """AAA,BBB,CCC,DDD
-0.056674973,8,True,a
-2.613230982,2,False,b
-3.568935038,7,False,a"""
-    parser = all_parsers
-
-    with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
-        parser.read_csv(StringIO(data), usecols=usecols)
-
-
-@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]])
-def test_usecols_with_multi_byte_characters(all_parsers, usecols):
-    data = """あああ,いい,ううう,ええええ
-0.056674973,8,True,a
-2.613230982,2,False,b
-3.568935038,7,False,a"""
-    parser = all_parsers
-
-    exp_data = {
-        "あああ": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002},
-        "いい": {0: 8, 1: 2, 2: 7},
-    }
-    expected = DataFrame(exp_data)
-
-    result = parser.read_csv(StringIO(data), usecols=usecols)
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_empty_usecols(all_parsers):
-    data = "a,b,c\n1,2,3\n4,5,6"
-    expected = DataFrame()
-    parser = all_parsers
-
-    result = parser.read_csv(StringIO(data), usecols=set())
-    tm.assert_frame_equal(result, expected)
-
-
-def test_np_array_usecols(all_parsers):
-    # see gh-12546
-    parser = all_parsers
-    data = "a,b,c\n1,2,3"
-    usecols = np.array(["a", "b"])
+    def test_callable_usecols(self, all_parsers, usecols, expected):
+        # see gh-14154
+        data = """AaA,bBb,CCC,ddd
+    0.056674973,8,True,a
+    2.613230982,2,False,b
+    3.568935038,7,False,a"""
+        parser = all_parsers
+
+        result = parser.read_csv(StringIO(data), usecols=usecols)
+        tm.assert_frame_equal(result, expected)
 
-    expected = DataFrame([[1, 2]], columns=usecols)
-    result = parser.read_csv(StringIO(data), usecols=usecols)
-    tm.assert_frame_equal(result, expected)
+    @pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]])
+    def test_incomplete_first_row(self, all_parsers, usecols):
+        # see gh-6710
+        data = "1,2\n1,2,3"
+        parser = all_parsers
+        names = ["a", "b", "c"]
+        expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]})
 
+        result = parser.read_csv(StringIO(data), names=names, usecols=usecols)
+        tm.assert_frame_equal(result, expected)
 
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "usecols,expected",
-    [
-        (
-            lambda x: x.upper() in ["AAA", "BBB", "DDD"],
-            DataFrame(
-                {
-                    "AaA": {
-                        0: 0.056674972999999997,
-                        1: 2.6132309819999997,
-                        2: 3.5689350380000002,
-                    },
-                    "bBb": {0: 8, 1: 2, 2: 7},
-                    "ddd": {0: "a", 1: "b", 2: "a"},
-                }
+    @pytest.mark.parametrize(
+        "data,usecols,kwargs,expected",
+        [
+            # see gh-8985
+            (
+                "19,29,39\n" * 2 + "10,20,30,40",
+                [0, 1, 2],
+                {"header": None},
+                DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]),
             ),
-        ),
-        (lambda x: False, DataFrame()),
-    ],
-)
-def test_callable_usecols(all_parsers, usecols, expected):
-    # see gh-14154
-    data = """AaA,bBb,CCC,ddd
-0.056674973,8,True,a
-2.613230982,2,False,b
-3.568935038,7,False,a"""
-    parser = all_parsers
+            # see gh-9549
+            (
+                ("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"),
+                ["A", "B", "C"],
+                {},
+                DataFrame(
+                    {
+                        "A": [1, 3, 1, 1, 1, 5],
+                        "B": [2, 4, 2, 2, 2, 6],
+                        "C": [3, 5, 4, 3, 3, 7],
+                    }
+                ),
+            ),
+        ],
+    )
+    def test_uneven_length_cols(self, all_parsers, data, usecols, kwargs, expected):
+        # see gh-8985
+        parser = all_parsers
+        result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs)
+        tm.assert_frame_equal(result, expected)
 
-    result = parser.read_csv(StringIO(data), usecols=usecols)
-    tm.assert_frame_equal(result, expected)
+    @pytest.mark.parametrize(
+        "usecols,kwargs,expected,msg",
+        [
+            (
+                ["a", "b", "c", "d"],
+                {},
+                DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}),
+                None,
+            ),
+            (
+                ["a", "b", "c", "f"],
+                {},
+                None,
+                _msg_validate_usecols_names.format(r"\['f'\]"),
+            ),
+            (["a", "b", "f"], {}, None, _msg_validate_usecols_names.format(r"\['f'\]")),
+            (
+                ["a", "b", "f", "g"],
+                {},
+                None,
+                _msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"),
+            ),
+            # see gh-14671
+            (
+                None,
+                {"header": 0, "names": ["A", "B", "C", "D"]},
+                DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}),
+                None,
+            ),
+            (
+                ["A", "B", "C", "f"],
+                {"header": 0, "names": ["A", "B", "C", "D"]},
+                None,
+                _msg_validate_usecols_names.format(r"\['f'\]"),
+            ),
+            (
+                ["A", "B", "f"],
+                {"names": ["A", "B", "C", "D"]},
+                None,
+                _msg_validate_usecols_names.format(r"\['f'\]"),
+            ),
+        ],
+    )
+    def test_raises_on_usecols_names_mismatch(
+        self, all_parsers, usecols, kwargs, expected, msg
+    ):
+        data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
+        kwargs.update(usecols=usecols)
+        parser = all_parsers
+
+        if expected is None:
+            with pytest.raises(ValueError, match=msg):
+                parser.read_csv(StringIO(data), **kwargs)
+        else:
+            result = parser.read_csv(StringIO(data), **kwargs)
+            tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
+    def test_usecols_subset_names_mismatch_orig_columns(self, all_parsers, usecols):
+        data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
+        names = ["A", "B", "C", "D"]
+        parser = all_parsers
+
+        result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
+        expected = DataFrame({"A": [1, 5], "C": [3, 7]})
+        tm.assert_frame_equal(result, expected)
 
 
 @skip_pyarrow
-@pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]])
-def test_incomplete_first_row(all_parsers, usecols):
-    # see gh-6710
-    data = "1,2\n1,2,3"
-    parser = all_parsers
-    names = ["a", "b", "c"]
-    expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]})
+class TestUsecolsParseDates:
+    @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
+    def test_usecols_with_parse_dates(self, all_parsers, usecols):
+        # see gh-9755
+        data = """a,b,c,d,e
+    0,1,20140101,0900,4
+    0,1,20140102,1000,4"""
+        parser = all_parsers
+        parse_dates = [[1, 2]]
+
+        cols = {
+            "a": [0, 0],
+            "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
+        }
+        expected = DataFrame(cols, columns=["c_d", "a"])
+        result = parser.read_csv(
+            StringIO(data), usecols=usecols, parse_dates=parse_dates
+        )
+        tm.assert_frame_equal(result, expected)
 
-    result = parser.read_csv(StringIO(data), names=names, usecols=usecols)
-    tm.assert_frame_equal(result, expected)
+    def test_usecols_with_parse_dates2(self, all_parsers):
+        # see gh-13604
+        parser = all_parsers
+        data = """2008-02-07 09:40,1032.43
+    2008-02-07 09:50,1042.54
+    2008-02-07 10:00,1051.65"""
+
+        names = ["date", "values"]
+        usecols = names[:]
+        parse_dates = [0]
+
+        index = Index(
+            [
+                Timestamp("2008-02-07 09:40"),
+                Timestamp("2008-02-07 09:50"),
+                Timestamp("2008-02-07 10:00"),
+            ],
+            name="date",
+        )
+        cols = {"values": [1032.43, 1042.54, 1051.65]}
+        expected = DataFrame(cols, index=index)
+
+        result = parser.read_csv(
+            StringIO(data),
+            parse_dates=parse_dates,
+            index_col=0,
+            usecols=usecols,
+            header=None,
+            names=names,
+        )
+        tm.assert_frame_equal(result, expected)
 
+    def test_usecols_with_parse_dates3(self, all_parsers):
+        # see gh-14792
+        parser = all_parsers
+        data = """a,b,c,d,e,f,g,h,i,j
+    2016/09/21,1,1,2,3,4,5,6,7,8"""
+
+        usecols = list("abcdefghij")
+        parse_dates = [0]
+
+        cols = {
+            "a": Timestamp("2016-09-21"),
+            "b": [1],
+            "c": [1],
+            "d": [2],
+            "e": [3],
+            "f": [4],
+            "g": [5],
+            "h": [6],
+            "i": [7],
+            "j": [8],
+        }
+        expected = DataFrame(cols, columns=usecols)
+
+        result = parser.read_csv(
+            StringIO(data), usecols=usecols, parse_dates=parse_dates
+        )
+        tm.assert_frame_equal(result, expected)
 
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "data,usecols,kwargs,expected",
-    [
-        # see gh-8985
-        (
-            "19,29,39\n" * 2 + "10,20,30,40",
-            [0, 1, 2],
-            {"header": None},
-            DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]),
-        ),
-        # see gh-9549
-        (
-            ("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"),
-            ["A", "B", "C"],
-            {},
-            DataFrame(
-                {
-                    "A": [1, 3, 1, 1, 1, 5],
-                    "B": [2, 4, 2, 2, 2, 6],
-                    "C": [3, 5, 4, 3, 3, 7],
-                }
-            ),
-        ),
-    ],
-)
-def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected):
-    # see gh-8985
-    parser = all_parsers
-    result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs)
-    tm.assert_frame_equal(result, expected)
+    def test_usecols_with_parse_dates4(self, all_parsers):
+        data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
+        usecols = list("abcdefghij")
+        parse_dates = [[0, 1]]
+        parser = all_parsers
+
+        cols = {
+            "a_b": "2016/09/21 1",
+            "c": [1],
+            "d": [2],
+            "e": [3],
+            "f": [4],
+            "g": [5],
+            "h": [6],
+            "i": [7],
+            "j": [8],
+        }
+        expected = DataFrame(cols, columns=["a_b"] + list("cdefghij"))
+
+        result = parser.read_csv(
+            StringIO(data), usecols=usecols, parse_dates=parse_dates
+        )
+        tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
+    @pytest.mark.parametrize(
+        "names",
+        [
+            list("abcde"),  # Names span all columns in original data.
+            list("acd"),  # Names span only the selected columns.
+        ],
+    )
+    def test_usecols_with_parse_dates_and_names(self, all_parsers, usecols, names):
+        # see gh-9755
+        s = """0,1,20140101,0900,4
+    0,1,20140102,1000,4"""
+        parse_dates = [[1, 2]]
+        parser = all_parsers
+
+        cols = {
+            "a": [0, 0],
+            "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
+        }
+        expected = DataFrame(cols, columns=["c_d", "a"])
+
+        result = parser.read_csv(
+            StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols
+        )
+        tm.assert_frame_equal(result, expected)
 
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "usecols,kwargs,expected,msg",
-    [
-        (
-            ["a", "b", "c", "d"],
-            {},
-            DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}),
-            None,
-        ),
-        (
-            ["a", "b", "c", "f"],
-            {},
-            None,
-            _msg_validate_usecols_names.format(r"\['f'\]"),
-        ),
-        (["a", "b", "f"], {}, None, _msg_validate_usecols_names.format(r"\['f'\]")),
-        (
-            ["a", "b", "f", "g"],
-            {},
-            None,
-            _msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"),
-        ),
-        # see gh-14671
-        (
-            None,
-            {"header": 0, "names": ["A", "B", "C", "D"]},
-            DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}),
-            None,
-        ),
-        (
-            ["A", "B", "C", "f"],
-            {"header": 0, "names": ["A", "B", "C", "D"]},
-            None,
-            _msg_validate_usecols_names.format(r"\['f'\]"),
-        ),
-        (
-            ["A", "B", "f"],
-            {"names": ["A", "B", "C", "D"]},
-            None,
-            _msg_validate_usecols_names.format(r"\['f'\]"),
-        ),
-    ],
-)
-def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected, msg):
-    data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
-    kwargs.update(usecols=usecols)
-    parser = all_parsers
 
-    if expected is None:
-        with pytest.raises(ValueError, match=msg):
-            parser.read_csv(StringIO(data), **kwargs)
-    else:
-        result = parser.read_csv(StringIO(data), **kwargs)
+class TestUsecolsStrings:
+    def test_usecols_with_unicode_strings(self, all_parsers):
+        # see gh-13219
+        data = """AAA,BBB,CCC,DDD
+    0.056674973,8,True,a
+    2.613230982,2,False,b
+    3.568935038,7,False,a"""
+        parser = all_parsers
+
+        exp_data = {
+            "AAA": {
+                0: 0.056674972999999997,
+                1: 2.6132309819999997,
+                2: 3.5689350380000002,
+            },
+            "BBB": {0: 8, 1: 2, 2: 7},
+        }
+        expected = DataFrame(exp_data)
+
+        result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"])
         tm.assert_frame_equal(result, expected)
 
+    def test_usecols_with_single_byte_unicode_strings(self, all_parsers):
+        # see gh-13219
+        data = """A,B,C,D
+    0.056674973,8,True,a
+    2.613230982,2,False,b
+    3.568935038,7,False,a"""
+        parser = all_parsers
+
+        exp_data = {
+            "A": {
+                0: 0.056674972999999997,
+                1: 2.6132309819999997,
+                2: 3.5689350380000002,
+            },
+            "B": {0: 8, 1: 2, 2: 7},
+        }
+        expected = DataFrame(exp_data)
+
+        result = parser.read_csv(StringIO(data), usecols=["A", "B"])
+        tm.assert_frame_equal(result, expected)
 
-@skip_pyarrow
-@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
-def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols):
-    data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
-    names = ["A", "B", "C", "D"]
-    parser = all_parsers
-
-    result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
-    expected = DataFrame({"A": [1, 5], "C": [3, 7]})
-    tm.assert_frame_equal(result, expected)
+    @pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]])
+    def test_usecols_with_mixed_encoding_strings(self, all_parsers, usecols):
+        data = """AAA,BBB,CCC,DDD
+    0.056674973,8,True,a
+    2.613230982,2,False,b
+    3.568935038,7,False,a"""
+        parser = all_parsers
+
+        with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
+            parser.read_csv(StringIO(data), usecols=usecols)
+
+    @pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]])
+    def test_usecols_with_multi_byte_characters(self, all_parsers, usecols):
+        data = """あああ,いい,ううう,ええええ
+    0.056674973,8,True,a
+    2.613230982,2,False,b
+    3.568935038,7,False,a"""
+        parser = all_parsers
+
+        exp_data = {
+            "あああ": {
+                0: 0.056674972999999997,
+                1: 2.6132309819999997,
+                2: 3.5689350380000002,
+            },
+            "いい": {0: 8, 1: 2, 2: 7},
+        }
+        expected = DataFrame(exp_data)
+
+        result = parser.read_csv(StringIO(data), usecols=usecols)
+        tm.assert_frame_equal(result, expected)

From 0af7291047c5a64418634ab6fe23f2dcd8bc1df8 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 19 Dec 2020 02:05:33 -0500
Subject: [PATCH 69/95] test reorg

---
 pandas/tests/io/parser/test_skiprows.py | 450 ++++++++++++------------
 1 file changed, 222 insertions(+), 228 deletions(-)

diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py
index 6d85e01c6fd4a..f043861b36e4a 100644
--- a/pandas/tests/io/parser/test_skiprows.py
+++ b/pandas/tests/io/parser/test_skiprows.py
@@ -18,248 +18,242 @@
 
 
 @skip_pyarrow
-@pytest.mark.parametrize("skiprows", [list(range(6)), 6])
-def test_skip_rows_bug(all_parsers, skiprows):
-    # see gh-505
-    parser = all_parsers
-    text = """#foo,a,b,c
-#foo,a,b,c
-#foo,a,b,c
-#foo,a,b,c
-#foo,a,b,c
-#foo,a,b,c
-1/1/2000,1.,2.,3.
-1/2/2000,4,5,6
-1/3/2000,7,8,9
-"""
-    result = parser.read_csv(
-        StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True
-    )
-    index = Index(
-        [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
-    )
-
-    expected = DataFrame(
-        np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_deep_skip_rows(all_parsers):
-    # see gh-4382
-    parser = all_parsers
-    data = "a,b,c\n" + "\n".join(
-        [",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)]
-    )
-    condensed_data = "a,b,c\n" + "\n".join(
-        [",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]]
+class TestParserSkiprows:
+    @pytest.mark.parametrize("skiprows", [list(range(6)), 6])
+    def test_skip_rows_bug(self, all_parsers, skiprows):
+        # see gh-505
+        parser = all_parsers
+        text = """#foo,a,b,c
+            #foo,a,b,c
+            #foo,a,b,c
+            #foo,a,b,c
+            #foo,a,b,c
+            #foo,a,b,c
+            1/1/2000,1.,2.,3.
+            1/2/2000,4,5,6
+            1/3/2000,7,8,9
+            """
+        result = parser.read_csv(
+            StringIO(text),
+            skiprows=skiprows,
+            header=None,
+            index_col=0,
+            parse_dates=True,
+        )
+        index = Index(
+            [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
+        )
+
+        expected = DataFrame(
+            np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_deep_skip_rows(self, all_parsers):
+        # see gh-4382
+        parser = all_parsers
+        data = "a,b,c\n" + "\n".join(
+            [",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)]
+        )
+        condensed_data = "a,b,c\n" + "\n".join(
+            [
+                ",".join([str(i), str(i + 1), str(i + 2)])
+                for i in [0, 1, 2, 3, 4, 6, 8, 9]
+            ]
+        )
+
+        result = parser.read_csv(StringIO(data), skiprows=[6, 8])
+        condensed_result = parser.read_csv(StringIO(condensed_data))
+        tm.assert_frame_equal(result, condensed_result)
+
+    def test_skip_rows_blank(self, all_parsers):
+        # see gh-9832
+        parser = all_parsers
+        text = """#foo,a,b,c
+        #foo,a,b,c
+
+        #foo,a,b,c
+        #foo,a,b,c
+
+        1/1/2000,1.,2.,3.
+        1/2/2000,4,5,6
+        1/3/2000,7,8,9
+        """
+        data = parser.read_csv(
+            StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True
+        )
+        index = Index(
+            [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
+        )
+
+        expected = DataFrame(
+            np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
+        )
+        tm.assert_frame_equal(data, expected)
+
+    def test_skip_row_with_quote(self, all_parsers):
+        # see gh-12775 and gh-10911
+        parser = all_parsers
+        data = """id,text,num_lines
+    1,"line '11' line 12",2
+    2,"line '21' line 22",2
+    3,"line '31' line 32",1"""
+
+        exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]]
+        expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
+
+        result = parser.read_csv(StringIO(data), skiprows=[1])
+        tm.assert_frame_equal(result, expected)
+
+    def test_skip_rows_skip_all(self, all_parsers):
+        parser = all_parsers
+        data = "a\n1\n2\n3\n4\n5"
+        msg = "No columns to parse from file"
+
+        with pytest.raises(EmptyDataError, match=msg):
+            parser.read_csv(StringIO(data), skiprows=lambda x: True)
+
+    def test_skip_rows_bad_callable(self, all_parsers):
+        msg = "by zero"
+        parser = all_parsers
+        data = "a\n1\n2\n3\n4\n5"
+
+        with pytest.raises(ZeroDivisionError, match=msg):
+            parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0)
+
+    def test_skiprows_infield_quote(self, all_parsers):
+        # see gh-14459
+        parser = all_parsers
+        data = 'a"\nb"\na\n1'
+        expected = DataFrame({"a": [1]})
+
+        result = parser.read_csv(StringIO(data), skiprows=2)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "kwargs,expected",
+        [
+            ({}, DataFrame({"1": [3, 5]})),
+            ({"header": 0, "names": ["foo"]}, DataFrame({"foo": [3, 5]})),
+        ],
     )
+    def test_skip_rows_callable(self, all_parsers, kwargs, expected):
+        parser = all_parsers
+        data = "a\n1\n2\n3\n4\n5"
 
-    result = parser.read_csv(StringIO(data), skiprows=[6, 8])
-    condensed_result = parser.read_csv(StringIO(condensed_data))
-    tm.assert_frame_equal(result, condensed_result)
-
-
-@skip_pyarrow
-def test_skip_rows_blank(all_parsers):
-    # see gh-9832
-    parser = all_parsers
-    text = """#foo,a,b,c
-#foo,a,b,c
-
-#foo,a,b,c
-#foo,a,b,c
+        result = parser.read_csv(
+            StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs
+        )
+        tm.assert_frame_equal(result, expected)
 
-1/1/2000,1.,2.,3.
-1/2/2000,4,5,6
-1/3/2000,7,8,9
-"""
-    data = parser.read_csv(
-        StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True
-    )
-    index = Index(
-        [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
+    @pytest.mark.parametrize(
+        "line_terminator", ["\n", "\r\n", "\r"]  # "LF"  # "CRLF"  # "CR"
     )
-
-    expected = DataFrame(
-        np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
+    def test_skiprows_lineterminator(self, all_parsers, line_terminator):
+        # see gh-9079
+        parser = all_parsers
+        data = "\n".join(
+            [
+                "SMOSMANIA ThetaProbe-ML2X ",
+                "2007/01/01 01:00   0.2140 U M ",
+                "2007/01/01 02:00   0.2141 M O ",
+                "2007/01/01 04:00   0.2142 D M ",
+            ]
+        )
+        expected = DataFrame(
+            [
+                ["2007/01/01", "01:00", 0.2140, "U", "M"],
+                ["2007/01/01", "02:00", 0.2141, "M", "O"],
+                ["2007/01/01", "04:00", 0.2142, "D", "M"],
+            ],
+            columns=["date", "time", "var", "flag", "oflag"],
+        )
+
+        if parser.engine == "python" and line_terminator == "\r":
+            pytest.skip("'CR' not respect with the Python parser yet")
+
+        data = data.replace("\n", line_terminator)
+        result = parser.read_csv(
+            StringIO(data),
+            skiprows=1,
+            delim_whitespace=True,
+            names=["date", "time", "var", "flag", "oflag"],
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @skip_pyarrow
+    @pytest.mark.parametrize(
+        "data,exp_data",
+        [
+            (
+                """id,text,num_lines
+    1,"line \n'11' line 12",2
+    2,"line \n'21' line 22",2
+    3,"line \n'31' line 32",1""",
+                [[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]],
+            ),
+            (
+                """id,text,num_lines
+    1,"line '11\n' line 12",2
+    2,"line '21\n' line 22",2
+    3,"line '31\n' line 32",1""",
+                [[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]],
+            ),
+            (
+                """id,text,num_lines
+    1,"line '11\n' \r\tline 12",2
+    2,"line '21\n' \r\tline 22",2
+    3,"line '31\n' \r\tline 32",1""",
+                [[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]],
+            ),
+        ],
     )
-    tm.assert_frame_equal(data, expected)
+    def test_skip_row_with_newline_and_quote(self, all_parsers, data, exp_data):
+        # see gh-12775 and gh-10911
+        parser = all_parsers
+        result = parser.read_csv(StringIO(data), skiprows=[1])
 
+        expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
+        tm.assert_frame_equal(result, expected)
 
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "data,kwargs,expected",
-    [
-        (
-            """id,text,num_lines
+    @skip_pyarrow
+    @pytest.mark.parametrize(
+        "data,kwargs,expected",
+        [
+            (
+                """id,text,num_lines
 1,"line 11
 line 12",2
 2,"line 21
 line 22",2
 3,"line 31",1""",
-            {"skiprows": [1]},
-            DataFrame(
-                [[2, "line 21\nline 22", 2], [3, "line 31", 1]],
-                columns=["id", "text", "num_lines"],
+                {"skiprows": [1]},
+                DataFrame(
+                    [[2, "line 21\nline 22", 2], [3, "line 31", 1]],
+                    columns=["id", "text", "num_lines"],
+                ),
             ),
-        ),
-        (
-            "a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~",
-            {"quotechar": "~", "skiprows": [2]},
-            DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]),
-        ),
-        (
             (
-                "Text,url\n~example\n "
-                "sentence\n one~,url1\n~"
-                "example\n sentence\n two~,url2\n~"
-                "example\n sentence\n three~,url3"
+                "a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~",
+                {"quotechar": "~", "skiprows": [2]},
+                DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]),
+            ),
+            (
+                (
+                    "Text,url\n~example\n "
+                    "sentence\n one~,url1\n~"
+                    "example\n sentence\n two~,url2\n~"
+                    "example\n sentence\n three~,url3"
+                ),
+                {"quotechar": "~", "skiprows": [1, 3]},
+                DataFrame(
+                    [["example\n sentence\n two", "url2"]], columns=["Text", "url"]
+                ),
             ),
-            {"quotechar": "~", "skiprows": [1, 3]},
-            DataFrame([["example\n sentence\n two", "url2"]], columns=["Text", "url"]),
-        ),
-    ],
-)
-def test_skip_row_with_newline(all_parsers, data, kwargs, expected):
-    # see gh-12775 and gh-10911
-    parser = all_parsers
-    result = parser.read_csv(StringIO(data), **kwargs)
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_skip_row_with_quote(all_parsers):
-    # see gh-12775 and gh-10911
-    parser = all_parsers
-    data = """id,text,num_lines
-1,"line '11' line 12",2
-2,"line '21' line 22",2
-3,"line '31' line 32",1"""
-
-    exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]]
-    expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
-
-    result = parser.read_csv(StringIO(data), skiprows=[1])
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "data,exp_data",
-    [
-        (
-            """id,text,num_lines
-1,"line \n'11' line 12",2
-2,"line \n'21' line 22",2
-3,"line \n'31' line 32",1""",
-            [[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]],
-        ),
-        (
-            """id,text,num_lines
-1,"line '11\n' line 12",2
-2,"line '21\n' line 22",2
-3,"line '31\n' line 32",1""",
-            [[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]],
-        ),
-        (
-            """id,text,num_lines
-1,"line '11\n' \r\tline 12",2
-2,"line '21\n' \r\tline 22",2
-3,"line '31\n' \r\tline 32",1""",
-            [[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]],
-        ),
-    ],
-)
-def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data):
-    # see gh-12775 and gh-10911
-    parser = all_parsers
-    result = parser.read_csv(StringIO(data), skiprows=[1])
-
-    expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "line_terminator", ["\n", "\r\n", "\r"]  # "LF"  # "CRLF"  # "CR"
-)
-def test_skiprows_lineterminator(all_parsers, line_terminator):
-    # see gh-9079
-    parser = all_parsers
-    data = "\n".join(
-        [
-            "SMOSMANIA ThetaProbe-ML2X ",
-            "2007/01/01 01:00   0.2140 U M ",
-            "2007/01/01 02:00   0.2141 M O ",
-            "2007/01/01 04:00   0.2142 D M ",
-        ]
-    )
-    expected = DataFrame(
-        [
-            ["2007/01/01", "01:00", 0.2140, "U", "M"],
-            ["2007/01/01", "02:00", 0.2141, "M", "O"],
-            ["2007/01/01", "04:00", 0.2142, "D", "M"],
         ],
-        columns=["date", "time", "var", "flag", "oflag"],
-    )
-
-    if parser.engine == "python" and line_terminator == "\r":
-        pytest.skip("'CR' not respect with the Python parser yet")
-
-    data = data.replace("\n", line_terminator)
-    result = parser.read_csv(
-        StringIO(data),
-        skiprows=1,
-        delim_whitespace=True,
-        names=["date", "time", "var", "flag", "oflag"],
     )
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_skiprows_infield_quote(all_parsers):
-    # see gh-14459
-    parser = all_parsers
-    data = 'a"\nb"\na\n1'
-    expected = DataFrame({"a": [1]})
-
-    result = parser.read_csv(StringIO(data), skiprows=2)
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "kwargs,expected",
-    [
-        ({}, DataFrame({"1": [3, 5]})),
-        ({"header": 0, "names": ["foo"]}, DataFrame({"foo": [3, 5]})),
-    ],
-)
-def test_skip_rows_callable(all_parsers, kwargs, expected):
-    parser = all_parsers
-    data = "a\n1\n2\n3\n4\n5"
-
-    result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs)
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_skip_rows_skip_all(all_parsers):
-    parser = all_parsers
-    data = "a\n1\n2\n3\n4\n5"
-    msg = "No columns to parse from file"
-
-    with pytest.raises(EmptyDataError, match=msg):
-        parser.read_csv(StringIO(data), skiprows=lambda x: True)
-
-
-@skip_pyarrow
-def test_skip_rows_bad_callable(all_parsers):
-    msg = "by zero"
-    parser = all_parsers
-    data = "a\n1\n2\n3\n4\n5"
-
-    with pytest.raises(ZeroDivisionError, match=msg):
-        parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0)
+    def test_skip_row_with_newline(self, all_parsers, data, kwargs, expected):
+        # see gh-12775 and gh-10911
+        parser = all_parsers
+        result = parser.read_csv(StringIO(data), **kwargs)
+        tm.assert_frame_equal(result, expected)

From 361aab6548ca66826880c37ec6c81f80906c626e Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 19 Dec 2020 02:09:14 -0500
Subject: [PATCH 70/95] test reorg

---
 pandas/tests/io/parser/test_quoting.py | 256 ++++++++++++-------------
 1 file changed, 121 insertions(+), 135 deletions(-)

diff --git a/pandas/tests/io/parser/test_quoting.py b/pandas/tests/io/parser/test_quoting.py
index a93dbde24b001..1617160d2089a 100644
--- a/pandas/tests/io/parser/test_quoting.py
+++ b/pandas/tests/io/parser/test_quoting.py
@@ -14,156 +14,142 @@
 import pandas._testing as tm
 
 skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
-xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
 
 @skip_pyarrow
-@pytest.mark.parametrize(
-    "kwargs,msg",
-    [
-        ({"quotechar": "foo"}, '"quotechar" must be a(n)? 1-character string'),
-        (
-            {"quotechar": None, "quoting": csv.QUOTE_MINIMAL},
-            "quotechar must be set if quoting enabled",
-        ),
-        ({"quotechar": 2}, '"quotechar" must be string, not int'),
-    ],
-)
-def test_bad_quote_char(all_parsers, kwargs, msg):
-    data = "1,2,3"
-    parser = all_parsers
-
-    with pytest.raises(TypeError, match=msg):
-        parser.read_csv(StringIO(data), **kwargs)
+class TestParserQuoting:
+    @pytest.mark.parametrize(
+        "kwargs,msg",
+        [
+            ({"quotechar": "foo"}, '"quotechar" must be a(n)? 1-character string'),
+            (
+                {"quotechar": None, "quoting": csv.QUOTE_MINIMAL},
+                "quotechar must be set if quoting enabled",
+            ),
+            ({"quotechar": 2}, '"quotechar" must be string, not int'),
+        ],
+    )
+    def test_bad_quote_char(self, all_parsers, kwargs, msg):
+        data = "1,2,3"
+        parser = all_parsers
 
+        with pytest.raises(TypeError, match=msg):
+            parser.read_csv(StringIO(data), **kwargs)
 
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "quoting,msg",
-    [
-        ("foo", '"quoting" must be an integer'),
-        (5, 'bad "quoting" value'),  # quoting must be in the range [0, 3]
-    ],
-)
-def test_bad_quoting(all_parsers, quoting, msg):
-    data = "1,2,3"
-    parser = all_parsers
-
-    with pytest.raises(TypeError, match=msg):
-        parser.read_csv(StringIO(data), quoting=quoting)
-
-
-def test_quote_char_basic(all_parsers):
-    parser = all_parsers
-    data = 'a,b,c\n1,2,"cat"'
-    expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
-
-    result = parser.read_csv(StringIO(data), quotechar='"')
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"])
-def test_quote_char_various(all_parsers, quote_char):
-    parser = all_parsers
-    expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
-
-    data = 'a,b,c\n1,2,"cat"'
-    new_data = data.replace('"', quote_char)
-
-    result = parser.read_csv(StringIO(new_data), quotechar=quote_char)
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
-@pytest.mark.parametrize("quote_char", ["", None])
-def test_null_quote_char(all_parsers, quoting, quote_char):
-    kwargs = {"quotechar": quote_char, "quoting": quoting}
-    data = "a,b,c\n1,2,3"
-    parser = all_parsers
-
-    if quoting != csv.QUOTE_NONE:
-        # Sanity checking.
-        msg = "quotechar must be set if quoting enabled"
+    @pytest.mark.parametrize(
+        "quoting,msg",
+        [
+            ("foo", '"quoting" must be an integer'),
+            (5, 'bad "quoting" value'),  # quoting must be in the range [0, 3]
+        ],
+    )
+    def test_bad_quoting(self, all_parsers, quoting, msg):
+        data = "1,2,3"
+        parser = all_parsers
 
         with pytest.raises(TypeError, match=msg):
-            parser.read_csv(StringIO(data), **kwargs)
-    else:
-        expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
-        result = parser.read_csv(StringIO(data), **kwargs)
-        tm.assert_frame_equal(result, expected)
+            parser.read_csv(StringIO(data), quoting=quoting)
 
+    def test_quote_char_basic(self, all_parsers):
+        parser = all_parsers
+        data = 'a,b,c\n1,2,"cat"'
+        expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
 
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "kwargs,exp_data",
-    [
-        ({}, [[1, 2, "foo"]]),  # Test default.
-        # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
-        ({"quotechar": '"', "quoting": csv.QUOTE_MINIMAL}, [[1, 2, "foo"]]),
-        # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
-        ({"quotechar": '"', "quoting": csv.QUOTE_ALL}, [[1, 2, "foo"]]),
-        # QUOTE_NONE tells the reader to do no special handling
-        # of quote characters and leave them alone.
-        ({"quotechar": '"', "quoting": csv.QUOTE_NONE}, [[1, 2, '"foo"']]),
-        # QUOTE_NONNUMERIC tells the reader to cast
-        # all non-quoted fields to float
-        ({"quotechar": '"', "quoting": csv.QUOTE_NONNUMERIC}, [[1.0, 2.0, "foo"]]),
-    ],
-)
-def test_quoting_various(all_parsers, kwargs, exp_data):
-    data = '1,2,"foo"'
-    parser = all_parsers
-    columns = ["a", "b", "c"]
-
-    result = parser.read_csv(StringIO(data), names=columns, **kwargs)
-    expected = DataFrame(exp_data, columns=columns)
-    tm.assert_frame_equal(result, expected)
+        result = parser.read_csv(StringIO(data), quotechar='"')
+        tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"])
+    def test_quote_char_various(self, all_parsers, quote_char):
+        parser = all_parsers
+        expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
 
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])]
-)
-def test_double_quote(all_parsers, doublequote, exp_data):
-    parser = all_parsers
-    data = 'a,b\n3,"4 "" 5"'
+        data = 'a,b,c\n1,2,"cat"'
+        new_data = data.replace('"', quote_char)
 
-    result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote)
-    expected = DataFrame(exp_data, columns=["a", "b"])
-    tm.assert_frame_equal(result, expected)
+        result = parser.read_csv(StringIO(new_data), quotechar=quote_char)
+        tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
+    @pytest.mark.parametrize("quote_char", ["", None])
+    def test_null_quote_char(self, all_parsers, quoting, quote_char):
+        kwargs = {"quotechar": quote_char, "quoting": quoting}
+        data = "a,b,c\n1,2,3"
+        parser = all_parsers
+
+        if quoting != csv.QUOTE_NONE:
+            # Sanity checking.
+            msg = "quotechar must be set if quoting enabled"
+
+            with pytest.raises(TypeError, match=msg):
+                parser.read_csv(StringIO(data), **kwargs)
+        else:
+            expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
+            result = parser.read_csv(StringIO(data), **kwargs)
+            tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "kwargs,exp_data",
+        [
+            ({}, [[1, 2, "foo"]]),  # Test default.
+            # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
+            ({"quotechar": '"', "quoting": csv.QUOTE_MINIMAL}, [[1, 2, "foo"]]),
+            # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
+            ({"quotechar": '"', "quoting": csv.QUOTE_ALL}, [[1, 2, "foo"]]),
+            # QUOTE_NONE tells the reader to do no special handling
+            # of quote characters and leave them alone.
+            ({"quotechar": '"', "quoting": csv.QUOTE_NONE}, [[1, 2, '"foo"']]),
+            # QUOTE_NONNUMERIC tells the reader to cast
+            # all non-quoted fields to float
+            ({"quotechar": '"', "quoting": csv.QUOTE_NONNUMERIC}, [[1.0, 2.0, "foo"]]),
+        ],
+    )
+    def test_quoting_various(self, all_parsers, kwargs, exp_data):
+        data = '1,2,"foo"'
+        parser = all_parsers
+        columns = ["a", "b", "c"]
+
+        result = parser.read_csv(StringIO(data), names=columns, **kwargs)
+        expected = DataFrame(exp_data, columns=columns)
+        tm.assert_frame_equal(result, expected)
 
-@pytest.mark.parametrize("quotechar", ['"', "\u0001"])
-def test_quotechar_unicode(all_parsers, quotechar):
-    # see gh-14477
-    data = "a\n1"
-    parser = all_parsers
-    expected = DataFrame({"a": [1]})
+    @pytest.mark.parametrize(
+        "doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])]
+    )
+    def test_double_quote(self, all_parsers, doublequote, exp_data):
+        parser = all_parsers
+        data = 'a,b\n3,"4 "" 5"'
 
-    result = parser.read_csv(StringIO(data), quotechar=quotechar)
-    tm.assert_frame_equal(result, expected)
+        result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote)
+        expected = DataFrame(exp_data, columns=["a", "b"])
+        tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize("quotechar", ['"', "\u0001"])
+    def test_quotechar_unicode(self, all_parsers, quotechar):
+        # see gh-14477
+        data = "a\n1"
+        parser = all_parsers
+        expected = DataFrame({"a": [1]})
 
-@skip_pyarrow
-@pytest.mark.parametrize("balanced", [True, False])
-def test_unbalanced_quoting(all_parsers, balanced):
-    # see gh-22789.
-    parser = all_parsers
-    data = 'a,b,c\n1,2,"3'
-
-    if balanced:
-        # Re-balance the quoting and read in without errors.
-        expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
-        result = parser.read_csv(StringIO(data + '"'))
+        result = parser.read_csv(StringIO(data), quotechar=quotechar)
         tm.assert_frame_equal(result, expected)
-    else:
-        msg = (
-            "EOF inside string starting at row 1"
-            if parser.engine == "c"
-            else "unexpected end of data"
-        )
-
-        with pytest.raises(ParserError, match=msg):
-            parser.read_csv(StringIO(data))
+
+    @pytest.mark.parametrize("balanced", [True, False])
+    def test_unbalanced_quoting(self, all_parsers, balanced):
+        # see gh-22789.
+        parser = all_parsers
+        data = 'a,b,c\n1,2,"3'
+
+        if balanced:
+            # Re-balance the quoting and read in without errors.
+            expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
+            result = parser.read_csv(StringIO(data + '"'))
+            tm.assert_frame_equal(result, expected)
+        else:
+            msg = (
+                "EOF inside string starting at row 1"
+                if parser.engine == "c"
+                else "unexpected end of data"
+            )
+
+            with pytest.raises(ParserError, match=msg):
+                parser.read_csv(StringIO(data))

From 75de071b6699cec5d3ef484a42889e33e47a34db Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 19 Dec 2020 02:12:08 -0500
Subject: [PATCH 71/95] test reorg

---
 pandas/tests/io/parser/test_quoting.py | 256 +++++++++++++------------
 1 file changed, 131 insertions(+), 125 deletions(-)

diff --git a/pandas/tests/io/parser/test_quoting.py b/pandas/tests/io/parser/test_quoting.py
index 1617160d2089a..6995965467d05 100644
--- a/pandas/tests/io/parser/test_quoting.py
+++ b/pandas/tests/io/parser/test_quoting.py
@@ -13,143 +13,149 @@
 from pandas import DataFrame
 import pandas._testing as tm
 
-skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
-
-
-@skip_pyarrow
-class TestParserQuoting:
-    @pytest.mark.parametrize(
-        "kwargs,msg",
-        [
-            ({"quotechar": "foo"}, '"quotechar" must be a(n)? 1-character string'),
-            (
-                {"quotechar": None, "quoting": csv.QUOTE_MINIMAL},
-                "quotechar must be set if quoting enabled",
-            ),
-            ({"quotechar": 2}, '"quotechar" must be string, not int'),
-        ],
-    )
-    def test_bad_quote_char(self, all_parsers, kwargs, msg):
-        data = "1,2,3"
-        parser = all_parsers
+pytestmark = pytest.mark.usefixtures("pyarrow_skip")
 
-        with pytest.raises(TypeError, match=msg):
-            parser.read_csv(StringIO(data), **kwargs)
 
-    @pytest.mark.parametrize(
-        "quoting,msg",
-        [
-            ("foo", '"quoting" must be an integer'),
-            (5, 'bad "quoting" value'),  # quoting must be in the range [0, 3]
-        ],
-    )
-    def test_bad_quoting(self, all_parsers, quoting, msg):
-        data = "1,2,3"
-        parser = all_parsers
+@pytest.mark.parametrize(
+    "kwargs,msg",
+    [
+        ({"quotechar": "foo"}, '"quotechar" must be a(n)? 1-character string'),
+        (
+            {"quotechar": None, "quoting": csv.QUOTE_MINIMAL},
+            "quotechar must be set if quoting enabled",
+        ),
+        ({"quotechar": 2}, '"quotechar" must be string, not int'),
+    ],
+)
+def test_bad_quote_char(all_parsers, kwargs, msg):
+    data = "1,2,3"
+    parser = all_parsers
 
-        with pytest.raises(TypeError, match=msg):
-            parser.read_csv(StringIO(data), quoting=quoting)
+    with pytest.raises(TypeError, match=msg):
+        parser.read_csv(StringIO(data), **kwargs)
 
-    def test_quote_char_basic(self, all_parsers):
-        parser = all_parsers
-        data = 'a,b,c\n1,2,"cat"'
-        expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
 
-        result = parser.read_csv(StringIO(data), quotechar='"')
-        tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "quoting,msg",
+    [
+        ("foo", '"quoting" must be an integer'),
+        (5, 'bad "quoting" value'),  # quoting must be in the range [0, 3]
+    ],
+)
+def test_bad_quoting(all_parsers, quoting, msg):
+    data = "1,2,3"
+    parser = all_parsers
 
-    @pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"])
-    def test_quote_char_various(self, all_parsers, quote_char):
-        parser = all_parsers
-        expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
+    with pytest.raises(TypeError, match=msg):
+        parser.read_csv(StringIO(data), quoting=quoting)
 
-        data = 'a,b,c\n1,2,"cat"'
-        new_data = data.replace('"', quote_char)
 
-        result = parser.read_csv(StringIO(new_data), quotechar=quote_char)
-        tm.assert_frame_equal(result, expected)
+def test_quote_char_basic(all_parsers):
+    parser = all_parsers
+    data = 'a,b,c\n1,2,"cat"'
+    expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
 
-    @pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
-    @pytest.mark.parametrize("quote_char", ["", None])
-    def test_null_quote_char(self, all_parsers, quoting, quote_char):
-        kwargs = {"quotechar": quote_char, "quoting": quoting}
-        data = "a,b,c\n1,2,3"
-        parser = all_parsers
-
-        if quoting != csv.QUOTE_NONE:
-            # Sanity checking.
-            msg = "quotechar must be set if quoting enabled"
-
-            with pytest.raises(TypeError, match=msg):
-                parser.read_csv(StringIO(data), **kwargs)
-        else:
-            expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
-            result = parser.read_csv(StringIO(data), **kwargs)
-            tm.assert_frame_equal(result, expected)
-
-    @pytest.mark.parametrize(
-        "kwargs,exp_data",
-        [
-            ({}, [[1, 2, "foo"]]),  # Test default.
-            # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
-            ({"quotechar": '"', "quoting": csv.QUOTE_MINIMAL}, [[1, 2, "foo"]]),
-            # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
-            ({"quotechar": '"', "quoting": csv.QUOTE_ALL}, [[1, 2, "foo"]]),
-            # QUOTE_NONE tells the reader to do no special handling
-            # of quote characters and leave them alone.
-            ({"quotechar": '"', "quoting": csv.QUOTE_NONE}, [[1, 2, '"foo"']]),
-            # QUOTE_NONNUMERIC tells the reader to cast
-            # all non-quoted fields to float
-            ({"quotechar": '"', "quoting": csv.QUOTE_NONNUMERIC}, [[1.0, 2.0, "foo"]]),
-        ],
-    )
-    def test_quoting_various(self, all_parsers, kwargs, exp_data):
-        data = '1,2,"foo"'
-        parser = all_parsers
-        columns = ["a", "b", "c"]
-
-        result = parser.read_csv(StringIO(data), names=columns, **kwargs)
-        expected = DataFrame(exp_data, columns=columns)
-        tm.assert_frame_equal(result, expected)
+    result = parser.read_csv(StringIO(data), quotechar='"')
+    tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.parametrize(
-        "doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])]
-    )
-    def test_double_quote(self, all_parsers, doublequote, exp_data):
-        parser = all_parsers
-        data = 'a,b\n3,"4 "" 5"'
 
-        result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote)
-        expected = DataFrame(exp_data, columns=["a", "b"])
-        tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"])
+def test_quote_char_various(all_parsers, quote_char):
+    parser = all_parsers
+    expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
+
+    data = 'a,b,c\n1,2,"cat"'
+    new_data = data.replace('"', quote_char)
 
-    @pytest.mark.parametrize("quotechar", ['"', "\u0001"])
-    def test_quotechar_unicode(self, all_parsers, quotechar):
-        # see gh-14477
-        data = "a\n1"
-        parser = all_parsers
-        expected = DataFrame({"a": [1]})
+    result = parser.read_csv(StringIO(new_data), quotechar=quote_char)
+    tm.assert_frame_equal(result, expected)
 
-        result = parser.read_csv(StringIO(data), quotechar=quotechar)
+
+@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
+@pytest.mark.parametrize("quote_char", ["", None])
+def test_null_quote_char(all_parsers, quoting, quote_char):
+    kwargs = {"quotechar": quote_char, "quoting": quoting}
+    data = "a,b,c\n1,2,3"
+    parser = all_parsers
+
+    if quoting != csv.QUOTE_NONE:
+        # Sanity checking.
+        msg = "quotechar must be set if quoting enabled"
+
+        with pytest.raises(TypeError, match=msg):
+            parser.read_csv(StringIO(data), **kwargs)
+    else:
+        expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
+        result = parser.read_csv(StringIO(data), **kwargs)
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.parametrize("balanced", [True, False])
-    def test_unbalanced_quoting(self, all_parsers, balanced):
-        # see gh-22789.
-        parser = all_parsers
-        data = 'a,b,c\n1,2,"3'
-
-        if balanced:
-            # Re-balance the quoting and read in without errors.
-            expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
-            result = parser.read_csv(StringIO(data + '"'))
-            tm.assert_frame_equal(result, expected)
-        else:
-            msg = (
-                "EOF inside string starting at row 1"
-                if parser.engine == "c"
-                else "unexpected end of data"
-            )
-
-            with pytest.raises(ParserError, match=msg):
-                parser.read_csv(StringIO(data))
+
+@pytest.mark.parametrize(
+    "kwargs,exp_data",
+    [
+        ({}, [[1, 2, "foo"]]),  # Test default.
+        # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
+        ({"quotechar": '"', "quoting": csv.QUOTE_MINIMAL}, [[1, 2, "foo"]]),
+        # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
+        ({"quotechar": '"', "quoting": csv.QUOTE_ALL}, [[1, 2, "foo"]]),
+        # QUOTE_NONE tells the reader to do no special handling
+        # of quote characters and leave them alone.
+        ({"quotechar": '"', "quoting": csv.QUOTE_NONE}, [[1, 2, '"foo"']]),
+        # QUOTE_NONNUMERIC tells the reader to cast
+        # all non-quoted fields to float
+        ({"quotechar": '"', "quoting": csv.QUOTE_NONNUMERIC}, [[1.0, 2.0, "foo"]]),
+    ],
+)
+def test_quoting_various(all_parsers, kwargs, exp_data):
+    data = '1,2,"foo"'
+    parser = all_parsers
+    columns = ["a", "b", "c"]
+
+    result = parser.read_csv(StringIO(data), names=columns, **kwargs)
+    expected = DataFrame(exp_data, columns=columns)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])]
+)
+def test_double_quote(all_parsers, doublequote, exp_data):
+    parser = all_parsers
+    data = 'a,b\n3,"4 "" 5"'
+
+    result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote)
+    expected = DataFrame(exp_data, columns=["a", "b"])
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("quotechar", ['"', "\u0001"])
+def test_quotechar_unicode(all_parsers, quotechar):
+    # see gh-14477
+    data = "a\n1"
+    parser = all_parsers
+    expected = DataFrame({"a": [1]})
+
+    result = parser.read_csv(StringIO(data), quotechar=quotechar)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("balanced", [True, False])
+def test_unbalanced_quoting(all_parsers, balanced):
+    # see gh-22789.
+    parser = all_parsers
+    data = 'a,b,c\n1,2,"3'
+
+    if balanced:
+        # Re-balance the quoting and read in without errors.
+        expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
+        result = parser.read_csv(StringIO(data + '"'))
+        tm.assert_frame_equal(result, expected)
+    else:
+        msg = (
+            "EOF inside string starting at row 1"
+            if parser.engine == "c"
+            else "unexpected end of data"
+        )
+
+        with pytest.raises(ParserError, match=msg):
+            parser.read_csv(StringIO(data))

From a1dfcb2d5ace8099cee4faed8e930bc6d8828759 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 19 Dec 2020 02:14:03 -0500
Subject: [PATCH 72/95] test reorg

---
 pandas/tests/io/parser/test_skiprows.py | 447 ++++++++++++------------
 1 file changed, 221 insertions(+), 226 deletions(-)

diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py
index f043861b36e4a..ffd4f3aecb5d0 100644
--- a/pandas/tests/io/parser/test_skiprows.py
+++ b/pandas/tests/io/parser/test_skiprows.py
@@ -14,246 +14,241 @@
 from pandas import DataFrame, Index
 import pandas._testing as tm
 
-skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
-
-
-@skip_pyarrow
-class TestParserSkiprows:
-    @pytest.mark.parametrize("skiprows", [list(range(6)), 6])
-    def test_skip_rows_bug(self, all_parsers, skiprows):
-        # see gh-505
-        parser = all_parsers
-        text = """#foo,a,b,c
-            #foo,a,b,c
-            #foo,a,b,c
-            #foo,a,b,c
-            #foo,a,b,c
-            #foo,a,b,c
-            1/1/2000,1.,2.,3.
-            1/2/2000,4,5,6
-            1/3/2000,7,8,9
-            """
-        result = parser.read_csv(
-            StringIO(text),
-            skiprows=skiprows,
-            header=None,
-            index_col=0,
-            parse_dates=True,
-        )
-        index = Index(
-            [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
-        )
-
-        expected = DataFrame(
-            np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
-        )
-        tm.assert_frame_equal(result, expected)
-
-    def test_deep_skip_rows(self, all_parsers):
-        # see gh-4382
-        parser = all_parsers
-        data = "a,b,c\n" + "\n".join(
-            [",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)]
-        )
-        condensed_data = "a,b,c\n" + "\n".join(
-            [
-                ",".join([str(i), str(i + 1), str(i + 2)])
-                for i in [0, 1, 2, 3, 4, 6, 8, 9]
-            ]
-        )
-
-        result = parser.read_csv(StringIO(data), skiprows=[6, 8])
-        condensed_result = parser.read_csv(StringIO(condensed_data))
-        tm.assert_frame_equal(result, condensed_result)
-
-    def test_skip_rows_blank(self, all_parsers):
-        # see gh-9832
-        parser = all_parsers
-        text = """#foo,a,b,c
-        #foo,a,b,c
-
-        #foo,a,b,c
-        #foo,a,b,c
-
-        1/1/2000,1.,2.,3.
-        1/2/2000,4,5,6
-        1/3/2000,7,8,9
-        """
-        data = parser.read_csv(
-            StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True
-        )
-        index = Index(
-            [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
-        )
-
-        expected = DataFrame(
-            np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
-        )
-        tm.assert_frame_equal(data, expected)
-
-    def test_skip_row_with_quote(self, all_parsers):
-        # see gh-12775 and gh-10911
-        parser = all_parsers
-        data = """id,text,num_lines
-    1,"line '11' line 12",2
-    2,"line '21' line 22",2
-    3,"line '31' line 32",1"""
-
-        exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]]
-        expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
-
-        result = parser.read_csv(StringIO(data), skiprows=[1])
-        tm.assert_frame_equal(result, expected)
-
-    def test_skip_rows_skip_all(self, all_parsers):
-        parser = all_parsers
-        data = "a\n1\n2\n3\n4\n5"
-        msg = "No columns to parse from file"
-
-        with pytest.raises(EmptyDataError, match=msg):
-            parser.read_csv(StringIO(data), skiprows=lambda x: True)
-
-    def test_skip_rows_bad_callable(self, all_parsers):
-        msg = "by zero"
-        parser = all_parsers
-        data = "a\n1\n2\n3\n4\n5"
-
-        with pytest.raises(ZeroDivisionError, match=msg):
-            parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0)
-
-    def test_skiprows_infield_quote(self, all_parsers):
-        # see gh-14459
-        parser = all_parsers
-        data = 'a"\nb"\na\n1'
-        expected = DataFrame({"a": [1]})
-
-        result = parser.read_csv(StringIO(data), skiprows=2)
-        tm.assert_frame_equal(result, expected)
-
-    @pytest.mark.parametrize(
-        "kwargs,expected",
-        [
-            ({}, DataFrame({"1": [3, 5]})),
-            ({"header": 0, "names": ["foo"]}, DataFrame({"foo": [3, 5]})),
-        ],
+pytestmark = pytest.mark.usefixtures("pyarrow_skip")
+
+
+@pytest.mark.parametrize("skiprows", [list(range(6)), 6])
+def test_skip_rows_bug(all_parsers, skiprows):
+    # see gh-505
+    parser = all_parsers
+    text = """#foo,a,b,c
+#foo,a,b,c
+#foo,a,b,c
+#foo,a,b,c
+#foo,a,b,c
+#foo,a,b,c
+1/1/2000,1.,2.,3.
+1/2/2000,4,5,6
+1/3/2000,7,8,9
+"""
+    result = parser.read_csv(
+        StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True
+    )
+    index = Index(
+        [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
+    )
+
+    expected = DataFrame(
+        np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
     )
-    def test_skip_rows_callable(self, all_parsers, kwargs, expected):
-        parser = all_parsers
-        data = "a\n1\n2\n3\n4\n5"
+    tm.assert_frame_equal(result, expected)
 
-        result = parser.read_csv(
-            StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs
-        )
-        tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.parametrize(
-        "line_terminator", ["\n", "\r\n", "\r"]  # "LF"  # "CRLF"  # "CR"
+def test_deep_skip_rows(all_parsers):
+    # see gh-4382
+    parser = all_parsers
+    data = "a,b,c\n" + "\n".join(
+        [",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)]
     )
-    def test_skiprows_lineterminator(self, all_parsers, line_terminator):
-        # see gh-9079
-        parser = all_parsers
-        data = "\n".join(
-            [
-                "SMOSMANIA ThetaProbe-ML2X ",
-                "2007/01/01 01:00   0.2140 U M ",
-                "2007/01/01 02:00   0.2141 M O ",
-                "2007/01/01 04:00   0.2142 D M ",
-            ]
-        )
-        expected = DataFrame(
-            [
-                ["2007/01/01", "01:00", 0.2140, "U", "M"],
-                ["2007/01/01", "02:00", 0.2141, "M", "O"],
-                ["2007/01/01", "04:00", 0.2142, "D", "M"],
-            ],
-            columns=["date", "time", "var", "flag", "oflag"],
-        )
-
-        if parser.engine == "python" and line_terminator == "\r":
-            pytest.skip("'CR' not respect with the Python parser yet")
-
-        data = data.replace("\n", line_terminator)
-        result = parser.read_csv(
-            StringIO(data),
-            skiprows=1,
-            delim_whitespace=True,
-            names=["date", "time", "var", "flag", "oflag"],
-        )
-        tm.assert_frame_equal(result, expected)
-
-    @skip_pyarrow
-    @pytest.mark.parametrize(
-        "data,exp_data",
-        [
-            (
-                """id,text,num_lines
-    1,"line \n'11' line 12",2
-    2,"line \n'21' line 22",2
-    3,"line \n'31' line 32",1""",
-                [[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]],
-            ),
-            (
-                """id,text,num_lines
-    1,"line '11\n' line 12",2
-    2,"line '21\n' line 22",2
-    3,"line '31\n' line 32",1""",
-                [[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]],
-            ),
-            (
-                """id,text,num_lines
-    1,"line '11\n' \r\tline 12",2
-    2,"line '21\n' \r\tline 22",2
-    3,"line '31\n' \r\tline 32",1""",
-                [[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]],
-            ),
-        ],
+    condensed_data = "a,b,c\n" + "\n".join(
+        [",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]]
     )
-    def test_skip_row_with_newline_and_quote(self, all_parsers, data, exp_data):
-        # see gh-12775 and gh-10911
-        parser = all_parsers
-        result = parser.read_csv(StringIO(data), skiprows=[1])
 
-        expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
-        tm.assert_frame_equal(result, expected)
+    result = parser.read_csv(StringIO(data), skiprows=[6, 8])
+    condensed_result = parser.read_csv(StringIO(condensed_data))
+    tm.assert_frame_equal(result, condensed_result)
 
-    @skip_pyarrow
-    @pytest.mark.parametrize(
-        "data,kwargs,expected",
-        [
-            (
-                """id,text,num_lines
+
+def test_skip_rows_blank(all_parsers):
+    # see gh-9832
+    parser = all_parsers
+    text = """#foo,a,b,c
+#foo,a,b,c
+
+#foo,a,b,c
+#foo,a,b,c
+
+1/1/2000,1.,2.,3.
+1/2/2000,4,5,6
+1/3/2000,7,8,9
+"""
+    data = parser.read_csv(
+        StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True
+    )
+    index = Index(
+        [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
+    )
+
+    expected = DataFrame(
+        np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
+    )
+    tm.assert_frame_equal(data, expected)
+
+
+@pytest.mark.parametrize(
+    "data,kwargs,expected",
+    [
+        (
+            """id,text,num_lines
 1,"line 11
 line 12",2
 2,"line 21
 line 22",2
 3,"line 31",1""",
-                {"skiprows": [1]},
-                DataFrame(
-                    [[2, "line 21\nline 22", 2], [3, "line 31", 1]],
-                    columns=["id", "text", "num_lines"],
-                ),
+            {"skiprows": [1]},
+            DataFrame(
+                [[2, "line 21\nline 22", 2], [3, "line 31", 1]],
+                columns=["id", "text", "num_lines"],
             ),
+        ),
+        (
+            "a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~",
+            {"quotechar": "~", "skiprows": [2]},
+            DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]),
+        ),
+        (
             (
-                "a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~",
-                {"quotechar": "~", "skiprows": [2]},
-                DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]),
-            ),
-            (
-                (
-                    "Text,url\n~example\n "
-                    "sentence\n one~,url1\n~"
-                    "example\n sentence\n two~,url2\n~"
-                    "example\n sentence\n three~,url3"
-                ),
-                {"quotechar": "~", "skiprows": [1, 3]},
-                DataFrame(
-                    [["example\n sentence\n two", "url2"]], columns=["Text", "url"]
-                ),
+                "Text,url\n~example\n "
+                "sentence\n one~,url1\n~"
+                "example\n sentence\n two~,url2\n~"
+                "example\n sentence\n three~,url3"
             ),
+            {"quotechar": "~", "skiprows": [1, 3]},
+            DataFrame([["example\n sentence\n two", "url2"]], columns=["Text", "url"]),
+        ),
+    ],
+)
+def test_skip_row_with_newline(all_parsers, data, kwargs, expected):
+    # see gh-12775 and gh-10911
+    parser = all_parsers
+    result = parser.read_csv(StringIO(data), **kwargs)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_skip_row_with_quote(all_parsers):
+    # see gh-12775 and gh-10911
+    parser = all_parsers
+    data = """id,text,num_lines
+1,"line '11' line 12",2
+2,"line '21' line 22",2
+3,"line '31' line 32",1"""
+
+    exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]]
+    expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
+
+    result = parser.read_csv(StringIO(data), skiprows=[1])
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data,exp_data",
+    [
+        (
+            """id,text,num_lines
+1,"line \n'11' line 12",2
+2,"line \n'21' line 22",2
+3,"line \n'31' line 32",1""",
+            [[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]],
+        ),
+        (
+            """id,text,num_lines
+1,"line '11\n' line 12",2
+2,"line '21\n' line 22",2
+3,"line '31\n' line 32",1""",
+            [[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]],
+        ),
+        (
+            """id,text,num_lines
+1,"line '11\n' \r\tline 12",2
+2,"line '21\n' \r\tline 22",2
+3,"line '31\n' \r\tline 32",1""",
+            [[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]],
+        ),
+    ],
+)
+def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data):
+    # see gh-12775 and gh-10911
+    parser = all_parsers
+    result = parser.read_csv(StringIO(data), skiprows=[1])
+
+    expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "line_terminator", ["\n", "\r\n", "\r"]  # "LF"  # "CRLF"  # "CR"
+)
+def test_skiprows_lineterminator(all_parsers, line_terminator):
+    # see gh-9079
+    parser = all_parsers
+    data = "\n".join(
+        [
+            "SMOSMANIA ThetaProbe-ML2X ",
+            "2007/01/01 01:00   0.2140 U M ",
+            "2007/01/01 02:00   0.2141 M O ",
+            "2007/01/01 04:00   0.2142 D M ",
+        ]
+    )
+    expected = DataFrame(
+        [
+            ["2007/01/01", "01:00", 0.2140, "U", "M"],
+            ["2007/01/01", "02:00", 0.2141, "M", "O"],
+            ["2007/01/01", "04:00", 0.2142, "D", "M"],
         ],
+        columns=["date", "time", "var", "flag", "oflag"],
+    )
+
+    if parser.engine == "python" and line_terminator == "\r":
+        pytest.skip("'CR' not respect with the Python parser yet")
+
+    data = data.replace("\n", line_terminator)
+    result = parser.read_csv(
+        StringIO(data),
+        skiprows=1,
+        delim_whitespace=True,
+        names=["date", "time", "var", "flag", "oflag"],
     )
-    def test_skip_row_with_newline(self, all_parsers, data, kwargs, expected):
-        # see gh-12775 and gh-10911
-        parser = all_parsers
-        result = parser.read_csv(StringIO(data), **kwargs)
-        tm.assert_frame_equal(result, expected)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_skiprows_infield_quote(all_parsers):
+    # see gh-14459
+    parser = all_parsers
+    data = 'a"\nb"\na\n1'
+    expected = DataFrame({"a": [1]})
+
+    result = parser.read_csv(StringIO(data), skiprows=2)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "kwargs,expected",
+    [
+        ({}, DataFrame({"1": [3, 5]})),
+        ({"header": 0, "names": ["foo"]}, DataFrame({"foo": [3, 5]})),
+    ],
+)
+def test_skip_rows_callable(all_parsers, kwargs, expected):
+    parser = all_parsers
+    data = "a\n1\n2\n3\n4\n5"
+
+    result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_skip_rows_skip_all(all_parsers):
+    parser = all_parsers
+    data = "a\n1\n2\n3\n4\n5"
+    msg = "No columns to parse from file"
+
+    with pytest.raises(EmptyDataError, match=msg):
+        parser.read_csv(StringIO(data), skiprows=lambda x: True)
+
+
+def test_skip_rows_bad_callable(all_parsers):
+    msg = "by zero"
+    parser = all_parsers
+    data = "a\n1\n2\n3\n4\n5"
+
+    with pytest.raises(ZeroDivisionError, match=msg):
+        parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0)

From 24331709f3d77b365a466e47b08ebe6ef626e657 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 19 Dec 2020 02:36:11 -0500
Subject: [PATCH 73/95] test reorg

---
 pandas/tests/io/parser/test_dtypes.py | 686 +++++++++++++-------------
 1 file changed, 330 insertions(+), 356 deletions(-)

diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py
index 4ef609cb87980..739c49cb87b3f 100644
--- a/pandas/tests/io/parser/test_dtypes.py
+++ b/pandas/tests/io/parser/test_dtypes.py
@@ -20,461 +20,435 @@
 
 
 @skip_pyarrow
-@pytest.mark.parametrize("dtype", [str, object])
-@pytest.mark.parametrize("check_orig", [True, False])
-def test_dtype_all_columns(all_parsers, dtype, check_orig):
-    # see gh-3795, gh-6607
-    parser = all_parsers
-
-    df = DataFrame(
-        np.random.rand(5, 2).round(4),
-        columns=list("AB"),
-        index=["1A", "1B", "1C", "1D", "1E"],
-    )
-
-    with tm.ensure_clean("__passing_str_as_dtype__.csv") as path:
-        df.to_csv(path)
-
-        result = parser.read_csv(path, dtype=dtype, index_col=0)
-
-        if check_orig:
-            expected = df.copy()
-            result = result.astype(float)
-        else:
-            expected = df.astype(str)
-
-        tm.assert_frame_equal(result, expected)
+class TestParserDtypesBasic:
+    @pytest.mark.parametrize("dtype", [str, object])
+    @pytest.mark.parametrize("check_orig", [True, False])
+    def test_dtype_all_columns(self, all_parsers, dtype, check_orig):
+        # see gh-3795, gh-6607
+        parser = all_parsers
+
+        df = DataFrame(
+            np.random.rand(5, 2).round(4),
+            columns=list("AB"),
+            index=["1A", "1B", "1C", "1D", "1E"],
+        )
 
+        with tm.ensure_clean("__passing_str_as_dtype__.csv") as path:
+            df.to_csv(path)
 
-@skip_pyarrow
-def test_dtype_all_columns_empty(all_parsers):
-    # see gh-12048
-    parser = all_parsers
-    result = parser.read_csv(StringIO("A,B"), dtype=str)
+            result = parser.read_csv(path, dtype=dtype, index_col=0)
 
-    expected = DataFrame({"A": [], "B": []}, index=[], dtype=str)
-    tm.assert_frame_equal(result, expected)
+            if check_orig:
+                expected = df.copy()
+                result = result.astype(float)
+            else:
+                expected = df.astype(str)
 
+            tm.assert_frame_equal(result, expected)
 
-@skip_pyarrow
-def test_dtype_per_column(all_parsers):
-    parser = all_parsers
-    data = """\
+    def test_dtype_per_column(self, all_parsers):
+        parser = all_parsers
+        data = """\
 one,two
 1,2.5
 2,3.5
 3,4.5
 4,5.5"""
-    expected = DataFrame(
-        [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"]
-    )
-    expected["one"] = expected["one"].astype(np.float64)
-    expected["two"] = expected["two"].astype(object)
-
-    result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str})
-    tm.assert_frame_equal(result, expected)
+        expected = DataFrame(
+            [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"]
+        )
+        expected["one"] = expected["one"].astype(np.float64)
+        expected["two"] = expected["two"].astype(object)
 
+        result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str})
+        tm.assert_frame_equal(result, expected)
 
-@skip_pyarrow
-def test_invalid_dtype_per_column(all_parsers):
-    parser = all_parsers
-    data = """\
+    def test_invalid_dtype_per_column(self, all_parsers):
+        parser = all_parsers
+        data = """\
 one,two
 1,2.5
 2,3.5
 3,4.5
 4,5.5"""
 
-    with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"):
-        parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"})
+        with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"):
+            parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"})
 
 
 @skip_pyarrow
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        "category",
-        CategoricalDtype(),
-        {"a": "category", "b": "category", "c": CategoricalDtype()},
-    ],
-)
-def test_categorical_dtype(all_parsers, dtype):
-    # see gh-10153
-    parser = all_parsers
-    data = """a,b,c
+class TestParserDtypesCategorical1:
+    @pytest.mark.parametrize(
+        "dtype",
+        [
+            "category",
+            CategoricalDtype(),
+            {"a": "category", "b": "category", "c": CategoricalDtype()},
+        ],
+    )
+    def test_categorical_dtype(self, all_parsers, dtype):
+        # see gh-10153
+        parser = all_parsers
+        data = """a,b,c
 1,a,3.4
 1,a,3.4
 2,b,4.5"""
-    expected = DataFrame(
-        {
-            "a": Categorical(["1", "1", "2"]),
-            "b": Categorical(["a", "a", "b"]),
-            "c": Categorical(["3.4", "3.4", "4.5"]),
-        }
-    )
-    actual = parser.read_csv(StringIO(data), dtype=dtype)
-    tm.assert_frame_equal(actual, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}])
-def test_categorical_dtype_single(all_parsers, dtype):
-    # see gh-10153
-    parser = all_parsers
-    data = """a,b,c
+        expected = DataFrame(
+            {
+                "a": Categorical(["1", "1", "2"]),
+                "b": Categorical(["a", "a", "b"]),
+                "c": Categorical(["3.4", "3.4", "4.5"]),
+            }
+        )
+        actual = parser.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(actual, expected)
+
+    @pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}])
+    def test_categorical_dtype_single(self, all_parsers, dtype):
+        # see gh-10153
+        parser = all_parsers
+        data = """a,b,c
 1,a,3.4
 1,a,3.4
 2,b,4.5"""
-    expected = DataFrame(
-        {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]}
-    )
-    actual = parser.read_csv(StringIO(data), dtype=dtype)
-    tm.assert_frame_equal(actual, expected)
-
+        expected = DataFrame(
+            {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]}
+        )
+        actual = parser.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(actual, expected)
 
-@skip_pyarrow
-def test_categorical_dtype_unsorted(all_parsers):
-    # see gh-10153
-    parser = all_parsers
-    data = """a,b,c
+    def test_categorical_dtype_unsorted(self, all_parsers):
+        # see gh-10153
+        parser = all_parsers
+        data = """a,b,c
 1,b,3.4
 1,b,3.4
 2,a,4.5"""
-    expected = DataFrame(
-        {
-            "a": Categorical(["1", "1", "2"]),
-            "b": Categorical(["b", "b", "a"]),
-            "c": Categorical(["3.4", "3.4", "4.5"]),
-        }
-    )
-    actual = parser.read_csv(StringIO(data), dtype="category")
-    tm.assert_frame_equal(actual, expected)
-
+        expected = DataFrame(
+            {
+                "a": Categorical(["1", "1", "2"]),
+                "b": Categorical(["b", "b", "a"]),
+                "c": Categorical(["3.4", "3.4", "4.5"]),
+            }
+        )
+        actual = parser.read_csv(StringIO(data), dtype="category")
+        tm.assert_frame_equal(actual, expected)
 
-@skip_pyarrow
-def test_categorical_dtype_missing(all_parsers):
-    # see gh-10153
-    parser = all_parsers
-    data = """a,b,c
+    def test_categorical_dtype_missing(self, all_parsers):
+        # see gh-10153
+        parser = all_parsers
+        data = """a,b,c
 1,b,3.4
 1,nan,3.4
 2,a,4.5"""
-    expected = DataFrame(
-        {
-            "a": Categorical(["1", "1", "2"]),
-            "b": Categorical(["b", np.nan, "a"]),
-            "c": Categorical(["3.4", "3.4", "4.5"]),
-        }
-    )
-    actual = parser.read_csv(StringIO(data), dtype="category")
-    tm.assert_frame_equal(actual, expected)
-
-
-@skip_pyarrow
-@pytest.mark.slow
-def test_categorical_dtype_high_cardinality_numeric(all_parsers):
-    # see gh-18186
-    parser = all_parsers
-    data = np.sort([str(i) for i in range(524289)])
-    expected = DataFrame({"a": Categorical(data, ordered=True)})
-
-    actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category")
-    actual["a"] = actual["a"].cat.reorder_categories(
-        np.sort(actual.a.cat.categories), ordered=True
-    )
-    tm.assert_frame_equal(actual, expected)
-
-
-def test_categorical_dtype_latin1(all_parsers, csv_dir_path):
-    # see gh-10153
-    pth = os.path.join(csv_dir_path, "unicode_series.csv")
-    parser = all_parsers
-    encoding = "latin-1"
-
-    expected = parser.read_csv(pth, header=None, encoding=encoding)
-    expected[1] = Categorical(expected[1])
-
-    actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"})
-    tm.assert_frame_equal(actual, expected)
-
-
-@skip_pyarrow
-def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
-    # see gh-10153
-    pth = os.path.join(csv_dir_path, "utf16_ex.txt")
-    parser = all_parsers
-    encoding = "utf-16"
-    sep = "\t"
+        expected = DataFrame(
+            {
+                "a": Categorical(["1", "1", "2"]),
+                "b": Categorical(["b", np.nan, "a"]),
+                "c": Categorical(["3.4", "3.4", "4.5"]),
+            }
+        )
+        actual = parser.read_csv(StringIO(data), dtype="category")
+        tm.assert_frame_equal(actual, expected)
+
+    @pytest.mark.slow
+    def test_categorical_dtype_high_cardinality_numeric(self, all_parsers):
+        # see gh-18186
+        parser = all_parsers
+        data = np.sort([str(i) for i in range(524289)])
+        expected = DataFrame({"a": Categorical(data, ordered=True)})
+
+        actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category")
+        actual["a"] = actual["a"].cat.reorder_categories(
+            np.sort(actual.a.cat.categories), ordered=True
+        )
+        tm.assert_frame_equal(actual, expected)
 
-    expected = parser.read_csv(pth, sep=sep, encoding=encoding)
-    expected = expected.apply(Categorical)
+    def test_categorical_dtype_utf16(self, all_parsers, csv_dir_path):
+        # see gh-10153
+        pth = os.path.join(csv_dir_path, "utf16_ex.txt")
+        parser = all_parsers
+        encoding = "utf-16"
+        sep = "\t"
 
-    actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category")
-    tm.assert_frame_equal(actual, expected)
+        expected = parser.read_csv(pth, sep=sep, encoding=encoding)
+        expected = expected.apply(Categorical)
 
+        actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category")
+        tm.assert_frame_equal(actual, expected)
 
-@skip_pyarrow
-def test_categorical_dtype_chunksize_infer_categories(all_parsers):
-    # see gh-10153
-    parser = all_parsers
-    data = """a,b
+    def test_categorical_dtype_chunksize_infer_categories(self, all_parsers):
+        # see gh-10153
+        parser = all_parsers
+        data = """a,b
 1,a
 1,b
 1,b
 2,c"""
-    expecteds = [
-        DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}),
-        DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]),
-    ]
-    with parser.read_csv(
-        StringIO(data), dtype={"b": "category"}, chunksize=2
-    ) as actuals:
-        for actual, expected in zip(actuals, expecteds):
-            tm.assert_frame_equal(actual, expected)
-
-
-@skip_pyarrow
-def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
-    # see gh-10153
-    parser = all_parsers
-    data = """a,b
+        expecteds = [
+            DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}),
+            DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]),
+        ]
+        with parser.read_csv(
+            StringIO(data), dtype={"b": "category"}, chunksize=2
+        ) as actuals:
+            for actual, expected in zip(actuals, expecteds):
+                tm.assert_frame_equal(actual, expected)
+
+    def test_categorical_dtype_chunksize_explicit_categories(self, all_parsers):
+        # see gh-10153
+        parser = all_parsers
+        data = """a,b
 1,a
 1,b
 1,b
 2,c"""
-    cats = ["a", "b", "c"]
-    expecteds = [
-        DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}),
-        DataFrame(
-            {"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)}, index=[2, 3]
-        ),
-    ]
-    dtype = CategoricalDtype(cats)
-    with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals:
-        for actual, expected in zip(actuals, expecteds):
-            tm.assert_frame_equal(actual, expected)
-
+        cats = ["a", "b", "c"]
+        expecteds = [
+            DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}),
+            DataFrame(
+                {"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)},
+                index=[2, 3],
+            ),
+        ]
+        dtype = CategoricalDtype(cats)
+        with parser.read_csv(
+            StringIO(data), dtype={"b": dtype}, chunksize=2
+        ) as actuals:
+            for actual, expected in zip(actuals, expecteds):
+                tm.assert_frame_equal(actual, expected)
+
+
+class TestParserDtypesCategorical2:
+    def test_categorical_dtype_latin1(self, all_parsers, csv_dir_path):
+        # see gh-10153
+        pth = os.path.join(csv_dir_path, "unicode_series.csv")
+        parser = all_parsers
+        encoding = "latin-1"
+
+        expected = parser.read_csv(pth, header=None, encoding=encoding)
+        expected[1] = Categorical(expected[1])
+
+        actual = parser.read_csv(
+            pth, header=None, encoding=encoding, dtype={1: "category"}
+        )
+        tm.assert_frame_equal(actual, expected)
 
-@pytest.mark.parametrize("ordered", [False, True])
-@pytest.mark.parametrize(
-    "categories",
-    [["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]],
-)
-def test_categorical_category_dtype(all_parsers, categories, ordered):
-    parser = all_parsers
-    data = """a,b
+    @pytest.mark.parametrize("ordered", [False, True])
+    @pytest.mark.parametrize(
+        "categories",
+        [["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]],
+    )
+    def test_categorical_category_dtype(self, all_parsers, categories, ordered):
+        parser = all_parsers
+        data = """a,b
 1,a
 1,b
 1,b
 2,c"""
-    expected = DataFrame(
-        {
-            "a": [1, 1, 1, 2],
-            "b": Categorical(
-                ["a", "b", "b", "c"], categories=categories, ordered=ordered
-            ),
-        }
-    )
-
-    dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)}
-    result = parser.read_csv(StringIO(data), dtype=dtype)
-    tm.assert_frame_equal(result, expected)
+        expected = DataFrame(
+            {
+                "a": [1, 1, 1, 2],
+                "b": Categorical(
+                    ["a", "b", "b", "c"], categories=categories, ordered=ordered
+                ),
+            }
+        )
 
+        dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)}
+        result = parser.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
 
-def test_categorical_category_dtype_unsorted(all_parsers):
-    parser = all_parsers
-    data = """a,b
+    def test_categorical_category_dtype_unsorted(self, all_parsers):
+        parser = all_parsers
+        data = """a,b
 1,a
 1,b
 1,b
 2,c"""
-    dtype = CategoricalDtype(["c", "b", "a"])
-    expected = DataFrame(
-        {
-            "a": [1, 1, 1, 2],
-            "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]),
-        }
-    )
-
-    result = parser.read_csv(StringIO(data), dtype={"b": dtype})
-    tm.assert_frame_equal(result, expected)
-
-
-def test_categorical_coerces_numeric(all_parsers):
-    parser = all_parsers
-    dtype = {"b": CategoricalDtype([1, 2, 3])}
-
-    data = "b\n1\n1\n2\n3"
-    expected = DataFrame({"b": Categorical([1, 1, 2, 3])})
-
-    result = parser.read_csv(StringIO(data), dtype=dtype)
-    tm.assert_frame_equal(result, expected)
-
-
-def test_categorical_coerces_datetime(all_parsers):
-    parser = all_parsers
-    dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None)
-    dtype = {"b": CategoricalDtype(dti)}
-
-    data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
-    expected = DataFrame({"b": Categorical(dtype["b"].categories)})
-
-    result = parser.read_csv(StringIO(data), dtype=dtype)
-    tm.assert_frame_equal(result, expected)
-
+        dtype = CategoricalDtype(["c", "b", "a"])
+        expected = DataFrame(
+            {
+                "a": [1, 1, 1, 2],
+                "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]),
+            }
+        )
 
-def test_categorical_coerces_timestamp(all_parsers):
-    parser = all_parsers
-    dtype = {"b": CategoricalDtype([Timestamp("2014")])}
+        result = parser.read_csv(StringIO(data), dtype={"b": dtype})
+        tm.assert_frame_equal(result, expected)
 
-    data = "b\n2014-01-01\n2014-01-01T00:00:00"
-    expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)})
+    def test_categorical_coerces_numeric(self, all_parsers):
+        parser = all_parsers
+        dtype = {"b": CategoricalDtype([1, 2, 3])}
 
-    result = parser.read_csv(StringIO(data), dtype=dtype)
-    tm.assert_frame_equal(result, expected)
+        data = "b\n1\n1\n2\n3"
+        expected = DataFrame({"b": Categorical([1, 1, 2, 3])})
 
+        result = parser.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
 
-@skip_pyarrow
-def test_categorical_coerces_timedelta(all_parsers):
-    parser = all_parsers
-    dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))}
+    def test_categorical_coerces_datetime(self, all_parsers):
+        parser = all_parsers
+        dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None)
+        dtype = {"b": CategoricalDtype(dti)}
 
-    data = "b\n1H\n2H\n3H"
-    expected = DataFrame({"b": Categorical(dtype["b"].categories)})
+        data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
+        expected = DataFrame({"b": Categorical(dtype["b"].categories)})
 
-    result = parser.read_csv(StringIO(data), dtype=dtype)
-    tm.assert_frame_equal(result, expected)
+        result = parser.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
 
+    def test_categorical_coerces_timestamp(self, all_parsers):
+        parser = all_parsers
+        dtype = {"b": CategoricalDtype([Timestamp("2014")])}
 
-@pytest.mark.parametrize(
-    "data",
-    [
-        "b\nTrue\nFalse\nNA\nFalse",
-        "b\ntrue\nfalse\nNA\nfalse",
-        "b\nTRUE\nFALSE\nNA\nFALSE",
-        "b\nTrue\nFalse\nNA\nFALSE",
-    ],
-)
-def test_categorical_dtype_coerces_boolean(all_parsers, data):
-    # see gh-20498
-    parser = all_parsers
-    dtype = {"b": CategoricalDtype([False, True])}
-    expected = DataFrame({"b": Categorical([True, False, None, False])})
+        data = "b\n2014-01-01\n2014-01-01T00:00:00"
+        expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)})
 
-    result = parser.read_csv(StringIO(data), dtype=dtype)
-    tm.assert_frame_equal(result, expected)
+        result = parser.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
 
+    @skip_pyarrow
+    def test_categorical_coerces_timedelta(self, all_parsers):
+        parser = all_parsers
+        dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))}
 
-def test_categorical_unexpected_categories(all_parsers):
-    parser = all_parsers
-    dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}
+        data = "b\n1H\n2H\n3H"
+        expected = DataFrame({"b": Categorical(dtype["b"].categories)})
 
-    data = "b\nd\na\nc\nd"  # Unexpected c
-    expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])})
+        result = parser.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
 
-    result = parser.read_csv(StringIO(data), dtype=dtype)
-    tm.assert_frame_equal(result, expected)
+    @pytest.mark.parametrize(
+        "data",
+        [
+            "b\nTrue\nFalse\nNA\nFalse",
+            "b\ntrue\nfalse\nNA\nfalse",
+            "b\nTRUE\nFALSE\nNA\nFALSE",
+            "b\nTrue\nFalse\nNA\nFALSE",
+        ],
+    )
+    def test_categorical_dtype_coerces_boolean(self, all_parsers, data):
+        # see gh-20498
+        parser = all_parsers
+        dtype = {"b": CategoricalDtype([False, True])}
+        expected = DataFrame({"b": Categorical([True, False, None, False])})
 
+        result = parser.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
 
-@skip_pyarrow
-def test_empty_pass_dtype(all_parsers):
-    parser = all_parsers
+    def test_categorical_unexpected_categories(self, all_parsers):
+        parser = all_parsers
+        dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}
 
-    data = "one,two"
-    result = parser.read_csv(StringIO(data), dtype={"one": "u1"})
+        data = "b\nd\na\nc\nd"  # Unexpected c
+        expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])})
 
-    expected = DataFrame(
-        {"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=object)},
-        index=Index([], dtype=object),
-    )
-    tm.assert_frame_equal(result, expected)
+        result = parser.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
 
 
 @skip_pyarrow
-def test_empty_with_index_pass_dtype(all_parsers):
-    parser = all_parsers
+class TestParserDtypesEmpty:
+    def test_dtype_all_columns_empty(self, all_parsers):
+        # see gh-12048
+        parser = all_parsers
+        result = parser.read_csv(StringIO("A,B"), dtype=str)
 
-    data = "one,two"
-    result = parser.read_csv(
-        StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"}
-    )
+        expected = DataFrame({"A": [], "B": []}, index=[], dtype=str)
+        tm.assert_frame_equal(result, expected)
 
-    expected = DataFrame(
-        {"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one")
-    )
-    tm.assert_frame_equal(result, expected)
+    def test_empty_pass_dtype(self, all_parsers):
+        parser = all_parsers
 
+        data = "one,two"
+        result = parser.read_csv(StringIO(data), dtype={"one": "u1"})
 
-@skip_pyarrow
-def test_empty_with_multi_index_pass_dtype(all_parsers):
-    parser = all_parsers
+        expected = DataFrame(
+            {"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=object)},
+            index=Index([], dtype=object),
+        )
+        tm.assert_frame_equal(result, expected)
 
-    data = "one,two,three"
-    result = parser.read_csv(
-        StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"}
-    )
+    def test_empty_with_index_pass_dtype(self, all_parsers):
+        parser = all_parsers
 
-    exp_idx = MultiIndex.from_arrays(
-        [np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)], names=["one", "two"]
-    )
-    expected = DataFrame({"three": np.empty(0, dtype=object)}, index=exp_idx)
-    tm.assert_frame_equal(result, expected)
+        data = "one,two"
+        result = parser.read_csv(
+            StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"}
+        )
 
+        expected = DataFrame(
+            {"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one")
+        )
+        tm.assert_frame_equal(result, expected)
 
-def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers, pyarrow_xfail):
-    parser = all_parsers
+    def test_empty_with_multi_index_pass_dtype(self, all_parsers):
+        parser = all_parsers
 
-    data = "one,one"
-    result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"})
+        data = "one,two,three"
+        result = parser.read_csv(
+            StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"}
+        )
 
-    expected = DataFrame(
-        {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
-        index=Index([], dtype=object),
-    )
-    tm.assert_frame_equal(result, expected)
+        exp_idx = MultiIndex.from_arrays(
+            [np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)],
+            names=["one", "two"],
+        )
+        expected = DataFrame({"three": np.empty(0, dtype=object)}, index=exp_idx)
+        tm.assert_frame_equal(result, expected)
 
+    def test_empty_with_mangled_column_pass_dtype_by_names(self, all_parsers):
+        parser = all_parsers
 
-@skip_pyarrow
-def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers):
-    parser = all_parsers
+        data = "one,one"
+        result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"})
 
-    data = "one,one"
-    result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
+        expected = DataFrame(
+            {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
+            index=Index([], dtype=object),
+        )
+        tm.assert_frame_equal(result, expected)
 
-    expected = DataFrame(
-        {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
-        index=Index([], dtype=object),
-    )
-    tm.assert_frame_equal(result, expected)
+    def test_empty_with_mangled_column_pass_dtype_by_indexes(self, all_parsers):
+        parser = all_parsers
 
+        data = "one,one"
+        result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
 
-@skip_pyarrow
-def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers):
-    # see gh-9424
-    parser = all_parsers
-    expected = concat(
-        [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
-        axis=1,
-    )
-    expected.index = expected.index.astype(object)
+        expected = DataFrame(
+            {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
+            index=Index([], dtype=object),
+        )
+        tm.assert_frame_equal(result, expected)
 
-    data = "one,one"
-    result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
-    tm.assert_frame_equal(result, expected)
+    def test_empty_with_dup_column_pass_dtype_by_indexes(self, all_parsers):
+        # see gh-9424
+        parser = all_parsers
+        expected = concat(
+            [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
+            axis=1,
+        )
+        expected.index = expected.index.astype(object)
 
+        data = "one,one"
+        result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
+        tm.assert_frame_equal(result, expected)
 
-def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers):
-    # see gh-9424
-    parser = all_parsers
-    expected = concat(
-        [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
-        axis=1,
-    )
-    expected.index = expected.index.astype(object)
+    def test_empty_with_dup_column_pass_dtype_by_indexes_raises(self, all_parsers):
+        # see gh-9424
+        parser = all_parsers
+        expected = concat(
+            [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
+            axis=1,
+        )
+        expected.index = expected.index.astype(object)
 
-    with pytest.raises(ValueError, match="Duplicate names"):
-        data = ""
-        parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"})
+        with pytest.raises(ValueError, match="Duplicate names"):
+            data = ""
+            parser.read_csv(
+                StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"}
+            )
 
 
 @skip_pyarrow

From 1a9f185b8e1cac6a12b94343ab76882f871d247a Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 19 Dec 2020 02:41:39 -0500
Subject: [PATCH 74/95] test reorg

---
 pandas/tests/io/parser/test_dtypes.py | 110 +++++++++++++-------------
 1 file changed, 57 insertions(+), 53 deletions(-)

diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py
index 739c49cb87b3f..dcc58b1f2a484 100644
--- a/pandas/tests/io/parser/test_dtypes.py
+++ b/pandas/tests/io/parser/test_dtypes.py
@@ -164,6 +164,7 @@ def test_categorical_dtype_high_cardinality_numeric(self, all_parsers):
         )
         tm.assert_frame_equal(actual, expected)
 
+    @pytest.mark.slow
     def test_categorical_dtype_utf16(self, all_parsers, csv_dir_path):
         # see gh-10153
         pth = os.path.join(csv_dir_path, "utf16_ex.txt")
@@ -220,6 +221,7 @@ def test_categorical_dtype_chunksize_explicit_categories(self, all_parsers):
 
 
 class TestParserDtypesCategorical2:
+    @pytest.mark.slow
     def test_categorical_dtype_latin1(self, all_parsers, csv_dir_path):
         # see gh-10153
         pth = os.path.join(csv_dir_path, "unicode_series.csv")
@@ -450,6 +452,61 @@ def test_empty_with_dup_column_pass_dtype_by_indexes_raises(self, all_parsers):
                 StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"}
             )
 
+    @skip_pyarrow
+    @pytest.mark.parametrize(
+        "dtype,expected",
+        [
+            (np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)),
+            (
+                "category",
+                DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]),
+            ),
+            (
+                {"a": "category", "b": "category"},
+                DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]),
+            ),
+            ("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")),
+            (
+                "timedelta64[ns]",
+                DataFrame(
+                    {
+                        "a": Series([], dtype="timedelta64[ns]"),
+                        "b": Series([], dtype="timedelta64[ns]"),
+                    },
+                    index=[],
+                ),
+            ),
+            (
+                {"a": np.int64, "b": np.int32},
+                DataFrame(
+                    {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
+                    index=[],
+                ),
+            ),
+            (
+                {0: np.int64, 1: np.int32},
+                DataFrame(
+                    {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
+                    index=[],
+                ),
+            ),
+            (
+                {"a": np.int64, 1: np.int32},
+                DataFrame(
+                    {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
+                    index=[],
+                ),
+            ),
+        ],
+    )
+    def test_empty_dtype(self, all_parsers, dtype, expected):
+        # see gh-14712
+        parser = all_parsers
+        data = "a,b"
+
+        result = parser.read_csv(StringIO(data), header=0, dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
 
 @skip_pyarrow
 def test_raise_on_passed_int_dtype_with_nas(all_parsers):
@@ -485,59 +542,6 @@ def test_dtype_with_converters(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "dtype,expected",
-    [
-        (np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)),
-        ("category", DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[])),
-        (
-            {"a": "category", "b": "category"},
-            DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]),
-        ),
-        ("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")),
-        (
-            "timedelta64[ns]",
-            DataFrame(
-                {
-                    "a": Series([], dtype="timedelta64[ns]"),
-                    "b": Series([], dtype="timedelta64[ns]"),
-                },
-                index=[],
-            ),
-        ),
-        (
-            {"a": np.int64, "b": np.int32},
-            DataFrame(
-                {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
-                index=[],
-            ),
-        ),
-        (
-            {0: np.int64, 1: np.int32},
-            DataFrame(
-                {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
-                index=[],
-            ),
-        ),
-        (
-            {"a": np.int64, 1: np.int32},
-            DataFrame(
-                {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
-                index=[],
-            ),
-        ),
-    ],
-)
-def test_empty_dtype(all_parsers, dtype, expected):
-    # see gh-14712
-    parser = all_parsers
-    data = "a,b"
-
-    result = parser.read_csv(StringIO(data), header=0, dtype=dtype)
-    tm.assert_frame_equal(result, expected)
-
-
 @pytest.mark.parametrize(
     "dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"])
 )

From 16d37dbe17a24d8d847fb5b35c5a6030035ceee4 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 19 Dec 2020 02:46:36 -0500
Subject: [PATCH 75/95] test reorg

---
 pandas/tests/io/parser/test_dtypes.py | 193 +++++++++++++-------------
 1 file changed, 94 insertions(+), 99 deletions(-)

diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py
index dcc58b1f2a484..452cbc635b470 100644
--- a/pandas/tests/io/parser/test_dtypes.py
+++ b/pandas/tests/io/parser/test_dtypes.py
@@ -75,6 +75,100 @@ def test_invalid_dtype_per_column(self, all_parsers):
         with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"):
             parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"})
 
+    def test_raise_on_passed_int_dtype_with_nas(self, all_parsers):
+        # see gh-2631
+        parser = all_parsers
+        data = """YEAR, DOY, a
+    2001,106380451,10
+    2001,,11
+    2001,106380451,67"""
+
+        msg = (
+            "Integer column has NA values"
+            if parser.engine == "c"
+            else "Unable to convert column DOY"
+        )
+        with pytest.raises(ValueError, match=msg):
+            parser.read_csv(
+                StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True
+            )
+
+    def test_dtype_with_converters(self, all_parsers):
+        parser = all_parsers
+        data = """a,b
+1.1,2.2
+1.2,2.3"""
+
+        # Dtype spec ignored if converted specified.
+        with tm.assert_produces_warning(ParserWarning):
+            result = parser.read_csv(
+                StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)}
+            )
+        expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]})
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"])
+    )
+    def test_numeric_dtype(self, all_parsers, dtype):
+        data = "0\n1"
+        parser = all_parsers
+        expected = DataFrame([0, 1], dtype=dtype)
+
+        result = parser.read_csv(StringIO(data), header=None, dtype=dtype)
+        tm.assert_frame_equal(expected, result)
+
+    def test_boolean_dtype(self, all_parsers):
+        parser = all_parsers
+        data = "\n".join(
+            [
+                "a",
+                "True",
+                "TRUE",
+                "true",
+                "1",
+                "1.0",
+                "False",
+                "FALSE",
+                "false",
+                "0",
+                "0.0",
+                "NaN",
+                "nan",
+                "NA",
+                "null",
+                "NULL",
+            ]
+        )
+
+        result = parser.read_csv(StringIO(data), dtype="boolean")
+        expected = DataFrame(
+            {
+                "a": pd.array(
+                    [
+                        True,
+                        True,
+                        True,
+                        True,
+                        True,
+                        False,
+                        False,
+                        False,
+                        False,
+                        False,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                    ],
+                    dtype="boolean",
+                )
+            }
+        )
+
+        tm.assert_frame_equal(result, expected)
+
 
 @skip_pyarrow
 class TestParserDtypesCategorical1:
@@ -506,102 +600,3 @@ def test_empty_dtype(self, all_parsers, dtype, expected):
 
         result = parser.read_csv(StringIO(data), header=0, dtype=dtype)
         tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_raise_on_passed_int_dtype_with_nas(all_parsers):
-    # see gh-2631
-    parser = all_parsers
-    data = """YEAR, DOY, a
-2001,106380451,10
-2001,,11
-2001,106380451,67"""
-
-    msg = (
-        "Integer column has NA values"
-        if parser.engine == "c"
-        else "Unable to convert column DOY"
-    )
-    with pytest.raises(ValueError, match=msg):
-        parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True)
-
-
-@skip_pyarrow
-def test_dtype_with_converters(all_parsers):
-    parser = all_parsers
-    data = """a,b
-1.1,2.2
-1.2,2.3"""
-
-    # Dtype spec ignored if converted specified.
-    with tm.assert_produces_warning(ParserWarning):
-        result = parser.read_csv(
-            StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)}
-        )
-    expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]})
-    tm.assert_frame_equal(result, expected)
-
-
-@pytest.mark.parametrize(
-    "dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"])
-)
-def test_numeric_dtype(all_parsers, dtype):
-    data = "0\n1"
-    parser = all_parsers
-    expected = DataFrame([0, 1], dtype=dtype)
-
-    result = parser.read_csv(StringIO(data), header=None, dtype=dtype)
-    tm.assert_frame_equal(expected, result)
-
-
-@skip_pyarrow
-def test_boolean_dtype(all_parsers):
-    parser = all_parsers
-    data = "\n".join(
-        [
-            "a",
-            "True",
-            "TRUE",
-            "true",
-            "1",
-            "1.0",
-            "False",
-            "FALSE",
-            "false",
-            "0",
-            "0.0",
-            "NaN",
-            "nan",
-            "NA",
-            "null",
-            "NULL",
-        ]
-    )
-
-    result = parser.read_csv(StringIO(data), dtype="boolean")
-    expected = DataFrame(
-        {
-            "a": pd.array(
-                [
-                    True,
-                    True,
-                    True,
-                    True,
-                    True,
-                    False,
-                    False,
-                    False,
-                    False,
-                    False,
-                    None,
-                    None,
-                    None,
-                    None,
-                    None,
-                ],
-                dtype="boolean",
-            )
-        }
-    )
-
-    tm.assert_frame_equal(result, expected)

From e124df0cd2f2c4f0000a3c0fb7a0d6cda9794a43 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Thu, 31 Dec 2020 00:21:25 -0500
Subject: [PATCH 76/95] pyarrow_xfail->pyarrow_skip

---
 pandas/tests/io/parser/test_common.py | 62 ++++++++++++++++++---------
 1 file changed, 41 insertions(+), 21 deletions(-)

diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index 443af3a33be18..d9631c5657e33 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -74,7 +74,7 @@ def _set_noconvert_columns(self):
 
 
 @skip_pyarrow
-def test_empty_decimal_marker(all_parsers, pyarrow_xfail):
+def test_empty_decimal_marker(all_parsers):
     data = """A|B|C
 1|2,334|5
 10|13|10.
@@ -176,7 +176,8 @@ def test_squeeze(all_parsers):
     assert not result._is_view
 
 
-def test_malformed(all_parsers, pyarrow_xfail):
+@skip_pyarrow
+def test_malformed(all_parsers):
     # see gh-6607
     parser = all_parsers
     data = """ignore
@@ -190,8 +191,9 @@ def test_malformed(all_parsers, pyarrow_xfail):
         parser.read_csv(StringIO(data), header=1, comment="#")
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("nrows", [5, 3, None])
-def test_malformed_chunks(all_parsers, nrows, pyarrow_xfail):
+def test_malformed_chunks(all_parsers, nrows):
     data = """ignore
 A,B,C
 skip
@@ -209,7 +211,8 @@ def test_malformed_chunks(all_parsers, nrows, pyarrow_xfail):
             reader.read(nrows)
 
 
-def test_unnamed_columns(all_parsers, pyarrow_xfail):
+@skip_pyarrow
+def test_unnamed_columns(all_parsers):
     data = """A,B,C,,
 1,2,3,4,5
 6,7,8,9,10
@@ -314,7 +317,8 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path):
     tm.assert_frame_equal(result, expected)
 
 
-def test_read_csv_wrong_num_columns(all_parsers, pyarrow_xfail):
+@skip_pyarrow
+def test_read_csv_wrong_num_columns(all_parsers):
     # Too few columns.
     data = """A,B,C,D,E,F
 1,2,3,4,5,6
@@ -430,8 +434,9 @@ def test_int_conversion(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("nrows", [3, 3.0])
-def test_read_nrows(all_parsers, nrows, pyarrow_xfail):
+def test_read_nrows(all_parsers, nrows):
     # see gh-10476
     data = """index,A,B,C,D
 foo,2,3,4,5
@@ -451,8 +456,9 @@ def test_read_nrows(all_parsers, nrows, pyarrow_xfail):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("nrows", [1.2, "foo", -1])
-def test_read_nrows_bad(all_parsers, nrows, pyarrow_xfail):
+def test_read_nrows_bad(all_parsers, nrows):
     data = """index,A,B,C,D
 foo,2,3,4,5
 bar,7,8,9,10
@@ -468,8 +474,9 @@ def test_read_nrows_bad(all_parsers, nrows, pyarrow_xfail):
         parser.read_csv(StringIO(data), nrows=nrows)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("index_col", [0, "index"])
-def test_read_chunksize_with_index(all_parsers, index_col, pyarrow_xfail):
+def test_read_chunksize_with_index(all_parsers, index_col):
     parser = all_parsers
     data = """index,A,B,C,D
 foo,2,3,4,5
@@ -500,8 +507,9 @@ def test_read_chunksize_with_index(all_parsers, index_col, pyarrow_xfail):
     tm.assert_frame_equal(chunks[2], expected[4:])
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("chunksize", [1.3, "foo", 0])
-def test_read_chunksize_bad(all_parsers, chunksize, pyarrow_xfail):
+def test_read_chunksize_bad(all_parsers, chunksize):
     data = """index,A,B,C,D
 foo,2,3,4,5
 bar,7,8,9,10
@@ -518,8 +526,9 @@ def test_read_chunksize_bad(all_parsers, chunksize, pyarrow_xfail):
             pass
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("chunksize", [2, 8])
-def test_read_chunksize_and_nrows(all_parsers, chunksize, pyarrow_xfail):
+def test_read_chunksize_and_nrows(all_parsers, chunksize):
     # see gh-15755
     data = """index,A,B,C,D
 foo,2,3,4,5
@@ -537,7 +546,8 @@ def test_read_chunksize_and_nrows(all_parsers, chunksize, pyarrow_xfail):
         tm.assert_frame_equal(concat(reader), expected)
 
 
-def test_read_chunksize_and_nrows_changing_size(all_parsers, pyarrow_xfail):
+@skip_pyarrow
+def test_read_chunksize_and_nrows_changing_size(all_parsers):
     data = """index,A,B,C,D
 foo,2,3,4,5
 bar,7,8,9,10
@@ -558,7 +568,8 @@ def test_read_chunksize_and_nrows_changing_size(all_parsers, pyarrow_xfail):
             reader.get_chunk(size=3)
 
 
-def test_get_chunk_passed_chunksize(all_parsers, pyarrow_xfail):
+@skip_pyarrow
+def test_get_chunk_passed_chunksize(all_parsers):
     parser = all_parsers
     data = """A,B,C
 1,2,3
@@ -573,8 +584,9 @@ def test_get_chunk_passed_chunksize(all_parsers, pyarrow_xfail):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}])
-def test_read_chunksize_compat(all_parsers, kwargs, pyarrow_xfail):
+def test_read_chunksize_compat(all_parsers, kwargs):
     # see gh-12185
     data = """index,A,B,C,D
 foo,2,3,4,5
@@ -590,7 +602,8 @@ def test_read_chunksize_compat(all_parsers, kwargs, pyarrow_xfail):
         tm.assert_frame_equal(concat(reader), result)
 
 
-def test_read_chunksize_jagged_names(all_parsers, pyarrow_xfail):
+@skip_pyarrow
+def test_read_chunksize_jagged_names(all_parsers):
     # see gh-23509
     parser = all_parsers
     data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
@@ -601,7 +614,8 @@ def test_read_chunksize_jagged_names(all_parsers, pyarrow_xfail):
     tm.assert_frame_equal(result, expected)
 
 
-def test_read_data_list(all_parsers, pyarrow_xfail):
+@skip_pyarrow
+def test_read_data_list(all_parsers):
     parser = all_parsers
     kwargs = {"index_col": 0}
     data = "A,B,C\nfoo,1,2,3\nbar,4,5,6"
@@ -615,7 +629,8 @@ def test_read_data_list(all_parsers, pyarrow_xfail):
     tm.assert_frame_equal(result, expected)
 
 
-def test_iterator(all_parsers, pyarrow_xfail):
+@skip_pyarrow
+def test_iterator(all_parsers):
     # see gh-6607
     data = """index,A,B,C,D
 foo,2,3,4,5
@@ -638,7 +653,8 @@ def test_iterator(all_parsers, pyarrow_xfail):
     tm.assert_frame_equal(last_chunk, expected[3:])
 
 
-def test_iterator2(all_parsers, pyarrow_xfail):
+@skip_pyarrow
+def test_iterator2(all_parsers):
     parser = all_parsers
     data = """A,B,C
 foo,1,2,3
@@ -701,7 +717,8 @@ def test_reader_list_skiprows(all_parsers):
     tm.assert_frame_equal(chunks[0], expected[1:3])
 
 
-def test_iterator_stop_on_chunksize(all_parsers, pyarrow_xfail):
+@skip_pyarrow
+def test_iterator_stop_on_chunksize(all_parsers):
     # gh-3967: stopping iteration when chunksize is specified
     parser = all_parsers
     data = """A,B,C
@@ -722,10 +739,11 @@ def test_iterator_stop_on_chunksize(all_parsers, pyarrow_xfail):
     tm.assert_frame_equal(concat(result), expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "kwargs", [{"iterator": True, "chunksize": 1}, {"iterator": True}, {"chunksize": 1}]
 )
-def test_iterator_skipfooter_errors(all_parsers, kwargs, pyarrow_xfail):
+def test_iterator_skipfooter_errors(all_parsers, kwargs):
     msg = "'skipfooter' not supported for iteration"
     parser = all_parsers
     data = "a\n1\n2"
@@ -1534,6 +1552,7 @@ def test_uneven_lines_with_usecols(all_parsers, usecols):
         tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data,kwargs,expected",
     [
@@ -1553,7 +1572,7 @@ def test_uneven_lines_with_usecols(all_parsers, usecols):
         ),
     ],
 )
-def test_read_empty_with_usecols(all_parsers, data, kwargs, expected, pyarrow_xfail):
+def test_read_empty_with_usecols(all_parsers, data, kwargs, expected):
     # see gh-12493
     parser = all_parsers
 
@@ -2174,7 +2193,8 @@ def test_read_table_equivalency_to_read_csv(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-def test_first_row_bom(all_parsers, pyarrow_xfail):
+@skip_pyarrow
+def test_first_row_bom(all_parsers):
     # see gh-26545
     parser = all_parsers
     data = '''\ufeff"Head1"	"Head2"	"Head3"'''

From fe253bacf5e7c5eb682f183453a5719003cbeb1e Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Thu, 31 Dec 2020 00:26:53 -0500
Subject: [PATCH 77/95] pyarrow_xfail->pyarrow_skip

---
 pandas/tests/io/parser/test_converters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py
index 955f249cdf9ae..158f924882503 100644
--- a/pandas/tests/io/parser/test_converters.py
+++ b/pandas/tests/io/parser/test_converters.py
@@ -12,7 +12,7 @@
 from pandas import DataFrame, Index
 import pandas._testing as tm
 
-pytestmark = pytest.mark.usefixtures("pyarrow_xfail")
+pytestmark = pytest.mark.usefixtures("pyarrow_skip")
 
 
 def test_converters_type_must_be_dict(all_parsers):

From 72c7c448916d716186795c8063ed7fb3c05a4282 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Thu, 31 Dec 2020 00:28:59 -0500
Subject: [PATCH 78/95] pyarrow_xfail->pyarrow_skip

---
 pandas/tests/io/parser/test_compression.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py
index 0af10c4124072..c01542d7d38c5 100644
--- a/pandas/tests/io/parser/test_compression.py
+++ b/pandas/tests/io/parser/test_compression.py
@@ -31,8 +31,9 @@ def parser_and_data(all_parsers, csv1):
     return parser, data, expected
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("compression", ["zip", "infer", "zip2"])
-def test_zip(parser_and_data, compression, pyarrow_xfail):
+def test_zip(parser_and_data, compression):
     parser, data, expected = parser_and_data
 
     with tm.ensure_clean("test_file.zip") as path:
@@ -48,8 +49,9 @@ def test_zip(parser_and_data, compression, pyarrow_xfail):
         tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("compression", ["zip", "infer"])
-def test_zip_error_multiple_files(parser_and_data, compression, pyarrow_xfail):
+def test_zip_error_multiple_files(parser_and_data, compression):
     parser, data, expected = parser_and_data
 
     with tm.ensure_clean("combined_zip.zip") as path:
@@ -63,7 +65,8 @@ def test_zip_error_multiple_files(parser_and_data, compression, pyarrow_xfail):
             parser.read_csv(path, compression=compression)
 
 
-def test_zip_error_no_files(parser_and_data, pyarrow_xfail):
+@skip_pyarrow
+def test_zip_error_no_files(parser_and_data):
     parser, _, _ = parser_and_data
 
     with tm.ensure_clean() as path:
@@ -74,7 +77,8 @@ def test_zip_error_no_files(parser_and_data, pyarrow_xfail):
             parser.read_csv(path, compression="zip")
 
 
-def test_zip_error_invalid_zip(parser_and_data, pyarrow_xfail):
+@skip_pyarrow
+def test_zip_error_invalid_zip(parser_and_data):
     parser, _, _ = parser_and_data
 
     with tm.ensure_clean() as path:
@@ -85,9 +89,7 @@ def test_zip_error_invalid_zip(parser_and_data, pyarrow_xfail):
 
 @skip_pyarrow
 @pytest.mark.parametrize("filename", [None, "test.{ext}"])
-def test_compression(
-    parser_and_data, compression_only, buffer, filename, pyarrow_xfail
-):
+def test_compression(parser_and_data, compression_only, buffer, filename):
     parser, data, expected = parser_and_data
     compress_type = compression_only
 
@@ -131,9 +133,8 @@ def test_infer_compression(all_parsers, csv1, buffer, ext):
     tm.assert_frame_equal(result, expected)
 
 
-def test_compression_utf_encoding(
-    all_parsers, csv_dir_path, utf_value, encoding_fmt, pyarrow_xfail
-):
+@skip_pyarrow
+def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt):
     # see gh-18071, gh-24130
     parser = all_parsers
     encoding = encoding_fmt.format(utf_value)

From 26710071f57ae90cf2285f6e5d473d268901d517 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Thu, 31 Dec 2020 02:48:32 -0500
Subject: [PATCH 79/95] xfail more tests

---
 pandas/tests/io/parser/test_common.py    | 57 ++++++++++++------------
 pandas/tests/io/parser/test_index_col.py |  5 ++-
 2 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index 155799d512b8d..14241140c1b25 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -15,7 +15,6 @@
 import pytest
 
 from pandas._libs.tslib import Timestamp
-from pandas.compat import is_platform_linux
 from pandas.errors import DtypeWarning, EmptyDataError, ParserError
 import pandas.util._test_decorators as td
 
@@ -1403,34 +1402,34 @@ def test_numeric_range_too_wide(all_parsers, exp_data):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
-@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999])
-def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
-    # GH#38753
-    parser, precision = all_parsers_all_precisions
-    data = f"data\n10E{neg_exp}"
-    result = parser.read_csv(StringIO(data), float_precision=precision)
-    expected = DataFrame({"data": [0.0]})
-    tm.assert_frame_equal(result, expected)
-
-
-@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
-def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
-    # GH#38753
-    parser, precision = all_parsers_all_precisions
-    data = f"data\n10E{exp}"
-    result = parser.read_csv(StringIO(data), float_precision=precision)
-    if precision == "round_trip":
-        if exp == 999999999999999999 and is_platform_linux():
-            mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
-            request.node.add_marker(mark)
-
-        value = np.inf if exp > 0 else 0.0
-        expected = DataFrame({"data": [value]})
-    else:
-        expected = DataFrame({"data": [f"10E{exp}"]})
-
-    tm.assert_frame_equal(result, expected)
+# @skip_pyarrow
+# @pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999])
+# def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
+#     # GH#38753
+#     parser, precision = all_parsers_all_precisions
+#     data = f"data\n10E{neg_exp}"
+#     result = parser.read_csv(StringIO(data), float_precision=precision)
+#     expected = DataFrame({"data": [0.0]})
+#     tm.assert_frame_equal(result, expected)
+
+# @skip_pyarrow
+# @pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
+# def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
+#     # GH#38753
+#     parser, precision = all_parsers_all_precisions
+#     data = f"data\n10E{exp}"
+#     result = parser.read_csv(StringIO(data), float_precision=precision)
+#     if precision == "round_trip":
+#         if exp == 999999999999999999 and is_platform_linux():
+#             mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
+#             request.node.add_marker(mark)
+
+#         value = np.inf if exp > 0 else 0.0
+#         expected = DataFrame({"data": [value]})
+#     else:
+#         expected = DataFrame({"data": [f"10E{exp}"]})
+
+#     tm.assert_frame_equal(result, expected)
 
 
 @skip_pyarrow
diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py
index 607fd021b0662..87abeaf18ac76 100644
--- a/pandas/tests/io/parser/test_index_col.py
+++ b/pandas/tests/io/parser/test_index_col.py
@@ -12,7 +12,6 @@
 import pandas._testing as tm
 
 skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
-xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
 
 @skip_pyarrow
@@ -240,6 +239,7 @@ def test_index_col_large_csv(all_parsers):
     tm.assert_frame_equal(result, df.set_index("a"))
 
 
+@skip_pyarrow
 def test_index_col_multiindex_columns_no_data(all_parsers):
     # GH#38292
     parser = all_parsers
@@ -255,6 +255,7 @@ def test_index_col_multiindex_columns_no_data(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_index_col_header_no_data(all_parsers):
     # GH#38292
     parser = all_parsers
@@ -267,6 +268,7 @@ def test_index_col_header_no_data(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_multiindex_columns_no_data(all_parsers):
     # GH#38292
     parser = all_parsers
@@ -277,6 +279,7 @@ def test_multiindex_columns_no_data(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_multiindex_columns_index_col_with_data(all_parsers):
     # GH#38292
     parser = all_parsers

From 73ca5d4c17d481aa303902ed965a372f8da647e4 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Thu, 31 Dec 2020 03:18:18 -0500
Subject: [PATCH 80/95] xfail more tests

---
 pandas/tests/io/parser/test_common.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index 14241140c1b25..a4d02b1bb7873 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -1307,14 +1307,14 @@ def test_float_parser(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-def test_scientific_no_exponent(all_parsers_all_precisions):
-    # see gh-12215
-    df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]})
-    data = df.to_csv(index=False)
-    parser, precision = all_parsers_all_precisions
-
-    df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
-    tm.assert_frame_equal(df_roundtrip, df)
+# def test_scientific_no_exponent(all_parsers_all_precisions):
+#     # see gh-12215
+#     df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]})
+#     data = df.to_csv(index=False)
+#     parser, precision = all_parsers_all_precisions
+
+#     df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
+#     tm.assert_frame_equal(df_roundtrip, df)
 
 
 @skip_pyarrow

From 639ca283fa39837bafa901da07b4b17e9cb634e6 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Fri, 1 Jan 2021 14:41:25 -0500
Subject: [PATCH 81/95] update refactoredt tests

---
 pandas/tests/io/parser/dtypes/test_categorical.py    | 12 ++++++++++++
 pandas/tests/io/parser/dtypes/test_dtypes_basic.py   |  2 ++
 pandas/tests/io/parser/dtypes/test_empty.py          |  2 ++
 pandas/tests/io/parser/usecols/test_parse_dates.py   |  2 ++
 pandas/tests/io/parser/usecols/test_usecols_basic.py |  2 ++
 5 files changed, 20 insertions(+)

diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py
index 2f569424a82f5..a4e59899f304e 100644
--- a/pandas/tests/io/parser/dtypes/test_categorical.py
+++ b/pandas/tests/io/parser/dtypes/test_categorical.py
@@ -14,7 +14,10 @@
 from pandas import Categorical, DataFrame, Timestamp
 import pandas._testing as tm
 
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
 
+
+@skip_pyarrow
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -41,6 +44,7 @@ def test_categorical_dtype(all_parsers, dtype):
     tm.assert_frame_equal(actual, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}])
 def test_categorical_dtype_single(all_parsers, dtype):
     # see gh-10153
@@ -56,6 +60,7 @@ def test_categorical_dtype_single(all_parsers, dtype):
     tm.assert_frame_equal(actual, expected)
 
 
+@skip_pyarrow
 def test_categorical_dtype_unsorted(all_parsers):
     # see gh-10153
     parser = all_parsers
@@ -74,6 +79,7 @@ def test_categorical_dtype_unsorted(all_parsers):
     tm.assert_frame_equal(actual, expected)
 
 
+@skip_pyarrow
 def test_categorical_dtype_missing(all_parsers):
     # see gh-10153
     parser = all_parsers
@@ -92,6 +98,7 @@ def test_categorical_dtype_missing(all_parsers):
     tm.assert_frame_equal(actual, expected)
 
 
+@skip_pyarrow
 @pytest.mark.slow
 def test_categorical_dtype_high_cardinality_numeric(all_parsers):
     # see gh-18186
@@ -106,6 +113,7 @@ def test_categorical_dtype_high_cardinality_numeric(all_parsers):
     tm.assert_frame_equal(actual, expected)
 
 
+@skip_pyarrow
 def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
     # see gh-10153
     pth = os.path.join(csv_dir_path, "utf16_ex.txt")
@@ -120,6 +128,7 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
     tm.assert_frame_equal(actual, expected)
 
 
+@skip_pyarrow
 def test_categorical_dtype_chunksize_infer_categories(all_parsers):
     # see gh-10153
     parser = all_parsers
@@ -139,6 +148,7 @@ def test_categorical_dtype_chunksize_infer_categories(all_parsers):
             tm.assert_frame_equal(actual, expected)
 
 
+@skip_pyarrow
 def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
     # see gh-10153
     parser = all_parsers
@@ -242,6 +252,7 @@ def test_categorical_coerces_datetime(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_categorical_coerces_timestamp(all_parsers):
     parser = all_parsers
     dtype = {"b": CategoricalDtype([Timestamp("2014")])}
@@ -253,6 +264,7 @@ def test_categorical_coerces_timestamp(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_categorical_coerces_timedelta(all_parsers):
     parser = all_parsers
     dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))}
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
index e416d8dcdd905..b6814e39241f0 100644
--- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
+++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -13,6 +13,8 @@
 from pandas import DataFrame
 import pandas._testing as tm
 
+pytestmark = pytest.mark.usefixtures("pyarrow_skip")
+
 
 @pytest.mark.parametrize("dtype", [str, object])
 @pytest.mark.parametrize("check_orig", [True, False])
diff --git a/pandas/tests/io/parser/dtypes/test_empty.py b/pandas/tests/io/parser/dtypes/test_empty.py
index 57d729fb4b7fc..618af246e4db3 100644
--- a/pandas/tests/io/parser/dtypes/test_empty.py
+++ b/pandas/tests/io/parser/dtypes/test_empty.py
@@ -10,6 +10,8 @@
 from pandas import Categorical, DataFrame, Index, MultiIndex, Series, concat
 import pandas._testing as tm
 
+pytestmark = pytest.mark.usefixtures("pyarrow_skip")
+
 
 def test_dtype_all_columns_empty(all_parsers):
     # see gh-12048
diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py
index c6b700c0adfff..34d5b4b7d183b 100644
--- a/pandas/tests/io/parser/usecols/test_parse_dates.py
+++ b/pandas/tests/io/parser/usecols/test_parse_dates.py
@@ -20,6 +20,8 @@
     "Usecols do not match columns, columns expected but not found: {0}"
 )
 
+pytestmark = pytest.mark.usefixtures("pyarrow_skip")
+
 
 @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
 def test_usecols_with_parse_dates(all_parsers, usecols):
diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py
index 7d81a88e09012..a163326124878 100644
--- a/pandas/tests/io/parser/usecols/test_usecols_basic.py
+++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py
@@ -19,6 +19,8 @@
     "Usecols do not match columns, columns expected but not found: {0}"
 )
 
+pytestmark = pytest.mark.usefixtures("pyarrow_skip")
+
 
 def test_raise_on_mixed_dtype_usecols(all_parsers):
     # See gh-12678

From 1994fadebb4e8b04cf5c91e2386f90f1ee86ce62 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 2 Jan 2021 01:40:13 -0500
Subject: [PATCH 82/95] float precision tests

---
 pandas/tests/io/parser/conftest.py    |  4 ++
 pandas/tests/io/parser/test_common.py | 74 ++++++++++++++-------------
 2 files changed, 43 insertions(+), 35 deletions(-)

diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index 27c81aa435e57..bda4c771c6511 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -116,10 +116,14 @@ def _get_all_parser_float_precision_combinations():
     params = []
     ids = []
     for parser, parser_id in zip(_all_parsers, _all_parser_ids):
+        if parser_id == "pyarrow":
+            # GH38370
+            continue
         for precision in parser.float_precision_choices:
             params.append((parser, precision))
             ids.append(f"{parser_id}-{precision}")
 
+    print(params)
     return {"params": params, "ids": ids}
 
 
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index a4d02b1bb7873..f06d1476c515a 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -15,6 +15,7 @@
 import pytest
 
 from pandas._libs.tslib import Timestamp
+from pandas.compat import is_platform_linux
 from pandas.errors import DtypeWarning, EmptyDataError, ParserError
 import pandas.util._test_decorators as td
 
@@ -1307,14 +1308,16 @@ def test_float_parser(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-# def test_scientific_no_exponent(all_parsers_all_precisions):
-#     # see gh-12215
-#     df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]})
-#     data = df.to_csv(index=False)
-#     parser, precision = all_parsers_all_precisions
+def test_scientific_no_exponent(all_parsers_all_precisions):
+    # see gh-12215
+    df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]})
+    data = df.to_csv(index=False)
+    parser, precision = all_parsers_all_precisions
+    if parser == "pyarrow":
+        pytest.skip()
 
-#     df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
-#     tm.assert_frame_equal(df_roundtrip, df)
+    df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
+    tm.assert_frame_equal(df_roundtrip, df)
 
 
 @skip_pyarrow
@@ -1402,34 +1405,35 @@ def test_numeric_range_too_wide(all_parsers, exp_data):
     tm.assert_frame_equal(result, expected)
 
 
-# @skip_pyarrow
-# @pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999])
-# def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
-#     # GH#38753
-#     parser, precision = all_parsers_all_precisions
-#     data = f"data\n10E{neg_exp}"
-#     result = parser.read_csv(StringIO(data), float_precision=precision)
-#     expected = DataFrame({"data": [0.0]})
-#     tm.assert_frame_equal(result, expected)
-
-# @skip_pyarrow
-# @pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
-# def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
-#     # GH#38753
-#     parser, precision = all_parsers_all_precisions
-#     data = f"data\n10E{exp}"
-#     result = parser.read_csv(StringIO(data), float_precision=precision)
-#     if precision == "round_trip":
-#         if exp == 999999999999999999 and is_platform_linux():
-#             mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
-#             request.node.add_marker(mark)
-
-#         value = np.inf if exp > 0 else 0.0
-#         expected = DataFrame({"data": [value]})
-#     else:
-#         expected = DataFrame({"data": [f"10E{exp}"]})
-
-#     tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999])
+def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
+    # GH#38753
+    parser, precision = all_parsers_all_precisions
+    if parser == "pyarrow":
+        pytest.skip()
+    data = f"data\n10E{neg_exp}"
+    result = parser.read_csv(StringIO(data), float_precision=precision)
+    expected = DataFrame({"data": [0.0]})
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
+def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
+    # GH#38753
+    parser, precision = all_parsers_all_precisions
+    data = f"data\n10E{exp}"
+    result = parser.read_csv(StringIO(data), float_precision=precision)
+    if precision == "round_trip":
+        if exp == 999999999999999999 and is_platform_linux():
+            mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
+            request.node.add_marker(mark)
+
+        value = np.inf if exp > 0 else 0.0
+        expected = DataFrame({"data": [value]})
+    else:
+        expected = DataFrame({"data": [f"10E{exp}"]})
+
+    tm.assert_frame_equal(result, expected)
 
 
 @skip_pyarrow

From 566f1b4c157fa5c5b4c53e22bd1344ccef91c625 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 2 Jan 2021 02:13:24 -0500
Subject: [PATCH 83/95] TST/REF: io/parsers/test_common.py

---
 .../io/parser/common/test_common_basic.py     | 752 ++++++++++++++++++
 1 file changed, 752 insertions(+)
 create mode 100644 pandas/tests/io/parser/common/test_common_basic.py

diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
new file mode 100644
index 0000000000000..4dd75dff16095
--- /dev/null
+++ b/pandas/tests/io/parser/common/test_common_basic.py
@@ -0,0 +1,752 @@
+"""
+Tests that work on both the Python and C engines but do not have a
+specific classification into the other test modules.
+"""
+from datetime import datetime
+from inspect import signature
+from io import StringIO
+import os
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslib import Timestamp
+from pandas.errors import EmptyDataError, ParserError
+
+from pandas import DataFrame, Index, Series, compat
+import pandas._testing as tm
+
+from pandas.io.parsers import CParserWrapper, TextFileReader
+
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+
+
+def test_override_set_noconvert_columns():
+    # see gh-17351
+    #
+    # Usecols needs to be sorted in _set_noconvert_columns based
+    # on the test_usecols_with_parse_dates test from test_usecols.py
+    class MyTextFileReader(TextFileReader):
+        def __init__(self):
+            self._currow = 0
+            self.squeeze = False
+
+    class MyCParserWrapper(CParserWrapper):
+        def _set_noconvert_columns(self):
+            if self.usecols_dtype == "integer":
+                # self.usecols is a set, which is documented as unordered
+                # but in practice, a CPython set of integers is sorted.
+                # In other implementations this assumption does not hold.
+                # The following code simulates a different order, which
+                # before GH 17351 would cause the wrong columns to be
+                # converted via the parse_dates parameter
+                self.usecols = list(self.usecols)
+                self.usecols.reverse()
+            return CParserWrapper._set_noconvert_columns(self)
+
+    data = """a,b,c,d,e
+0,1,20140101,0900,4
+0,1,20140102,1000,4"""
+
+    parse_dates = [[1, 2]]
+    cols = {
+        "a": [0, 0],
+        "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
+    }
+    expected = DataFrame(cols, columns=["c_d", "a"])
+
+    parser = MyTextFileReader()
+    parser.options = {
+        "usecols": [0, 2, 3],
+        "parse_dates": parse_dates,
+        "delimiter": ",",
+    }
+    parser.engine = "c"
+    parser._engine = MyCParserWrapper(StringIO(data), **parser.options)
+
+    result = parser.read()
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_read_csv_local(all_parsers, csv1):
+    prefix = "file:///" if compat.is_platform_windows() else "file://"
+    parser = all_parsers
+
+    fname = prefix + str(os.path.abspath(csv1))
+    result = parser.read_csv(fname, index_col=0, parse_dates=True)
+
+    expected = DataFrame(
+        [
+            [0.980269, 3.685731, -0.364216805298, -1.159738],
+            [1.047916, -0.041232, -0.16181208307, 0.212549],
+            [0.498581, 0.731168, -0.537677223318, 1.346270],
+            [1.120202, 1.567621, 0.00364077397681, 0.675253],
+            [-0.487094, 0.571455, -1.6116394093, 0.103469],
+            [0.836649, 0.246462, 0.588542635376, 1.062782],
+            [-0.157161, 1.340307, 1.1957779562, -1.097007],
+        ],
+        columns=["A", "B", "C", "D"],
+        index=Index(
+            [
+                datetime(2000, 1, 3),
+                datetime(2000, 1, 4),
+                datetime(2000, 1, 5),
+                datetime(2000, 1, 6),
+                datetime(2000, 1, 7),
+                datetime(2000, 1, 10),
+                datetime(2000, 1, 11),
+            ],
+            name="index",
+        ),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_1000_sep(all_parsers):
+    parser = all_parsers
+    data = """A|B|C
+1|2,334|5
+10|13|10.
+"""
+    expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]})
+
+    result = parser.read_csv(StringIO(data), sep="|", thousands=",")
+    tm.assert_frame_equal(result, expected)
+
+
+def test_squeeze(all_parsers):
+    data = """\
+a,1
+b,2
+c,3
+"""
+    parser = all_parsers
+    index = Index(["a", "b", "c"], name=0)
+    expected = Series([1, 2, 3], name=1, index=index)
+
+    result = parser.read_csv(StringIO(data), index_col=0, header=None, squeeze=True)
+    tm.assert_series_equal(result, expected)
+
+    # see gh-8217
+    #
+    # Series should not be a view.
+    assert not result._is_view
+
+
+@skip_pyarrow
+def test_unnamed_columns(all_parsers):
+    data = """A,B,C,,
+1,2,3,4,5
+6,7,8,9,10
+11,12,13,14,15
+"""
+    parser = all_parsers
+    expected = DataFrame(
+        [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]],
+        dtype=np.int64,
+        columns=["A", "B", "C", "Unnamed: 3", "Unnamed: 4"],
+    )
+    result = parser.read_csv(StringIO(data))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_csv_mixed_type(all_parsers):
+    data = """A,B,C
+a,1,2
+b,3,4
+c,4,5
+"""
+    parser = all_parsers
+    expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]})
+    result = parser.read_csv(StringIO(data))
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_read_csv_low_memory_no_rows_with_index(all_parsers):
+    # see gh-21141
+    parser = all_parsers
+
+    if not parser.low_memory:
+        pytest.skip("This is a low-memory specific test")
+
+    data = """A,B,C
+1,1,1,2
+2,2,3,4
+3,3,4,5
+"""
+    result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
+    expected = DataFrame(columns=["A", "B", "C"])
+    tm.assert_frame_equal(result, expected)
+
+
+def test_read_csv_dataframe(all_parsers, csv1):
+    parser = all_parsers
+    result = parser.read_csv(csv1, index_col=0, parse_dates=True)
+
+    expected = DataFrame(
+        [
+            [0.980269, 3.685731, -0.364216805298, -1.159738],
+            [1.047916, -0.041232, -0.16181208307, 0.212549],
+            [0.498581, 0.731168, -0.537677223318, 1.346270],
+            [1.120202, 1.567621, 0.00364077397681, 0.675253],
+            [-0.487094, 0.571455, -1.6116394093, 0.103469],
+            [0.836649, 0.246462, 0.588542635376, 1.062782],
+            [-0.157161, 1.340307, 1.1957779562, -1.097007],
+        ],
+        columns=["A", "B", "C", "D"],
+        index=Index(
+            [
+                datetime(2000, 1, 3),
+                datetime(2000, 1, 4),
+                datetime(2000, 1, 5),
+                datetime(2000, 1, 6),
+                datetime(2000, 1, 7),
+                datetime(2000, 1, 10),
+                datetime(2000, 1, 11),
+            ],
+            name="index",
+        ),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+@pytest.mark.parametrize("nrows", [3, 3.0])
+def test_read_nrows(all_parsers, nrows):
+    # see gh-10476
+    data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+    expected = DataFrame(
+        [["foo", 2, 3, 4, 5], ["bar", 7, 8, 9, 10], ["baz", 12, 13, 14, 15]],
+        columns=["index", "A", "B", "C", "D"],
+    )
+    parser = all_parsers
+
+    result = parser.read_csv(StringIO(data), nrows=nrows)
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+@pytest.mark.parametrize("nrows", [1.2, "foo", -1])
+def test_read_nrows_bad(all_parsers, nrows):
+    data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+    msg = r"'nrows' must be an integer >=0"
+    parser = all_parsers
+
+    with pytest.raises(ValueError, match=msg):
+        parser.read_csv(StringIO(data), nrows=nrows)
+
+
+def test_nrows_skipfooter_errors(all_parsers):
+    msg = "'skipfooter' not supported with 'nrows'"
+    data = "a\n1\n2\n3\n4\n5\n6"
+    parser = all_parsers
+
+    with pytest.raises(ValueError, match=msg):
+        parser.read_csv(StringIO(data), skipfooter=1, nrows=5)
+
+
+@skip_pyarrow
+def test_missing_trailing_delimiters(all_parsers):
+    parser = all_parsers
+    data = """A,B,C,D
+1,2,3,4
+1,3,3,
+1,4,5"""
+
+    result = parser.read_csv(StringIO(data))
+    expected = DataFrame(
+        [[1, 2, 3, 4], [1, 3, 3, np.nan], [1, 4, 5, np.nan]],
+        columns=["A", "B", "C", "D"],
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_skip_initial_space(all_parsers):
+    data = (
+        '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, '
+        "1.00361,  1.12551, 330.65659, 0355626618.16711,  73.48821, "
+        "314.11625,  1917.09447,   179.71425,  80.000, 240.000, -350,  "
+        "70.06056, 344.98370, 1,   1, -0.689265, -0.692787,  "
+        "0.212036,    14.7674,   41.605,   -9999.0,   -9999.0,   "
+        "-9999.0,   -9999.0,   -9999.0,  -9999.0, 000, 012, 128"
+    )
+    parser = all_parsers
+
+    result = parser.read_csv(
+        StringIO(data),
+        names=list(range(33)),
+        header=None,
+        na_values=["-9999.0"],
+        skipinitialspace=True,
+    )
+    expected = DataFrame(
+        [
+            [
+                "09-Apr-2012",
+                "01:10:18.300",
+                2456026.548822908,
+                12849,
+                1.00361,
+                1.12551,
+                330.65659,
+                355626618.16711,
+                73.48821,
+                314.11625,
+                1917.09447,
+                179.71425,
+                80.0,
+                240.0,
+                -350,
+                70.06056,
+                344.9837,
+                1,
+                1,
+                -0.689265,
+                -0.692787,
+                0.212036,
+                14.7674,
+                41.605,
+                np.nan,
+                np.nan,
+                np.nan,
+                np.nan,
+                np.nan,
+                np.nan,
+                0,
+                12,
+                128,
+            ]
+        ]
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_trailing_delimiters(all_parsers):
+    # see gh-2442
+    data = """A,B,C
+1,2,3,
+4,5,6,
+7,8,9,"""
+    parser = all_parsers
+    result = parser.read_csv(StringIO(data), index_col=False)
+
+    expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_escapechar(all_parsers):
+    # https://stackoverflow.com/questions/13824840/feature-request-for-
+    # pandas-read-csv
+    data = '''SEARCH_TERM,ACTUAL_URL
+"bra tv bord","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
+"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
+"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"'''  # noqa
+
+    parser = all_parsers
+    result = parser.read_csv(
+        StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8"
+    )
+
+    assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals series'
+
+    tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"]))
+
+
+@skip_pyarrow
+def test_ignore_leading_whitespace(all_parsers):
+    # see gh-3374, gh-6607
+    parser = all_parsers
+    data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9"
+    result = parser.read_csv(StringIO(data), sep=r"\s+")
+
+    expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]})
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+@pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]])
+def test_uneven_lines_with_usecols(all_parsers, usecols):
+    # see gh-12203
+    parser = all_parsers
+    data = r"""a,b,c
+0,1,2
+3,4,5,6,7
+8,9,10"""
+
+    if usecols is None:
+        # Make sure that an error is still raised
+        # when the "usecols" parameter is not provided.
+        msg = r"Expected \d+ fields in line \d+, saw \d+"
+        with pytest.raises(ParserError, match=msg):
+            parser.read_csv(StringIO(data))
+    else:
+        expected = DataFrame({"a": [0, 3, 8], "b": [1, 4, 9]})
+
+        result = parser.read_csv(StringIO(data), usecols=usecols)
+        tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+@pytest.mark.parametrize(
+    "data,kwargs,expected",
+    [
+        # First, check to see that the response of parser when faced with no
+        # provided columns raises the correct error, with or without usecols.
+        ("", {}, None),
+        ("", {"usecols": ["X"]}, None),
+        (
+            ",,",
+            {"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]},
+            DataFrame(columns=["X"], index=[0], dtype=np.float64),
+        ),
+        (
+            "",
+            {"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]},
+            DataFrame(columns=["X"]),
+        ),
+    ],
+)
+def test_read_empty_with_usecols(all_parsers, data, kwargs, expected):
+    # see gh-12493
+    parser = all_parsers
+
+    if expected is None:
+        msg = "No columns to parse from file"
+        with pytest.raises(EmptyDataError, match=msg):
+            parser.read_csv(StringIO(data), **kwargs)
+    else:
+        result = parser.read_csv(StringIO(data), **kwargs)
+        tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+@pytest.mark.parametrize(
+    "kwargs,expected",
+    [
+        # gh-8661, gh-8679: this should ignore six lines, including
+        # lines with trailing whitespace and blank lines.
+        (
+            {
+                "header": None,
+                "delim_whitespace": True,
+                "skiprows": [0, 1, 2, 3, 5, 6],
+                "skip_blank_lines": True,
+            },
+            DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]),
+        ),
+        # gh-8983: test skipping set of rows after a row with trailing spaces.
+        (
+            {
+                "delim_whitespace": True,
+                "skiprows": [1, 2, 3, 5, 6],
+                "skip_blank_lines": True,
+            },
+            DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}),
+        ),
+    ],
+)
+def test_trailing_spaces(all_parsers, kwargs, expected):
+    data = "A B C  \nrandom line with trailing spaces    \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n   \n5.1,NaN,10.0\n"  # noqa
+    parser = all_parsers
+
+    result = parser.read_csv(StringIO(data.replace(",", "  ")), **kwargs)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_raise_on_sep_with_delim_whitespace(all_parsers):
+    # see gh-6607
+    data = "a b c\n1 2 3"
+    parser = all_parsers
+
+    with pytest.raises(ValueError, match="you can only specify one"):
+        parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
+
+
+@skip_pyarrow
+@pytest.mark.parametrize("delim_whitespace", [True, False])
+def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
+    # see gh-9710
+    parser = all_parsers
+    data = """\
+MyColumn
+a
+b
+a
+b\n"""
+
+    expected = DataFrame({"MyColumn": list("abab")})
+    result = parser.read_csv(
+        StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+@pytest.mark.parametrize(
+    "sep,skip_blank_lines,exp_data",
+    [
+        (",", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
+        (r"\s+", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
+        (
+            ",",
+            False,
+            [
+                [1.0, 2.0, 4.0],
+                [np.nan, np.nan, np.nan],
+                [np.nan, np.nan, np.nan],
+                [5.0, np.nan, 10.0],
+                [np.nan, np.nan, np.nan],
+                [-70.0, 0.4, 1.0],
+            ],
+        ),
+    ],
+)
+def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data):
+    parser = all_parsers
+    data = """\
+A,B,C
+1,2.,4.
+
+
+5.,NaN,10.0
+
+-70,.4,1
+"""
+
+    if sep == r"\s+":
+        data = data.replace(",", "  ")
+
+    result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines)
+    expected = DataFrame(exp_data, columns=["A", "B", "C"])
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_whitespace_lines(all_parsers):
+    parser = all_parsers
+    data = """
+
+\t  \t\t
+\t
+A,B,C
+\t    1,2.,4.
+5.,NaN,10.0
+"""
+    expected = DataFrame([[1, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"])
+    result = parser.read_csv(StringIO(data))
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+@pytest.mark.parametrize(
+    "data,expected",
+    [
+        (
+            """   A   B   C   D
+a   1   2   3   4
+b   1   2   3   4
+c   1   2   3   4
+""",
+            DataFrame(
+                [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
+                columns=["A", "B", "C", "D"],
+                index=["a", "b", "c"],
+            ),
+        ),
+        (
+            "    a b c\n1 2 3 \n4 5  6\n 7 8 9",
+            DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]),
+        ),
+    ],
+)
+def test_whitespace_regex_separator(all_parsers, data, expected):
+    # see gh-6607
+    parser = all_parsers
+    result = parser.read_csv(StringIO(data), sep=r"\s+")
+    tm.assert_frame_equal(result, expected)
+
+
+def test_sub_character(all_parsers, csv_dir_path):
+    # see gh-16893
+    filename = os.path.join(csv_dir_path, "sub_char.csv")
+    expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"])
+
+    parser = all_parsers
+    result = parser.read_csv(filename)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv", "中文文件名.csv"])
+def test_filename_with_special_chars(all_parsers, filename):
+    # see gh-15086.
+    parser = all_parsers
+    df = DataFrame({"a": [1, 2, 3]})
+
+    with tm.ensure_clean(filename) as path:
+        df.to_csv(path, index=False)
+
+        result = parser.read_csv(path)
+        tm.assert_frame_equal(result, df)
+
+
+def test_read_table_same_signature_as_read_csv(all_parsers):
+    # GH-34976
+    parser = all_parsers
+
+    table_sign = signature(parser.read_table)
+    csv_sign = signature(parser.read_csv)
+
+    assert table_sign.parameters.keys() == csv_sign.parameters.keys()
+    assert table_sign.return_annotation == csv_sign.return_annotation
+
+    for key, csv_param in csv_sign.parameters.items():
+        table_param = table_sign.parameters[key]
+        if key == "sep":
+            assert csv_param.default == ","
+            assert table_param.default == "\t"
+            assert table_param.annotation == csv_param.annotation
+            assert table_param.kind == csv_param.kind
+            continue
+        else:
+            assert table_param == csv_param
+
+
+def test_read_table_equivalency_to_read_csv(all_parsers):
+    # see gh-21948
+    # As of 0.25.0, read_table is undeprecated
+    parser = all_parsers
+    data = "a\tb\n1\t2\n3\t4"
+    expected = parser.read_csv(StringIO(data), sep="\t")
+    result = parser.read_table(StringIO(data))
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_first_row_bom(all_parsers):
+    # see gh-26545
+    parser = all_parsers
+    data = '''\ufeff"Head1"	"Head2"	"Head3"'''
+
+    result = parser.read_csv(StringIO(data), delimiter="\t")
+    expected = DataFrame(columns=["Head1", "Head2", "Head3"])
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_first_row_bom_unquoted(all_parsers):
+    # see gh-36343
+    parser = all_parsers
+    data = """\ufeffHead1	Head2	Head3"""
+
+    result = parser.read_csv(StringIO(data), delimiter="\t")
+    expected = DataFrame(columns=["Head1", "Head2", "Head3"])
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+@pytest.mark.parametrize("nrows", range(1, 6))
+def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
+    # GH 28071
+    ref = DataFrame(
+        [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]],
+        columns=list("ab"),
+    )
+    csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
+    parser = all_parsers
+    df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
+    tm.assert_frame_equal(df, ref[:nrows])
+
+
+@skip_pyarrow
+def test_no_header_two_extra_columns(all_parsers):
+    # GH 26218
+    column_names = ["one", "two", "three"]
+    ref = DataFrame([["foo", "bar", "baz"]], columns=column_names)
+    stream = StringIO("foo,bar,baz,bam,blah")
+    parser = all_parsers
+    df = parser.read_csv(stream, header=None, names=column_names, index_col=False)
+    tm.assert_frame_equal(df, ref)
+
+
+def test_read_csv_names_not_accepting_sets(all_parsers):
+    # GH 34946
+    data = """\
+    1,2,3
+    4,5,6\n"""
+    parser = all_parsers
+    with pytest.raises(ValueError, match="Names should be an ordered collection."):
+        parser.read_csv(StringIO(data), names=set("QAZ"))
+
+
+@skip_pyarrow
+def test_read_table_delim_whitespace_default_sep(all_parsers):
+    # GH: 35958
+    f = StringIO("a  b  c\n1 -2 -3\n4  5   6")
+    parser = all_parsers
+    result = parser.read_table(f, delim_whitespace=True)
+    expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]})
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("delimiter", [",", "\t"])
+def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter):
+    # GH: 35958
+    f = StringIO("a  b  c\n1 -2 -3\n4  5   6")
+    parser = all_parsers
+    msg = (
+        "Specified a delimiter with both sep and "
+        "delim_whitespace=True; you can only specify one."
+    )
+    with pytest.raises(ValueError, match=msg):
+        parser.read_csv(f, delim_whitespace=True, sep=delimiter)
+
+    with pytest.raises(ValueError, match=msg):
+        parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)
+
+
+@pytest.mark.parametrize("delimiter", [",", "\t"])
+def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
+    # GH: 35958
+    f = StringIO("a  b  c\n1 -2 -3\n4  5   6")
+    parser = all_parsers
+    msg = (
+        "Specified a delimiter with both sep and "
+        "delim_whitespace=True; you can only specify one."
+    )
+    with pytest.raises(ValueError, match=msg):
+        parser.read_table(f, delim_whitespace=True, sep=delimiter)
+
+    with pytest.raises(ValueError, match=msg):
+        parser.read_table(f, delim_whitespace=True, delimiter=delimiter)
+
+
+@skip_pyarrow
+def test_dict_keys_as_names(all_parsers):
+    # GH: 36928
+    data = "1,2"
+
+    keys = {"a": int, "b": int}.keys()
+    parser = all_parsers
+
+    result = parser.read_csv(StringIO(data), names=keys)
+    expected = DataFrame({"a": [1], "b": [2]})
+    tm.assert_frame_equal(result, expected)

From cd9b3004724ffb666c4d9897cdc13943f66ca3b7 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 2 Jan 2021 02:17:31 -0500
Subject: [PATCH 84/95] TST/REF: io/parsers/test_common.py

---
 pandas/tests/io/parser/common/test_ints.py | 210 +++++++++++++++++++++
 1 file changed, 210 insertions(+)
 create mode 100644 pandas/tests/io/parser/common/test_ints.py

diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py
new file mode 100644
index 0000000000000..4b31447b638f7
--- /dev/null
+++ b/pandas/tests/io/parser/common/test_ints.py
@@ -0,0 +1,210 @@
+"""
+Tests that work on both the Python and C engines but do not have a
+specific classification into the other test modules.
+"""
+from io import StringIO
+
+import numpy as np
+import pytest
+
+from pandas import DataFrame, Series
+import pandas._testing as tm
+
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+
+
+def test_int_conversion(all_parsers):
+    data = """A,B
+1.0,1
+2.0,2
+3.0,3
+"""
+    parser = all_parsers
+    result = parser.read_csv(StringIO(data))
+
+    expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"])
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data,kwargs,expected",
+    [
+        (
+            "A,B\nTrue,1\nFalse,2\nTrue,3",
+            {},
+            DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
+        ),
+        (
+            "A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3",
+            {"true_values": ["yes", "Yes", "YES"], "false_values": ["no", "NO", "No"]},
+            DataFrame(
+                [[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]],
+                columns=["A", "B"],
+            ),
+        ),
+        (
+            "A,B\nTRUE,1\nFALSE,2\nTRUE,3",
+            {},
+            DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
+        ),
+        (
+            "A,B\nfoo,bar\nbar,foo",
+            {"true_values": ["foo"], "false_values": ["bar"]},
+            DataFrame([[True, False], [False, True]], columns=["A", "B"]),
+        ),
+    ],
+)
+def test_parse_bool(all_parsers, data, kwargs, expected):
+    parser = all_parsers
+    result = parser.read_csv(StringIO(data), **kwargs)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_parse_integers_above_fp_precision(all_parsers):
+    data = """Numbers
+17007000002000191
+17007000002000191
+17007000002000191
+17007000002000191
+17007000002000192
+17007000002000192
+17007000002000192
+17007000002000192
+17007000002000192
+17007000002000194"""
+    parser = all_parsers
+    result = parser.read_csv(StringIO(data))
+    expected = DataFrame(
+        {
+            "Numbers": [
+                17007000002000191,
+                17007000002000191,
+                17007000002000191,
+                17007000002000191,
+                17007000002000192,
+                17007000002000192,
+                17007000002000192,
+                17007000002000192,
+                17007000002000192,
+                17007000002000194,
+            ]
+        }
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+@pytest.mark.parametrize("sep", [" ", r"\s+"])
+def test_integer_overflow_bug(all_parsers, sep):
+    # see gh-2601
+    data = "65248E10 11\n55555E55 22\n"
+    parser = all_parsers
+
+    result = parser.read_csv(StringIO(data), header=None, sep=sep)
+    expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]])
+    tm.assert_frame_equal(result, expected)
+
+
+def test_int64_min_issues(all_parsers):
+    # see gh-2599
+    parser = all_parsers
+    data = "A,B\n0,0\n0,"
+    result = parser.read_csv(StringIO(data))
+
+    expected = DataFrame({"A": [0, 0], "B": [0, np.nan]})
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+@pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
+def test_int64_overflow(all_parsers, conv):
+    data = """ID
+00013007854817840016671868
+00013007854817840016749251
+00013007854817840016754630
+00013007854817840016781876
+00013007854817840017028824
+00013007854817840017963235
+00013007854817840018860166"""
+    parser = all_parsers
+
+    if conv is None:
+        # 13007854817840016671868 > UINT64_MAX, so this
+        # will overflow and return object as the dtype.
+        result = parser.read_csv(StringIO(data))
+        expected = DataFrame(
+            [
+                "00013007854817840016671868",
+                "00013007854817840016749251",
+                "00013007854817840016754630",
+                "00013007854817840016781876",
+                "00013007854817840017028824",
+                "00013007854817840017963235",
+                "00013007854817840018860166",
+            ],
+            columns=["ID"],
+        )
+        tm.assert_frame_equal(result, expected)
+    else:
+        # 13007854817840016671868 > UINT64_MAX, so attempts
+        # to cast to either int64 or uint64 will result in
+        # an OverflowError being raised.
+        msg = (
+            "(Python int too large to convert to C long)|"
+            "(long too big to convert)|"
+            "(int too big to convert)"
+        )
+
+        with pytest.raises(OverflowError, match=msg):
+            parser.read_csv(StringIO(data), converters={"ID": conv})
+
+
+@skip_pyarrow
+@pytest.mark.parametrize(
+    "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min]
+)
+def test_int64_uint64_range(all_parsers, val):
+    # These numbers fall right inside the int64-uint64
+    # range, so they should be parsed as string.
+    parser = all_parsers
+    result = parser.read_csv(StringIO(str(val)), header=None)
+
+    expected = DataFrame([val])
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+@pytest.mark.parametrize(
+    "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
+)
+def test_outside_int64_uint64_range(all_parsers, val):
+    # These numbers fall just outside the int64-uint64
+    # range, so they should be parsed as string.
+    parser = all_parsers
+    result = parser.read_csv(StringIO(str(val)), header=None)
+
+    expected = DataFrame([str(val)])
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+@pytest.mark.parametrize("exp_data", [[str(-1), str(2 ** 63)], [str(2 ** 63), str(-1)]])
+def test_numeric_range_too_wide(all_parsers, exp_data):
+    # No numerical dtype can hold both negative and uint64
+    # values, so they should be cast as string.
+    parser = all_parsers
+    data = "\n".join(exp_data)
+    expected = DataFrame(exp_data)
+
+    result = parser.read_csv(StringIO(data), header=None)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_integer_precision(all_parsers):
+    # Gh 7072
+    s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765
+5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389"""
+    parser = all_parsers
+    result = parser.read_csv(StringIO(s), header=None)[4]
+    expected = Series([4321583677327450765, 4321113141090630389], name=4)
+    tm.assert_series_equal(result, expected)

From 4a7dc0f99152dc9a78f07f16f6cb7d2e295da66e Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 2 Jan 2021 02:19:19 -0500
Subject: [PATCH 85/95] TST/REF: io/parsers/test_common.py

---
 .../tests/io/parser/common/test_chunksize.py  | 232 ++++++++++++++++++
 1 file changed, 232 insertions(+)
 create mode 100644 pandas/tests/io/parser/common/test_chunksize.py

diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py
new file mode 100644
index 0000000000000..f7db9a5546d62
--- /dev/null
+++ b/pandas/tests/io/parser/common/test_chunksize.py
@@ -0,0 +1,232 @@
+"""
+Tests that work on both the Python and C engines but do not have a
+specific classification into the other test modules.
+"""
+from io import StringIO
+
+import numpy as np
+import pytest
+
+from pandas.errors import DtypeWarning
+
+from pandas import DataFrame, concat
+import pandas._testing as tm
+
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+
+
+@skip_pyarrow
+@pytest.mark.parametrize("index_col", [0, "index"])
+def test_read_chunksize_with_index(all_parsers, index_col):
+    parser = all_parsers
+    data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+
+    expected = DataFrame(
+        [
+            ["foo", 2, 3, 4, 5],
+            ["bar", 7, 8, 9, 10],
+            ["baz", 12, 13, 14, 15],
+            ["qux", 12, 13, 14, 15],
+            ["foo2", 12, 13, 14, 15],
+            ["bar2", 12, 13, 14, 15],
+        ],
+        columns=["index", "A", "B", "C", "D"],
+    )
+    expected = expected.set_index("index")
+
+    with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
+        chunks = list(reader)
+    tm.assert_frame_equal(chunks[0], expected[:2])
+    tm.assert_frame_equal(chunks[1], expected[2:4])
+    tm.assert_frame_equal(chunks[2], expected[4:])
+
+
+@skip_pyarrow
+@pytest.mark.parametrize("chunksize", [1.3, "foo", 0])
+def test_read_chunksize_bad(all_parsers, chunksize):
+    data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+    parser = all_parsers
+    msg = r"'chunksize' must be an integer >=1"
+
+    with pytest.raises(ValueError, match=msg):
+        with parser.read_csv(StringIO(data), chunksize=chunksize) as _:
+            pass
+
+
+@skip_pyarrow
+@pytest.mark.parametrize("chunksize", [2, 8])
+def test_read_chunksize_and_nrows(all_parsers, chunksize):
+    # see gh-15755
+    data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+    parser = all_parsers
+    kwargs = {"index_col": 0, "nrows": 5}
+
+    expected = parser.read_csv(StringIO(data), **kwargs)
+    with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader:
+        tm.assert_frame_equal(concat(reader), expected)
+
+
+@skip_pyarrow
+def test_read_chunksize_and_nrows_changing_size(all_parsers):
+    data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+    parser = all_parsers
+    kwargs = {"index_col": 0, "nrows": 5}
+
+    expected = parser.read_csv(StringIO(data), **kwargs)
+    with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader:
+        tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2])
+        tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5])
+
+        with pytest.raises(StopIteration, match=""):
+            reader.get_chunk(size=3)
+
+
+@skip_pyarrow
+def test_get_chunk_passed_chunksize(all_parsers):
+    parser = all_parsers
+    data = """A,B,C
+1,2,3
+4,5,6
+7,8,9
+1,2,3"""
+
+    with parser.read_csv(StringIO(data), chunksize=2) as reader:
+        result = reader.get_chunk()
+
+    expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+@pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}])
+def test_read_chunksize_compat(all_parsers, kwargs):
+    # see gh-12185
+    data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+    parser = all_parsers
+    result = parser.read_csv(StringIO(data), **kwargs)
+    with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
+        tm.assert_frame_equal(concat(reader), result)
+
+
+@skip_pyarrow
+def test_read_chunksize_jagged_names(all_parsers):
+    # see gh-23509
+    parser = all_parsers
+    data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
+
+    expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])
+    with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader:
+        result = concat(reader)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_chunk_begins_with_newline_whitespace(all_parsers):
+    # see gh-10022
+    parser = all_parsers
+    data = "\n hello\nworld\n"
+
+    result = parser.read_csv(StringIO(data), header=None)
+    expected = DataFrame([" hello", "world"])
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.xfail(reason="GH38630, sometimes gives ResourceWarning", strict=False)
+def test_chunks_have_consistent_numerical_type(all_parsers):
+    parser = all_parsers
+    integers = [str(i) for i in range(499999)]
+    data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
+
+    # Coercions should work without warnings.
+    with tm.assert_produces_warning(None):
+        result = parser.read_csv(StringIO(data))
+
+    assert type(result.a[0]) is np.float64
+    assert result.a.dtype == float
+
+
+def test_warn_if_chunks_have_mismatched_type(all_parsers):
+    warning_type = None
+    parser = all_parsers
+    integers = [str(i) for i in range(499999)]
+    data = "a\n" + "\n".join(integers + ["a", "b"] + integers)
+
+    # see gh-3866: if chunks are different types and can't
+    # be coerced using numerical types, then issue warning.
+    if parser.engine == "c" and parser.low_memory:
+        warning_type = DtypeWarning
+
+    with tm.assert_produces_warning(warning_type):
+        df = parser.read_csv(StringIO(data))
+    assert df.a.dtype == object
+
+
+@skip_pyarrow
+@pytest.mark.parametrize("iterator", [True, False])
+def test_empty_with_nrows_chunksize(all_parsers, iterator):
+    # see gh-9535
+    parser = all_parsers
+    expected = DataFrame(columns=["foo", "bar"])
+
+    nrows = 10
+    data = StringIO("foo,bar\n")
+
+    if iterator:
+        with parser.read_csv(data, chunksize=nrows) as reader:
+            result = next(iter(reader))
+    else:
+        result = parser.read_csv(data, nrows=nrows)
+
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_read_csv_memory_growth_chunksize(all_parsers):
+    # see gh-24805
+    #
+    # Let's just make sure that we don't crash
+    # as we iteratively process all chunks.
+    parser = all_parsers
+
+    with tm.ensure_clean() as path:
+        with open(path, "w") as f:
+            for i in range(1000):
+                f.write(str(i) + "\n")
+
+        with parser.read_csv(path, chunksize=20) as result:
+            for _ in result:
+                pass

From 3b24fe74ea5e851ad8be2c6c96fa70e70c3e88b5 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 2 Jan 2021 02:22:44 -0500
Subject: [PATCH 86/95] TST/REF: io/parsers/test_common.py

---
 pandas/tests/io/parser/common/test_decimal.py | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 pandas/tests/io/parser/common/test_decimal.py

diff --git a/pandas/tests/io/parser/common/test_decimal.py b/pandas/tests/io/parser/common/test_decimal.py
new file mode 100644
index 0000000000000..21eadc51d25b6
--- /dev/null
+++ b/pandas/tests/io/parser/common/test_decimal.py
@@ -0,0 +1,64 @@
+"""
+Tests that work on both the Python and C engines but do not have a
+specific classification into the other test modules.
+"""
+from io import StringIO
+
+import pytest
+
+from pandas import DataFrame
+import pandas._testing as tm
+
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+
+
+@skip_pyarrow
+@pytest.mark.parametrize(
+    "data,thousands,decimal",
+    [
+        (
+            """A|B|C
+1|2,334.01|5
+10|13|10.
+""",
+            ",",
+            ".",
+        ),
+        (
+            """A|B|C
+1|2.334,01|5
+10|13|10,
+""",
+            ".",
+            ",",
+        ),
+    ],
+)
+def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal):
+    parser = all_parsers
+    expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
+
+    result = parser.read_csv(
+        StringIO(data), sep="|", thousands=thousands, decimal=decimal
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_euro_decimal_format(all_parsers):
+    parser = all_parsers
+    data = """Id;Number1;Number2;Text1;Text2;Number3
+1;1521,1541;187101,9543;ABC;poi;4,738797819
+2;121,12;14897,76;DEF;uyt;0,377320872
+3;878,158;108013,434;GHI;rez;2,735694704"""
+
+    result = parser.read_csv(StringIO(data), sep=";", decimal=",")
+    expected = DataFrame(
+        [
+            [1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819],
+            [2, 121.12, 14897.76, "DEF", "uyt", 0.377320872],
+            [3, 878.158, 108013.434, "GHI", "rez", 2.735694704],
+        ],
+        columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
+    )
+    tm.assert_frame_equal(result, expected)

From c33bf46783e24baee51a89961d0249c520329911 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 2 Jan 2021 02:24:39 -0500
Subject: [PATCH 87/95] TST/REF: io/parsers/test_common.py

---
 .../tests/io/parser/common/test_iterator.py   | 110 ++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 pandas/tests/io/parser/common/test_iterator.py

diff --git a/pandas/tests/io/parser/common/test_iterator.py b/pandas/tests/io/parser/common/test_iterator.py
new file mode 100644
index 0000000000000..f19ae55ecb8ac
--- /dev/null
+++ b/pandas/tests/io/parser/common/test_iterator.py
@@ -0,0 +1,110 @@
+"""
+Tests that work on both the Python and C engines but do not have a
+specific classification into the other test modules.
+"""
+from io import StringIO
+
+import pytest
+
+from pandas import DataFrame, Series, concat
+import pandas._testing as tm
+
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+
+
+@skip_pyarrow
+def test_iterator(all_parsers):
+    # see gh-6607
+    data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+    parser = all_parsers
+    kwargs = {"index_col": 0}
+
+    expected = parser.read_csv(StringIO(data), **kwargs)
+    with parser.read_csv(StringIO(data), iterator=True, **kwargs) as reader:
+
+        first_chunk = reader.read(3)
+        tm.assert_frame_equal(first_chunk, expected[:3])
+
+        last_chunk = reader.read(5)
+    tm.assert_frame_equal(last_chunk, expected[3:])
+
+
+@skip_pyarrow
+def test_iterator2(all_parsers):
+    parser = all_parsers
+    data = """A,B,C
+foo,1,2,3
+bar,4,5,6
+baz,7,8,9
+"""
+
+    with parser.read_csv(StringIO(data), iterator=True) as reader:
+        result = list(reader)
+
+    expected = DataFrame(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+        index=["foo", "bar", "baz"],
+        columns=["A", "B", "C"],
+    )
+    tm.assert_frame_equal(result[0], expected)
+
+
+@skip_pyarrow
+def test_iterator_stop_on_chunksize(all_parsers):
+    # gh-3967: stopping iteration when chunksize is specified
+    parser = all_parsers
+    data = """A,B,C
+foo,1,2,3
+bar,4,5,6
+baz,7,8,9
+"""
+
+    with parser.read_csv(StringIO(data), chunksize=1) as reader:
+        result = list(reader)
+
+    assert len(result) == 3
+    expected = DataFrame(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+        index=["foo", "bar", "baz"],
+        columns=["A", "B", "C"],
+    )
+    tm.assert_frame_equal(concat(result), expected)
+
+
+@skip_pyarrow
+@pytest.mark.parametrize(
+    "kwargs", [{"iterator": True, "chunksize": 1}, {"iterator": True}, {"chunksize": 1}]
+)
+def test_iterator_skipfooter_errors(all_parsers, kwargs):
+    msg = "'skipfooter' not supported for iteration"
+    parser = all_parsers
+    data = "a\n1\n2"
+
+    with pytest.raises(ValueError, match=msg):
+        with parser.read_csv(StringIO(data), skipfooter=1, **kwargs) as _:
+            pass
+
+
+def test_iteration_open_handle(all_parsers):
+    parser = all_parsers
+    kwargs = {"squeeze": True, "header": None}
+
+    with tm.ensure_clean() as path:
+        with open(path, "w") as f:
+            f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG")
+
+        with open(path) as f:
+            for line in f:
+                if "CCC" in line:
+                    break
+
+            result = parser.read_csv(f, **kwargs)
+            expected = Series(["DDD", "EEE", "FFF", "GGG"], name=0)
+            tm.assert_series_equal(result, expected)

From dc9530baa77b9a5e6d5e30f6f366d4794b271824 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 2 Jan 2021 02:27:14 -0500
Subject: [PATCH 88/95] TST/REF: io/parsers/test_common.py

---
 pandas/tests/io/parser/common/test_index.py | 292 ++++++++++++++++++++
 1 file changed, 292 insertions(+)
 create mode 100644 pandas/tests/io/parser/common/test_index.py

diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py
new file mode 100644
index 0000000000000..fd999fcdabac3
--- /dev/null
+++ b/pandas/tests/io/parser/common/test_index.py
@@ -0,0 +1,292 @@
+"""
+Tests that work on both the Python and C engines but do not have a
+specific classification into the other test modules.
+"""
+from datetime import datetime
+from io import StringIO
+import os
+
+import pytest
+
+from pandas import DataFrame, Index, MultiIndex
+import pandas._testing as tm
+
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+
+
+@pytest.mark.parametrize(
+    "data,kwargs,expected",
+    [
+        (
+            """foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+""",
+            {"index_col": 0, "names": ["index", "A", "B", "C", "D"]},
+            DataFrame(
+                [
+                    [2, 3, 4, 5],
+                    [7, 8, 9, 10],
+                    [12, 13, 14, 15],
+                    [12, 13, 14, 15],
+                    [12, 13, 14, 15],
+                    [12, 13, 14, 15],
+                ],
+                index=Index(["foo", "bar", "baz", "qux", "foo2", "bar2"], name="index"),
+                columns=["A", "B", "C", "D"],
+            ),
+        ),
+        (
+            """foo,one,2,3,4,5
+foo,two,7,8,9,10
+foo,three,12,13,14,15
+bar,one,12,13,14,15
+bar,two,12,13,14,15
+""",
+            {"index_col": [0, 1], "names": ["index1", "index2", "A", "B", "C", "D"]},
+            DataFrame(
+                [
+                    [2, 3, 4, 5],
+                    [7, 8, 9, 10],
+                    [12, 13, 14, 15],
+                    [12, 13, 14, 15],
+                    [12, 13, 14, 15],
+                ],
+                index=MultiIndex.from_tuples(
+                    [
+                        ("foo", "one"),
+                        ("foo", "two"),
+                        ("foo", "three"),
+                        ("bar", "one"),
+                        ("bar", "two"),
+                    ],
+                    names=["index1", "index2"],
+                ),
+                columns=["A", "B", "C", "D"],
+            ),
+        ),
+    ],
+)
+def test_pass_names_with_index(all_parsers, data, kwargs, expected):
+    parser = all_parsers
+    result = parser.read_csv(StringIO(data), **kwargs)
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
+def test_multi_index_no_level_names(all_parsers, index_col):
+    data = """index1,index2,A,B,C,D
+foo,one,2,3,4,5
+foo,two,7,8,9,10
+foo,three,12,13,14,15
+bar,one,12,13,14,15
+bar,two,12,13,14,15
+"""
+    headless_data = "\n".join(data.split("\n")[1:])
+
+    names = ["A", "B", "C", "D"]
+    parser = all_parsers
+
+    result = parser.read_csv(
+        StringIO(headless_data), index_col=index_col, header=None, names=names
+    )
+    expected = parser.read_csv(StringIO(data), index_col=index_col)
+
+    # No index names in headless data.
+    expected.index.names = [None] * 2
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_multi_index_no_level_names_implicit(all_parsers):
+    parser = all_parsers
+    data = """A,B,C,D
+foo,one,2,3,4,5
+foo,two,7,8,9,10
+foo,three,12,13,14,15
+bar,one,12,13,14,15
+bar,two,12,13,14,15
+"""
+
+    result = parser.read_csv(StringIO(data))
+    expected = DataFrame(
+        [
+            [2, 3, 4, 5],
+            [7, 8, 9, 10],
+            [12, 13, 14, 15],
+            [12, 13, 14, 15],
+            [12, 13, 14, 15],
+        ],
+        columns=["A", "B", "C", "D"],
+        index=MultiIndex.from_tuples(
+            [
+                ("foo", "one"),
+                ("foo", "two"),
+                ("foo", "three"),
+                ("bar", "one"),
+                ("bar", "two"),
+            ]
+        ),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+@pytest.mark.parametrize(
+    "data,expected,header",
+    [
+        ("a,b", DataFrame(columns=["a", "b"]), [0]),
+        (
+            "a,b\nc,d",
+            DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])),
+            [0, 1],
+        ),
+    ],
+)
+@pytest.mark.parametrize("round_trip", [True, False])
+def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip):
+    # see gh-14545
+    parser = all_parsers
+    data = expected.to_csv(index=False) if round_trip else data
+
+    result = parser.read_csv(StringIO(data), header=header)
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_no_unnamed_index(all_parsers):
+    parser = all_parsers
+    data = """ id c0 c1 c2
+0 1 0 a b
+1 2 0 c d
+2 2 2 e f
+"""
+    result = parser.read_csv(StringIO(data), sep=" ")
+    expected = DataFrame(
+        [[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], [2, 2, 2, "e", "f"]],
+        columns=["Unnamed: 0", "id", "c0", "c1", "c2"],
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_read_duplicate_index_explicit(all_parsers):
+    data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo,12,13,14,15
+bar,12,13,14,15
+"""
+    parser = all_parsers
+    result = parser.read_csv(StringIO(data), index_col=0)
+
+    expected = DataFrame(
+        [
+            [2, 3, 4, 5],
+            [7, 8, 9, 10],
+            [12, 13, 14, 15],
+            [12, 13, 14, 15],
+            [12, 13, 14, 15],
+            [12, 13, 14, 15],
+        ],
+        columns=["A", "B", "C", "D"],
+        index=Index(["foo", "bar", "baz", "qux", "foo", "bar"], name="index"),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_read_duplicate_index_implicit(all_parsers):
+    data = """A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo,12,13,14,15
+bar,12,13,14,15
+"""
+    parser = all_parsers
+    result = parser.read_csv(StringIO(data))
+
+    expected = DataFrame(
+        [
+            [2, 3, 4, 5],
+            [7, 8, 9, 10],
+            [12, 13, 14, 15],
+            [12, 13, 14, 15],
+            [12, 13, 14, 15],
+            [12, 13, 14, 15],
+        ],
+        columns=["A", "B", "C", "D"],
+        index=Index(["foo", "bar", "baz", "qux", "foo", "bar"]),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_read_csv_no_index_name(all_parsers, csv_dir_path):
+    parser = all_parsers
+    csv2 = os.path.join(csv_dir_path, "test2.csv")
+    result = parser.read_csv(csv2, index_col=0, parse_dates=True)
+
+    expected = DataFrame(
+        [
+            [0.980269, 3.685731, -0.364216805298, -1.159738, "foo"],
+            [1.047916, -0.041232, -0.16181208307, 0.212549, "bar"],
+            [0.498581, 0.731168, -0.537677223318, 1.346270, "baz"],
+            [1.120202, 1.567621, 0.00364077397681, 0.675253, "qux"],
+            [-0.487094, 0.571455, -1.6116394093, 0.103469, "foo2"],
+        ],
+        columns=["A", "B", "C", "D", "E"],
+        index=Index(
+            [
+                datetime(2000, 1, 3),
+                datetime(2000, 1, 4),
+                datetime(2000, 1, 5),
+                datetime(2000, 1, 6),
+                datetime(2000, 1, 7),
+            ]
+        ),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_empty_with_index(all_parsers):
+    # see gh-10184
+    data = "x,y"
+    parser = all_parsers
+    result = parser.read_csv(StringIO(data), index_col=0)
+
+    expected = DataFrame(columns=["y"], index=Index([], name="x"))
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_empty_with_multi_index(all_parsers):
+    # see gh-10467
+    data = "x,y,z"
+    parser = all_parsers
+    result = parser.read_csv(StringIO(data), index_col=["x", "y"])
+
+    expected = DataFrame(
+        columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"])
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_empty_with_reversed_multi_index(all_parsers):
+    data = "x,y,z"
+    parser = all_parsers
+    result = parser.read_csv(StringIO(data), index_col=[1, 0])
+
+    expected = DataFrame(
+        columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"])
+    )
+    tm.assert_frame_equal(result, expected)

From d83b2e0bbf341f79d741b41aa6f7853c3bb78f8d Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 2 Jan 2021 02:28:54 -0500
Subject: [PATCH 89/95] TST/REF: io/parsers/test_common.py

---
 .../tests/io/parser/common/test_data_list.py  | 87 +++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 pandas/tests/io/parser/common/test_data_list.py

diff --git a/pandas/tests/io/parser/common/test_data_list.py b/pandas/tests/io/parser/common/test_data_list.py
new file mode 100644
index 0000000000000..d67f728ad87e5
--- /dev/null
+++ b/pandas/tests/io/parser/common/test_data_list.py
@@ -0,0 +1,87 @@
+"""
+Tests that work on both the Python and C engines but do not have a
+specific classification into the other test modules.
+"""
+import csv
+from io import StringIO
+
+import pytest
+
+from pandas import DataFrame
+import pandas._testing as tm
+
+from pandas.io.parsers import TextParser
+
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+
+
+@skip_pyarrow
+def test_read_data_list(all_parsers):
+    parser = all_parsers
+    kwargs = {"index_col": 0}
+    data = "A,B,C\nfoo,1,2,3\nbar,4,5,6"
+
+    data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]]
+    expected = parser.read_csv(StringIO(data), **kwargs)
+
+    with TextParser(data_list, chunksize=2, **kwargs) as parser:
+        result = parser.read()
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_reader_list(all_parsers):
+    data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+    parser = all_parsers
+    kwargs = {"index_col": 0}
+
+    lines = list(csv.reader(StringIO(data)))
+    with TextParser(lines, chunksize=2, **kwargs) as reader:
+        chunks = list(reader)
+
+    expected = parser.read_csv(StringIO(data), **kwargs)
+
+    tm.assert_frame_equal(chunks[0], expected[:2])
+    tm.assert_frame_equal(chunks[1], expected[2:4])
+    tm.assert_frame_equal(chunks[2], expected[4:])
+
+
+def test_reader_list_skiprows(all_parsers):
+    data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+    parser = all_parsers
+    kwargs = {"index_col": 0}
+
+    lines = list(csv.reader(StringIO(data)))
+    with TextParser(lines, chunksize=2, skiprows=[1], **kwargs) as reader:
+        chunks = list(reader)
+
+    expected = parser.read_csv(StringIO(data), **kwargs)
+
+    tm.assert_frame_equal(chunks[0], expected[1:3])
+
+
+def test_read_csv_parse_simple_list(all_parsers):
+    parser = all_parsers
+    data = """foo
+bar baz
+qux foo
+foo
+bar"""
+
+    result = parser.read_csv(StringIO(data), header=None)
+    expected = DataFrame(["foo", "bar baz", "qux foo", "foo", "bar"])
+    tm.assert_frame_equal(result, expected)

From 6205bedf0e4a4d9397ae781244043e58f87edfd9 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 2 Jan 2021 02:30:45 -0500
Subject: [PATCH 90/95] TST/REF: io/parsers/test_common.py

---
 pandas/tests/io/parser/common/test_float.py | 69 +++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 pandas/tests/io/parser/common/test_float.py

diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py
new file mode 100644
index 0000000000000..c9dcc5189de06
--- /dev/null
+++ b/pandas/tests/io/parser/common/test_float.py
@@ -0,0 +1,69 @@
+"""
+Tests that work on both the Python and C engines but do not have a
+specific classification into the other test modules.
+"""
+from io import StringIO
+
+import numpy as np
+import pytest
+
+from pandas.compat import is_platform_linux
+
+from pandas import DataFrame
+import pandas._testing as tm
+
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+
+
+@skip_pyarrow
+def test_float_parser(all_parsers):
+    # see gh-9565
+    parser = all_parsers
+    data = "45e-1,4.5,45.,inf,-inf"
+    result = parser.read_csv(StringIO(data), header=None)
+
+    expected = DataFrame([[float(s) for s in data.split(",")]])
+    tm.assert_frame_equal(result, expected)
+
+
+def test_scientific_no_exponent(all_parsers_all_precisions):
+    # see gh-12215
+    df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]})
+    data = df.to_csv(index=False)
+    parser, precision = all_parsers_all_precisions
+    if parser == "pyarrow":
+        pytest.skip()
+
+    df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
+    tm.assert_frame_equal(df_roundtrip, df)
+
+
+@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999])
+def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
+    # GH#38753
+    parser, precision = all_parsers_all_precisions
+    if parser == "pyarrow":
+        pytest.skip()
+    data = f"data\n10E{neg_exp}"
+    result = parser.read_csv(StringIO(data), float_precision=precision)
+    expected = DataFrame({"data": [0.0]})
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
+def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
+    # GH#38753
+    parser, precision = all_parsers_all_precisions
+    data = f"data\n10E{exp}"
+    result = parser.read_csv(StringIO(data), float_precision=precision)
+    if precision == "round_trip":
+        if exp == 999999999999999999 and is_platform_linux():
+            mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
+            request.node.add_marker(mark)
+
+        value = np.inf if exp > 0 else 0.0
+        expected = DataFrame({"data": [value]})
+    else:
+        expected = DataFrame({"data": [f"10E{exp}"]})
+
+    tm.assert_frame_equal(result, expected)

From c4b3bb72fa34c80012fe05089625cabef2b433a0 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 2 Jan 2021 02:32:15 -0500
Subject: [PATCH 91/95] TST/REF: io/parsers/test_common.py

---
 pandas/tests/io/parser/common/test_inf.py | 65 +++++++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 pandas/tests/io/parser/common/test_inf.py

diff --git a/pandas/tests/io/parser/common/test_inf.py b/pandas/tests/io/parser/common/test_inf.py
new file mode 100644
index 0000000000000..9bc93171f9307
--- /dev/null
+++ b/pandas/tests/io/parser/common/test_inf.py
@@ -0,0 +1,65 @@
+"""
+Tests that work on both the Python and C engines but do not have a
+specific classification into the other test modules.
+"""
+from io import StringIO
+
+import numpy as np
+import pytest
+
+from pandas import DataFrame, option_context
+import pandas._testing as tm
+
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+
+
+@skip_pyarrow
+@pytest.mark.parametrize("na_filter", [True, False])
+def test_inf_parsing(all_parsers, na_filter):
+    parser = all_parsers
+    data = """\
+,A
+a,inf
+b,-inf
+c,+Inf
+d,-Inf
+e,INF
+f,-INF
+g,+INf
+h,-INf
+i,inF
+j,-inF"""
+    expected = DataFrame(
+        {"A": [float("inf"), float("-inf")] * 5},
+        index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
+    )
+    result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+@pytest.mark.parametrize("na_filter", [True, False])
+def test_infinity_parsing(all_parsers, na_filter):
+    parser = all_parsers
+    data = """\
+,A
+a,Infinity
+b,-Infinity
+c,+Infinity
+"""
+    expected = DataFrame(
+        {"A": [float("infinity"), float("-infinity"), float("+infinity")]},
+        index=["a", "b", "c"],
+    )
+    result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_read_csv_with_use_inf_as_na(all_parsers):
+    # https://github.com/pandas-dev/pandas/issues/35493
+    parser = all_parsers
+    data = "1.0\nNaN\n3.0"
+    with option_context("use_inf_as_na", True):
+        result = parser.read_csv(StringIO(data), header=None)
+    expected = DataFrame([1.0, np.nan, 3.0])
+    tm.assert_frame_equal(result, expected)

From a77b33eb6afd4f9e4c4fdc272ffe59869fe2c873 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 2 Jan 2021 02:34:53 -0500
Subject: [PATCH 92/95] TST/REF: io/parsers/test_common.py

---
 pandas/tests/io/parser/common/test_verbose.py | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 pandas/tests/io/parser/common/test_verbose.py

diff --git a/pandas/tests/io/parser/common/test_verbose.py b/pandas/tests/io/parser/common/test_verbose.py
new file mode 100644
index 0000000000000..e085d230d1acd
--- /dev/null
+++ b/pandas/tests/io/parser/common/test_verbose.py
@@ -0,0 +1,57 @@
+"""
+Tests that work on both the Python and C engines but do not have a
+specific classification into the other test modules.
+"""
+from io import StringIO
+
+import pytest
+
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+
+
+@skip_pyarrow
+def test_verbose_read(all_parsers, capsys):
+    parser = all_parsers
+    data = """a,b,c,d
+one,1,2,3
+one,1,2,3
+,1,2,3
+one,1,2,3
+,1,2,3
+,1,2,3
+one,1,2,3
+two,1,2,3"""
+
+    # Engines are verbose in different ways.
+    parser.read_csv(StringIO(data), verbose=True)
+    captured = capsys.readouterr()
+
+    if parser.engine == "c":
+        assert "Tokenization took:" in captured.out
+        assert "Parser memory cleanup took:" in captured.out
+    else:  # Python engine
+        assert captured.out == "Filled 3 NA values in column a\n"
+
+
+@skip_pyarrow
+def test_verbose_read2(all_parsers, capsys):
+    parser = all_parsers
+    data = """a,b,c,d
+one,1,2,3
+two,1,2,3
+three,1,2,3
+four,1,2,3
+five,1,2,3
+,1,2,3
+seven,1,2,3
+eight,1,2,3"""
+
+    parser.read_csv(StringIO(data), verbose=True, index_col=0)
+    captured = capsys.readouterr()
+
+    # Engines are verbose in different ways.
+    if parser.engine == "c":
+        assert "Tokenization took:" in captured.out
+        assert "Parser memory cleanup took:" in captured.out
+    else:  # Python engine
+        assert captured.out == "Filled 1 NA values in column a\n"

From 04c8d218a86880ca90ecee8b16158e865a8750ca Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 2 Jan 2021 02:40:03 -0500
Subject: [PATCH 93/95] TST/REF: io/parsers/test_common.py

---
 .../io/parser/common/test_file_buffer_url.py  | 452 ++++++++++++++++++
 1 file changed, 452 insertions(+)
 create mode 100644 pandas/tests/io/parser/common/test_file_buffer_url.py

diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py
new file mode 100644
index 0000000000000..0a5bc4a135b9e
--- /dev/null
+++ b/pandas/tests/io/parser/common/test_file_buffer_url.py
@@ -0,0 +1,452 @@
+"""
+Tests that work on both the Python and C engines but do not have a
+specific classification into the other test modules.
+"""
+from io import BytesIO, StringIO
+import os
+import platform
+from urllib.error import URLError
+
+import pytest
+
+from pandas.errors import EmptyDataError, ParserError
+import pandas.util._test_decorators as td
+
+from pandas import DataFrame
+import pandas._testing as tm
+
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+
+
+@skip_pyarrow
+@tm.network
+def test_url(all_parsers, csv_dir_path):
+    # TODO: FTP testing
+    parser = all_parsers
+    kwargs = {"sep": "\t"}
+
+    url = (
+        "https://raw.github.com/pandas-dev/pandas/master/"
+        "pandas/tests/io/parser/data/salaries.csv"
+    )
+    url_result = parser.read_csv(url, **kwargs)
+
+    local_path = os.path.join(csv_dir_path, "salaries.csv")
+    local_result = parser.read_csv(local_path, **kwargs)
+    tm.assert_frame_equal(url_result, local_result)
+
+
+@skip_pyarrow
+@pytest.mark.slow
+def test_local_file(all_parsers, csv_dir_path):
+    parser = all_parsers
+    kwargs = {"sep": "\t"}
+
+    local_path = os.path.join(csv_dir_path, "salaries.csv")
+    local_result = parser.read_csv(local_path, **kwargs)
+    url = "file://localhost/" + local_path
+
+    try:
+        url_result = parser.read_csv(url, **kwargs)
+        tm.assert_frame_equal(url_result, local_result)
+    except URLError:
+        # Fails on some systems.
+        pytest.skip("Failing on: " + " ".join(platform.uname()))
+
+
+@skip_pyarrow
+def test_path_path_lib(all_parsers):
+    parser = all_parsers
+    df = tm.makeDataFrame()
+    result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0))
+    tm.assert_frame_equal(df, result)
+
+
+@skip_pyarrow
+def test_path_local_path(all_parsers):
+    parser = all_parsers
+    df = tm.makeDataFrame()
+    result = tm.round_trip_localpath(
+        df.to_csv, lambda p: parser.read_csv(p, index_col=0)
+    )
+    tm.assert_frame_equal(df, result)
+
+
+@skip_pyarrow
+def test_nonexistent_path(all_parsers):
+    # gh-2428: pls no segfault
+    # gh-14086: raise more helpful FileNotFoundError
+    # GH#29233 "File foo" instead of "File b'foo'"
+    parser = all_parsers
+    path = f"{tm.rands(10)}.csv"
+
+    msg = r"\[Errno 2\]"
+    with pytest.raises(FileNotFoundError, match=msg) as e:
+        parser.read_csv(path)
+    assert path == e.value.filename
+
+
+@skip_pyarrow
+@td.skip_if_windows  # os.chmod does not work in windows
+def test_no_permission(all_parsers):
+    # GH 23784
+    parser = all_parsers
+
+    msg = r"\[Errno 13\]"
+    with tm.ensure_clean() as path:
+        os.chmod(path, 0)  # make file unreadable
+
+        # verify that this process cannot open the file (not running as sudo)
+        try:
+            with open(path):
+                pass
+            pytest.skip("Running as sudo.")
+        except PermissionError:
+            pass
+
+        with pytest.raises(PermissionError, match=msg) as e:
+            parser.read_csv(path)
+        assert path == e.value.filename
+
+
+@skip_pyarrow
+@pytest.mark.parametrize(
+    "data,kwargs,expected,msg",
+    [
+        # gh-10728: WHITESPACE_LINE
+        (
+            "a,b,c\n4,5,6\n ",
+            {},
+            DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
+            None,
+        ),
+        # gh-10548: EAT_LINE_COMMENT
+        (
+            "a,b,c\n4,5,6\n#comment",
+            {"comment": "#"},
+            DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
+            None,
+        ),
+        # EAT_CRNL_NOP
+        (
+            "a,b,c\n4,5,6\n\r",
+            {},
+            DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
+            None,
+        ),
+        # EAT_COMMENT
+        (
+            "a,b,c\n4,5,6#comment",
+            {"comment": "#"},
+            DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
+            None,
+        ),
+        # SKIP_LINE
+        (
+            "a,b,c\n4,5,6\nskipme",
+            {"skiprows": [2]},
+            DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
+            None,
+        ),
+        # EAT_LINE_COMMENT
+        (
+            "a,b,c\n4,5,6\n#comment",
+            {"comment": "#", "skip_blank_lines": False},
+            DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
+            None,
+        ),
+        # IN_FIELD
+        (
+            "a,b,c\n4,5,6\n ",
+            {"skip_blank_lines": False},
+            DataFrame([["4", 5, 6], [" ", None, None]], columns=["a", "b", "c"]),
+            None,
+        ),
+        # EAT_CRNL
+        (
+            "a,b,c\n4,5,6\n\r",
+            {"skip_blank_lines": False},
+            DataFrame([[4, 5, 6], [None, None, None]], columns=["a", "b", "c"]),
+            None,
+        ),
+        # ESCAPED_CHAR
+        (
+            "a,b,c\n4,5,6\n\\",
+            {"escapechar": "\\"},
+            None,
+            "(EOF following escape character)|(unexpected end of data)",
+        ),
+        # ESCAPE_IN_QUOTED_FIELD
+        (
+            'a,b,c\n4,5,6\n"\\',
+            {"escapechar": "\\"},
+            None,
+            "(EOF inside string starting at row 2)|(unexpected end of data)",
+        ),
+        # IN_QUOTED_FIELD
+        (
+            'a,b,c\n4,5,6\n"',
+            {"escapechar": "\\"},
+            None,
+            "(EOF inside string starting at row 2)|(unexpected end of data)",
+        ),
+    ],
+    ids=[
+        "whitespace-line",
+        "eat-line-comment",
+        "eat-crnl-nop",
+        "eat-comment",
+        "skip-line",
+        "eat-line-comment",
+        "in-field",
+        "eat-crnl",
+        "escaped-char",
+        "escape-in-quoted-field",
+        "in-quoted-field",
+    ],
+)
+def test_eof_states(all_parsers, data, kwargs, expected, msg):
+    # see gh-10728, gh-10548
+    parser = all_parsers
+
+    if expected is None:
+        with pytest.raises(ParserError, match=msg):
+            parser.read_csv(StringIO(data), **kwargs)
+    else:
+        result = parser.read_csv(StringIO(data), **kwargs)
+        tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_temporary_file(all_parsers):
+    # see gh-13398
+    parser = all_parsers
+    data = "0 0"
+
+    with tm.ensure_clean(mode="w+", return_filelike=True) as new_file:
+        new_file.write(data)
+        new_file.flush()
+        new_file.seek(0)
+
+        result = parser.read_csv(new_file, sep=r"\s+", header=None)
+
+        expected = DataFrame([[0, 0]])
+        tm.assert_frame_equal(result, expected)
+
+
+def test_internal_eof_byte(all_parsers):
+    # see gh-5500
+    parser = all_parsers
+    data = "a,b\n1\x1a,2"
+
+    expected = DataFrame([["1\x1a", 2]], columns=["a", "b"])
+    result = parser.read_csv(StringIO(data))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_internal_eof_byte_to_file(all_parsers):
+    # see gh-16559
+    parser = all_parsers
+    data = b'c1,c2\r\n"test \x1a    test", test\r\n'
+    expected = DataFrame([["test \x1a    test", " test"]], columns=["c1", "c2"])
+    path = f"__{tm.rands(10)}__.csv"
+
+    with tm.ensure_clean(path) as path:
+        with open(path, "wb") as f:
+            f.write(data)
+
+        result = parser.read_csv(path)
+        tm.assert_frame_equal(result, expected)
+
+
+def test_file_handle_string_io(all_parsers):
+    # gh-14418
+    #
+    # Don't close user provided file handles.
+    parser = all_parsers
+    data = "a,b\n1,2"
+
+    fh = StringIO(data)
+    parser.read_csv(fh)
+    assert not fh.closed
+
+
+def test_file_handles_with_open(all_parsers, csv1):
+    # gh-14418
+    #
+    # Don't close user provided file handles.
+    parser = all_parsers
+
+    for mode in ["r", "rb"]:
+        with open(csv1, mode) as f:
+            parser.read_csv(f)
+            assert not f.closed
+
+
+@skip_pyarrow
+def test_invalid_file_buffer_class(all_parsers):
+    # see gh-15337
+    class InvalidBuffer:
+        pass
+
+    parser = all_parsers
+    msg = "Invalid file path or buffer object type"
+
+    with pytest.raises(ValueError, match=msg):
+        parser.read_csv(InvalidBuffer())
+
+
+@skip_pyarrow
+def test_invalid_file_buffer_mock(all_parsers):
+    # see gh-15337
+    parser = all_parsers
+    msg = "Invalid file path or buffer object type"
+
+    class Foo:
+        pass
+
+    with pytest.raises(ValueError, match=msg):
+        parser.read_csv(Foo())
+
+
+def test_valid_file_buffer_seems_invalid(all_parsers):
+    # gh-16135: we want to ensure that "tell" and "seek"
+    # aren't actually being used when we call `read_csv`
+    #
+    # Thus, while the object may look "invalid" (these
+    # methods are attributes of the `StringIO` class),
+    # it is still a valid file-object for our purposes.
+    class NoSeekTellBuffer(StringIO):
+        def tell(self):
+            raise AttributeError("No tell method")
+
+        def seek(self, pos, whence=0):
+            raise AttributeError("No seek method")
+
+    data = "a\n1"
+    parser = all_parsers
+    expected = DataFrame({"a": [1]})
+
+    result = parser.read_csv(NoSeekTellBuffer(data))
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("io_class", [StringIO, BytesIO])
+@pytest.mark.parametrize("encoding", [None, "utf-8"])
+def test_read_csv_file_handle(all_parsers, io_class, encoding):
+    """
+    Test whether read_csv does not close user-provided file handles.
+
+    GH 36980
+    """
+    parser = all_parsers
+    expected = DataFrame({"a": [1], "b": [2]})
+
+    content = "a,b\n1,2"
+    if io_class == BytesIO:
+        content = content.encode("utf-8")
+    handle = io_class(content)
+
+    tm.assert_frame_equal(parser.read_csv(handle, encoding=encoding), expected)
+    assert not handle.closed
+
+
+@skip_pyarrow
+def test_memory_map_file_handle_silent_fallback(all_parsers, compression):
+    """
+    Do not fail for buffers with memory_map=True (cannot memory map BytesIO).
+
+    GH 37621
+    """
+    parser = all_parsers
+    expected = DataFrame({"a": [1], "b": [2]})
+
+    handle = BytesIO()
+    expected.to_csv(handle, index=False, compression=compression, mode="wb")
+    handle.seek(0)
+
+    tm.assert_frame_equal(
+        parser.read_csv(handle, memory_map=True, compression=compression),
+        expected,
+    )
+
+
+@skip_pyarrow
+def test_memory_map_compression(all_parsers, compression):
+    """
+    Support memory map for compressed files.
+
+    GH 37621
+    """
+    parser = all_parsers
+    expected = DataFrame({"a": [1], "b": [2]})
+
+    with tm.ensure_clean() as path:
+        expected.to_csv(path, index=False, compression=compression)
+
+        tm.assert_frame_equal(
+            parser.read_csv(path, memory_map=True, compression=compression),
+            expected,
+        )
+
+
+@skip_pyarrow
+def test_context_manager(all_parsers, datapath):
+    # make sure that opened files are closed
+    parser = all_parsers
+
+    path = datapath("io", "data", "csv", "iris.csv")
+
+    reader = parser.read_csv(path, chunksize=1)
+    assert not reader._engine.handles.handle.closed
+    try:
+        with reader:
+            next(reader)
+            assert False
+    except AssertionError:
+        assert reader._engine.handles.handle.closed
+
+
+@skip_pyarrow
+def test_context_manageri_user_provided(all_parsers, datapath):
+    # make sure that user-provided handles are not closed
+    parser = all_parsers
+
+    with open(datapath("io", "data", "csv", "iris.csv"), mode="r") as path:
+
+        reader = parser.read_csv(path, chunksize=1)
+        assert not reader._engine.handles.handle.closed
+        try:
+            with reader:
+                next(reader)
+                assert False
+        except AssertionError:
+            assert not reader._engine.handles.handle.closed
+
+
+@skip_pyarrow
+def test_file_descriptor_leak(all_parsers):
+    # GH 31488
+
+    parser = all_parsers
+    with tm.ensure_clean() as path:
+
+        def test():
+            with pytest.raises(EmptyDataError, match="No columns to parse from file"):
+                parser.read_csv(path)
+
+        td.check_file_leaks(test)()
+
+
+@skip_pyarrow
+@td.check_file_leaks
+def test_memory_map(all_parsers, csv_dir_path):
+    mmap_file = os.path.join(csv_dir_path, "test_mmap.csv")
+    parser = all_parsers
+
+    expected = DataFrame(
+        {"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]}
+    )
+
+    result = parser.read_csv(mmap_file, memory_map=True)
+    tm.assert_frame_equal(result, expected)

From 8bb69591dfc3ce64f8bc5e2ad95d8be081e3fcb1 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 2 Jan 2021 02:42:46 -0500
Subject: [PATCH 94/95] TST/REF: io/parsers/test_common.py

---
 .../io/parser/common/test_read_errors.py      | 223 ++++++++++++++++++
 1 file changed, 223 insertions(+)
 create mode 100644 pandas/tests/io/parser/common/test_read_errors.py

diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py
new file mode 100644
index 0000000000000..f68239bf5d48e
--- /dev/null
+++ b/pandas/tests/io/parser/common/test_read_errors.py
@@ -0,0 +1,223 @@
+"""
+Tests that work on both the Python and C engines but do not have a
+specific classification into the other test modules.
+"""
+import codecs
+from io import StringIO
+import os
+
+import numpy as np
+import pytest
+
+from pandas.errors import EmptyDataError, ParserError
+
+from pandas import DataFrame
+import pandas._testing as tm
+
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+
+
+@skip_pyarrow
+def test_empty_decimal_marker(all_parsers):
+    data = """A|B|C
+1|2,334|5
+10|13|10.
+"""
+    # Parsers support only length-1 decimals
+    msg = "Only length-1 decimal markers supported"
+    parser = all_parsers
+
+    with pytest.raises(ValueError, match=msg):
+        parser.read_csv(StringIO(data), decimal="")
+
+
+@skip_pyarrow
+def test_bad_stream_exception(all_parsers, csv_dir_path):
+    # see gh-13652
+    #
+    # This test validates that both the Python engine and C engine will
+    # raise UnicodeDecodeError instead of C engine raising ParserError
+    # and swallowing the exception that caused read to fail.
+    path = os.path.join(csv_dir_path, "sauron.SHIFT_JIS.csv")
+    codec = codecs.lookup("utf-8")
+    utf8 = codecs.lookup("utf-8")
+    parser = all_parsers
+    msg = "'utf-8' codec can't decode byte"
+
+    # Stream must be binary UTF8.
+    with open(path, "rb") as handle, codecs.StreamRecoder(
+        handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter
+    ) as stream:
+
+        with pytest.raises(UnicodeDecodeError, match=msg):
+            parser.read_csv(stream)
+
+
+@skip_pyarrow
+def test_malformed(all_parsers):
+    # see gh-6607
+    parser = all_parsers
+    data = """ignore
+A,B,C
+1,2,3 # comment
+1,2,3,4,5
+2,3,4
+"""
+    msg = "Expected 3 fields in line 4, saw 5"
+    with pytest.raises(ParserError, match=msg):
+        parser.read_csv(StringIO(data), header=1, comment="#")
+
+
+@skip_pyarrow
+@pytest.mark.parametrize("nrows", [5, 3, None])
+def test_malformed_chunks(all_parsers, nrows):
+    data = """ignore
+A,B,C
+skip
+1,2,3
+3,5,10 # comment
+1,2,3,4,5
+2,3,4
+"""
+    parser = all_parsers
+    msg = "Expected 3 fields in line 6, saw 5"
+    with parser.read_csv(
+        StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2]
+    ) as reader:
+        with pytest.raises(ParserError, match=msg):
+            reader.read(nrows)
+
+
+@skip_pyarrow
+def test_catch_too_many_names(all_parsers):
+    # see gh-5156
+    data = """\
+1,2,3
+4,,6
+7,8,9
+10,11,12\n"""
+    parser = all_parsers
+    msg = (
+        "Too many columns specified: expected 4 and found 3"
+        if parser.engine == "c"
+        else "Number of passed names did not match "
+        "number of header fields in the file"
+    )
+
+    with pytest.raises(ValueError, match=msg):
+        parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"])
+
+
+@skip_pyarrow
+@pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5])
+def test_raise_on_no_columns(all_parsers, nrows):
+    parser = all_parsers
+    data = "\n" * nrows
+
+    msg = "No columns to parse from file"
+    with pytest.raises(EmptyDataError, match=msg):
+        parser.read_csv(StringIO(data))
+
+
+def test_read_csv_raises_on_header_prefix(all_parsers):
+    # gh-27394
+    parser = all_parsers
+    msg = "Argument prefix must be None if argument header is not None"
+
+    s = StringIO("0,1\n2,3")
+
+    with pytest.raises(ValueError, match=msg):
+        parser.read_csv(s, header=0, prefix="_X")
+
+
+def test_unexpected_keyword_parameter_exception(all_parsers):
+    # GH-34976
+    parser = all_parsers
+
+    msg = "{}\\(\\) got an unexpected keyword argument 'foo'"
+    with pytest.raises(TypeError, match=msg.format("read_csv")):
+        parser.read_csv("foo.csv", foo=1)
+    with pytest.raises(TypeError, match=msg.format("read_table")):
+        parser.read_table("foo.tsv", foo=1)
+
+
+@skip_pyarrow
+def test_suppress_error_output(all_parsers, capsys):
+    # see gh-15925
+    parser = all_parsers
+    data = "a\n1\n1,2,3\n4\n5,6,7"
+    expected = DataFrame({"a": [1, 4]})
+
+    result = parser.read_csv(
+        StringIO(data), error_bad_lines=False, warn_bad_lines=False
+    )
+    tm.assert_frame_equal(result, expected)
+
+    captured = capsys.readouterr()
+    assert captured.err == ""
+
+
+@skip_pyarrow
+@pytest.mark.parametrize(
+    "kwargs",
+    [{}, {"error_bad_lines": True}],  # Default is True.  # Explicitly pass in.
+)
+@pytest.mark.parametrize(
+    "warn_kwargs", [{}, {"warn_bad_lines": True}, {"warn_bad_lines": False}]
+)
+def test_error_bad_lines(all_parsers, kwargs, warn_kwargs):
+    # see gh-15925
+    parser = all_parsers
+    kwargs.update(**warn_kwargs)
+    data = "a\n1\n1,2,3\n4\n5,6,7"
+
+    msg = "Expected 1 fields in line 3, saw 3"
+    with pytest.raises(ParserError, match=msg):
+        parser.read_csv(StringIO(data), **kwargs)
+
+
+@skip_pyarrow
+def test_warn_bad_lines(all_parsers, capsys):
+    # see gh-15925
+    parser = all_parsers
+    data = "a\n1\n1,2,3\n4\n5,6,7"
+    expected = DataFrame({"a": [1, 4]})
+
+    result = parser.read_csv(StringIO(data), error_bad_lines=False, warn_bad_lines=True)
+    tm.assert_frame_equal(result, expected)
+
+    captured = capsys.readouterr()
+    assert "Skipping line 3" in captured.err
+    assert "Skipping line 5" in captured.err
+
+
+@skip_pyarrow
+def test_read_csv_wrong_num_columns(all_parsers):
+    # Too few columns.
+    data = """A,B,C,D,E,F
+1,2,3,4,5,6
+6,7,8,9,10,11,12
+11,12,13,14,15,16
+"""
+    parser = all_parsers
+    msg = "Expected 6 fields in line 3, saw 7"
+
+    with pytest.raises(ParserError, match=msg):
+        parser.read_csv(StringIO(data))
+
+
+@skip_pyarrow
+def test_null_byte_char(all_parsers):
+    # see gh-2741
+    data = "\x00,foo"
+    names = ["a", "b"]
+    parser = all_parsers
+
+    if parser.engine == "c":
+        expected = DataFrame([[np.nan, "foo"]], columns=names)
+        out = parser.read_csv(StringIO(data), names=names)
+        tm.assert_frame_equal(out, expected)
+    else:
+        msg = "NULL byte detected"
+        with pytest.raises(ParserError, match=msg):
+            parser.read_csv(StringIO(data), names=names)

From d9478d6487167aee6033fa2b8918906b8ab47a70 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <andrew.r.wieteska@gmail.com>
Date: Sat, 2 Jan 2021 02:43:57 -0500
Subject: [PATCH 95/95] TST/REF: remove test_common.py

---
 pandas/tests/io/parser/test_common.py | 2466 -------------------------
 1 file changed, 2466 deletions(-)
 delete mode 100644 pandas/tests/io/parser/test_common.py

diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
deleted file mode 100644
index f06d1476c515a..0000000000000
--- a/pandas/tests/io/parser/test_common.py
+++ /dev/null
@@ -1,2466 +0,0 @@
-"""
-Tests that work on both the Python and C engines but do not have a
-specific classification into the other test modules.
-"""
-import codecs
-import csv
-from datetime import datetime
-from inspect import signature
-from io import BytesIO, StringIO
-import os
-import platform
-from urllib.error import URLError
-
-import numpy as np
-import pytest
-
-from pandas._libs.tslib import Timestamp
-from pandas.compat import is_platform_linux
-from pandas.errors import DtypeWarning, EmptyDataError, ParserError
-import pandas.util._test_decorators as td
-
-from pandas import DataFrame, Index, MultiIndex, Series, compat, concat, option_context
-import pandas._testing as tm
-
-from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser
-
-skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
-
-
-def test_override_set_noconvert_columns():
-    # see gh-17351
-    #
-    # Usecols needs to be sorted in _set_noconvert_columns based
-    # on the test_usecols_with_parse_dates test from test_usecols.py
-    class MyTextFileReader(TextFileReader):
-        def __init__(self):
-            self._currow = 0
-            self.squeeze = False
-
-    class MyCParserWrapper(CParserWrapper):
-        def _set_noconvert_columns(self):
-            if self.usecols_dtype == "integer":
-                # self.usecols is a set, which is documented as unordered
-                # but in practice, a CPython set of integers is sorted.
-                # In other implementations this assumption does not hold.
-                # The following code simulates a different order, which
-                # before GH 17351 would cause the wrong columns to be
-                # converted via the parse_dates parameter
-                self.usecols = list(self.usecols)
-                self.usecols.reverse()
-            return CParserWrapper._set_noconvert_columns(self)
-
-    data = """a,b,c,d,e
-0,1,20140101,0900,4
-0,1,20140102,1000,4"""
-
-    parse_dates = [[1, 2]]
-    cols = {
-        "a": [0, 0],
-        "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
-    }
-    expected = DataFrame(cols, columns=["c_d", "a"])
-
-    parser = MyTextFileReader()
-    parser.options = {
-        "usecols": [0, 2, 3],
-        "parse_dates": parse_dates,
-        "delimiter": ",",
-    }
-    parser.engine = "c"
-    parser._engine = MyCParserWrapper(StringIO(data), **parser.options)
-
-    result = parser.read()
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_empty_decimal_marker(all_parsers):
-    data = """A|B|C
-1|2,334|5
-10|13|10.
-"""
-    # Parsers support only length-1 decimals
-    msg = "Only length-1 decimal markers supported"
-    parser = all_parsers
-
-    with pytest.raises(ValueError, match=msg):
-        parser.read_csv(StringIO(data), decimal="")
-
-
-@skip_pyarrow
-def test_bad_stream_exception(all_parsers, csv_dir_path):
-    # see gh-13652
-    #
-    # This test validates that both the Python engine and C engine will
-    # raise UnicodeDecodeError instead of C engine raising ParserError
-    # and swallowing the exception that caused read to fail.
-    path = os.path.join(csv_dir_path, "sauron.SHIFT_JIS.csv")
-    codec = codecs.lookup("utf-8")
-    utf8 = codecs.lookup("utf-8")
-    parser = all_parsers
-    msg = "'utf-8' codec can't decode byte"
-
-    # Stream must be binary UTF8.
-    with open(path, "rb") as handle, codecs.StreamRecoder(
-        handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter
-    ) as stream:
-
-        with pytest.raises(UnicodeDecodeError, match=msg):
-            parser.read_csv(stream)
-
-
-@skip_pyarrow
-def test_read_csv_local(all_parsers, csv1):
-    prefix = "file:///" if compat.is_platform_windows() else "file://"
-    parser = all_parsers
-
-    fname = prefix + str(os.path.abspath(csv1))
-    result = parser.read_csv(fname, index_col=0, parse_dates=True)
-
-    expected = DataFrame(
-        [
-            [0.980269, 3.685731, -0.364216805298, -1.159738],
-            [1.047916, -0.041232, -0.16181208307, 0.212549],
-            [0.498581, 0.731168, -0.537677223318, 1.346270],
-            [1.120202, 1.567621, 0.00364077397681, 0.675253],
-            [-0.487094, 0.571455, -1.6116394093, 0.103469],
-            [0.836649, 0.246462, 0.588542635376, 1.062782],
-            [-0.157161, 1.340307, 1.1957779562, -1.097007],
-        ],
-        columns=["A", "B", "C", "D"],
-        index=Index(
-            [
-                datetime(2000, 1, 3),
-                datetime(2000, 1, 4),
-                datetime(2000, 1, 5),
-                datetime(2000, 1, 6),
-                datetime(2000, 1, 7),
-                datetime(2000, 1, 10),
-                datetime(2000, 1, 11),
-            ],
-            name="index",
-        ),
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_1000_sep(all_parsers):
-    parser = all_parsers
-    data = """A|B|C
-1|2,334|5
-10|13|10.
-"""
-    expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]})
-
-    result = parser.read_csv(StringIO(data), sep="|", thousands=",")
-    tm.assert_frame_equal(result, expected)
-
-
-def test_squeeze(all_parsers):
-    data = """\
-a,1
-b,2
-c,3
-"""
-    parser = all_parsers
-    index = Index(["a", "b", "c"], name=0)
-    expected = Series([1, 2, 3], name=1, index=index)
-
-    result = parser.read_csv(StringIO(data), index_col=0, header=None, squeeze=True)
-    tm.assert_series_equal(result, expected)
-
-    # see gh-8217
-    #
-    # Series should not be a view.
-    assert not result._is_view
-
-
-@skip_pyarrow
-def test_malformed(all_parsers):
-    # see gh-6607
-    parser = all_parsers
-    data = """ignore
-A,B,C
-1,2,3 # comment
-1,2,3,4,5
-2,3,4
-"""
-    msg = "Expected 3 fields in line 4, saw 5"
-    with pytest.raises(ParserError, match=msg):
-        parser.read_csv(StringIO(data), header=1, comment="#")
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("nrows", [5, 3, None])
-def test_malformed_chunks(all_parsers, nrows):
-    data = """ignore
-A,B,C
-skip
-1,2,3
-3,5,10 # comment
-1,2,3,4,5
-2,3,4
-"""
-    parser = all_parsers
-    msg = "Expected 3 fields in line 6, saw 5"
-    with parser.read_csv(
-        StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2]
-    ) as reader:
-        with pytest.raises(ParserError, match=msg):
-            reader.read(nrows)
-
-
-@skip_pyarrow
-def test_unnamed_columns(all_parsers):
-    data = """A,B,C,,
-1,2,3,4,5
-6,7,8,9,10
-11,12,13,14,15
-"""
-    parser = all_parsers
-    expected = DataFrame(
-        [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]],
-        dtype=np.int64,
-        columns=["A", "B", "C", "Unnamed: 3", "Unnamed: 4"],
-    )
-    result = parser.read_csv(StringIO(data))
-    tm.assert_frame_equal(result, expected)
-
-
-def test_csv_mixed_type(all_parsers):
-    data = """A,B,C
-a,1,2
-b,3,4
-c,4,5
-"""
-    parser = all_parsers
-    expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]})
-    result = parser.read_csv(StringIO(data))
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_read_csv_low_memory_no_rows_with_index(all_parsers):
-    # see gh-21141
-    parser = all_parsers
-
-    if not parser.low_memory:
-        pytest.skip("This is a low-memory specific test")
-
-    data = """A,B,C
-1,1,1,2
-2,2,3,4
-3,3,4,5
-"""
-    result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
-    expected = DataFrame(columns=["A", "B", "C"])
-    tm.assert_frame_equal(result, expected)
-
-
-def test_read_csv_dataframe(all_parsers, csv1):
-    parser = all_parsers
-    result = parser.read_csv(csv1, index_col=0, parse_dates=True)
-
-    expected = DataFrame(
-        [
-            [0.980269, 3.685731, -0.364216805298, -1.159738],
-            [1.047916, -0.041232, -0.16181208307, 0.212549],
-            [0.498581, 0.731168, -0.537677223318, 1.346270],
-            [1.120202, 1.567621, 0.00364077397681, 0.675253],
-            [-0.487094, 0.571455, -1.6116394093, 0.103469],
-            [0.836649, 0.246462, 0.588542635376, 1.062782],
-            [-0.157161, 1.340307, 1.1957779562, -1.097007],
-        ],
-        columns=["A", "B", "C", "D"],
-        index=Index(
-            [
-                datetime(2000, 1, 3),
-                datetime(2000, 1, 4),
-                datetime(2000, 1, 5),
-                datetime(2000, 1, 6),
-                datetime(2000, 1, 7),
-                datetime(2000, 1, 10),
-                datetime(2000, 1, 11),
-            ],
-            name="index",
-        ),
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_read_csv_no_index_name(all_parsers, csv_dir_path):
-    parser = all_parsers
-    csv2 = os.path.join(csv_dir_path, "test2.csv")
-    result = parser.read_csv(csv2, index_col=0, parse_dates=True)
-
-    expected = DataFrame(
-        [
-            [0.980269, 3.685731, -0.364216805298, -1.159738, "foo"],
-            [1.047916, -0.041232, -0.16181208307, 0.212549, "bar"],
-            [0.498581, 0.731168, -0.537677223318, 1.346270, "baz"],
-            [1.120202, 1.567621, 0.00364077397681, 0.675253, "qux"],
-            [-0.487094, 0.571455, -1.6116394093, 0.103469, "foo2"],
-        ],
-        columns=["A", "B", "C", "D", "E"],
-        index=Index(
-            [
-                datetime(2000, 1, 3),
-                datetime(2000, 1, 4),
-                datetime(2000, 1, 5),
-                datetime(2000, 1, 6),
-                datetime(2000, 1, 7),
-            ]
-        ),
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_read_csv_wrong_num_columns(all_parsers):
-    # Too few columns.
-    data = """A,B,C,D,E,F
-1,2,3,4,5,6
-6,7,8,9,10,11,12
-11,12,13,14,15,16
-"""
-    parser = all_parsers
-    msg = "Expected 6 fields in line 3, saw 7"
-
-    with pytest.raises(ParserError, match=msg):
-        parser.read_csv(StringIO(data))
-
-
-def test_read_duplicate_index_explicit(all_parsers):
-    data = """index,A,B,C,D
-foo,2,3,4,5
-bar,7,8,9,10
-baz,12,13,14,15
-qux,12,13,14,15
-foo,12,13,14,15
-bar,12,13,14,15
-"""
-    parser = all_parsers
-    result = parser.read_csv(StringIO(data), index_col=0)
-
-    expected = DataFrame(
-        [
-            [2, 3, 4, 5],
-            [7, 8, 9, 10],
-            [12, 13, 14, 15],
-            [12, 13, 14, 15],
-            [12, 13, 14, 15],
-            [12, 13, 14, 15],
-        ],
-        columns=["A", "B", "C", "D"],
-        index=Index(["foo", "bar", "baz", "qux", "foo", "bar"], name="index"),
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_read_duplicate_index_implicit(all_parsers):
-    data = """A,B,C,D
-foo,2,3,4,5
-bar,7,8,9,10
-baz,12,13,14,15
-qux,12,13,14,15
-foo,12,13,14,15
-bar,12,13,14,15
-"""
-    parser = all_parsers
-    result = parser.read_csv(StringIO(data))
-
-    expected = DataFrame(
-        [
-            [2, 3, 4, 5],
-            [7, 8, 9, 10],
-            [12, 13, 14, 15],
-            [12, 13, 14, 15],
-            [12, 13, 14, 15],
-            [12, 13, 14, 15],
-        ],
-        columns=["A", "B", "C", "D"],
-        index=Index(["foo", "bar", "baz", "qux", "foo", "bar"]),
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@pytest.mark.parametrize(
-    "data,kwargs,expected",
-    [
-        (
-            "A,B\nTrue,1\nFalse,2\nTrue,3",
-            {},
-            DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
-        ),
-        (
-            "A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3",
-            {"true_values": ["yes", "Yes", "YES"], "false_values": ["no", "NO", "No"]},
-            DataFrame(
-                [[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]],
-                columns=["A", "B"],
-            ),
-        ),
-        (
-            "A,B\nTRUE,1\nFALSE,2\nTRUE,3",
-            {},
-            DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
-        ),
-        (
-            "A,B\nfoo,bar\nbar,foo",
-            {"true_values": ["foo"], "false_values": ["bar"]},
-            DataFrame([[True, False], [False, True]], columns=["A", "B"]),
-        ),
-    ],
-)
-def test_parse_bool(all_parsers, data, kwargs, expected):
-    parser = all_parsers
-    result = parser.read_csv(StringIO(data), **kwargs)
-    tm.assert_frame_equal(result, expected)
-
-
-def test_int_conversion(all_parsers):
-    data = """A,B
-1.0,1
-2.0,2
-3.0,3
-"""
-    parser = all_parsers
-    result = parser.read_csv(StringIO(data))
-
-    expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"])
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("nrows", [3, 3.0])
-def test_read_nrows(all_parsers, nrows):
-    # see gh-10476
-    data = """index,A,B,C,D
-foo,2,3,4,5
-bar,7,8,9,10
-baz,12,13,14,15
-qux,12,13,14,15
-foo2,12,13,14,15
-bar2,12,13,14,15
-"""
-    expected = DataFrame(
-        [["foo", 2, 3, 4, 5], ["bar", 7, 8, 9, 10], ["baz", 12, 13, 14, 15]],
-        columns=["index", "A", "B", "C", "D"],
-    )
-    parser = all_parsers
-
-    result = parser.read_csv(StringIO(data), nrows=nrows)
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("nrows", [1.2, "foo", -1])
-def test_read_nrows_bad(all_parsers, nrows):
-    data = """index,A,B,C,D
-foo,2,3,4,5
-bar,7,8,9,10
-baz,12,13,14,15
-qux,12,13,14,15
-foo2,12,13,14,15
-bar2,12,13,14,15
-"""
-    msg = r"'nrows' must be an integer >=0"
-    parser = all_parsers
-
-    with pytest.raises(ValueError, match=msg):
-        parser.read_csv(StringIO(data), nrows=nrows)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("index_col", [0, "index"])
-def test_read_chunksize_with_index(all_parsers, index_col):
-    parser = all_parsers
-    data = """index,A,B,C,D
-foo,2,3,4,5
-bar,7,8,9,10
-baz,12,13,14,15
-qux,12,13,14,15
-foo2,12,13,14,15
-bar2,12,13,14,15
-"""
-
-    expected = DataFrame(
-        [
-            ["foo", 2, 3, 4, 5],
-            ["bar", 7, 8, 9, 10],
-            ["baz", 12, 13, 14, 15],
-            ["qux", 12, 13, 14, 15],
-            ["foo2", 12, 13, 14, 15],
-            ["bar2", 12, 13, 14, 15],
-        ],
-        columns=["index", "A", "B", "C", "D"],
-    )
-    expected = expected.set_index("index")
-
-    with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
-        chunks = list(reader)
-    tm.assert_frame_equal(chunks[0], expected[:2])
-    tm.assert_frame_equal(chunks[1], expected[2:4])
-    tm.assert_frame_equal(chunks[2], expected[4:])
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("chunksize", [1.3, "foo", 0])
-def test_read_chunksize_bad(all_parsers, chunksize):
-    data = """index,A,B,C,D
-foo,2,3,4,5
-bar,7,8,9,10
-baz,12,13,14,15
-qux,12,13,14,15
-foo2,12,13,14,15
-bar2,12,13,14,15
-"""
-    parser = all_parsers
-    msg = r"'chunksize' must be an integer >=1"
-
-    with pytest.raises(ValueError, match=msg):
-        with parser.read_csv(StringIO(data), chunksize=chunksize) as _:
-            pass
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("chunksize", [2, 8])
-def test_read_chunksize_and_nrows(all_parsers, chunksize):
-    # see gh-15755
-    data = """index,A,B,C,D
-foo,2,3,4,5
-bar,7,8,9,10
-baz,12,13,14,15
-qux,12,13,14,15
-foo2,12,13,14,15
-bar2,12,13,14,15
-"""
-    parser = all_parsers
-    kwargs = {"index_col": 0, "nrows": 5}
-
-    expected = parser.read_csv(StringIO(data), **kwargs)
-    with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader:
-        tm.assert_frame_equal(concat(reader), expected)
-
-
-@skip_pyarrow
-def test_read_chunksize_and_nrows_changing_size(all_parsers):
-    data = """index,A,B,C,D
-foo,2,3,4,5
-bar,7,8,9,10
-baz,12,13,14,15
-qux,12,13,14,15
-foo2,12,13,14,15
-bar2,12,13,14,15
-"""
-    parser = all_parsers
-    kwargs = {"index_col": 0, "nrows": 5}
-
-    expected = parser.read_csv(StringIO(data), **kwargs)
-    with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader:
-        tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2])
-        tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5])
-
-        with pytest.raises(StopIteration, match=""):
-            reader.get_chunk(size=3)
-
-
-@skip_pyarrow
-def test_get_chunk_passed_chunksize(all_parsers):
-    parser = all_parsers
-    data = """A,B,C
-1,2,3
-4,5,6
-7,8,9
-1,2,3"""
-
-    with parser.read_csv(StringIO(data), chunksize=2) as reader:
-        result = reader.get_chunk()
-
-    expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}])
-def test_read_chunksize_compat(all_parsers, kwargs):
-    # see gh-12185
-    data = """index,A,B,C,D
-foo,2,3,4,5
-bar,7,8,9,10
-baz,12,13,14,15
-qux,12,13,14,15
-foo2,12,13,14,15
-bar2,12,13,14,15
-"""
-    parser = all_parsers
-    result = parser.read_csv(StringIO(data), **kwargs)
-    with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
-        tm.assert_frame_equal(concat(reader), result)
-
-
-@skip_pyarrow
-def test_read_chunksize_jagged_names(all_parsers):
-    # see gh-23509
-    parser = all_parsers
-    data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
-
-    expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])
-    with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader:
-        result = concat(reader)
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_read_data_list(all_parsers):
-    parser = all_parsers
-    kwargs = {"index_col": 0}
-    data = "A,B,C\nfoo,1,2,3\nbar,4,5,6"
-
-    data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]]
-    expected = parser.read_csv(StringIO(data), **kwargs)
-
-    with TextParser(data_list, chunksize=2, **kwargs) as parser:
-        result = parser.read()
-
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_iterator(all_parsers):
-    # see gh-6607
-    data = """index,A,B,C,D
-foo,2,3,4,5
-bar,7,8,9,10
-baz,12,13,14,15
-qux,12,13,14,15
-foo2,12,13,14,15
-bar2,12,13,14,15
-"""
-    parser = all_parsers
-    kwargs = {"index_col": 0}
-
-    expected = parser.read_csv(StringIO(data), **kwargs)
-    with parser.read_csv(StringIO(data), iterator=True, **kwargs) as reader:
-
-        first_chunk = reader.read(3)
-        tm.assert_frame_equal(first_chunk, expected[:3])
-
-        last_chunk = reader.read(5)
-    tm.assert_frame_equal(last_chunk, expected[3:])
-
-
-@skip_pyarrow
-def test_iterator2(all_parsers):
-    parser = all_parsers
-    data = """A,B,C
-foo,1,2,3
-bar,4,5,6
-baz,7,8,9
-"""
-
-    with parser.read_csv(StringIO(data), iterator=True) as reader:
-        result = list(reader)
-
-    expected = DataFrame(
-        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
-        index=["foo", "bar", "baz"],
-        columns=["A", "B", "C"],
-    )
-    tm.assert_frame_equal(result[0], expected)
-
-
-def test_reader_list(all_parsers):
-    data = """index,A,B,C,D
-foo,2,3,4,5
-bar,7,8,9,10
-baz,12,13,14,15
-qux,12,13,14,15
-foo2,12,13,14,15
-bar2,12,13,14,15
-"""
-    parser = all_parsers
-    kwargs = {"index_col": 0}
-
-    lines = list(csv.reader(StringIO(data)))
-    with TextParser(lines, chunksize=2, **kwargs) as reader:
-        chunks = list(reader)
-
-    expected = parser.read_csv(StringIO(data), **kwargs)
-
-    tm.assert_frame_equal(chunks[0], expected[:2])
-    tm.assert_frame_equal(chunks[1], expected[2:4])
-    tm.assert_frame_equal(chunks[2], expected[4:])
-
-
-def test_reader_list_skiprows(all_parsers):
-    data = """index,A,B,C,D
-foo,2,3,4,5
-bar,7,8,9,10
-baz,12,13,14,15
-qux,12,13,14,15
-foo2,12,13,14,15
-bar2,12,13,14,15
-"""
-    parser = all_parsers
-    kwargs = {"index_col": 0}
-
-    lines = list(csv.reader(StringIO(data)))
-    with TextParser(lines, chunksize=2, skiprows=[1], **kwargs) as reader:
-        chunks = list(reader)
-
-    expected = parser.read_csv(StringIO(data), **kwargs)
-
-    tm.assert_frame_equal(chunks[0], expected[1:3])
-
-
-@skip_pyarrow
-def test_iterator_stop_on_chunksize(all_parsers):
-    # gh-3967: stopping iteration when chunksize is specified
-    parser = all_parsers
-    data = """A,B,C
-foo,1,2,3
-bar,4,5,6
-baz,7,8,9
-"""
-
-    with parser.read_csv(StringIO(data), chunksize=1) as reader:
-        result = list(reader)
-
-    assert len(result) == 3
-    expected = DataFrame(
-        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
-        index=["foo", "bar", "baz"],
-        columns=["A", "B", "C"],
-    )
-    tm.assert_frame_equal(concat(result), expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "kwargs", [{"iterator": True, "chunksize": 1}, {"iterator": True}, {"chunksize": 1}]
-)
-def test_iterator_skipfooter_errors(all_parsers, kwargs):
-    msg = "'skipfooter' not supported for iteration"
-    parser = all_parsers
-    data = "a\n1\n2"
-
-    with pytest.raises(ValueError, match=msg):
-        with parser.read_csv(StringIO(data), skipfooter=1, **kwargs) as _:
-            pass
-
-
-def test_nrows_skipfooter_errors(all_parsers):
-    msg = "'skipfooter' not supported with 'nrows'"
-    data = "a\n1\n2\n3\n4\n5\n6"
-    parser = all_parsers
-
-    with pytest.raises(ValueError, match=msg):
-        parser.read_csv(StringIO(data), skipfooter=1, nrows=5)
-
-
-@pytest.mark.parametrize(
-    "data,kwargs,expected",
-    [
-        (
-            """foo,2,3,4,5
-bar,7,8,9,10
-baz,12,13,14,15
-qux,12,13,14,15
-foo2,12,13,14,15
-bar2,12,13,14,15
-""",
-            {"index_col": 0, "names": ["index", "A", "B", "C", "D"]},
-            DataFrame(
-                [
-                    [2, 3, 4, 5],
-                    [7, 8, 9, 10],
-                    [12, 13, 14, 15],
-                    [12, 13, 14, 15],
-                    [12, 13, 14, 15],
-                    [12, 13, 14, 15],
-                ],
-                index=Index(["foo", "bar", "baz", "qux", "foo2", "bar2"], name="index"),
-                columns=["A", "B", "C", "D"],
-            ),
-        ),
-        (
-            """foo,one,2,3,4,5
-foo,two,7,8,9,10
-foo,three,12,13,14,15
-bar,one,12,13,14,15
-bar,two,12,13,14,15
-""",
-            {"index_col": [0, 1], "names": ["index1", "index2", "A", "B", "C", "D"]},
-            DataFrame(
-                [
-                    [2, 3, 4, 5],
-                    [7, 8, 9, 10],
-                    [12, 13, 14, 15],
-                    [12, 13, 14, 15],
-                    [12, 13, 14, 15],
-                ],
-                index=MultiIndex.from_tuples(
-                    [
-                        ("foo", "one"),
-                        ("foo", "two"),
-                        ("foo", "three"),
-                        ("bar", "one"),
-                        ("bar", "two"),
-                    ],
-                    names=["index1", "index2"],
-                ),
-                columns=["A", "B", "C", "D"],
-            ),
-        ),
-    ],
-)
-def test_pass_names_with_index(all_parsers, data, kwargs, expected):
-    parser = all_parsers
-    result = parser.read_csv(StringIO(data), **kwargs)
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
-def test_multi_index_no_level_names(all_parsers, index_col):
-    data = """index1,index2,A,B,C,D
-foo,one,2,3,4,5
-foo,two,7,8,9,10
-foo,three,12,13,14,15
-bar,one,12,13,14,15
-bar,two,12,13,14,15
-"""
-    headless_data = "\n".join(data.split("\n")[1:])
-
-    names = ["A", "B", "C", "D"]
-    parser = all_parsers
-
-    result = parser.read_csv(
-        StringIO(headless_data), index_col=index_col, header=None, names=names
-    )
-    expected = parser.read_csv(StringIO(data), index_col=index_col)
-
-    # No index names in headless data.
-    expected.index.names = [None] * 2
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_multi_index_no_level_names_implicit(all_parsers):
-    parser = all_parsers
-    data = """A,B,C,D
-foo,one,2,3,4,5
-foo,two,7,8,9,10
-foo,three,12,13,14,15
-bar,one,12,13,14,15
-bar,two,12,13,14,15
-"""
-
-    result = parser.read_csv(StringIO(data))
-    expected = DataFrame(
-        [
-            [2, 3, 4, 5],
-            [7, 8, 9, 10],
-            [12, 13, 14, 15],
-            [12, 13, 14, 15],
-            [12, 13, 14, 15],
-        ],
-        columns=["A", "B", "C", "D"],
-        index=MultiIndex.from_tuples(
-            [
-                ("foo", "one"),
-                ("foo", "two"),
-                ("foo", "three"),
-                ("bar", "one"),
-                ("bar", "two"),
-            ]
-        ),
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "data,expected,header",
-    [
-        ("a,b", DataFrame(columns=["a", "b"]), [0]),
-        (
-            "a,b\nc,d",
-            DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])),
-            [0, 1],
-        ),
-    ],
-)
-@pytest.mark.parametrize("round_trip", [True, False])
-def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip):
-    # see gh-14545
-    parser = all_parsers
-    data = expected.to_csv(index=False) if round_trip else data
-
-    result = parser.read_csv(StringIO(data), header=header)
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_no_unnamed_index(all_parsers):
-    parser = all_parsers
-    data = """ id c0 c1 c2
-0 1 0 a b
-1 2 0 c d
-2 2 2 e f
-"""
-    result = parser.read_csv(StringIO(data), sep=" ")
-    expected = DataFrame(
-        [[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], [2, 2, 2, "e", "f"]],
-        columns=["Unnamed: 0", "id", "c0", "c1", "c2"],
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-def test_read_csv_parse_simple_list(all_parsers):
-    parser = all_parsers
-    data = """foo
-bar baz
-qux foo
-foo
-bar"""
-
-    result = parser.read_csv(StringIO(data), header=None)
-    expected = DataFrame(["foo", "bar baz", "qux foo", "foo", "bar"])
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@tm.network
-def test_url(all_parsers, csv_dir_path):
-    # TODO: FTP testing
-    parser = all_parsers
-    kwargs = {"sep": "\t"}
-
-    url = (
-        "https://raw.github.com/pandas-dev/pandas/master/"
-        "pandas/tests/io/parser/data/salaries.csv"
-    )
-    url_result = parser.read_csv(url, **kwargs)
-
-    local_path = os.path.join(csv_dir_path, "salaries.csv")
-    local_result = parser.read_csv(local_path, **kwargs)
-    tm.assert_frame_equal(url_result, local_result)
-
-
-@skip_pyarrow
-@pytest.mark.slow
-def test_local_file(all_parsers, csv_dir_path):
-    parser = all_parsers
-    kwargs = {"sep": "\t"}
-
-    local_path = os.path.join(csv_dir_path, "salaries.csv")
-    local_result = parser.read_csv(local_path, **kwargs)
-    url = "file://localhost/" + local_path
-
-    try:
-        url_result = parser.read_csv(url, **kwargs)
-        tm.assert_frame_equal(url_result, local_result)
-    except URLError:
-        # Fails on some systems.
-        pytest.skip("Failing on: " + " ".join(platform.uname()))
-
-
-@skip_pyarrow
-def test_path_path_lib(all_parsers):
-    parser = all_parsers
-    df = tm.makeDataFrame()
-    result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0))
-    tm.assert_frame_equal(df, result)
-
-
-@skip_pyarrow
-def test_path_local_path(all_parsers):
-    parser = all_parsers
-    df = tm.makeDataFrame()
-    result = tm.round_trip_localpath(
-        df.to_csv, lambda p: parser.read_csv(p, index_col=0)
-    )
-    tm.assert_frame_equal(df, result)
-
-
-@skip_pyarrow
-def test_nonexistent_path(all_parsers):
-    # gh-2428: pls no segfault
-    # gh-14086: raise more helpful FileNotFoundError
-    # GH#29233 "File foo" instead of "File b'foo'"
-    parser = all_parsers
-    path = f"{tm.rands(10)}.csv"
-
-    msg = r"\[Errno 2\]"
-    with pytest.raises(FileNotFoundError, match=msg) as e:
-        parser.read_csv(path)
-    assert path == e.value.filename
-
-
-@skip_pyarrow
-@td.skip_if_windows  # os.chmod does not work in windows
-def test_no_permission(all_parsers):
-    # GH 23784
-    parser = all_parsers
-
-    msg = r"\[Errno 13\]"
-    with tm.ensure_clean() as path:
-        os.chmod(path, 0)  # make file unreadable
-
-        # verify that this process cannot open the file (not running as sudo)
-        try:
-            with open(path):
-                pass
-            pytest.skip("Running as sudo.")
-        except PermissionError:
-            pass
-
-        with pytest.raises(PermissionError, match=msg) as e:
-            parser.read_csv(path)
-        assert path == e.value.filename
-
-
-@skip_pyarrow
-def test_missing_trailing_delimiters(all_parsers):
-    parser = all_parsers
-    data = """A,B,C,D
-1,2,3,4
-1,3,3,
-1,4,5"""
-
-    result = parser.read_csv(StringIO(data))
-    expected = DataFrame(
-        [[1, 2, 3, 4], [1, 3, 3, np.nan], [1, 4, 5, np.nan]],
-        columns=["A", "B", "C", "D"],
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_skip_initial_space(all_parsers):
-    data = (
-        '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, '
-        "1.00361,  1.12551, 330.65659, 0355626618.16711,  73.48821, "
-        "314.11625,  1917.09447,   179.71425,  80.000, 240.000, -350,  "
-        "70.06056, 344.98370, 1,   1, -0.689265, -0.692787,  "
-        "0.212036,    14.7674,   41.605,   -9999.0,   -9999.0,   "
-        "-9999.0,   -9999.0,   -9999.0,  -9999.0, 000, 012, 128"
-    )
-    parser = all_parsers
-
-    result = parser.read_csv(
-        StringIO(data),
-        names=list(range(33)),
-        header=None,
-        na_values=["-9999.0"],
-        skipinitialspace=True,
-    )
-    expected = DataFrame(
-        [
-            [
-                "09-Apr-2012",
-                "01:10:18.300",
-                2456026.548822908,
-                12849,
-                1.00361,
-                1.12551,
-                330.65659,
-                355626618.16711,
-                73.48821,
-                314.11625,
-                1917.09447,
-                179.71425,
-                80.0,
-                240.0,
-                -350,
-                70.06056,
-                344.9837,
-                1,
-                1,
-                -0.689265,
-                -0.692787,
-                0.212036,
-                14.7674,
-                41.605,
-                np.nan,
-                np.nan,
-                np.nan,
-                np.nan,
-                np.nan,
-                np.nan,
-                0,
-                12,
-                128,
-            ]
-        ]
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_trailing_delimiters(all_parsers):
-    # see gh-2442
-    data = """A,B,C
-1,2,3,
-4,5,6,
-7,8,9,"""
-    parser = all_parsers
-    result = parser.read_csv(StringIO(data), index_col=False)
-
-    expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]})
-    tm.assert_frame_equal(result, expected)
-
-
-def test_escapechar(all_parsers):
-    # https://stackoverflow.com/questions/13824840/feature-request-for-
-    # pandas-read-csv
-    data = '''SEARCH_TERM,ACTUAL_URL
-"bra tv bord","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
-"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
-"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"'''  # noqa
-
-    parser = all_parsers
-    result = parser.read_csv(
-        StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8"
-    )
-
-    assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals series'
-
-    tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"]))
-
-
-def test_int64_min_issues(all_parsers):
-    # see gh-2599
-    parser = all_parsers
-    data = "A,B\n0,0\n0,"
-    result = parser.read_csv(StringIO(data))
-
-    expected = DataFrame({"A": [0, 0], "B": [0, np.nan]})
-    tm.assert_frame_equal(result, expected)
-
-
-def test_parse_integers_above_fp_precision(all_parsers):
-    data = """Numbers
-17007000002000191
-17007000002000191
-17007000002000191
-17007000002000191
-17007000002000192
-17007000002000192
-17007000002000192
-17007000002000192
-17007000002000192
-17007000002000194"""
-    parser = all_parsers
-    result = parser.read_csv(StringIO(data))
-    expected = DataFrame(
-        {
-            "Numbers": [
-                17007000002000191,
-                17007000002000191,
-                17007000002000191,
-                17007000002000191,
-                17007000002000192,
-                17007000002000192,
-                17007000002000192,
-                17007000002000192,
-                17007000002000192,
-                17007000002000194,
-            ]
-        }
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@pytest.mark.xfail(reason="GH38630, sometimes gives ResourceWarning", strict=False)
-def test_chunks_have_consistent_numerical_type(all_parsers):
-    parser = all_parsers
-    integers = [str(i) for i in range(499999)]
-    data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
-
-    # Coercions should work without warnings.
-    with tm.assert_produces_warning(None):
-        result = parser.read_csv(StringIO(data))
-
-    assert type(result.a[0]) is np.float64
-    assert result.a.dtype == float
-
-
-def test_warn_if_chunks_have_mismatched_type(all_parsers):
-    warning_type = None
-    parser = all_parsers
-    integers = [str(i) for i in range(499999)]
-    data = "a\n" + "\n".join(integers + ["a", "b"] + integers)
-
-    # see gh-3866: if chunks are different types and can't
-    # be coerced using numerical types, then issue warning.
-    if parser.engine == "c" and parser.low_memory:
-        warning_type = DtypeWarning
-
-    with tm.assert_produces_warning(warning_type):
-        df = parser.read_csv(StringIO(data))
-    assert df.a.dtype == object
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("sep", [" ", r"\s+"])
-def test_integer_overflow_bug(all_parsers, sep):
-    # see gh-2601
-    data = "65248E10 11\n55555E55 22\n"
-    parser = all_parsers
-
-    result = parser.read_csv(StringIO(data), header=None, sep=sep)
-    expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]])
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_catch_too_many_names(all_parsers):
-    # see gh-5156
-    data = """\
-1,2,3
-4,,6
-7,8,9
-10,11,12\n"""
-    parser = all_parsers
-    msg = (
-        "Too many columns specified: expected 4 and found 3"
-        if parser.engine == "c"
-        else "Number of passed names did not match "
-        "number of header fields in the file"
-    )
-
-    with pytest.raises(ValueError, match=msg):
-        parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"])
-
-
-@skip_pyarrow
-def test_ignore_leading_whitespace(all_parsers):
-    # see gh-3374, gh-6607
-    parser = all_parsers
-    data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9"
-    result = parser.read_csv(StringIO(data), sep=r"\s+")
-
-    expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]})
-    tm.assert_frame_equal(result, expected)
-
-
-def test_chunk_begins_with_newline_whitespace(all_parsers):
-    # see gh-10022
-    parser = all_parsers
-    data = "\n hello\nworld\n"
-
-    result = parser.read_csv(StringIO(data), header=None)
-    expected = DataFrame([" hello", "world"])
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_empty_with_index(all_parsers):
-    # see gh-10184
-    data = "x,y"
-    parser = all_parsers
-    result = parser.read_csv(StringIO(data), index_col=0)
-
-    expected = DataFrame(columns=["y"], index=Index([], name="x"))
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_empty_with_multi_index(all_parsers):
-    # see gh-10467
-    data = "x,y,z"
-    parser = all_parsers
-    result = parser.read_csv(StringIO(data), index_col=["x", "y"])
-
-    expected = DataFrame(
-        columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"])
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_empty_with_reversed_multi_index(all_parsers):
-    data = "x,y,z"
-    parser = all_parsers
-    result = parser.read_csv(StringIO(data), index_col=[1, 0])
-
-    expected = DataFrame(
-        columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"])
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_float_parser(all_parsers):
-    # see gh-9565
-    parser = all_parsers
-    data = "45e-1,4.5,45.,inf,-inf"
-    result = parser.read_csv(StringIO(data), header=None)
-
-    expected = DataFrame([[float(s) for s in data.split(",")]])
-    tm.assert_frame_equal(result, expected)
-
-
-def test_scientific_no_exponent(all_parsers_all_precisions):
-    # see gh-12215
-    df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]})
-    data = df.to_csv(index=False)
-    parser, precision = all_parsers_all_precisions
-    if parser == "pyarrow":
-        pytest.skip()
-
-    df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
-    tm.assert_frame_equal(df_roundtrip, df)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
-def test_int64_overflow(all_parsers, conv):
-    data = """ID
-00013007854817840016671868
-00013007854817840016749251
-00013007854817840016754630
-00013007854817840016781876
-00013007854817840017028824
-00013007854817840017963235
-00013007854817840018860166"""
-    parser = all_parsers
-
-    if conv is None:
-        # 13007854817840016671868 > UINT64_MAX, so this
-        # will overflow and return object as the dtype.
-        result = parser.read_csv(StringIO(data))
-        expected = DataFrame(
-            [
-                "00013007854817840016671868",
-                "00013007854817840016749251",
-                "00013007854817840016754630",
-                "00013007854817840016781876",
-                "00013007854817840017028824",
-                "00013007854817840017963235",
-                "00013007854817840018860166",
-            ],
-            columns=["ID"],
-        )
-        tm.assert_frame_equal(result, expected)
-    else:
-        # 13007854817840016671868 > UINT64_MAX, so attempts
-        # to cast to either int64 or uint64 will result in
-        # an OverflowError being raised.
-        msg = (
-            "(Python int too large to convert to C long)|"
-            "(long too big to convert)|"
-            "(int too big to convert)"
-        )
-
-        with pytest.raises(OverflowError, match=msg):
-            parser.read_csv(StringIO(data), converters={"ID": conv})
-
-
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min]
-)
-def test_int64_uint64_range(all_parsers, val):
-    # These numbers fall right inside the int64-uint64
-    # range, so they should be parsed as string.
-    parser = all_parsers
-    result = parser.read_csv(StringIO(str(val)), header=None)
-
-    expected = DataFrame([val])
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
-)
-def test_outside_int64_uint64_range(all_parsers, val):
-    # These numbers fall just outside the int64-uint64
-    # range, so they should be parsed as string.
-    parser = all_parsers
-    result = parser.read_csv(StringIO(str(val)), header=None)
-
-    expected = DataFrame([str(val)])
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("exp_data", [[str(-1), str(2 ** 63)], [str(2 ** 63), str(-1)]])
-def test_numeric_range_too_wide(all_parsers, exp_data):
-    # No numerical dtype can hold both negative and uint64
-    # values, so they should be cast as string.
-    parser = all_parsers
-    data = "\n".join(exp_data)
-    expected = DataFrame(exp_data)
-
-    result = parser.read_csv(StringIO(data), header=None)
-    tm.assert_frame_equal(result, expected)
-
-
-@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999])
-def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
-    # GH#38753
-    parser, precision = all_parsers_all_precisions
-    if parser == "pyarrow":
-        pytest.skip()
-    data = f"data\n10E{neg_exp}"
-    result = parser.read_csv(StringIO(data), float_precision=precision)
-    expected = DataFrame({"data": [0.0]})
-    tm.assert_frame_equal(result, expected)
-
-
-@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
-def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
-    # GH#38753
-    parser, precision = all_parsers_all_precisions
-    data = f"data\n10E{exp}"
-    result = parser.read_csv(StringIO(data), float_precision=precision)
-    if precision == "round_trip":
-        if exp == 999999999999999999 and is_platform_linux():
-            mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
-            request.node.add_marker(mark)
-
-        value = np.inf if exp > 0 else 0.0
-        expected = DataFrame({"data": [value]})
-    else:
-        expected = DataFrame({"data": [f"10E{exp}"]})
-
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("iterator", [True, False])
-def test_empty_with_nrows_chunksize(all_parsers, iterator):
-    # see gh-9535
-    parser = all_parsers
-    expected = DataFrame(columns=["foo", "bar"])
-
-    nrows = 10
-    data = StringIO("foo,bar\n")
-
-    if iterator:
-        with parser.read_csv(data, chunksize=nrows) as reader:
-            result = next(iter(reader))
-    else:
-        result = parser.read_csv(data, nrows=nrows)
-
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "data,kwargs,expected,msg",
-    [
-        # gh-10728: WHITESPACE_LINE
-        (
-            "a,b,c\n4,5,6\n ",
-            {},
-            DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
-            None,
-        ),
-        # gh-10548: EAT_LINE_COMMENT
-        (
-            "a,b,c\n4,5,6\n#comment",
-            {"comment": "#"},
-            DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
-            None,
-        ),
-        # EAT_CRNL_NOP
-        (
-            "a,b,c\n4,5,6\n\r",
-            {},
-            DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
-            None,
-        ),
-        # EAT_COMMENT
-        (
-            "a,b,c\n4,5,6#comment",
-            {"comment": "#"},
-            DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
-            None,
-        ),
-        # SKIP_LINE
-        (
-            "a,b,c\n4,5,6\nskipme",
-            {"skiprows": [2]},
-            DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
-            None,
-        ),
-        # EAT_LINE_COMMENT
-        (
-            "a,b,c\n4,5,6\n#comment",
-            {"comment": "#", "skip_blank_lines": False},
-            DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
-            None,
-        ),
-        # IN_FIELD
-        (
-            "a,b,c\n4,5,6\n ",
-            {"skip_blank_lines": False},
-            DataFrame([["4", 5, 6], [" ", None, None]], columns=["a", "b", "c"]),
-            None,
-        ),
-        # EAT_CRNL
-        (
-            "a,b,c\n4,5,6\n\r",
-            {"skip_blank_lines": False},
-            DataFrame([[4, 5, 6], [None, None, None]], columns=["a", "b", "c"]),
-            None,
-        ),
-        # ESCAPED_CHAR
-        (
-            "a,b,c\n4,5,6\n\\",
-            {"escapechar": "\\"},
-            None,
-            "(EOF following escape character)|(unexpected end of data)",
-        ),
-        # ESCAPE_IN_QUOTED_FIELD
-        (
-            'a,b,c\n4,5,6\n"\\',
-            {"escapechar": "\\"},
-            None,
-            "(EOF inside string starting at row 2)|(unexpected end of data)",
-        ),
-        # IN_QUOTED_FIELD
-        (
-            'a,b,c\n4,5,6\n"',
-            {"escapechar": "\\"},
-            None,
-            "(EOF inside string starting at row 2)|(unexpected end of data)",
-        ),
-    ],
-    ids=[
-        "whitespace-line",
-        "eat-line-comment",
-        "eat-crnl-nop",
-        "eat-comment",
-        "skip-line",
-        "eat-line-comment",
-        "in-field",
-        "eat-crnl",
-        "escaped-char",
-        "escape-in-quoted-field",
-        "in-quoted-field",
-    ],
-)
-def test_eof_states(all_parsers, data, kwargs, expected, msg):
-    # see gh-10728, gh-10548
-    parser = all_parsers
-
-    if expected is None:
-        with pytest.raises(ParserError, match=msg):
-            parser.read_csv(StringIO(data), **kwargs)
-    else:
-        result = parser.read_csv(StringIO(data), **kwargs)
-        tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]])
-def test_uneven_lines_with_usecols(all_parsers, usecols):
-    # see gh-12203
-    parser = all_parsers
-    data = r"""a,b,c
-0,1,2
-3,4,5,6,7
-8,9,10"""
-
-    if usecols is None:
-        # Make sure that an error is still raised
-        # when the "usecols" parameter is not provided.
-        msg = r"Expected \d+ fields in line \d+, saw \d+"
-        with pytest.raises(ParserError, match=msg):
-            parser.read_csv(StringIO(data))
-    else:
-        expected = DataFrame({"a": [0, 3, 8], "b": [1, 4, 9]})
-
-        result = parser.read_csv(StringIO(data), usecols=usecols)
-        tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "data,kwargs,expected",
-    [
-        # First, check to see that the response of parser when faced with no
-        # provided columns raises the correct error, with or without usecols.
-        ("", {}, None),
-        ("", {"usecols": ["X"]}, None),
-        (
-            ",,",
-            {"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]},
-            DataFrame(columns=["X"], index=[0], dtype=np.float64),
-        ),
-        (
-            "",
-            {"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]},
-            DataFrame(columns=["X"]),
-        ),
-    ],
-)
-def test_read_empty_with_usecols(all_parsers, data, kwargs, expected):
-    # see gh-12493
-    parser = all_parsers
-
-    if expected is None:
-        msg = "No columns to parse from file"
-        with pytest.raises(EmptyDataError, match=msg):
-            parser.read_csv(StringIO(data), **kwargs)
-    else:
-        result = parser.read_csv(StringIO(data), **kwargs)
-        tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "kwargs,expected",
-    [
-        # gh-8661, gh-8679: this should ignore six lines, including
-        # lines with trailing whitespace and blank lines.
-        (
-            {
-                "header": None,
-                "delim_whitespace": True,
-                "skiprows": [0, 1, 2, 3, 5, 6],
-                "skip_blank_lines": True,
-            },
-            DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]),
-        ),
-        # gh-8983: test skipping set of rows after a row with trailing spaces.
-        (
-            {
-                "delim_whitespace": True,
-                "skiprows": [1, 2, 3, 5, 6],
-                "skip_blank_lines": True,
-            },
-            DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}),
-        ),
-    ],
-)
-def test_trailing_spaces(all_parsers, kwargs, expected):
-    data = "A B C  \nrandom line with trailing spaces    \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n   \n5.1,NaN,10.0\n"  # noqa
-    parser = all_parsers
-
-    result = parser.read_csv(StringIO(data.replace(",", "  ")), **kwargs)
-    tm.assert_frame_equal(result, expected)
-
-
-def test_raise_on_sep_with_delim_whitespace(all_parsers):
-    # see gh-6607
-    data = "a b c\n1 2 3"
-    parser = all_parsers
-
-    with pytest.raises(ValueError, match="you can only specify one"):
-        parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("delim_whitespace", [True, False])
-def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
-    # see gh-9710
-    parser = all_parsers
-    data = """\
-MyColumn
-a
-b
-a
-b\n"""
-
-    expected = DataFrame({"MyColumn": list("abab")})
-    result = parser.read_csv(
-        StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "sep,skip_blank_lines,exp_data",
-    [
-        (",", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
-        (r"\s+", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
-        (
-            ",",
-            False,
-            [
-                [1.0, 2.0, 4.0],
-                [np.nan, np.nan, np.nan],
-                [np.nan, np.nan, np.nan],
-                [5.0, np.nan, 10.0],
-                [np.nan, np.nan, np.nan],
-                [-70.0, 0.4, 1.0],
-            ],
-        ),
-    ],
-)
-def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data):
-    parser = all_parsers
-    data = """\
-A,B,C
-1,2.,4.
-
-
-5.,NaN,10.0
-
--70,.4,1
-"""
-
-    if sep == r"\s+":
-        data = data.replace(",", "  ")
-
-    result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines)
-    expected = DataFrame(exp_data, columns=["A", "B", "C"])
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_whitespace_lines(all_parsers):
-    parser = all_parsers
-    data = """
-
-\t  \t\t
-\t
-A,B,C
-\t    1,2.,4.
-5.,NaN,10.0
-"""
-    expected = DataFrame([[1, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"])
-    result = parser.read_csv(StringIO(data))
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "data,expected",
-    [
-        (
-            """   A   B   C   D
-a   1   2   3   4
-b   1   2   3   4
-c   1   2   3   4
-""",
-            DataFrame(
-                [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
-                columns=["A", "B", "C", "D"],
-                index=["a", "b", "c"],
-            ),
-        ),
-        (
-            "    a b c\n1 2 3 \n4 5  6\n 7 8 9",
-            DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]),
-        ),
-    ],
-)
-def test_whitespace_regex_separator(all_parsers, data, expected):
-    # see gh-6607
-    parser = all_parsers
-    result = parser.read_csv(StringIO(data), sep=r"\s+")
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_verbose_read(all_parsers, capsys):
-    parser = all_parsers
-    data = """a,b,c,d
-one,1,2,3
-one,1,2,3
-,1,2,3
-one,1,2,3
-,1,2,3
-,1,2,3
-one,1,2,3
-two,1,2,3"""
-
-    # Engines are verbose in different ways.
-    parser.read_csv(StringIO(data), verbose=True)
-    captured = capsys.readouterr()
-
-    if parser.engine == "c":
-        assert "Tokenization took:" in captured.out
-        assert "Parser memory cleanup took:" in captured.out
-    else:  # Python engine
-        assert captured.out == "Filled 3 NA values in column a\n"
-
-
-@skip_pyarrow
-def test_verbose_read2(all_parsers, capsys):
-    parser = all_parsers
-    data = """a,b,c,d
-one,1,2,3
-two,1,2,3
-three,1,2,3
-four,1,2,3
-five,1,2,3
-,1,2,3
-seven,1,2,3
-eight,1,2,3"""
-
-    parser.read_csv(StringIO(data), verbose=True, index_col=0)
-    captured = capsys.readouterr()
-
-    # Engines are verbose in different ways.
-    if parser.engine == "c":
-        assert "Tokenization took:" in captured.out
-        assert "Parser memory cleanup took:" in captured.out
-    else:  # Python engine
-        assert captured.out == "Filled 1 NA values in column a\n"
-
-
-def test_iteration_open_handle(all_parsers):
-    parser = all_parsers
-    kwargs = {"squeeze": True, "header": None}
-
-    with tm.ensure_clean() as path:
-        with open(path, "w") as f:
-            f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG")
-
-        with open(path) as f:
-            for line in f:
-                if "CCC" in line:
-                    break
-
-            result = parser.read_csv(f, **kwargs)
-            expected = Series(["DDD", "EEE", "FFF", "GGG"], name=0)
-            tm.assert_series_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "data,thousands,decimal",
-    [
-        (
-            """A|B|C
-1|2,334.01|5
-10|13|10.
-""",
-            ",",
-            ".",
-        ),
-        (
-            """A|B|C
-1|2.334,01|5
-10|13|10,
-""",
-            ".",
-            ",",
-        ),
-    ],
-)
-def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal):
-    parser = all_parsers
-    expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
-
-    result = parser.read_csv(
-        StringIO(data), sep="|", thousands=thousands, decimal=decimal
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_euro_decimal_format(all_parsers):
-    parser = all_parsers
-    data = """Id;Number1;Number2;Text1;Text2;Number3
-1;1521,1541;187101,9543;ABC;poi;4,738797819
-2;121,12;14897,76;DEF;uyt;0,377320872
-3;878,158;108013,434;GHI;rez;2,735694704"""
-
-    result = parser.read_csv(StringIO(data), sep=";", decimal=",")
-    expected = DataFrame(
-        [
-            [1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819],
-            [2, 121.12, 14897.76, "DEF", "uyt", 0.377320872],
-            [3, 878.158, 108013.434, "GHI", "rez", 2.735694704],
-        ],
-        columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("na_filter", [True, False])
-def test_inf_parsing(all_parsers, na_filter):
-    parser = all_parsers
-    data = """\
-,A
-a,inf
-b,-inf
-c,+Inf
-d,-Inf
-e,INF
-f,-INF
-g,+INf
-h,-INf
-i,inF
-j,-inF"""
-    expected = DataFrame(
-        {"A": [float("inf"), float("-inf")] * 5},
-        index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
-    )
-    result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("na_filter", [True, False])
-def test_infinity_parsing(all_parsers, na_filter):
-    parser = all_parsers
-    data = """\
-,A
-a,Infinity
-b,-Infinity
-c,+Infinity
-"""
-    expected = DataFrame(
-        {"A": [float("infinity"), float("-infinity"), float("+infinity")]},
-        index=["a", "b", "c"],
-    )
-    result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5])
-def test_raise_on_no_columns(all_parsers, nrows):
-    parser = all_parsers
-    data = "\n" * nrows
-
-    msg = "No columns to parse from file"
-    with pytest.raises(EmptyDataError, match=msg):
-        parser.read_csv(StringIO(data))
-
-
-@skip_pyarrow
-@td.check_file_leaks
-def test_memory_map(all_parsers, csv_dir_path):
-    mmap_file = os.path.join(csv_dir_path, "test_mmap.csv")
-    parser = all_parsers
-
-    expected = DataFrame(
-        {"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]}
-    )
-
-    result = parser.read_csv(mmap_file, memory_map=True)
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_null_byte_char(all_parsers):
-    # see gh-2741
-    data = "\x00,foo"
-    names = ["a", "b"]
-    parser = all_parsers
-
-    if parser.engine == "c":
-        expected = DataFrame([[np.nan, "foo"]], columns=names)
-        out = parser.read_csv(StringIO(data), names=names)
-        tm.assert_frame_equal(out, expected)
-    else:
-        msg = "NULL byte detected"
-        with pytest.raises(ParserError, match=msg):
-            parser.read_csv(StringIO(data), names=names)
-
-
-@skip_pyarrow
-def test_temporary_file(all_parsers):
-    # see gh-13398
-    parser = all_parsers
-    data = "0 0"
-
-    with tm.ensure_clean(mode="w+", return_filelike=True) as new_file:
-        new_file.write(data)
-        new_file.flush()
-        new_file.seek(0)
-
-        result = parser.read_csv(new_file, sep=r"\s+", header=None)
-
-        expected = DataFrame([[0, 0]])
-        tm.assert_frame_equal(result, expected)
-
-
-def test_internal_eof_byte(all_parsers):
-    # see gh-5500
-    parser = all_parsers
-    data = "a,b\n1\x1a,2"
-
-    expected = DataFrame([["1\x1a", 2]], columns=["a", "b"])
-    result = parser.read_csv(StringIO(data))
-    tm.assert_frame_equal(result, expected)
-
-
-def test_internal_eof_byte_to_file(all_parsers):
-    # see gh-16559
-    parser = all_parsers
-    data = b'c1,c2\r\n"test \x1a    test", test\r\n'
-    expected = DataFrame([["test \x1a    test", " test"]], columns=["c1", "c2"])
-    path = f"__{tm.rands(10)}__.csv"
-
-    with tm.ensure_clean(path) as path:
-        with open(path, "wb") as f:
-            f.write(data)
-
-        result = parser.read_csv(path)
-        tm.assert_frame_equal(result, expected)
-
-
-def test_sub_character(all_parsers, csv_dir_path):
-    # see gh-16893
-    filename = os.path.join(csv_dir_path, "sub_char.csv")
-    expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"])
-
-    parser = all_parsers
-    result = parser.read_csv(filename)
-    tm.assert_frame_equal(result, expected)
-
-
-def test_file_handle_string_io(all_parsers):
-    # gh-14418
-    #
-    # Don't close user provided file handles.
-    parser = all_parsers
-    data = "a,b\n1,2"
-
-    fh = StringIO(data)
-    parser.read_csv(fh)
-    assert not fh.closed
-
-
-def test_file_handles_with_open(all_parsers, csv1):
-    # gh-14418
-    #
-    # Don't close user provided file handles.
-    parser = all_parsers
-
-    for mode in ["r", "rb"]:
-        with open(csv1, mode) as f:
-            parser.read_csv(f)
-            assert not f.closed
-
-
-@skip_pyarrow
-def test_invalid_file_buffer_class(all_parsers):
-    # see gh-15337
-    class InvalidBuffer:
-        pass
-
-    parser = all_parsers
-    msg = "Invalid file path or buffer object type"
-
-    with pytest.raises(ValueError, match=msg):
-        parser.read_csv(InvalidBuffer())
-
-
-@skip_pyarrow
-def test_invalid_file_buffer_mock(all_parsers):
-    # see gh-15337
-    parser = all_parsers
-    msg = "Invalid file path or buffer object type"
-
-    class Foo:
-        pass
-
-    with pytest.raises(ValueError, match=msg):
-        parser.read_csv(Foo())
-
-
-def test_valid_file_buffer_seems_invalid(all_parsers):
-    # gh-16135: we want to ensure that "tell" and "seek"
-    # aren't actually being used when we call `read_csv`
-    #
-    # Thus, while the object may look "invalid" (these
-    # methods are attributes of the `StringIO` class),
-    # it is still a valid file-object for our purposes.
-    class NoSeekTellBuffer(StringIO):
-        def tell(self):
-            raise AttributeError("No tell method")
-
-        def seek(self, pos, whence=0):
-            raise AttributeError("No seek method")
-
-    data = "a\n1"
-    parser = all_parsers
-    expected = DataFrame({"a": [1]})
-
-    result = parser.read_csv(NoSeekTellBuffer(data))
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-@pytest.mark.parametrize(
-    "kwargs",
-    [{}, {"error_bad_lines": True}],  # Default is True.  # Explicitly pass in.
-)
-@pytest.mark.parametrize(
-    "warn_kwargs", [{}, {"warn_bad_lines": True}, {"warn_bad_lines": False}]
-)
-def test_error_bad_lines(all_parsers, kwargs, warn_kwargs):
-    # see gh-15925
-    parser = all_parsers
-    kwargs.update(**warn_kwargs)
-    data = "a\n1\n1,2,3\n4\n5,6,7"
-
-    msg = "Expected 1 fields in line 3, saw 3"
-    with pytest.raises(ParserError, match=msg):
-        parser.read_csv(StringIO(data), **kwargs)
-
-
-@skip_pyarrow
-def test_warn_bad_lines(all_parsers, capsys):
-    # see gh-15925
-    parser = all_parsers
-    data = "a\n1\n1,2,3\n4\n5,6,7"
-    expected = DataFrame({"a": [1, 4]})
-
-    result = parser.read_csv(StringIO(data), error_bad_lines=False, warn_bad_lines=True)
-    tm.assert_frame_equal(result, expected)
-
-    captured = capsys.readouterr()
-    assert "Skipping line 3" in captured.err
-    assert "Skipping line 5" in captured.err
-
-
-@skip_pyarrow
-def test_suppress_error_output(all_parsers, capsys):
-    # see gh-15925
-    parser = all_parsers
-    data = "a\n1\n1,2,3\n4\n5,6,7"
-    expected = DataFrame({"a": [1, 4]})
-
-    result = parser.read_csv(
-        StringIO(data), error_bad_lines=False, warn_bad_lines=False
-    )
-    tm.assert_frame_equal(result, expected)
-
-    captured = capsys.readouterr()
-    assert captured.err == ""
-
-
-@pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv", "中文文件名.csv"])
-def test_filename_with_special_chars(all_parsers, filename):
-    # see gh-15086.
-    parser = all_parsers
-    df = DataFrame({"a": [1, 2, 3]})
-
-    with tm.ensure_clean(filename) as path:
-        df.to_csv(path, index=False)
-
-        result = parser.read_csv(path)
-        tm.assert_frame_equal(result, df)
-
-
-@skip_pyarrow
-def test_read_csv_memory_growth_chunksize(all_parsers):
-    # see gh-24805
-    #
-    # Let's just make sure that we don't crash
-    # as we iteratively process all chunks.
-    parser = all_parsers
-
-    with tm.ensure_clean() as path:
-        with open(path, "w") as f:
-            for i in range(1000):
-                f.write(str(i) + "\n")
-
-        with parser.read_csv(path, chunksize=20) as result:
-            for _ in result:
-                pass
-
-
-def test_read_csv_raises_on_header_prefix(all_parsers):
-    # gh-27394
-    parser = all_parsers
-    msg = "Argument prefix must be None if argument header is not None"
-
-    s = StringIO("0,1\n2,3")
-
-    with pytest.raises(ValueError, match=msg):
-        parser.read_csv(s, header=0, prefix="_X")
-
-
-def test_unexpected_keyword_parameter_exception(all_parsers):
-    # GH-34976
-    parser = all_parsers
-
-    msg = "{}\\(\\) got an unexpected keyword argument 'foo'"
-    with pytest.raises(TypeError, match=msg.format("read_csv")):
-        parser.read_csv("foo.csv", foo=1)
-    with pytest.raises(TypeError, match=msg.format("read_table")):
-        parser.read_table("foo.tsv", foo=1)
-
-
-def test_read_table_same_signature_as_read_csv(all_parsers):
-    # GH-34976
-    parser = all_parsers
-
-    table_sign = signature(parser.read_table)
-    csv_sign = signature(parser.read_csv)
-
-    assert table_sign.parameters.keys() == csv_sign.parameters.keys()
-    assert table_sign.return_annotation == csv_sign.return_annotation
-
-    for key, csv_param in csv_sign.parameters.items():
-        table_param = table_sign.parameters[key]
-        if key == "sep":
-            assert csv_param.default == ","
-            assert table_param.default == "\t"
-            assert table_param.annotation == csv_param.annotation
-            assert table_param.kind == csv_param.kind
-            continue
-        else:
-            assert table_param == csv_param
-
-
-def test_read_table_equivalency_to_read_csv(all_parsers):
-    # see gh-21948
-    # As of 0.25.0, read_table is undeprecated
-    parser = all_parsers
-    data = "a\tb\n1\t2\n3\t4"
-    expected = parser.read_csv(StringIO(data), sep="\t")
-    result = parser.read_table(StringIO(data))
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_first_row_bom(all_parsers):
-    # see gh-26545
-    parser = all_parsers
-    data = '''\ufeff"Head1"	"Head2"	"Head3"'''
-
-    result = parser.read_csv(StringIO(data), delimiter="\t")
-    expected = DataFrame(columns=["Head1", "Head2", "Head3"])
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_first_row_bom_unquoted(all_parsers):
-    # see gh-36343
-    parser = all_parsers
-    data = """\ufeffHead1	Head2	Head3"""
-
-    result = parser.read_csv(StringIO(data), delimiter="\t")
-    expected = DataFrame(columns=["Head1", "Head2", "Head3"])
-    tm.assert_frame_equal(result, expected)
-
-
-def test_integer_precision(all_parsers):
-    # Gh 7072
-    s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765
-5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389"""
-    parser = all_parsers
-    result = parser.read_csv(StringIO(s), header=None)[4]
-    expected = Series([4321583677327450765, 4321113141090630389], name=4)
-    tm.assert_series_equal(result, expected)
-
-
-@skip_pyarrow
-def test_file_descriptor_leak(all_parsers):
-    # GH 31488
-
-    parser = all_parsers
-    with tm.ensure_clean() as path:
-
-        def test():
-            with pytest.raises(EmptyDataError, match="No columns to parse from file"):
-                parser.read_csv(path)
-
-        td.check_file_leaks(test)()
-
-
-@skip_pyarrow
-@pytest.mark.parametrize("nrows", range(1, 6))
-def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
-    # GH 28071
-    ref = DataFrame(
-        [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]],
-        columns=list("ab"),
-    )
-    csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
-    parser = all_parsers
-    df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
-    tm.assert_frame_equal(df, ref[:nrows])
-
-
-@skip_pyarrow
-def test_no_header_two_extra_columns(all_parsers):
-    # GH 26218
-    column_names = ["one", "two", "three"]
-    ref = DataFrame([["foo", "bar", "baz"]], columns=column_names)
-    stream = StringIO("foo,bar,baz,bam,blah")
-    parser = all_parsers
-    df = parser.read_csv(stream, header=None, names=column_names, index_col=False)
-    tm.assert_frame_equal(df, ref)
-
-
-def test_read_csv_names_not_accepting_sets(all_parsers):
-    # GH 34946
-    data = """\
-    1,2,3
-    4,5,6\n"""
-    parser = all_parsers
-    with pytest.raises(ValueError, match="Names should be an ordered collection."):
-        parser.read_csv(StringIO(data), names=set("QAZ"))
-
-
-def test_read_csv_with_use_inf_as_na(all_parsers):
-    # https://github.com/pandas-dev/pandas/issues/35493
-    parser = all_parsers
-    data = "1.0\nNaN\n3.0"
-    with option_context("use_inf_as_na", True):
-        result = parser.read_csv(StringIO(data), header=None)
-    expected = DataFrame([1.0, np.nan, 3.0])
-    tm.assert_frame_equal(result, expected)
-
-
-@skip_pyarrow
-def test_read_table_delim_whitespace_default_sep(all_parsers):
-    # GH: 35958
-    f = StringIO("a  b  c\n1 -2 -3\n4  5   6")
-    parser = all_parsers
-    result = parser.read_table(f, delim_whitespace=True)
-    expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]})
-    tm.assert_frame_equal(result, expected)
-
-
-@pytest.mark.parametrize("delimiter", [",", "\t"])
-def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter):
-    # GH: 35958
-    f = StringIO("a  b  c\n1 -2 -3\n4  5   6")
-    parser = all_parsers
-    msg = (
-        "Specified a delimiter with both sep and "
-        "delim_whitespace=True; you can only specify one."
-    )
-    with pytest.raises(ValueError, match=msg):
-        parser.read_csv(f, delim_whitespace=True, sep=delimiter)
-
-    with pytest.raises(ValueError, match=msg):
-        parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)
-
-
-@pytest.mark.parametrize("delimiter", [",", "\t"])
-def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
-    # GH: 35958
-    f = StringIO("a  b  c\n1 -2 -3\n4  5   6")
-    parser = all_parsers
-    msg = (
-        "Specified a delimiter with both sep and "
-        "delim_whitespace=True; you can only specify one."
-    )
-    with pytest.raises(ValueError, match=msg):
-        parser.read_table(f, delim_whitespace=True, sep=delimiter)
-
-    with pytest.raises(ValueError, match=msg):
-        parser.read_table(f, delim_whitespace=True, delimiter=delimiter)
-
-
-@skip_pyarrow
-def test_dict_keys_as_names(all_parsers):
-    # GH: 36928
-    data = "1,2"
-
-    keys = {"a": int, "b": int}.keys()
-    parser = all_parsers
-
-    result = parser.read_csv(StringIO(data), names=keys)
-    expected = DataFrame({"a": [1], "b": [2]})
-    tm.assert_frame_equal(result, expected)
-
-
-@pytest.mark.parametrize("io_class", [StringIO, BytesIO])
-@pytest.mark.parametrize("encoding", [None, "utf-8"])
-def test_read_csv_file_handle(all_parsers, io_class, encoding):
-    """
-    Test whether read_csv does not close user-provided file handles.
-
-    GH 36980
-    """
-    parser = all_parsers
-    expected = DataFrame({"a": [1], "b": [2]})
-
-    content = "a,b\n1,2"
-    if io_class == BytesIO:
-        content = content.encode("utf-8")
-    handle = io_class(content)
-
-    tm.assert_frame_equal(parser.read_csv(handle, encoding=encoding), expected)
-    assert not handle.closed
-
-
-@skip_pyarrow
-def test_memory_map_file_handle_silent_fallback(all_parsers, compression):
-    """
-    Do not fail for buffers with memory_map=True (cannot memory map BytesIO).
-
-    GH 37621
-    """
-    parser = all_parsers
-    expected = DataFrame({"a": [1], "b": [2]})
-
-    handle = BytesIO()
-    expected.to_csv(handle, index=False, compression=compression, mode="wb")
-    handle.seek(0)
-
-    tm.assert_frame_equal(
-        parser.read_csv(handle, memory_map=True, compression=compression),
-        expected,
-    )
-
-
-@skip_pyarrow
-def test_memory_map_compression(all_parsers, compression):
-    """
-    Support memory map for compressed files.
-
-    GH 37621
-    """
-    parser = all_parsers
-    expected = DataFrame({"a": [1], "b": [2]})
-
-    with tm.ensure_clean() as path:
-        expected.to_csv(path, index=False, compression=compression)
-
-        tm.assert_frame_equal(
-            parser.read_csv(path, memory_map=True, compression=compression),
-            expected,
-        )
-
-
-@skip_pyarrow
-def test_context_manager(all_parsers, datapath):
-    # make sure that opened files are closed
-    parser = all_parsers
-
-    path = datapath("io", "data", "csv", "iris.csv")
-
-    reader = parser.read_csv(path, chunksize=1)
-    assert not reader._engine.handles.handle.closed
-    try:
-        with reader:
-            next(reader)
-            assert False
-    except AssertionError:
-        assert reader._engine.handles.handle.closed
-
-
-@skip_pyarrow
-def test_context_manageri_user_provided(all_parsers, datapath):
-    # make sure that user-provided handles are not closed
-    parser = all_parsers
-
-    with open(datapath("io", "data", "csv", "iris.csv"), mode="r") as path:
-
-        reader = parser.read_csv(path, chunksize=1)
-        assert not reader._engine.handles.handle.closed
-        try:
-            with reader:
-                next(reader)
-                assert False
-        except AssertionError:
-            assert not reader._engine.handles.handle.closed