BUG: Fix Arrow CSV Parser erroring when specifying a dtype for index … (pandas-dev#53360)

lithomas1 · mroeschke · im-vinicius · commit d6b915704dcf · 2023-07-08T12:28:06.000+02:00
* BUG: Fix Arrow CSV Parser erroring when specifying a dtype for index cols

* BUG: Fix Arrow CSV Parser erroring when specifying a dtype for index cols

* Update doc/source/whatsnew/v2.1.0.rst

Co-authored-by: Matthew Roeschke &lt;10647082+mroeschke@users.noreply.github.com&gt;

---------

Co-authored-by: Matthew Roeschke &lt;10647082+mroeschke@users.noreply.github.com&gt;
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -396,6 +396,7 @@ I/O
 - :meth:`DataFrame.to_sql` now raising ``ValueError`` when the name param is left empty while using SQLAlchemy to connect (:issue:`52675`)
 - Bug in :func:`json_normalize`, fix json_normalize cannot parse metadata fields list type (:issue:`37782`)
 - Bug in :func:`read_csv` where it would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`)
+- Bug in :func:`read_csv`, with ``engine="pyarrow"`` erroring when specifying a ``dtype`` with ``index_col`` (:issue:`53229`)
 - Bug in :func:`read_hdf` not properly closing store after a ``IndexError`` is raised (:issue:`52781`)
 - Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`)
 - Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -142,14 +142,29 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
                 elif item not in frame.columns:
                     raise ValueError(f"Index {item} invalid")
 
+                # Process dtype for index_col and drop from dtypes
+                if self.dtype is not None:
+                    key, new_dtype = (
+                        (item, self.dtype.get(item))
+                        if self.dtype.get(item) is not None
+                        else (frame.columns[item], self.dtype.get(frame.columns[item]))
+                    )
+                    if new_dtype is not None:
+                        frame[key] = frame[key].astype(new_dtype)
+                        del self.dtype[key]
+
             frame.set_index(index_to_set, drop=True, inplace=True)
             # Clear names if headerless and no name given
             if self.header is None and not multi_index_named:
                 frame.index.names = [None] * len(frame.index.names)
 
-        if self.kwds.get("dtype") is not None:
+        if self.dtype is not None:
+            # Ignore non-existent columns from dtype mapping
+            # like other parsers do
+            if isinstance(self.dtype, dict):
+                self.dtype = {k: v for k, v in self.dtype.items() if k in frame.columns}
             try:
-                frame = frame.astype(self.kwds.get("dtype"))
+                frame = frame.astype(self.dtype)
             except TypeError as e:
                 # GH#44901 reraise to keep api consistent
                 raise ValueError(e)
diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py
@@ -324,12 +324,15 @@ def test_infer_types_boolean_sum(all_parsers):
     tm.assert_frame_equal(result, expected, check_index_type=False)
 
 
-@skip_pyarrow
 @pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)])
-def test_specify_dtype_for_index_col(all_parsers, dtype, val):
+def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
     # GH#9435
     data = "a,b\n01,2"
     parser = all_parsers
+    if dtype == object and parser.engine == "pyarrow":
+        request.node.add_marker(
+            pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine")
+        )
     result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype})
     expected = DataFrame({"b": [2]}, index=Index([val], name="a"))
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py
@@ -12,6 +12,7 @@
 from pandas import (
     DataFrame,
     Index,
+    array,
 )
 import pandas._testing as tm
 
@@ -24,8 +25,8 @@
     "Usecols do not match columns, columns expected but not found: {0}"
 )
 
-# TODO(1.4): Change to xfails at release time
-pytestmark = pytest.mark.usefixtures("pyarrow_skip")
+# TODO: Switch to xfails
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
 
 
 def test_raise_on_mixed_dtype_usecols(all_parsers):
@@ -41,6 +42,7 @@ def test_raise_on_mixed_dtype_usecols(all_parsers):
         parser.read_csv(StringIO(data), usecols=usecols)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")])
 def test_usecols(all_parsers, usecols):
     data = """\
@@ -56,6 +58,7 @@ def test_usecols(all_parsers, usecols):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_usecols_with_names(all_parsers):
     data = """\
 a,b,c
@@ -71,6 +74,7 @@ def test_usecols_with_names(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])]
 )
@@ -87,6 +91,7 @@ def test_usecols_relative_to_names(all_parsers, names, usecols):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_usecols_relative_to_names2(all_parsers):
     # see gh-5766
     data = """\
@@ -103,6 +108,7 @@ def test_usecols_relative_to_names2(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_usecols_name_length_conflict(all_parsers):
     data = """\
 1,2,3
@@ -127,6 +133,7 @@ def test_usecols_single_string(all_parsers):
         parser.read_csv(StringIO(data), usecols="foo")
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"]
 )
@@ -140,6 +147,7 @@ def test_usecols_index_col_false(all_parsers, data):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("index_col", ["b", 0])
 @pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]])
 def test_usecols_index_col_conflict(all_parsers, usecols, index_col):
@@ -166,6 +174,7 @@ def test_usecols_index_col_conflict2(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_usecols_implicit_index_col(all_parsers):
     # see gh-2654
     parser = all_parsers
@@ -198,6 +207,7 @@ def test_usecols_index_col_end(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_usecols_regex_sep(all_parsers):
     # see gh-2733
     parser = all_parsers
@@ -208,6 +218,7 @@ def test_usecols_regex_sep(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_usecols_with_whitespace(all_parsers):
     parser = all_parsers
     data = "a  b  c\n4  apple  bat  5.7\n8  orange  cow  10"
@@ -217,6 +228,7 @@ def test_usecols_with_whitespace(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "usecols,expected",
     [
@@ -239,6 +251,7 @@ def test_usecols_with_integer_like_header(all_parsers, usecols, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_empty_usecols(all_parsers):
     data = "a,b,c\n1,2,3\n4,5,6"
     expected = DataFrame(columns=Index([]))
@@ -259,6 +272,7 @@ def test_np_array_usecols(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "usecols,expected",
     [
@@ -291,6 +305,7 @@ def test_callable_usecols(all_parsers, usecols, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]])
 def test_incomplete_first_row(all_parsers, usecols):
     # see gh-6710
@@ -303,6 +318,7 @@ def test_incomplete_first_row(all_parsers, usecols):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data,usecols,kwargs,expected",
     [
@@ -335,6 +351,7 @@ def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "usecols,kwargs,expected,msg",
     [
@@ -391,6 +408,7 @@ def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected
         tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
 def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols):
     data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
@@ -402,6 +420,7 @@ def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("names", [None, ["a", "b"]])
 def test_usecols_indices_out_of_bounds(all_parsers, names):
     # GH#25623 & GH 41130; enforced in 2.0
@@ -414,6 +433,7 @@ def test_usecols_indices_out_of_bounds(all_parsers, names):
         parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0)
 
 
+@skip_pyarrow
 def test_usecols_additional_columns(all_parsers):
     # GH#46997
     parser = all_parsers
@@ -423,10 +443,29 @@ def test_usecols_additional_columns(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_usecols_additional_columns_integer_columns(all_parsers):
     # GH#46997
     parser = all_parsers
     usecols = lambda header: header.strip() in ["0", "1"]
     result = parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols)
     expected = DataFrame({"0": ["x"], "1": "y"})
     tm.assert_frame_equal(result, expected)
+
+
+def test_usecols_dtype(all_parsers):
+    parser = all_parsers
+    data = """
+col1,col2,col3
+a,1,x
+b,2,y
+"""
+    result = parser.read_csv(
+        StringIO(data),
+        usecols=["col1", "col2"],
+        dtype={"col1": "string", "col2": "uint8", "col3": "string"},
+    )
+    expected = DataFrame(
+        {"col1": array(["a", "b"]), "col2": np.array([1, 2], dtype="uint8")}
+    )
+    tm.assert_frame_equal(result, expected)