From 052089aecce18ea784568ee70b0f91d38040e41a Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 10 Feb 2023 07:03:13 -0500
Subject: [PATCH 1/3] ENH: Enable more Arrow CSV tests/features

---
 doc/source/whatsnew/v2.0.0.rst                |  1 +
 pandas/io/parsers/arrow_parser_wrapper.py     | 10 +++++-
 pandas/io/parsers/readers.py                  |  1 -
 pandas/tests/io/parser/common/test_decimal.py |  3 +-
 pandas/tests/io/parser/test_encoding.py       | 32 ++++++++++++-------
 5 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index b006d3820889f..249d951bae04b 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -1283,6 +1283,7 @@ I/O
 - Bug in :meth:`DataFrame.to_dict` not converting ``NA`` to ``None`` (:issue:`50795`)
 - Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`)
 - Bug in :func:`read_xml` where file-like objects failed when iterparse is used (:issue:`50641`)
+- Fixed handling of ``encoding`` and ``decimal`` parameters when ``engine="pyarrow"`` in :func:`read_csv` (:issue:`51302`)
 
 Period
 ^^^^^^
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
index 420b6212f857a..63914fceff22a 100644
--- a/pandas/io/parsers/arrow_parser_wrapper.py
+++ b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -52,6 +52,7 @@ def _get_pyarrow_options(self) -> None:
             "na_values": "null_values",
             "escapechar": "escape_char",
             "skip_blank_lines": "ignore_empty_lines",
+            "decimal": "decimal_point",
         }
         for pandas_name, pyarrow_name in mapping.items():
             if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None:
@@ -69,13 +70,20 @@ def _get_pyarrow_options(self) -> None:
             for option_name, option_value in self.kwds.items()
             if option_value is not None
             and option_name
-            in ("include_columns", "null_values", "true_values", "false_values")
+            in (
+                "include_columns",
+                "null_values",
+                "true_values",
+                "false_values",
+                "decimal_point",
+            )
         }
         self.read_options = {
             "autogenerate_column_names": self.header is None,
             "skip_rows": self.header
             if self.header is not None
             else self.kwds["skiprows"],
+            "encoding": self.encoding,
         }
 
     def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
index 7230c675ee775..3fb8b98b01ad1 100644
--- a/pandas/io/parsers/readers.py
+++ b/pandas/io/parsers/readers.py
@@ -453,7 +453,6 @@
     "quoting",
     "lineterminator",
     "converters",
-    "decimal",
     "iterator",
     "dayfirst",
     "verbose",
diff --git a/pandas/tests/io/parser/common/test_decimal.py b/pandas/tests/io/parser/common/test_decimal.py
index ab58ddff9c06e..72d4eb2c69845 100644
--- a/pandas/tests/io/parser/common/test_decimal.py
+++ b/pandas/tests/io/parser/common/test_decimal.py
@@ -9,9 +9,10 @@
 from pandas import DataFrame
 import pandas._testing as tm
 
-pytestmark = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,thousands,decimal",
     [
diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py
index 775d5571c7a3d..c4f7abe27a8bd 100644
--- a/pandas/tests/io/parser/test_encoding.py
+++ b/pandas/tests/io/parser/test_encoding.py
@@ -13,6 +13,8 @@
 import numpy as np
 import pytest
 
+from pandas.compat import pa_version_under8p0
+
 from pandas import (
     DataFrame,
     read_csv,
@@ -20,9 +22,9 @@
 import pandas._testing as tm
 
 skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
 
-@skip_pyarrow
 def test_bytes_io_input(all_parsers):
     encoding = "cp1255"
     parser = all_parsers
@@ -44,7 +46,7 @@ def test_read_csv_unicode(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
+@xfail_pyarrow
 @pytest.mark.parametrize("sep", [",", "\t"])
 @pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"])
 def test_utf16_bom_skiprows(all_parsers, sep, encoding):
@@ -73,7 +75,6 @@ def test_utf16_bom_skiprows(all_parsers, sep, encoding):
         tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 def test_utf16_example(all_parsers, csv_dir_path):
     path = os.path.join(csv_dir_path, "utf16_ex.txt")
     parser = all_parsers
@@ -81,7 +82,6 @@ def test_utf16_example(all_parsers, csv_dir_path):
     assert len(result) == 50
 
 
-@skip_pyarrow
 def test_unicode_encoding(all_parsers, csv_dir_path):
     path = os.path.join(csv_dir_path, "unicode_series.csv")
     parser = all_parsers
@@ -94,7 +94,7 @@ def test_unicode_encoding(all_parsers, csv_dir_path):
     assert got == expected
 
 
-@skip_pyarrow
+# @xfail_pyarrow
 @pytest.mark.parametrize(
     "data,kwargs,expected",
     [
@@ -114,7 +114,7 @@ def test_unicode_encoding(all_parsers, csv_dir_path):
         ),
     ],
 )
-def test_utf8_bom(all_parsers, data, kwargs, expected):
+def test_utf8_bom(all_parsers, data, kwargs, expected, request):
     # see gh-4793
     parser = all_parsers
     bom = "\ufeff"
@@ -124,11 +124,21 @@ def _encode_data_with_bom(_data):
         bom_data = (bom + _data).encode(utf8)
         return BytesIO(bom_data)
 
+    if (
+        parser.engine == "pyarrow"
+        and pa_version_under8p0
+        and data == "\n1"
+        and kwargs.get("skip_blank_lines", False)
+    ):
+        # Manually xfail, since we don't have mechanism to xfail specific version
+        request.node.add_marker(
+            pytest.mark.xfail(reason="This test fails on pyarrow < 8")
+        )
+
     result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs)
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
     # see gh-13549
     expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
@@ -141,7 +151,7 @@ def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "file_path,encoding",
     [
@@ -226,7 +236,7 @@ def test_parse_encoded_special_characters(encoding):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
+@xfail_pyarrow
 @pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"])
 def test_encoding_memory_map(all_parsers, encoding):
     # GH40986
@@ -244,7 +254,7 @@ def test_encoding_memory_map(all_parsers, encoding):
     tm.assert_frame_equal(df, expected)
 
 
-@skip_pyarrow
+@xfail_pyarrow
 def test_chunk_splits_multibyte_char(all_parsers):
     """
     Chunk splits a multibyte character with memory_map=True
@@ -264,7 +274,7 @@ def test_chunk_splits_multibyte_char(all_parsers):
     tm.assert_frame_equal(dfr, df)
 
 
-@skip_pyarrow
+@xfail_pyarrow
 def test_readcsv_memmap_utf8(all_parsers):
     """
     GH 43787

From be188c25d8f38ef5adf7cdfb4d1faffd71f8e944 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Wed, 22 Feb 2023 18:06:41 -0500
Subject: [PATCH 2/3] fix tests and whatsnew

---
 doc/source/whatsnew/v2.0.0.rst                    | 4 ++--
 pandas/tests/io/parser/dtypes/test_categorical.py | 1 -
 pandas/tests/io/parser/test_encoding.py           | 9 ++-------
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index 249d951bae04b..67d7c1c4af2b2 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -299,7 +299,7 @@ Other enhancements
 - Added :meth:`Series.dt.unit` and :meth:`Series.dt.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`51223`)
 - Added new argument ``dtype`` to :func:`read_sql` to be consistent with :func:`read_sql_query` (:issue:`50797`)
 - Added support for SQLAlchemy 2.0 (:issue:`40686`)
--
+- Support ``decimal`` parameter when ``engine="pyarrow"`` in :func:`read_csv` (:issue:`51302`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_200.notable_bug_fixes:
@@ -1283,7 +1283,7 @@ I/O
 - Bug in :meth:`DataFrame.to_dict` not converting ``NA`` to ``None`` (:issue:`50795`)
 - Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`)
 - Bug in :func:`read_xml` where file-like objects failed when iterparse is used (:issue:`50641`)
-- Fixed handling of ``encoding`` and ``decimal`` parameters when ``engine="pyarrow"`` in :func:`read_csv` (:issue:`51302`)
+- Fixed handling of ``encoding`` parameter when ``engine="pyarrow"`` in :func:`read_csv` (:issue:`51302`)
 
 Period
 ^^^^^^
diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py
index a0deebecdfff8..33422d41c2f93 100644
--- a/pandas/tests/io/parser/dtypes/test_categorical.py
+++ b/pandas/tests/io/parser/dtypes/test_categorical.py
@@ -118,7 +118,6 @@ def test_categorical_dtype_high_cardinality_numeric(all_parsers):
     tm.assert_frame_equal(actual, expected)
 
 
-@xfail_pyarrow
 def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
     # see gh-10153
     pth = os.path.join(csv_dir_path, "utf16_ex.txt")
diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py
index c4f7abe27a8bd..f537c2f0681d7 100644
--- a/pandas/tests/io/parser/test_encoding.py
+++ b/pandas/tests/io/parser/test_encoding.py
@@ -13,8 +13,6 @@
 import numpy as np
 import pytest
 
-from pandas.compat import pa_version_under8p0
-
 from pandas import (
     DataFrame,
     read_csv,
@@ -94,7 +92,6 @@ def test_unicode_encoding(all_parsers, csv_dir_path):
     assert got == expected
 
 
-# @xfail_pyarrow
 @pytest.mark.parametrize(
     "data,kwargs,expected",
     [
@@ -126,13 +123,12 @@ def _encode_data_with_bom(_data):
 
     if (
         parser.engine == "pyarrow"
-        and pa_version_under8p0
         and data == "\n1"
-        and kwargs.get("skip_blank_lines", False)
+        and kwargs.get("skip_blank_lines", True)
     ):
         # Manually xfail, since we don't have mechanism to xfail specific version
         request.node.add_marker(
-            pytest.mark.xfail(reason="This test fails on pyarrow < 8")
+            pytest.mark.xfail(reason="Pyarrow can't read blank lines")
         )
 
     result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs)
@@ -151,7 +147,6 @@ def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 @pytest.mark.parametrize(
     "file_path,encoding",
     [

From b0a58dd3bc3ae6857b49f79ee4ba6c3019f907c0 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Wed, 15 Mar 2023 11:49:03 -0400
Subject: [PATCH 3/3] Update v2.0.0.rst

---
 doc/source/whatsnew/v2.0.0.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index b5b7b39c52e46..7a88d9ab8e3a2 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -280,7 +280,7 @@ Other enhancements
 - :func:`to_datetime` now accepts ``"mixed"`` as an argument to ``format``, which will infer the format for each element individually (:issue:`50972`)
 - Added new argument ``engine`` to :func:`read_json` to support parsing JSON with pyarrow by specifying ``engine="pyarrow"`` (:issue:`48893`)
 - Added support for SQLAlchemy 2.0 (:issue:`40686`)
-- Support ``decimal`` parameter when ``engine="pyarrow"`` in :func:`read_csv` (:issue:`51302`)
+- Added support for ``decimal`` parameter when ``engine="pyarrow"`` in :func:`read_csv` (:issue:`51302`)
 - :class:`Index` set operations :meth:`Index.union`, :meth:`Index.intersection`, :meth:`Index.difference`, and :meth:`Index.symmetric_difference` now support ``sort=True``, which will always return a sorted result, unlike the default ``sort=None`` which does not sort in some cases (:issue:`25151`)
 - Added new escape mode "latex-math" to avoid escaping "$" in formatter (:issue:`50040`)
 
@@ -1294,7 +1294,7 @@ I/O
 - Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`)
 - Bug in :meth:`DataFrame.to_html` with ``na_rep`` set when the :class:`DataFrame` contains non-scalar data (:issue:`47103`)
 - Bug in :func:`read_xml` where file-like objects failed when iterparse is used (:issue:`50641`)
-- Fixed handling of ``encoding`` parameter when ``engine="pyarrow"`` in :func:`read_csv` (:issue:`51302`)
+- Bug in :func:`read_csv` when ``engine="pyarrow"`` where ``encoding`` parameter was not handled correctly (:issue:`51302`)
 - Bug in :func:`read_xml` ignored repeated elements when iterparse is used (:issue:`51183`)
 - Bug in :class:`ExcelWriter` leaving file handles open if an exception occurred during instantiation (:issue:`51443`)