diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index c927dc2ac4a96..200ba9af508b6 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -270,6 +270,7 @@ Other enhancements - :func:`to_datetime` now accepts ``"mixed"`` as an argument to ``format``, which will infer the format for each element individually (:issue:`50972`) - Added new argument ``engine`` to :func:`read_json` to support parsing JSON with pyarrow by specifying ``engine="pyarrow"`` (:issue:`48893`) - Added support for SQLAlchemy 2.0 (:issue:`40686`) +- Added support for ``decimal`` parameter when ``engine="pyarrow"`` in :func:`read_csv` (:issue:`51302`) - :class:`Index` set operations :meth:`Index.union`, :meth:`Index.intersection`, :meth:`Index.difference`, and :meth:`Index.symmetric_difference` now support ``sort=True``, which will always return a sorted result, unlike the default ``sort=None`` which does not sort in some cases (:issue:`25151`) - Added new escape mode "latex-math" to avoid escaping "$" in formatter (:issue:`50040`) @@ -1287,6 +1288,7 @@ I/O - Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`) - Bug in :meth:`DataFrame.to_html` with ``na_rep`` set when the :class:`DataFrame` contains non-scalar data (:issue:`47103`) - Bug in :func:`read_xml` where file-like objects failed when iterparse is used (:issue:`50641`) +- Bug in :func:`read_csv` when ``engine="pyarrow"`` where ``encoding`` parameter was not handled correctly (:issue:`51302`) - Bug in :func:`read_xml` ignored repeated elements when iterparse is used (:issue:`51183`) - Bug in :class:`ExcelWriter` leaving file handles open if an exception occurred during instantiation (:issue:`51443`) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 30fc65dca7ca1..a741a11332e99 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -54,6 +54,7 @@ def _get_pyarrow_options(self) -> None: "na_values": "null_values", "escapechar": "escape_char", "skip_blank_lines": "ignore_empty_lines", + "decimal": "decimal_point", } for pandas_name, pyarrow_name in mapping.items(): if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None: @@ -71,13 +72,20 @@ def _get_pyarrow_options(self) -> None: for option_name, option_value in self.kwds.items() if option_value is not None and option_name - in ("include_columns", "null_values", "true_values", "false_values") + in ( + "include_columns", + "null_values", + "true_values", + "false_values", + "decimal_point", + ) } self.read_options = { "autogenerate_column_names": self.header is None, "skip_rows": self.header if self.header is not None else self.kwds["skiprows"], + "encoding": self.encoding, } def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 015f27ed4f2c4..df675a0a3a6cc 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -475,7 +475,6 @@ class _Fwf_Defaults(TypedDict): "quoting", "lineterminator", "converters", - "decimal", "iterator", "dayfirst", "verbose", diff --git a/pandas/tests/io/parser/common/test_decimal.py b/pandas/tests/io/parser/common/test_decimal.py index ab58ddff9c06e..72d4eb2c69845 100644 --- a/pandas/tests/io/parser/common/test_decimal.py +++ b/pandas/tests/io/parser/common/test_decimal.py @@ -9,9 +9,10 @@ from pandas import DataFrame import pandas._testing as tm -pytestmark = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +@xfail_pyarrow @pytest.mark.parametrize( "data,thousands,decimal", [ diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py index a0deebecdfff8..33422d41c2f93 100644 --- a/pandas/tests/io/parser/dtypes/test_categorical.py +++ b/pandas/tests/io/parser/dtypes/test_categorical.py @@ -118,7 +118,6 @@ def test_categorical_dtype_high_cardinality_numeric(all_parsers): tm.assert_frame_equal(actual, expected) -@xfail_pyarrow def test_categorical_dtype_utf16(all_parsers, csv_dir_path): # see gh-10153 pth = os.path.join(csv_dir_path, "utf16_ex.txt") diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 775d5571c7a3d..f537c2f0681d7 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -20,9 +20,9 @@ import pandas._testing as tm skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -@skip_pyarrow def test_bytes_io_input(all_parsers): encoding = "cp1255" parser = all_parsers @@ -44,7 +44,7 @@ def test_read_csv_unicode(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow @pytest.mark.parametrize("sep", [",", "\t"]) @pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"]) def test_utf16_bom_skiprows(all_parsers, sep, encoding): @@ -73,7 +73,6 @@ def test_utf16_bom_skiprows(all_parsers, sep, encoding): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_utf16_example(all_parsers, csv_dir_path): path = os.path.join(csv_dir_path, "utf16_ex.txt") parser = all_parsers @@ -81,7 +80,6 @@ def test_utf16_example(all_parsers, csv_dir_path): assert len(result) == 50 -@skip_pyarrow def test_unicode_encoding(all_parsers, csv_dir_path): path = os.path.join(csv_dir_path, "unicode_series.csv") parser = all_parsers @@ -94,7 +92,6 @@ def test_unicode_encoding(all_parsers, csv_dir_path): assert got == expected -@skip_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -114,7 +111,7 @@ def test_unicode_encoding(all_parsers, csv_dir_path): ), ], ) -def test_utf8_bom(all_parsers, data, kwargs, expected): +def test_utf8_bom(all_parsers, data, kwargs, expected, request): # see gh-4793 parser = all_parsers bom = "\ufeff" @@ -124,11 +121,20 @@ def _encode_data_with_bom(_data): bom_data = (bom + _data).encode(utf8) return BytesIO(bom_data) + if ( + parser.engine == "pyarrow" + and data == "\n1" + and kwargs.get("skip_blank_lines", True) + ): + # Manually xfail, since we don't have mechanism to xfail specific version + request.node.add_marker( + pytest.mark.xfail(reason="Pyarrow can't read blank lines") + ) + result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs) tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt): # see gh-13549 expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]}) @@ -141,7 +147,6 @@ def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "file_path,encoding", [ @@ -226,7 +231,7 @@ def test_parse_encoded_special_characters(encoding): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow @pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"]) def test_encoding_memory_map(all_parsers, encoding): # GH40986 @@ -244,7 +249,7 @@ def test_encoding_memory_map(all_parsers, encoding): tm.assert_frame_equal(df, expected) -@skip_pyarrow +@xfail_pyarrow def test_chunk_splits_multibyte_char(all_parsers): """ Chunk splits a multibyte character with memory_map=True @@ -264,7 +269,7 @@ def test_chunk_splits_multibyte_char(all_parsers): tm.assert_frame_equal(dfr, df) -@skip_pyarrow +@xfail_pyarrow def test_readcsv_memmap_utf8(all_parsers): """ GH 43787