diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 852c1e0d139e5..35058ba03ade8 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -206,7 +206,7 @@ def time_read_csv(self, bad_date_value): class ReadCSVSkipRows(BaseIO): fname = "__test__.csv" - params = ([None, 10000], ["c", "python"]) + params = ([None, 10000], ["c", "python", "pyarrow"]) param_names = ["skiprows", "engine"] def setup(self, skiprows, engine): @@ -320,7 +320,7 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision): class ReadCSVEngine(StringIORewind): - params = ["c", "python"] + params = ["c", "python", "pyarrow"] param_names = ["engine"] def setup(self, engine): diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 939fd5b832cef..1f1556123db17 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -160,9 +160,15 @@ dtype : Type name or dict of column -> type, default ``None`` (unsupported with ``engine='python'``). Use ``str`` or ``object`` together with suitable ``na_values`` settings to preserve and not interpret dtype. -engine : {``'c'``, ``'python'``} - Parser engine to use. The C engine is faster while the Python engine is - currently more feature-complete. +engine : {``'c'``, ``'python'``, ``'pyarrow'``} + Parser engine to use. The C and pyarrow engines are faster, while the python engine + is currently more feature-complete. Multithreading is currently only supported by + the pyarrow engine. + + .. versionadded:: 1.4.0 + + The "pyarrow" engine was added as an *experimental* engine, and some features + are unsupported, or may not work correctly, with this engine. converters : dict, default ``None`` Dict of functions for converting values in certain columns. Keys can either be integers or column labels. @@ -1622,11 +1628,17 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object: Specifying the parser engine '''''''''''''''''''''''''''' -Under the hood pandas uses a fast and efficient parser implemented in C as well -as a Python implementation which is currently more feature-complete. Where -possible pandas uses the C parser (specified as ``engine='c'``), but may fall -back to Python if C-unsupported options are specified. Currently, C-unsupported -options include: +Pandas currently supports three engines, the C engine, the python engine, and an experimental +pyarrow engine (requires the ``pyarrow`` package). In general, the pyarrow engine is fastest +on larger workloads and is equivalent in speed to the C engine on most other workloads. +The python engine tends to be slower than the pyarrow and C engines on most workloads. However, +the pyarrow engine is much less robust than the C engine, which lacks a few features compared to the +Python engine. + +Where possible, pandas uses the C parser (specified as ``engine='c'``), but it may fall +back to Python if C-unsupported options are specified. + +Currently, options unsupported by the C and pyarrow engines include: * ``sep`` other than a single character (e.g. regex separators) * ``skipfooter`` @@ -1635,6 +1647,32 @@ options include: Specifying any of the above options will produce a ``ParserWarning`` unless the python engine is selected explicitly using ``engine='python'``. +Options that are unsupported by the pyarrow engine which are not covered by the list above include: + +* ``float_precision`` +* ``chunksize`` +* ``comment`` +* ``nrows`` +* ``thousands`` +* ``memory_map`` +* ``dialect`` +* ``warn_bad_lines`` +* ``error_bad_lines`` +* ``on_bad_lines`` +* ``delim_whitespace`` +* ``quoting`` +* ``lineterminator`` +* ``converters`` +* ``decimal`` +* ``iterator`` +* ``dayfirst`` +* ``infer_datetime_format`` +* ``verbose`` +* ``skipinitialspace`` +* ``low_memory`` + +Specifying these options with ``engine='pyarrow'`` will raise a ``ValueError``. + .. _io.remote: Reading/writing remote files diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index fc488504f1fdf..450ecc85c725b 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -78,10 +78,13 @@ Styler There are also bug fixes and deprecations listed below. -.. _whatsnew_140.enhancements.enhancement2: +.. _whatsnew_140.enhancements.pyarrow_csv_engine: -enhancement2 -^^^^^^^^^^^^ +Multithreaded CSV reading with a new CSV Engine based on pyarrow +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`pandas.read_csv` now accepts ``engine="pyarrow"`` (requires at least ``pyarrow`` 0.17.0) as an argument, allowing for faster csv parsing on multicore machines +with pyarrow installed. See the :doc:`I/O docs ` for more info. (:issue:`23697`) .. _whatsnew_140.enhancements.other: diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py new file mode 100644 index 0000000000000..033cd88da9687 --- /dev/null +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -0,0 +1,138 @@ +from __future__ import annotations + +from pandas._typing import FilePathOrBuffer +from pandas.compat._optional import import_optional_dependency + +from pandas.core.dtypes.inference import is_integer + +from pandas.core.frame import DataFrame + +from pandas.io.common import get_handle +from pandas.io.parsers.base_parser import ParserBase + + +class ArrowParserWrapper(ParserBase): + """ + Wrapper for the pyarrow engine for read_csv() + """ + + def __init__(self, src: FilePathOrBuffer, **kwds): + self.kwds = kwds + self.src = src + + ParserBase.__init__(self, kwds) + + self._parse_kwds() + + def _parse_kwds(self): + """ + Validates keywords before passing to pyarrow. + """ + encoding: str | None = self.kwds.get("encoding") + self.encoding = "utf-8" if encoding is None else encoding + + self.usecols, self.usecols_dtype = self._validate_usecols_arg( + self.kwds["usecols"] + ) + na_values = self.kwds["na_values"] + if isinstance(na_values, dict): + raise ValueError( + "The pyarrow engine doesn't support passing a dict for na_values" + ) + self.na_values = list(self.kwds["na_values"]) + + def _get_pyarrow_options(self): + """ + Rename some arguments to pass to pyarrow + """ + mapping = { + "usecols": "include_columns", + "na_values": "null_values", + "escapechar": "escape_char", + "skip_blank_lines": "ignore_empty_lines", + } + for pandas_name, pyarrow_name in mapping.items(): + if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None: + self.kwds[pyarrow_name] = self.kwds.pop(pandas_name) + + self.parse_options = { + option_name: option_value + for option_name, option_value in self.kwds.items() + if option_value is not None + and option_name + in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines") + } + self.convert_options = { + option_name: option_value + for option_name, option_value in self.kwds.items() + if option_value is not None + and option_name + in ("include_columns", "null_values", "true_values", "false_values") + } + self.read_options = { + "autogenerate_column_names": self.header is None, + "skip_rows": self.header + if self.header is not None + else self.kwds["skiprows"], + } + + def _finalize_output(self, frame: DataFrame) -> DataFrame: + """ + Processes data read in based on kwargs. + + Parameters + ---------- + frame: DataFrame + The DataFrame to process. + + Returns + ------- + DataFrame + The processed DataFrame. + """ + num_cols = len(frame.columns) + if self.header is None: + if self.names is None: + if self.prefix is not None: + self.names = [f"{self.prefix}{i}" for i in range(num_cols)] + elif self.header is None: + self.names = range(num_cols) + frame.columns = self.names + # we only need the frame not the names + frame.columns, frame = self._do_date_conversions(frame.columns, frame) + if self.index_col is not None: + for i, item in enumerate(self.index_col): + if is_integer(item): + self.index_col[i] = frame.columns[item] + frame.set_index(self.index_col, drop=True, inplace=True) + + if self.kwds.get("dtype") is not None: + frame = frame.astype(self.kwds.get("dtype")) + return frame + + def read(self) -> DataFrame: + """ + Reads the contents of a CSV file into a DataFrame and + processes it according to the kwargs passed in the + constructor. + + Returns + ------- + DataFrame + The DataFrame created from the CSV file. + """ + pyarrow_csv = import_optional_dependency("pyarrow.csv") + self._get_pyarrow_options() + + with get_handle( + self.src, "rb", encoding=self.encoding, is_text=False + ) as handles: + table = pyarrow_csv.read_csv( + handles.handle, + read_options=pyarrow_csv.ReadOptions(**self.read_options), + parse_options=pyarrow_csv.ParseOptions(**self.parse_options), + convert_options=pyarrow_csv.ConvertOptions(**self.convert_options), + ) + + frame = table.to_pandas() + return self._finalize_output(frame) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index c639a4a9d494e..b26d8c293cde9 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -42,6 +42,7 @@ from pandas.core.indexes.api import RangeIndex from pandas.io.common import validate_header_arg +from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper from pandas.io.parsers.base_parser import ( ParserBase, is_index_col, @@ -143,9 +144,15 @@ to preserve and not interpret dtype. If converters are specified, they will be applied INSTEAD of dtype conversion. -engine : {{'c', 'python'}}, optional - Parser engine to use. The C engine is faster while the python engine is - currently more feature-complete. +engine : {{'c', 'python', 'pyarrow'}}, optional + Parser engine to use. The C and pyarrow engines are faster, while the python engine + is currently more feature-complete. Multithreading is currently only supported by + the pyarrow engine. + + .. versionadded:: 1.4.0 + + The "pyarrow" engine was added as an *experimental* engine, and some features + are unsupported, or may not work correctly, with this engine. converters : dict, optional Dict of functions for converting values in certain columns. Keys can either be integers or column labels. @@ -406,6 +413,33 @@ _c_unsupported = {"skipfooter"} _python_unsupported = {"low_memory", "float_precision"} +_pyarrow_unsupported = { + "skipfooter", + "float_precision", + "chunksize", + "comment", + "nrows", + "thousands", + "memory_map", + "dialect", + "warn_bad_lines", + "error_bad_lines", + # TODO(1.4) + # This doesn't error properly ATM, fix for release + # but not blocker for initial PR + # "on_bad_lines", + "delim_whitespace", + "quoting", + "lineterminator", + "converters", + "decimal", + "iterator", + "dayfirst", + "infer_datetime_format", + "verbose", + "skipinitialspace", + "low_memory", +} _deprecated_defaults: dict[str, Any] = {"error_bad_lines": None, "warn_bad_lines": None} _deprecated_args: set[str] = {"error_bad_lines", "warn_bad_lines"} @@ -472,7 +506,20 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): # Extract some of the arguments (pass chunksize on). iterator = kwds.get("iterator", False) - chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1) + chunksize = kwds.get("chunksize", None) + if kwds.get("engine") == "pyarrow": + if iterator: + raise ValueError( + "The 'iterator' option is not supported with the 'pyarrow' engine" + ) + + if chunksize is not None: + raise ValueError( + "The 'chunksize' option is not supported with the 'pyarrow' engine" + ) + else: + chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1) + nrows = kwds.get("nrows", None) # Check for duplicates in names. @@ -785,6 +832,10 @@ def __init__(self, f, engine=None, **kwds): dialect = _extract_dialect(kwds) if dialect is not None: + if engine == "pyarrow": + raise ValueError( + "The 'dialect' option is not supported with the 'pyarrow' engine" + ) kwds = _merge_with_dialect_properties(dialect, kwds) if kwds.get("header", "infer") == "infer": @@ -823,7 +874,17 @@ def _get_options_with_defaults(self, engine): value = kwds.get(argname, default) # see gh-12935 - if argname == "mangle_dupe_cols" and not value: + if ( + engine == "pyarrow" + and argname in _pyarrow_unsupported + and value != default + ): + raise ValueError( + f"The {repr(argname)} option is not supported with the " + f"'pyarrow' engine" + ) + elif argname == "mangle_dupe_cols" and value is False: + # GH12935 raise ValueError("Setting mangle_dupe_cols=False is not supported yet") else: options[argname] = value @@ -878,9 +939,9 @@ def _clean_options(self, options, engine): delim_whitespace = options["delim_whitespace"] if sep is None and not delim_whitespace: - if engine == "c": + if engine in ("c", "pyarrow"): fallback_reason = ( - "the 'c' engine does not support " + f"the '{engine}' engine does not support " "sep=None with delim_whitespace=False" ) engine = "python" @@ -891,7 +952,7 @@ def _clean_options(self, options, engine): elif engine not in ("python", "python-fwf"): # wait until regex engine integrated fallback_reason = ( - "the 'c' engine does not support " + f"the '{engine}' engine does not support " "regex separators (separators > 1 char and " r"different from '\s+' are interpreted as regex)" ) @@ -910,7 +971,7 @@ def _clean_options(self, options, engine): if not encodeable and engine not in ("python", "python-fwf"): fallback_reason = ( f"the separator encoded in {encoding} " - "is > 1 char long, and the 'c' engine " + f"is > 1 char long, and the '{engine}' engine " "does not support such separators" ) engine = "python" @@ -925,7 +986,7 @@ def _clean_options(self, options, engine): fallback_reason = ( "ord(quotechar) > 127, meaning the " "quotechar is larger than one byte, " - "and the 'c' engine does not support such quotechars" + f"and the '{engine}' engine does not support such quotechars" ) engine = "python" @@ -1001,8 +1062,15 @@ def _clean_options(self, options, engine): na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) # handle skiprows; this is internally handled by the - # c-engine, so only need for python parsers - if engine != "c": + # c-engine, so only need for python and pyarrow parsers + if engine == "pyarrow": + if not is_integer(skiprows) and skiprows is not None: + # pyarrow expects skiprows to be passed as an integer + raise ValueError( + "skiprows argument must be an integer when using " + "engine='pyarrow'" + ) + else: if is_integer(skiprows): skiprows = list(range(skiprows)) if skiprows is None: @@ -1030,6 +1098,7 @@ def _make_engine(self, engine="c"): mapping: dict[str, type[ParserBase]] = { "c": CParserWrapper, "python": PythonParser, + "pyarrow": ArrowParserWrapper, "python-fwf": FixedWidthFieldParser, } if engine not in mapping: @@ -1043,22 +1112,25 @@ def _failover_to_python(self): raise AbstractMethodError(self) def read(self, nrows=None): - nrows = validate_integer("nrows", nrows) - index, columns, col_dict = self._engine.read(nrows) - - if index is None: - if col_dict: - # Any column is actually fine: - new_rows = len(next(iter(col_dict.values()))) - index = RangeIndex(self._currow, self._currow + new_rows) - else: - new_rows = 0 + if self.engine == "pyarrow": + df = self._engine.read() else: - new_rows = len(index) + nrows = validate_integer("nrows", nrows) + index, columns, col_dict = self._engine.read(nrows) + + if index is None: + if col_dict: + # Any column is actually fine: + new_rows = len(next(iter(col_dict.values()))) + index = RangeIndex(self._currow, self._currow + new_rows) + else: + new_rows = 0 + else: + new_rows = len(index) - df = DataFrame(col_dict, columns=columns, index=index) + df = DataFrame(col_dict, columns=columns, index=index) - self._currow += new_rows + self._currow += new_rows if self.squeeze and len(df.columns) == 1: return df[df.columns[0]].copy() diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 86891367e9bd6..e8a8769bc6291 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -15,6 +15,8 @@ ) import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + @pytest.mark.parametrize("index_col", [0, "index"]) def test_read_chunksize_with_index(all_parsers, index_col): diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index b2e528aa5f8d5..841df0ea7e470 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -30,6 +30,9 @@ from pandas.io.parsers import TextFileReader from pandas.io.parsers.c_parser_wrapper import CParserWrapper +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + def test_override_set_noconvert_columns(): # see gh-17351 @@ -112,6 +115,7 @@ def test_read_csv_local(all_parsers, csv1): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_1000_sep(all_parsers): parser = all_parsers data = """A|B|C @@ -143,6 +147,7 @@ def test_squeeze(all_parsers): assert not result._is_view +@xfail_pyarrow def test_unnamed_columns(all_parsers): data = """A,B,C,, 1,2,3,4,5 @@ -171,6 +176,7 @@ def test_csv_mixed_type(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_read_csv_low_memory_no_rows_with_index(all_parsers): # see gh-21141 parser = all_parsers @@ -219,6 +225,7 @@ def test_read_csv_dataframe(all_parsers, csv1): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("nrows", [3, 3.0]) def test_read_nrows(all_parsers, nrows): # see gh-10476 @@ -240,6 +247,7 @@ def test_read_nrows(all_parsers, nrows): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("nrows", [1.2, "foo", -1]) def test_read_nrows_bad(all_parsers, nrows): data = """index,A,B,C,D @@ -266,6 +274,7 @@ def test_nrows_skipfooter_errors(all_parsers): parser.read_csv(StringIO(data), skipfooter=1, nrows=5) +@xfail_pyarrow def test_missing_trailing_delimiters(all_parsers): parser = all_parsers data = """A,B,C,D @@ -281,6 +290,7 @@ def test_missing_trailing_delimiters(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_skip_initial_space(all_parsers): data = ( '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' @@ -341,6 +351,7 @@ def test_skip_initial_space(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_trailing_delimiters(all_parsers): # see gh-2442 data = """A,B,C @@ -372,6 +383,7 @@ def test_escapechar(all_parsers): tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"])) +@xfail_pyarrow def test_ignore_leading_whitespace(all_parsers): # see gh-3374, gh-6607 parser = all_parsers @@ -382,6 +394,7 @@ def test_ignore_leading_whitespace(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]]) def test_uneven_lines_with_usecols(all_parsers, usecols): # see gh-12203 @@ -404,6 +417,7 @@ def test_uneven_lines_with_usecols(all_parsers, usecols): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -436,6 +450,7 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs,expected", [ @@ -478,6 +493,7 @@ def test_raise_on_sep_with_delim_whitespace(all_parsers): parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True) +@xfail_pyarrow @pytest.mark.parametrize("delim_whitespace", [True, False]) def test_single_char_leading_whitespace(all_parsers, delim_whitespace): # see gh-9710 @@ -496,6 +512,8 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace): tm.assert_frame_equal(result, expected) +# Skip for now, actually only one test fails though, but its tricky to xfail +@skip_pyarrow @pytest.mark.parametrize( "sep,skip_blank_lines,exp_data", [ @@ -535,6 +553,7 @@ def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_whitespace_lines(all_parsers): parser = all_parsers data = """ @@ -550,6 +569,7 @@ def test_whitespace_lines(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -647,6 +667,7 @@ def test_read_csv_and_table_sys_setprofile(all_parsers, read_func): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_first_row_bom(all_parsers): # see gh-26545 parser = all_parsers @@ -657,6 +678,7 @@ def test_first_row_bom(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_first_row_bom_unquoted(all_parsers): # see gh-36343 parser = all_parsers @@ -667,6 +689,7 @@ def test_first_row_bom_unquoted(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("nrows", range(1, 6)) def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): # GH 28071 @@ -680,6 +703,7 @@ def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): tm.assert_frame_equal(df, ref[:nrows]) +@xfail_pyarrow def test_no_header_two_extra_columns(all_parsers): # GH 26218 column_names = ["one", "two", "three"] @@ -701,6 +725,7 @@ def test_read_csv_names_not_accepting_sets(all_parsers): parser.read_csv(StringIO(data), names=set("QAZ")) +@xfail_pyarrow def test_read_table_delim_whitespace_default_sep(all_parsers): # GH: 35958 f = StringIO("a b c\n1 -2 -3\n4 5 6") @@ -784,6 +809,7 @@ def test_names_and_prefix_explicit_None(all_parsers, names, prefix, func): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_dict_keys_as_names(all_parsers): # GH: 36928 data = "1,2" @@ -796,6 +822,7 @@ def test_dict_keys_as_names(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_encoding_surrogatepass(all_parsers): # GH39017 parser = all_parsers @@ -814,6 +841,7 @@ def test_encoding_surrogatepass(all_parsers): parser.read_csv(path) +@xfail_pyarrow @pytest.mark.parametrize("on_bad_lines", ["error", "warn"]) def test_deprecated_bad_lines_warns(all_parsers, csv1, on_bad_lines): # GH 15122 diff --git a/pandas/tests/io/parser/common/test_data_list.py b/pandas/tests/io/parser/common/test_data_list.py index 92b8c864f1619..8d484bba1cb9d 100644 --- a/pandas/tests/io/parser/common/test_data_list.py +++ b/pandas/tests/io/parser/common/test_data_list.py @@ -5,12 +5,17 @@ import csv from io import StringIO +import pytest + from pandas import DataFrame import pandas._testing as tm from pandas.io.parsers import TextParser +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow def test_read_data_list(all_parsers): parser = all_parsers kwargs = {"index_col": 0} diff --git a/pandas/tests/io/parser/common/test_decimal.py b/pandas/tests/io/parser/common/test_decimal.py index 7ca9f253bd501..ab58ddff9c06e 100644 --- a/pandas/tests/io/parser/common/test_decimal.py +++ b/pandas/tests/io/parser/common/test_decimal.py @@ -9,6 +9,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + @pytest.mark.parametrize( "data,thousands,decimal", diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index 2a3d7328aa662..11ef9d7d69122 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -21,6 +21,10 @@ from pandas import DataFrame import pandas._testing as tm +# TODO(1.4) Please xfail individual tests at release time +# instead of skip +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + @tm.network def test_url(all_parsers, csv_dir_path): diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py index 29aa387e2b045..0fa85a1f5f8ec 100644 --- a/pandas/tests/io/parser/common/test_float.py +++ b/pandas/tests/io/parser/common/test_float.py @@ -12,6 +12,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + def test_float_parser(all_parsers): # see gh-9565 diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 6e7022cd87875..a37bd010d0e1b 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -15,6 +15,8 @@ ) import pandas._testing as tm +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + @pytest.mark.parametrize( "data,kwargs,expected", @@ -78,6 +80,7 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) def test_multi_index_no_level_names(all_parsers, index_col): data = """index1,index2,A,B,C,D @@ -102,6 +105,7 @@ def test_multi_index_no_level_names(all_parsers, index_col): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_multi_index_no_level_names_implicit(all_parsers): parser = all_parsers data = """A,B,C,D @@ -135,6 +139,7 @@ def test_multi_index_no_level_names_implicit(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,expected,header", [ @@ -156,6 +161,7 @@ def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_no_unnamed_index(all_parsers): parser = all_parsers data = """ id c0 c1 c2 @@ -198,6 +204,7 @@ def test_read_duplicate_index_explicit(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_read_duplicate_index_implicit(all_parsers): data = """A,B,C,D foo,2,3,4,5 @@ -225,6 +232,7 @@ def test_read_duplicate_index_implicit(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_read_csv_no_index_name(all_parsers, csv_dir_path): parser = all_parsers csv2 = os.path.join(csv_dir_path, "test2.csv") @@ -252,6 +260,7 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_index(all_parsers): # see gh-10184 data = "x,y" @@ -262,6 +271,7 @@ def test_empty_with_index(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_multi_index(all_parsers): # see gh-10467 data = "x,y,z" @@ -274,6 +284,7 @@ def test_empty_with_multi_index(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_reversed_multi_index(all_parsers): data = "x,y,z" parser = all_parsers diff --git a/pandas/tests/io/parser/common/test_inf.py b/pandas/tests/io/parser/common/test_inf.py index 52fbdedd138fb..d43fb2f5187e1 100644 --- a/pandas/tests/io/parser/common/test_inf.py +++ b/pandas/tests/io/parser/common/test_inf.py @@ -13,7 +13,10 @@ ) import pandas._testing as tm +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow @pytest.mark.parametrize("na_filter", [True, False]) def test_inf_parsing(all_parsers, na_filter): parser = all_parsers @@ -37,6 +40,7 @@ def test_inf_parsing(all_parsers, na_filter): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("na_filter", [True, False]) def test_infinity_parsing(all_parsers, na_filter): parser = all_parsers diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index febeef695aafb..0026bdc3c0ae3 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -13,6 +13,9 @@ ) import pandas._testing as tm +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + def test_int_conversion(all_parsers): data = """A,B @@ -94,6 +97,7 @@ def test_parse_integers_above_fp_precision(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow # Flaky @pytest.mark.parametrize("sep", [" ", r"\s+"]) def test_integer_overflow_bug(all_parsers, sep): # see gh-2601 @@ -115,6 +119,7 @@ def test_int64_min_issues(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("conv", [None, np.int64, np.uint64]) def test_int64_overflow(all_parsers, conv): data = """ID @@ -158,6 +163,7 @@ def test_int64_overflow(all_parsers, conv): parser.read_csv(StringIO(data), converters={"ID": conv}) +@xfail_pyarrow @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min] ) @@ -171,6 +177,7 @@ def test_int64_uint64_range(all_parsers, val): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] ) @@ -184,6 +191,7 @@ def test_outside_int64_uint64_range(all_parsers, val): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("exp_data", [[str(-1), str(2 ** 63)], [str(2 ** 63), str(-1)]]) def test_numeric_range_too_wide(all_parsers, exp_data): # No numerical dtype can hold both negative and uint64 diff --git a/pandas/tests/io/parser/common/test_iterator.py b/pandas/tests/io/parser/common/test_iterator.py index 5ae1d80589df9..06ce0687e513a 100644 --- a/pandas/tests/io/parser/common/test_iterator.py +++ b/pandas/tests/io/parser/common/test_iterator.py @@ -13,6 +13,8 @@ ) import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + def test_iterator(all_parsers): # see gh-6607 diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index f5438ea3f0296..90816a2955d5a 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -21,6 +21,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + def test_empty_decimal_marker(all_parsers): data = """A|B|C diff --git a/pandas/tests/io/parser/common/test_verbose.py b/pandas/tests/io/parser/common/test_verbose.py index fdd905b48ea1e..335065db974dc 100644 --- a/pandas/tests/io/parser/common/test_verbose.py +++ b/pandas/tests/io/parser/common/test_verbose.py @@ -4,6 +4,10 @@ """ from io import StringIO +import pytest + +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + def test_verbose_read(all_parsers, capsys): parser = all_parsers diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index e11746c118ff7..372034e552b25 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -4,6 +4,8 @@ import pytest +from pandas.compat._optional import VERSIONS + from pandas import ( read_csv, read_table, @@ -48,6 +50,11 @@ class PythonParser(BaseParser): float_precision_choices = [None] +class PyArrowParser(BaseParser): + engine = "pyarrow" + float_precision_choices = [None] + + @pytest.fixture def csv_dir_path(datapath): """ @@ -67,14 +74,19 @@ def csv1(datapath): _cParserHighMemory = CParserHighMemory() _cParserLowMemory = CParserLowMemory() _pythonParser = PythonParser() +_pyarrowParser = PyArrowParser() _py_parsers_only = [_pythonParser] _c_parsers_only = [_cParserHighMemory, _cParserLowMemory] -_all_parsers = [*_c_parsers_only, *_py_parsers_only] +_pyarrow_parsers_only = [_pyarrowParser] + +_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] _py_parser_ids = ["python"] _c_parser_ids = ["c_high", "c_low"] -_all_parser_ids = [*_c_parser_ids, *_py_parser_ids] +_pyarrow_parsers_ids = ["pyarrow"] + +_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parsers_ids] @pytest.fixture(params=_all_parsers, ids=_all_parser_ids) @@ -82,6 +94,12 @@ def all_parsers(request): """ Fixture all of the CSV parsers. """ + if request.param.engine == "pyarrow": + pytest.importorskip("pyarrow", VERSIONS["pyarrow"]) + # Try setting num cpus to 1 to avoid hangs? + import pyarrow + + pyarrow.set_cpu_count(1) return request.param @@ -101,6 +119,14 @@ def python_parser_only(request): return request.param +@pytest.fixture(params=_pyarrow_parsers_only, ids=_pyarrow_parsers_ids) +def pyarrow_parser_only(request): + """ + Fixture all of the CSV parsers using the Pyarrow engine. + """ + return request.param + + def _get_all_parser_float_precision_combinations(): """ Return all allowable parser and float precision @@ -207,3 +233,36 @@ def numeric_decimal(request): represents the value to read while the second represents the expected result. """ return request.param + + +@pytest.fixture +def pyarrow_xfail(request): + """ + Fixture that xfails a test if the engine is pyarrow. + """ + if "all_parsers" in request.fixturenames: + parser = request.getfixturevalue("all_parsers") + elif "all_parsers_all_precisions" in request.fixturenames: + # Return value is tuple of (engine, precision) + parser = request.getfixturevalue("all_parsers_all_precisions")[0] + else: + return + if parser.engine == "pyarrow": + mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") + request.node.add_marker(mark) + + +@pytest.fixture +def pyarrow_skip(request): + """ + Fixture that skips a test if the engine is pyarrow. + """ + if "all_parsers" in request.fixturenames: + parser = request.getfixturevalue("all_parsers") + elif "all_parsers_all_precisions" in request.fixturenames: + # Return value is tuple of (engine, precision) + parser = request.getfixturevalue("all_parsers_all_precisions")[0] + else: + return + if parser.engine == "pyarrow": + pytest.skip("pyarrow doesn't support this.") diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py index f956403197cf5..7e04dd570e812 100644 --- a/pandas/tests/io/parser/dtypes/test_categorical.py +++ b/pandas/tests/io/parser/dtypes/test_categorical.py @@ -18,7 +18,11 @@ ) import pandas._testing as tm +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + +@xfail_pyarrow @pytest.mark.parametrize( "dtype", [ @@ -45,6 +49,7 @@ def test_categorical_dtype(all_parsers, dtype): tm.assert_frame_equal(actual, expected) +@skip_pyarrow # Flaky @pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) def test_categorical_dtype_single(all_parsers, dtype): # see gh-10153 @@ -60,6 +65,7 @@ def test_categorical_dtype_single(all_parsers, dtype): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow def test_categorical_dtype_unsorted(all_parsers): # see gh-10153 parser = all_parsers @@ -78,6 +84,7 @@ def test_categorical_dtype_unsorted(all_parsers): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow def test_categorical_dtype_missing(all_parsers): # see gh-10153 parser = all_parsers @@ -96,6 +103,7 @@ def test_categorical_dtype_missing(all_parsers): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow @pytest.mark.slow def test_categorical_dtype_high_cardinality_numeric(all_parsers): # see gh-18186 @@ -110,6 +118,7 @@ def test_categorical_dtype_high_cardinality_numeric(all_parsers): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow def test_categorical_dtype_utf16(all_parsers, csv_dir_path): # see gh-10153 pth = os.path.join(csv_dir_path, "utf16_ex.txt") @@ -124,6 +133,7 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow def test_categorical_dtype_chunksize_infer_categories(all_parsers): # see gh-10153 parser = all_parsers @@ -143,6 +153,7 @@ def test_categorical_dtype_chunksize_infer_categories(all_parsers): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow def test_categorical_dtype_chunksize_explicit_categories(all_parsers): # see gh-10153 parser = all_parsers @@ -234,6 +245,7 @@ def test_categorical_coerces_numeric(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow # Flaky def test_categorical_coerces_datetime(all_parsers): parser = all_parsers dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None) @@ -257,6 +269,7 @@ def test_categorical_coerces_timestamp(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_categorical_coerces_timedelta(all_parsers): parser = all_parsers dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 32a7ac44c0b38..ae5ddb83f7052 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -16,6 +16,10 @@ ) import pandas._testing as tm +# TODO(1.4): Change me into xfail at release time +# and xfail individual tests +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) diff --git a/pandas/tests/io/parser/dtypes/test_empty.py b/pandas/tests/io/parser/dtypes/test_empty.py index 200d1b50bfced..ee02af773129a 100644 --- a/pandas/tests/io/parser/dtypes/test_empty.py +++ b/pandas/tests/io/parser/dtypes/test_empty.py @@ -17,6 +17,9 @@ ) import pandas._testing as tm +# TODO(1.4): Change me into individual xfails at release time +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + def test_dtype_all_columns_empty(all_parsers): # see gh-12048 diff --git a/pandas/tests/io/parser/test_comment.py b/pandas/tests/io/parser/test_comment.py index d10d8e27a59a5..e2fd30cb9f2c6 100644 --- a/pandas/tests/io/parser/test_comment.py +++ b/pandas/tests/io/parser/test_comment.py @@ -10,6 +10,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + @pytest.mark.parametrize("na_values", [None, ["NaN"]]) def test_comment(all_parsers, na_values): diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index 220d9474c6dbf..e0799df8d7a4c 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -12,6 +12,8 @@ from pandas import DataFrame import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + @pytest.fixture(params=[True, False]) def buffer(request): @@ -29,6 +31,7 @@ def parser_and_data(all_parsers, csv1): return parser, data, expected +@skip_pyarrow @pytest.mark.parametrize("compression", ["zip", "infer", "zip2"]) def test_zip(parser_and_data, compression): parser, data, expected = parser_and_data @@ -46,6 +49,7 @@ def test_zip(parser_and_data, compression): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("compression", ["zip", "infer"]) def test_zip_error_multiple_files(parser_and_data, compression): parser, data, expected = parser_and_data @@ -61,6 +65,7 @@ def test_zip_error_multiple_files(parser_and_data, compression): parser.read_csv(path, compression=compression) +@skip_pyarrow def test_zip_error_no_files(parser_and_data): parser, _, _ = parser_and_data @@ -72,6 +77,7 @@ def test_zip_error_no_files(parser_and_data): parser.read_csv(path, compression="zip") +@skip_pyarrow def test_zip_error_invalid_zip(parser_and_data): parser, _, _ = parser_and_data @@ -81,6 +87,7 @@ def test_zip_error_invalid_zip(parser_and_data): parser.read_csv(f, compression="zip") +@skip_pyarrow @pytest.mark.parametrize("filename", [None, "test.{ext}"]) def test_compression(parser_and_data, compression_only, buffer, filename): parser, data, expected = parser_and_data @@ -96,6 +103,8 @@ def test_compression(parser_and_data, compression_only, buffer, filename): tm.write_to_compressed(compress_type, path, data) compression = "infer" if filename else compress_type + if ext == "bz2": + pytest.xfail("pyarrow wheels don't have bz2 codec support") if buffer: with open(path, "rb") as f: result = parser.read_csv(f, compression=compression) @@ -105,6 +114,7 @@ def test_compression(parser_and_data, compression_only, buffer, filename): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("ext", [None, "gz", "bz2"]) def test_infer_compression(all_parsers, csv1, buffer, ext): # see gh-9770 @@ -124,6 +134,7 @@ def test_infer_compression(all_parsers, csv1, buffer, ext): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt): # see gh-18071, gh-24130 parser = all_parsers @@ -141,6 +152,7 @@ def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"]) def test_invalid_compression(all_parsers, invalid_compression): parser = all_parsers diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 78b64baab4dc0..21933d83ce3f4 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -15,6 +15,8 @@ ) import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + def test_converters_type_must_be_dict(all_parsers): parser = all_parsers diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py index d0ee6add9ca92..55b193903bce0 100644 --- a/pandas/tests/io/parser/test_dialect.py +++ b/pandas/tests/io/parser/test_dialect.py @@ -13,6 +13,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + @pytest.fixture def custom_dialect(): diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 006438df2a5e0..b902a99cc4ea2 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -16,7 +16,10 @@ ) import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + +@skip_pyarrow def test_bytes_io_input(all_parsers): encoding = "cp1255" parser = all_parsers @@ -28,6 +31,7 @@ def test_bytes_io_input(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_read_csv_unicode(all_parsers): parser = all_parsers data = BytesIO("\u0141aski, Jan;1".encode()) @@ -37,6 +41,7 @@ def test_read_csv_unicode(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("sep", [",", "\t"]) @pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"]) def test_utf16_bom_skiprows(all_parsers, sep, encoding): @@ -71,6 +76,7 @@ def test_utf16_bom_skiprows(all_parsers, sep, encoding): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_utf16_example(all_parsers, csv_dir_path): path = os.path.join(csv_dir_path, "utf16_ex.txt") parser = all_parsers @@ -78,6 +84,7 @@ def test_utf16_example(all_parsers, csv_dir_path): assert len(result) == 50 +@skip_pyarrow def test_unicode_encoding(all_parsers, csv_dir_path): path = os.path.join(csv_dir_path, "unicode_series.csv") parser = all_parsers @@ -90,6 +97,7 @@ def test_unicode_encoding(all_parsers, csv_dir_path): assert got == expected +@skip_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -123,6 +131,7 @@ def _encode_data_with_bom(_data): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt): # see gh-13549 expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]}) @@ -135,6 +144,7 @@ def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "file_path,encoding", [ @@ -169,6 +179,7 @@ def test_binary_mode_file_buffers( tm.assert_frame_equal(expected, result) +@skip_pyarrow @pytest.mark.parametrize("pass_encoding", [True, False]) def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding): # see gh-24130 @@ -185,6 +196,7 @@ def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding) tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_encoding_named_temp_file(all_parsers): # see gh-31819 parser = all_parsers @@ -222,6 +234,7 @@ def test_parse_encoded_special_characters(encoding): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"]) def test_encoding_memory_map(all_parsers, encoding): # GH40986 diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 3b814360d3aa4..4016ce48be5f6 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -18,7 +18,11 @@ ) import pandas._testing as tm +# TODO(1.4): Change me to xfails at release time +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + +@skip_pyarrow def test_read_with_bad_header(all_parsers): parser = all_parsers msg = r"but only \d+ lines in file" @@ -86,6 +90,7 @@ def test_no_header_prefix(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_header_with_index_col(all_parsers): parser = all_parsers data = """foo,1,2,3 @@ -123,6 +128,7 @@ def test_header_not_first_line(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_header_multi_index(all_parsers): parser = all_parsers expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) @@ -188,6 +194,7 @@ def test_header_multi_index_invalid(all_parsers, kwargs, msg): _TestTuple = namedtuple("_TestTuple", ["first", "second"]) +@skip_pyarrow @pytest.mark.parametrize( "kwargs", [ @@ -235,6 +242,7 @@ def test_header_multi_index_common_format1(all_parsers, kwargs): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "kwargs", [ @@ -281,6 +289,7 @@ def test_header_multi_index_common_format2(all_parsers, kwargs): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "kwargs", [ @@ -328,6 +337,7 @@ def test_header_multi_index_common_format3(all_parsers, kwargs): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_header_multi_index_common_format_malformed1(all_parsers): parser = all_parsers expected = DataFrame( @@ -348,6 +358,7 @@ def test_header_multi_index_common_format_malformed1(all_parsers): tm.assert_frame_equal(expected, result) +@skip_pyarrow def test_header_multi_index_common_format_malformed2(all_parsers): parser = all_parsers expected = DataFrame( @@ -369,6 +380,7 @@ def test_header_multi_index_common_format_malformed2(all_parsers): tm.assert_frame_equal(expected, result) +@skip_pyarrow def test_header_multi_index_common_format_malformed3(all_parsers): parser = all_parsers expected = DataFrame( @@ -389,6 +401,7 @@ def test_header_multi_index_common_format_malformed3(all_parsers): tm.assert_frame_equal(expected, result) +@skip_pyarrow def test_header_multi_index_blank_line(all_parsers): # GH 40442 parser = all_parsers @@ -400,6 +413,7 @@ def test_header_multi_index_blank_line(all_parsers): tm.assert_frame_equal(expected, result) +@skip_pyarrow @pytest.mark.parametrize( "data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)] ) @@ -412,6 +426,7 @@ def test_header_names_backward_compat(all_parsers, data, header): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("kwargs", [{}, {"index_col": False}]) def test_read_only_header_no_rows(all_parsers, kwargs): # See gh-7773 @@ -457,6 +472,7 @@ def test_non_int_header(all_parsers, header): parser.read_csv(StringIO(data), header=header) +@skip_pyarrow def test_singleton_header(all_parsers): # see gh-7757 data = """a,b,c\n0,1,2\n1,2,3""" @@ -467,6 +483,7 @@ def test_singleton_header(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -513,6 +530,7 @@ def test_mangles_multi_index(all_parsers, data, expected): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("index_col", [None, [0]]) @pytest.mark.parametrize( "columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])] @@ -556,6 +574,7 @@ def test_multi_index_unnamed(all_parsers, index_col, columns): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_read_csv_multiindex_columns(all_parsers): # GH#6051 parser = all_parsers diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 2f876a28c56cd..6be82af5349ed 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -15,7 +15,11 @@ ) import pandas._testing as tm +# TODO(1.4): Change me to xfails at release time +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + +@skip_pyarrow @pytest.mark.parametrize("with_header", [True, False]) def test_index_col_named(all_parsers, with_header): parser = all_parsers @@ -70,6 +74,7 @@ def test_index_col_is_true(all_parsers): parser.read_csv(StringIO(data), index_col=True) +@skip_pyarrow def test_infer_index_col(all_parsers): data = """A,B,C foo,1,2,3 @@ -87,6 +92,7 @@ def test_infer_index_col(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "index_col,kwargs", [ @@ -135,6 +141,7 @@ def test_index_col_empty_data(all_parsers, index_col, kwargs): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_empty_with_index_col_false(all_parsers): # see gh-10413 data = "x,y" @@ -145,6 +152,7 @@ def test_empty_with_index_col_false(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "index_names", [ @@ -169,6 +177,7 @@ def test_multi_index_naming(all_parsers, index_names): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_multi_index_naming_not_all_at_beginning(all_parsers): parser = all_parsers data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4" @@ -183,6 +192,7 @@ def test_multi_index_naming_not_all_at_beginning(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_no_multi_index_level_names_empty(all_parsers): # GH 10984 parser = all_parsers @@ -194,6 +204,7 @@ def test_no_multi_index_level_names_empty(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_header_with_index_col(all_parsers): # GH 33476 parser = all_parsers @@ -217,6 +228,7 @@ def test_header_with_index_col(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.slow def test_index_col_large_csv(all_parsers): # https://github.com/pandas-dev/pandas/issues/37094 @@ -232,6 +244,7 @@ def test_index_col_large_csv(all_parsers): tm.assert_frame_equal(result, df.set_index("a")) +@skip_pyarrow def test_index_col_multiindex_columns_no_data(all_parsers): # GH#38292 parser = all_parsers @@ -247,6 +260,7 @@ def test_index_col_multiindex_columns_no_data(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_index_col_header_no_data(all_parsers): # GH#38292 parser = all_parsers @@ -259,6 +273,7 @@ def test_index_col_header_no_data(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_multiindex_columns_no_data(all_parsers): # GH#38292 parser = all_parsers @@ -269,6 +284,7 @@ def test_multiindex_columns_no_data(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_multiindex_columns_index_col_with_data(all_parsers): # GH#38292 parser = all_parsers diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 457a6567febab..6473e6c7670c8 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -10,7 +10,10 @@ from pandas import DataFrame import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + +@skip_pyarrow @pytest.mark.parametrize("kwargs", [{}, {"mangle_dupe_cols": True}]) def test_basic(all_parsers, kwargs): # TODO: add test for condition "mangle_dupe_cols=False" @@ -24,6 +27,7 @@ def test_basic(all_parsers, kwargs): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_basic_names(all_parsers): # See gh-7160 parser = all_parsers @@ -44,6 +48,7 @@ def test_basic_names_raise(all_parsers): parser.read_csv(StringIO(data), names=["a", "b", "a"]) +@skip_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -72,6 +77,7 @@ def test_thorough_mangle_columns(all_parsers, data, expected): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "data,names,expected", [ @@ -111,6 +117,7 @@ def test_thorough_mangle_names(all_parsers, data, names, expected): parser.read_csv(StringIO(data), names=names) +@skip_pyarrow def test_mangled_unnamed_placeholders(all_parsers): # xref gh-13017 orig_key = "0" diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index 123dce2048a44..ab278470934a5 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -13,6 +13,10 @@ from pandas import DataFrame import pandas._testing as tm +# We'll probably always skip these for pyarrow +# Maybe we'll add our own tests for pyarrow too +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + def _construct_dataframe(num_rows): """ diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 2880bf8690b46..101d3b565712d 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -16,7 +16,10 @@ ) import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + +@skip_pyarrow def test_string_nas(all_parsers): parser = all_parsers data = """A,B,C @@ -32,6 +35,7 @@ def test_string_nas(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_detect_string_na(all_parsers): parser = all_parsers data = """A,B @@ -46,6 +50,7 @@ def test_detect_string_na(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "na_values", [ @@ -83,6 +88,7 @@ def test_non_string_na_values(all_parsers, data, na_values): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_default_na_values(all_parsers): _NA_VALUES = { "-1.#IND", @@ -130,6 +136,7 @@ def f(i, v): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("na_values", ["baz", ["baz"]]) def test_custom_na_values(all_parsers, na_values): parser = all_parsers @@ -163,6 +170,7 @@ def test_bool_na_values(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_na_value_dict(all_parsers): data = """A,B,C foo,bar,NA @@ -181,6 +189,7 @@ def test_na_value_dict(all_parsers): tm.assert_frame_equal(df, expected) +@skip_pyarrow @pytest.mark.parametrize( "index_col,expected", [ @@ -214,6 +223,7 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "kwargs,expected", [ @@ -275,6 +285,7 @@ def test_na_values_keep_default(all_parsers, kwargs, expected): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_no_na_values_no_keep_default(all_parsers): # see gh-4318: passing na_values=None and # keep_default_na=False yields 'None" as a na_value @@ -301,6 +312,7 @@ def test_no_na_values_no_keep_default(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_no_keep_default_na_dict_na_values(all_parsers): # see gh-19227 data = "a,b\n,2" @@ -312,6 +324,7 @@ def test_no_keep_default_na_dict_na_values(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_no_keep_default_na_dict_na_scalar_values(all_parsers): # see gh-19227 # @@ -323,6 +336,7 @@ def test_no_keep_default_na_dict_na_scalar_values(all_parsers): tm.assert_frame_equal(df, expected) +@skip_pyarrow @pytest.mark.parametrize("col_zero_na_values", [113125, "113125"]) def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values): # see gh-19227 @@ -352,6 +366,7 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "na_filter,row_data", [ @@ -373,6 +388,7 @@ def test_na_values_na_filter_override(all_parsers, na_filter, row_data): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_na_trailing_columns(all_parsers): parser = all_parsers data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax @@ -400,6 +416,7 @@ def test_na_trailing_columns(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "na_values,row_data", [ @@ -418,6 +435,7 @@ def test_na_values_scalar(all_parsers, na_values, row_data): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_na_values_dict_aliasing(all_parsers): parser = all_parsers na_values = {"a": 2, "b": 1} @@ -433,6 +451,7 @@ def test_na_values_dict_aliasing(all_parsers): tm.assert_dict_equal(na_values, na_values_copy) +@skip_pyarrow def test_na_values_dict_col_index(all_parsers): # see gh-14203 data = "a\nfoo\n1" @@ -444,6 +463,7 @@ def test_na_values_dict_col_index(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -473,6 +493,7 @@ def test_empty_na_values_no_default_with_index(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])] ) @@ -501,6 +522,7 @@ def test_inf_na_values_with_int_index(all_parsers): tm.assert_frame_equal(out, expected) +@skip_pyarrow @pytest.mark.parametrize("na_filter", [True, False]) def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): # see gh-20377 @@ -516,6 +538,7 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "data, na_values", [ @@ -544,6 +567,7 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): ) +@skip_pyarrow def test_str_nan_dropped(all_parsers): # see gh-21131 parser = all_parsers @@ -572,6 +596,7 @@ def test_str_nan_dropped(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_nan_multi_index(all_parsers): # GH 42446 parser = all_parsers diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 41f0b661611a6..5fbd1db9df96d 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -42,6 +42,8 @@ import pandas.io.date_converters as conv from pandas.io.parsers import read_csv +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + # constant _DEFAULT_DATETIME = datetime(1, 1, 1) diff --git a/pandas/tests/io/parser/test_quoting.py b/pandas/tests/io/parser/test_quoting.py index 7a07632390eff..6995965467d05 100644 --- a/pandas/tests/io/parser/test_quoting.py +++ b/pandas/tests/io/parser/test_quoting.py @@ -13,6 +13,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + @pytest.mark.parametrize( "kwargs,msg", diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 0735f60fabbf6..9df6bf42c55d2 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -17,6 +17,9 @@ ) import pandas._testing as tm +# XFAIL ME PLS once hanging tests issues identified +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + @pytest.mark.parametrize("skiprows", [list(range(6)), 6]) def test_skip_rows_bug(all_parsers, skiprows): diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 2cf3d959acb48..1e5cf49ce24ae 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -121,3 +121,26 @@ def read(self): with pytest.raises(ValueError, match=msg): read_csv(NoNextBuffer(data), engine=python_engine) + + def test_pyarrow_engine(self): + from pandas.io.parsers.readers import _pyarrow_unsupported as pa_unsupported + + data = """1,2,3,, + 1,2,3,4, + 1,2,3,4,5 + 1,2,,, + 1,2,3,4,""" + + for default in pa_unsupported: + msg = ( + f"The {repr(default)} option is not " + f"supported with the 'pyarrow' engine" + ) + kwargs = {default: object()} + default_needs_bool = {"on_bad_lines", "error_bad_lines"} + if default == "dialect": + kwargs[default] = "excel" # test a random dialect + elif default in default_needs_bool: + kwargs[default] = True + with pytest.raises(ValueError, match=msg): + read_csv(StringIO(data), engine="pyarrow", **kwargs) diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py index 44ea3866dd793..50000dab8a7aa 100644 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -22,6 +22,10 @@ "Usecols do not match columns, columns expected but not found: {0}" ) +# TODO(1.4): Change these to xfails whenever parse_dates support(which was +# intentionally disable to keep small PR sizes) is added back +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) def test_usecols_with_parse_dates(all_parsers, usecols): diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 16649be5b8a58..d0080273537bb 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -22,6 +22,9 @@ "Usecols do not match columns, columns expected but not found: {0}" ) +# TODO(1.4): Change to xfails at release time +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + def test_raise_on_mixed_dtype_usecols(all_parsers): # See gh-12678