diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 24d21ad6a633d..287f1d997d665 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -146,7 +146,7 @@ def time_read_csv(self, bad_date_value): class ReadCSVSkipRows(BaseIO): fname = "__test__.csv" - params = ([None, 10000], ["c", "python"]) + params = ([None, 10000], ["c", "pyarrow", "python"]) param_names = ["skiprows", "engine"] def setup(self, skiprows, engine): @@ -257,9 +257,18 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision): names=list("abc"), ) + def time_read_csv_pyarrow(self, sep, decimal, float_precision): + read_csv( + self.data(self.StringIO_input), + sep=sep, + header=None, + names=list("abc"), + engine="pyarrow", + ) + class ReadCSVEngine(StringIORewind): - params = ["c", "python"] + params = ["c", "python", "pyarrow"] param_names = ["engine"] def setup(self, engine): diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 9c9ad9538f488..a26ddd7c9b888 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -158,9 +158,11 @@ dtype : Type name or dict of column -> type, default ``None`` (unsupported with ``engine='python'``). Use ``str`` or ``object`` together with suitable ``na_values`` settings to preserve and not interpret dtype. -engine : {``'c'``, ``'python'``} - Parser engine to use. The C engine is faster while the Python engine is - currently more feature-complete. +engine : {``'c'``, ``'pyarrow'``, ``'python'``} + Parser engine to use. The pyarrow engine is the most performant, followed by + the C engine, which in turn is faster than the python engine. However, the + pyarrow and C engine are currently less feature complete than their Python + counterpart. converters : dict, default ``None`` Dict of functions for converting values in certain columns. Keys can either be integers or column labels. @@ -1602,11 +1604,17 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object: Specifying the parser engine '''''''''''''''''''''''''''' -Under the hood pandas uses a fast and efficient parser implemented in C as well -as a Python implementation which is currently more feature-complete. Where -possible pandas uses the C parser (specified as ``engine='c'``), but may fall -back to Python if C-unsupported options are specified. Currently, C-unsupported -options include: +Pandas currently supports three engines, the C engine, the python engine, and an optional +pyarrow engine. The pyarrow engine is fastest, followed by the C and Python engines. However, +the pyarrow engine is much less robust than the C engine, and the C engine is less feature-rich +than the Python engine. + +Where possible pandas uses the C parser (specified as ``engine='c'``), but it may fall +back to Python if C-unsupported options are specified. If pyarrow unsupported options are +specified while using ``engine='pyarrow'``, the parser will throw an error. +(a full list of unsupported options is available at ``pandas.io.parsers._pyarrow_unsupported``). + +Currently, C-unsupported options include: * ``sep`` other than a single character (e.g. regex separators) * ``skipfooter`` diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index af11b6543a74b..14dae57f9a7ff 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -13,6 +13,11 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +read_csv() now accepts pyarrow as an engine +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing on multicore machines +with pyarrow installed. See the :doc:`I/O docs ` for more info. (:issue:`23697`) .. _whatsnew_130.read_csv_json_http_headers: Custom HTTP(s) headers when reading csv or json files diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3058d1eed22b9..d03a7b88473a0 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -5,7 +5,7 @@ from collections import abc, defaultdict import csv import datetime -from io import StringIO +from io import StringIO, TextIOBase import itertools import re import sys @@ -32,6 +32,7 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing from pandas._typing import DtypeArg, FilePathOrBuffer, StorageOptions, Union +from pandas.compat._optional import import_optional_dependency from pandas.errors import ( AbstractMethodError, EmptyDataError, @@ -173,9 +174,13 @@ to preserve and not interpret dtype. If converters are specified, they will be applied INSTEAD of dtype conversion. -engine : {{'c', 'python'}}, optional - Parser engine to use. The C engine is faster while the python engine is - currently more feature-complete. +engine : {{'c', 'python', 'pyarrow'}}, optional + Parser engine to use. The C and pyarrow engines are faster, while the python engine + is currently more feature-complete. Multithreading is currently only supported by + the pyarrow engine. + + .. versionchanged:: 1.3 + The "pyarrow" engine was added. converters : dict, optional Dict of functions for converting values in certain columns. Keys can either be integers or column labels. @@ -446,8 +451,22 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): kwds["parse_dates"] = True # Extract some of the arguments (pass chunksize on). + iterator = kwds.get("iterator", False) - chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1) + chunksize = kwds.get("chunksize", None) + if kwds.get("engine") == "pyarrow": + if iterator: + raise ValueError( + "The 'iterator' option is not supported with the 'pyarrow' engine" + ) + + if chunksize is not None: + raise ValueError( + "The 'chunksize' option is not supported with the 'pyarrow' engine" + ) + else: + chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1) + nrows = kwds.get("nrows", None) # Check for duplicates in names. @@ -519,6 +538,29 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} _c_unsupported = {"skipfooter"} +_pyarrow_unsupported = { + "skipfooter", + "float_precision", + "chunksize", + "comment", + "nrows", + "thousands", + "memory_map", + "dialect", + "warn_bad_lines", + "error_bad_lines", + "delim_whitespace", + "quoting", + "lineterminator", + "converters", + "decimal", + "iterator", + "dayfirst", + "infer_datetime_format", + "verbose", + "skipinitialspace", + "low_memory", +} _python_unsupported = {"low_memory", "float_precision"} _deprecated_defaults: Dict[str, Any] = {} @@ -788,6 +830,10 @@ def __init__(self, f, engine=None, **kwds): dialect = _extract_dialect(kwds) if dialect is not None: + if engine == "pyarrow": + raise ValueError( + "The 'dialect' option is not supported with the 'pyarrow' engine" + ) kwds = _merge_with_dialect_properties(dialect, kwds) if kwds.get("header", "infer") == "infer": @@ -823,9 +869,17 @@ def _get_options_with_defaults(self, engine): for argname, default in _parser_defaults.items(): value = kwds.get(argname, default) - - # see gh-12935 - if argname == "mangle_dupe_cols" and not value: + if ( + engine == "pyarrow" + and argname in _pyarrow_unsupported + and value != default + ): + raise ValueError( + f"The {repr(argname)} option is not supported with the " + f"'pyarrow' engine" + ) + elif argname == "mangle_dupe_cols" and value is False: + # GH12935 raise ValueError("Setting mangle_dupe_cols=False is not supported yet") else: options[argname] = value @@ -883,9 +937,9 @@ def _clean_options(self, options, engine): delim_whitespace = options["delim_whitespace"] if sep is None and not delim_whitespace: - if engine == "c": + if engine in ("c", "pyarrow"): fallback_reason = ( - "the 'c' engine does not support " + f"the {engine} engine does not support " "sep=None with delim_whitespace=False" ) engine = "python" @@ -896,7 +950,7 @@ def _clean_options(self, options, engine): elif engine not in ("python", "python-fwf"): # wait until regex engine integrated fallback_reason = ( - "the 'c' engine does not support " + f"the '{engine}' engine does not support " "regex separators (separators > 1 char and " r"different from '\s+' are interpreted as regex)" ) @@ -1006,14 +1060,22 @@ def _clean_options(self, options, engine): na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) # handle skiprows; this is internally handled by the - # c-engine, so only need for python parsers + # c-engine, so only need for python and pyarrow parsers if engine != "c": - if is_integer(skiprows): - skiprows = list(range(skiprows)) - if skiprows is None: - skiprows = set() - elif not callable(skiprows): - skiprows = set(skiprows) + if engine == "pyarrow": + if not is_integer(skiprows) and skiprows is not None: + # pyarrow expects skiprows to be passed as an integer + raise ValueError( + "skiprows argument must be an integer when using " + "engine='pyarrow'" + ) + else: + if is_integer(skiprows): + skiprows = list(range(skiprows)) + if skiprows is None: + skiprows = set() + elif not callable(skiprows): + skiprows = set(skiprows) # put stuff back result["names"] = names @@ -1035,6 +1097,7 @@ def _make_engine(self, engine="c"): mapping: Dict[str, Type[ParserBase]] = { "c": CParserWrapper, "python": PythonParser, + "pyarrow": ArrowParserWrapper, "python-fwf": FixedWidthFieldParser, } if engine not in mapping: @@ -1048,22 +1111,25 @@ def _failover_to_python(self): raise AbstractMethodError(self) def read(self, nrows=None): - nrows = validate_integer("nrows", nrows) - index, columns, col_dict = self._engine.read(nrows) - - if index is None: - if col_dict: - # Any column is actually fine: - new_rows = len(next(iter(col_dict.values()))) - index = RangeIndex(self._currow, self._currow + new_rows) - else: - new_rows = 0 + if self.engine == "pyarrow": + df = self._engine.read() else: - new_rows = len(index) + nrows = validate_integer("nrows", nrows) + index, columns, col_dict = self._engine.read(nrows) + + if index is None: + if col_dict: + # Any column is actually fine: + new_rows = len(next(iter(col_dict.values()))) + index = RangeIndex(self._currow, self._currow + new_rows) + else: + new_rows = 0 + else: + new_rows = len(index) - df = DataFrame(col_dict, columns=columns, index=index) + df = DataFrame(col_dict, columns=columns, index=index) - self._currow += new_rows + self._currow += new_rows if self.squeeze and len(df.columns) == 1: return df[df.columns[0]].copy() @@ -2165,6 +2231,129 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): return values +class BytesIOWrapper: + """ + Allows the pyarrow engine for read_csv() to read from string buffers + """ + + def __init__( + self, + string_buffer: Union[StringIO, TextIOBase], + encoding: str = "utf-8", + ): + self.string_buffer = string_buffer + self.encoding = encoding + + def __getattr__(self, attr: str): + return getattr(self.string_buffer, attr) + + def read(self, size: int = -1): + content = self.string_buffer.read(size) + return content.encode(self.encoding) + + +class ArrowParserWrapper(ParserBase): + """ + Wrapper for the pyarrow engine for read_csv() + """ + + def __init__(self, src, **kwds): + self.kwds = kwds + self.src = src + + ParserBase.__init__(self, kwds) + + self._parse_kwds() + + def _parse_kwds(self): + encoding: Optional[str] = self.kwds.get("encoding") + self.encoding = "utf-8" if encoding is None else encoding + + self.usecols, self.usecols_dtype = _validate_usecols_arg(self.kwds["usecols"]) + na_values = self.kwds["na_values"] + if isinstance(na_values, dict): + raise ValueError( + "The pyarrow engine doesn't support passing a dict for na_values" + ) + self.na_values = list( + _clean_na_values( + self.kwds["na_values"], keep_default_na=self.kwds["keep_default_na"] + )[0] + ) + + if isinstance(self.src, TextIOBase): + self.src = BytesIOWrapper(self.src, encoding=self.encoding) + + def _get_pyarrow_options(self): + # rename some arguments to pass to pyarrow + mapping = { + "usecols": "include_columns", + "na_values": "null_values", + "escapechar": "escape_char", + "skip_blank_lines": "ignore_empty_lines", + } + for pandas_name, pyarrow_name in mapping.items(): + if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None: + self.kwds[pyarrow_name] = self.kwds.pop(pandas_name) + + self.parse_options = { + option_name: option_value + for option_name, option_value in self.kwds.items() + if option_value is not None + and option_name + in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines") + } + self.convert_options = { + option_name: option_value + for option_name, option_value in self.kwds.items() + if option_value is not None + and option_name + in ("include_columns", "null_values", "true_values", "false_values") + } + self.read_options = { + "autogenerate_column_names": self.header is None, + "skip_rows": self.header + if self.header is not None + else self.kwds["skiprows"], + } + + def _finalize_output(self, frame): + num_cols = len(frame.columns) + if self.header is None: + if self.names is None: + if self.prefix is not None: + self.names = [f"{self.prefix}{i}" for i in range(num_cols)] + elif self.header is None: + self.names = range(num_cols) + frame.columns = self.names + # we only need the frame not the names + frame.columns, frame = self._do_date_conversions(frame.columns, frame) + if self.index_col is not None: + for i, item in enumerate(self.index_col): + if is_integer(item): + self.index_col[i] = frame.columns[item] + frame.set_index(self.index_col, drop=True, inplace=True) + + if self.kwds.get("dtype") is not None: + frame = frame.astype(self.kwds.get("dtype")) + return frame + + def read(self): + pyarrow = import_optional_dependency("pyarrow.csv") + + self._get_pyarrow_options() + + table = pyarrow.read_csv( + self.src, + read_options=pyarrow.ReadOptions(**self.read_options), + parse_options=pyarrow.ParseOptions(**self.parse_options), + convert_options=pyarrow.ConvertOptions(**self.convert_options), + ) + + frame = table.to_pandas() + return self._finalize_output(frame) + + def TextParser(*args, **kwds): """ Converts lists of lists/tuples into DataFrames with proper type inference diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 8c1475025b442..cdc21dca0b494 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -12,6 +12,8 @@ from pandas import DataFrame, concat import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + @pytest.mark.parametrize("index_col", [0, "index"]) def test_read_chunksize_with_index(all_parsers, index_col): diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 4fd754bf79ba2..4dd75dff16095 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -18,6 +18,8 @@ from pandas.io.parsers import CParserWrapper, TextFileReader +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + def test_override_set_noconvert_columns(): # see gh-17351 @@ -66,6 +68,7 @@ def _set_noconvert_columns(self): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_read_csv_local(all_parsers, csv1): prefix = "file:///" if compat.is_platform_windows() else "file://" parser = all_parsers @@ -100,6 +103,7 @@ def test_read_csv_local(all_parsers, csv1): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_1000_sep(all_parsers): parser = all_parsers data = """A|B|C @@ -131,6 +135,7 @@ def test_squeeze(all_parsers): assert not result._is_view +@skip_pyarrow def test_unnamed_columns(all_parsers): data = """A,B,C,, 1,2,3,4,5 @@ -159,6 +164,7 @@ def test_csv_mixed_type(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_read_csv_low_memory_no_rows_with_index(all_parsers): # see gh-21141 parser = all_parsers @@ -207,6 +213,7 @@ def test_read_csv_dataframe(all_parsers, csv1): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("nrows", [3, 3.0]) def test_read_nrows(all_parsers, nrows): # see gh-10476 @@ -228,6 +235,7 @@ def test_read_nrows(all_parsers, nrows): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("nrows", [1.2, "foo", -1]) def test_read_nrows_bad(all_parsers, nrows): data = """index,A,B,C,D @@ -254,6 +262,7 @@ def test_nrows_skipfooter_errors(all_parsers): parser.read_csv(StringIO(data), skipfooter=1, nrows=5) +@skip_pyarrow def test_missing_trailing_delimiters(all_parsers): parser = all_parsers data = """A,B,C,D @@ -269,6 +278,7 @@ def test_missing_trailing_delimiters(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_skip_initial_space(all_parsers): data = ( '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' @@ -329,6 +339,7 @@ def test_skip_initial_space(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_trailing_delimiters(all_parsers): # see gh-2442 data = """A,B,C @@ -360,6 +371,7 @@ def test_escapechar(all_parsers): tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"])) +@skip_pyarrow def test_ignore_leading_whitespace(all_parsers): # see gh-3374, gh-6607 parser = all_parsers @@ -370,6 +382,7 @@ def test_ignore_leading_whitespace(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]]) def test_uneven_lines_with_usecols(all_parsers, usecols): # see gh-12203 @@ -392,6 +405,7 @@ def test_uneven_lines_with_usecols(all_parsers, usecols): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -424,6 +438,7 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "kwargs,expected", [ @@ -466,6 +481,7 @@ def test_raise_on_sep_with_delim_whitespace(all_parsers): parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True) +@skip_pyarrow @pytest.mark.parametrize("delim_whitespace", [True, False]) def test_single_char_leading_whitespace(all_parsers, delim_whitespace): # see gh-9710 @@ -484,6 +500,7 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "sep,skip_blank_lines,exp_data", [ @@ -523,6 +540,7 @@ def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_whitespace_lines(all_parsers): parser = all_parsers data = """ @@ -538,6 +556,7 @@ def test_whitespace_lines(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -621,6 +640,7 @@ def test_read_table_equivalency_to_read_csv(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_first_row_bom(all_parsers): # see gh-26545 parser = all_parsers @@ -631,6 +651,7 @@ def test_first_row_bom(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_first_row_bom_unquoted(all_parsers): # see gh-36343 parser = all_parsers @@ -641,6 +662,7 @@ def test_first_row_bom_unquoted(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("nrows", range(1, 6)) def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): # GH 28071 @@ -654,6 +676,7 @@ def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): tm.assert_frame_equal(df, ref[:nrows]) +@skip_pyarrow def test_no_header_two_extra_columns(all_parsers): # GH 26218 column_names = ["one", "two", "three"] @@ -674,6 +697,7 @@ def test_read_csv_names_not_accepting_sets(all_parsers): parser.read_csv(StringIO(data), names=set("QAZ")) +@skip_pyarrow def test_read_table_delim_whitespace_default_sep(all_parsers): # GH: 35958 f = StringIO("a b c\n1 -2 -3\n4 5 6") @@ -715,6 +739,7 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): parser.read_table(f, delim_whitespace=True, delimiter=delimiter) +@skip_pyarrow def test_dict_keys_as_names(all_parsers): # GH: 36928 data = "1,2" diff --git a/pandas/tests/io/parser/common/test_data_list.py b/pandas/tests/io/parser/common/test_data_list.py index 92b8c864f1619..d67f728ad87e5 100644 --- a/pandas/tests/io/parser/common/test_data_list.py +++ b/pandas/tests/io/parser/common/test_data_list.py @@ -5,12 +5,17 @@ import csv from io import StringIO +import pytest + from pandas import DataFrame import pandas._testing as tm from pandas.io.parsers import TextParser +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + +@skip_pyarrow def test_read_data_list(all_parsers): parser = all_parsers kwargs = {"index_col": 0} diff --git a/pandas/tests/io/parser/common/test_decimal.py b/pandas/tests/io/parser/common/test_decimal.py index 7ca9f253bd501..ab58ddff9c06e 100644 --- a/pandas/tests/io/parser/common/test_decimal.py +++ b/pandas/tests/io/parser/common/test_decimal.py @@ -9,6 +9,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + @pytest.mark.parametrize( "data,thousands,decimal", diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index d0f1d63f88b3e..0a5bc4a135b9e 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -15,7 +15,10 @@ from pandas import DataFrame import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + +@skip_pyarrow @tm.network def test_url(all_parsers, csv_dir_path): # TODO: FTP testing @@ -33,6 +36,7 @@ def test_url(all_parsers, csv_dir_path): tm.assert_frame_equal(url_result, local_result) +@skip_pyarrow @pytest.mark.slow def test_local_file(all_parsers, csv_dir_path): parser = all_parsers @@ -50,6 +54,7 @@ def test_local_file(all_parsers, csv_dir_path): pytest.skip("Failing on: " + " ".join(platform.uname())) +@skip_pyarrow def test_path_path_lib(all_parsers): parser = all_parsers df = tm.makeDataFrame() @@ -57,6 +62,7 @@ def test_path_path_lib(all_parsers): tm.assert_frame_equal(df, result) +@skip_pyarrow def test_path_local_path(all_parsers): parser = all_parsers df = tm.makeDataFrame() @@ -66,6 +72,7 @@ def test_path_local_path(all_parsers): tm.assert_frame_equal(df, result) +@skip_pyarrow def test_nonexistent_path(all_parsers): # gh-2428: pls no segfault # gh-14086: raise more helpful FileNotFoundError @@ -79,6 +86,7 @@ def test_nonexistent_path(all_parsers): assert path == e.value.filename +@skip_pyarrow @td.skip_if_windows # os.chmod does not work in windows def test_no_permission(all_parsers): # GH 23784 @@ -101,6 +109,7 @@ def test_no_permission(all_parsers): assert path == e.value.filename +@skip_pyarrow @pytest.mark.parametrize( "data,kwargs,expected,msg", [ @@ -208,6 +217,7 @@ def test_eof_states(all_parsers, data, kwargs, expected, msg): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_temporary_file(all_parsers): # see gh-13398 parser = all_parsers @@ -273,6 +283,7 @@ def test_file_handles_with_open(all_parsers, csv1): assert not f.closed +@skip_pyarrow def test_invalid_file_buffer_class(all_parsers): # see gh-15337 class InvalidBuffer: @@ -285,6 +296,7 @@ class InvalidBuffer: parser.read_csv(InvalidBuffer()) +@skip_pyarrow def test_invalid_file_buffer_mock(all_parsers): # see gh-15337 parser = all_parsers @@ -339,6 +351,7 @@ def test_read_csv_file_handle(all_parsers, io_class, encoding): assert not handle.closed +@skip_pyarrow def test_memory_map_file_handle_silent_fallback(all_parsers, compression): """ Do not fail for buffers with memory_map=True (cannot memory map BytesIO). @@ -358,6 +371,7 @@ def test_memory_map_file_handle_silent_fallback(all_parsers, compression): ) +@skip_pyarrow def test_memory_map_compression(all_parsers, compression): """ Support memory map for compressed files. @@ -376,6 +390,7 @@ def test_memory_map_compression(all_parsers, compression): ) +@skip_pyarrow def test_context_manager(all_parsers, datapath): # make sure that opened files are closed parser = all_parsers @@ -392,6 +407,7 @@ def test_context_manager(all_parsers, datapath): assert reader._engine.handles.handle.closed +@skip_pyarrow def test_context_manageri_user_provided(all_parsers, datapath): # make sure that user-provided handles are not closed parser = all_parsers @@ -408,6 +424,7 @@ def test_context_manageri_user_provided(all_parsers, datapath): assert not reader._engine.handles.handle.closed +@skip_pyarrow def test_file_descriptor_leak(all_parsers): # GH 31488 @@ -421,6 +438,7 @@ def test(): td.check_file_leaks(test)() +@skip_pyarrow @td.check_file_leaks def test_memory_map(all_parsers, csv_dir_path): mmap_file = os.path.join(csv_dir_path, "test_mmap.csv") diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py index 29aa387e2b045..0fa85a1f5f8ec 100644 --- a/pandas/tests/io/parser/common/test_float.py +++ b/pandas/tests/io/parser/common/test_float.py @@ -12,6 +12,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + def test_float_parser(all_parsers): # see gh-9565 diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index a133e1be49946..fd999fcdabac3 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -11,6 +11,8 @@ from pandas import DataFrame, Index, MultiIndex import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + @pytest.mark.parametrize( "data,kwargs,expected", @@ -74,6 +76,7 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) def test_multi_index_no_level_names(all_parsers, index_col): data = """index1,index2,A,B,C,D @@ -98,6 +101,7 @@ def test_multi_index_no_level_names(all_parsers, index_col): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_multi_index_no_level_names_implicit(all_parsers): parser = all_parsers data = """A,B,C,D @@ -131,6 +135,7 @@ def test_multi_index_no_level_names_implicit(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "data,expected,header", [ @@ -152,6 +157,7 @@ def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_no_unnamed_index(all_parsers): parser = all_parsers data = """ id c0 c1 c2 @@ -194,6 +200,7 @@ def test_read_duplicate_index_explicit(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_read_duplicate_index_implicit(all_parsers): data = """A,B,C,D foo,2,3,4,5 @@ -221,6 +228,7 @@ def test_read_duplicate_index_implicit(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_read_csv_no_index_name(all_parsers, csv_dir_path): parser = all_parsers csv2 = os.path.join(csv_dir_path, "test2.csv") @@ -248,6 +256,7 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_empty_with_index(all_parsers): # see gh-10184 data = "x,y" @@ -258,6 +267,7 @@ def test_empty_with_index(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_empty_with_multi_index(all_parsers): # see gh-10467 data = "x,y,z" @@ -270,6 +280,7 @@ def test_empty_with_multi_index(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_empty_with_reversed_multi_index(all_parsers): data = "x,y,z" parser = all_parsers diff --git a/pandas/tests/io/parser/common/test_inf.py b/pandas/tests/io/parser/common/test_inf.py index fca4aaaba6675..9bc93171f9307 100644 --- a/pandas/tests/io/parser/common/test_inf.py +++ b/pandas/tests/io/parser/common/test_inf.py @@ -10,7 +10,10 @@ from pandas import DataFrame, option_context import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + +@skip_pyarrow @pytest.mark.parametrize("na_filter", [True, False]) def test_inf_parsing(all_parsers, na_filter): parser = all_parsers @@ -34,6 +37,7 @@ def test_inf_parsing(all_parsers, na_filter): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("na_filter", [True, False]) def test_infinity_parsing(all_parsers, na_filter): parser = all_parsers diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index a8f5c43ea15c7..4b31447b638f7 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -10,6 +10,8 @@ from pandas import DataFrame, Series import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + def test_int_conversion(all_parsers): data = """A,B @@ -91,6 +93,7 @@ def test_parse_integers_above_fp_precision(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("sep", [" ", r"\s+"]) def test_integer_overflow_bug(all_parsers, sep): # see gh-2601 @@ -112,6 +115,7 @@ def test_int64_min_issues(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("conv", [None, np.int64, np.uint64]) def test_int64_overflow(all_parsers, conv): data = """ID @@ -155,6 +159,7 @@ def test_int64_overflow(all_parsers, conv): parser.read_csv(StringIO(data), converters={"ID": conv}) +@skip_pyarrow @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min] ) @@ -168,6 +173,7 @@ def test_int64_uint64_range(all_parsers, val): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] ) @@ -181,6 +187,7 @@ def test_outside_int64_uint64_range(all_parsers, val): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("exp_data", [[str(-1), str(2 ** 63)], [str(2 ** 63), str(-1)]]) def test_numeric_range_too_wide(all_parsers, exp_data): # No numerical dtype can hold both negative and uint64 diff --git a/pandas/tests/io/parser/common/test_iterator.py b/pandas/tests/io/parser/common/test_iterator.py index 3cc30b0ab4029..9f8793295344d 100644 --- a/pandas/tests/io/parser/common/test_iterator.py +++ b/pandas/tests/io/parser/common/test_iterator.py @@ -9,6 +9,8 @@ from pandas import DataFrame, Series, concat import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + def test_iterator(all_parsers): # see gh-6607 diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index a2787ddad3683..62584a7299cad 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -14,6 +14,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + def test_empty_decimal_marker(all_parsers): data = """A|B|C diff --git a/pandas/tests/io/parser/common/test_verbose.py b/pandas/tests/io/parser/common/test_verbose.py index fdd905b48ea1e..335065db974dc 100644 --- a/pandas/tests/io/parser/common/test_verbose.py +++ b/pandas/tests/io/parser/common/test_verbose.py @@ -4,6 +4,10 @@ """ from io import StringIO +import pytest + +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + def test_verbose_read(all_parsers, capsys): parser = all_parsers diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index ec098353960d7..bda4c771c6511 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -44,6 +44,11 @@ class PythonParser(BaseParser): float_precision_choices = [None] +class PyArrowParser(BaseParser): + engine = "pyarrow" + float_precision_choices = [None] + + @pytest.fixture def csv_dir_path(datapath): """ @@ -63,14 +68,18 @@ def csv1(datapath): _cParserHighMemory = CParserHighMemory() _cParserLowMemory = CParserLowMemory() _pythonParser = PythonParser() +_pyarrowParser = PyArrowParser() _py_parsers_only = [_pythonParser] _c_parsers_only = [_cParserHighMemory, _cParserLowMemory] -_all_parsers = [*_c_parsers_only, *_py_parsers_only] +_pyarrow_parsers_only = [_pyarrowParser] _py_parser_ids = ["python"] _c_parser_ids = ["c_high", "c_low"] -_all_parser_ids = [*_c_parser_ids, *_py_parser_ids] +_pyarrow_parser_ids = ["pyarrow"] + +_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] +_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids] @pytest.fixture(params=_all_parsers, ids=_all_parser_ids) @@ -78,6 +87,8 @@ def all_parsers(request): """ Fixture all of the CSV parsers. """ + if request.param.engine == "pyarrow": + pytest.importorskip("pyarrow", "0.15.0") return request.param @@ -105,10 +116,14 @@ def _get_all_parser_float_precision_combinations(): params = [] ids = [] for parser, parser_id in zip(_all_parsers, _all_parser_ids): + if parser_id == "pyarrow": + # GH38370 + continue for precision in parser.float_precision_choices: params.append((parser, precision)) ids.append(f"{parser_id}-{precision}") + print(params) return {"params": params, "ids": ids} @@ -148,3 +163,26 @@ def encoding_fmt(request): Fixture for all possible string formats of a UTF encoding. """ return request.param + + +@pytest.fixture +def pyarrow_xfail(request): + """ + Fixture that xfails a test if the engine is pyarrow. + """ + if "all_parsers" in request.fixturenames: + parser = request.getfixturevalue("all_parsers") + if parser.engine == "pyarrow": + mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") + request.node.add_marker(mark) + + +@pytest.fixture +def pyarrow_skip(request): + """ + Fixture that skips a test if the engine is pyarrow. + """ + if "all_parsers" in request.fixturenames: + parser = request.getfixturevalue("all_parsers") + if parser.engine == "pyarrow": + pytest.skip("pyarrow doesn't support this.") diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py index 2f569424a82f5..a4e59899f304e 100644 --- a/pandas/tests/io/parser/dtypes/test_categorical.py +++ b/pandas/tests/io/parser/dtypes/test_categorical.py @@ -14,7 +14,10 @@ from pandas import Categorical, DataFrame, Timestamp import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + +@skip_pyarrow @pytest.mark.parametrize( "dtype", [ @@ -41,6 +44,7 @@ def test_categorical_dtype(all_parsers, dtype): tm.assert_frame_equal(actual, expected) +@skip_pyarrow @pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) def test_categorical_dtype_single(all_parsers, dtype): # see gh-10153 @@ -56,6 +60,7 @@ def test_categorical_dtype_single(all_parsers, dtype): tm.assert_frame_equal(actual, expected) +@skip_pyarrow def test_categorical_dtype_unsorted(all_parsers): # see gh-10153 parser = all_parsers @@ -74,6 +79,7 @@ def test_categorical_dtype_unsorted(all_parsers): tm.assert_frame_equal(actual, expected) +@skip_pyarrow def test_categorical_dtype_missing(all_parsers): # see gh-10153 parser = all_parsers @@ -92,6 +98,7 @@ def test_categorical_dtype_missing(all_parsers): tm.assert_frame_equal(actual, expected) +@skip_pyarrow @pytest.mark.slow def test_categorical_dtype_high_cardinality_numeric(all_parsers): # see gh-18186 @@ -106,6 +113,7 @@ def test_categorical_dtype_high_cardinality_numeric(all_parsers): tm.assert_frame_equal(actual, expected) +@skip_pyarrow def test_categorical_dtype_utf16(all_parsers, csv_dir_path): # see gh-10153 pth = os.path.join(csv_dir_path, "utf16_ex.txt") @@ -120,6 +128,7 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path): tm.assert_frame_equal(actual, expected) +@skip_pyarrow def test_categorical_dtype_chunksize_infer_categories(all_parsers): # see gh-10153 parser = all_parsers @@ -139,6 +148,7 @@ def test_categorical_dtype_chunksize_infer_categories(all_parsers): tm.assert_frame_equal(actual, expected) +@skip_pyarrow def test_categorical_dtype_chunksize_explicit_categories(all_parsers): # see gh-10153 parser = all_parsers @@ -242,6 +252,7 @@ def test_categorical_coerces_datetime(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_categorical_coerces_timestamp(all_parsers): parser = all_parsers dtype = {"b": CategoricalDtype([Timestamp("2014")])} @@ -253,6 +264,7 @@ def test_categorical_coerces_timestamp(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_categorical_coerces_timedelta(all_parsers): parser = all_parsers dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index fc34d65fdad52..087554487a337 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -13,6 +13,8 @@ from pandas import DataFrame, Timestamp import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) diff --git a/pandas/tests/io/parser/dtypes/test_empty.py b/pandas/tests/io/parser/dtypes/test_empty.py index 57d729fb4b7fc..618af246e4db3 100644 --- a/pandas/tests/io/parser/dtypes/test_empty.py +++ b/pandas/tests/io/parser/dtypes/test_empty.py @@ -10,6 +10,8 @@ from pandas import Categorical, DataFrame, Index, MultiIndex, Series, concat import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + def test_dtype_all_columns_empty(all_parsers): # see gh-12048 diff --git a/pandas/tests/io/parser/test_comment.py b/pandas/tests/io/parser/test_comment.py index d10d8e27a59a5..420ffdc4c26eb 100644 --- a/pandas/tests/io/parser/test_comment.py +++ b/pandas/tests/io/parser/test_comment.py @@ -10,6 +10,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_xfail") + @pytest.mark.parametrize("na_values", [None, ["NaN"]]) def test_comment(all_parsers, na_values): diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index 220d9474c6dbf..c01542d7d38c5 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -12,6 +12,8 @@ from pandas import DataFrame import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + @pytest.fixture(params=[True, False]) def buffer(request): @@ -29,6 +31,7 @@ def parser_and_data(all_parsers, csv1): return parser, data, expected +@skip_pyarrow @pytest.mark.parametrize("compression", ["zip", "infer", "zip2"]) def test_zip(parser_and_data, compression): parser, data, expected = parser_and_data @@ -46,6 +49,7 @@ def test_zip(parser_and_data, compression): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("compression", ["zip", "infer"]) def test_zip_error_multiple_files(parser_and_data, compression): parser, data, expected = parser_and_data @@ -61,6 +65,7 @@ def test_zip_error_multiple_files(parser_and_data, compression): parser.read_csv(path, compression=compression) +@skip_pyarrow def test_zip_error_no_files(parser_and_data): parser, _, _ = parser_and_data @@ -72,6 +77,7 @@ def test_zip_error_no_files(parser_and_data): parser.read_csv(path, compression="zip") +@skip_pyarrow def test_zip_error_invalid_zip(parser_and_data): parser, _, _ = parser_and_data @@ -81,6 +87,7 @@ def test_zip_error_invalid_zip(parser_and_data): parser.read_csv(f, compression="zip") +@skip_pyarrow @pytest.mark.parametrize("filename", [None, "test.{ext}"]) def test_compression(parser_and_data, compression_only, buffer, filename): parser, data, expected = parser_and_data @@ -114,6 +121,8 @@ def test_infer_compression(all_parsers, csv1, buffer, ext): expected = parser.read_csv(csv1, **kwargs) kwargs["compression"] = "infer" + if ext == "bz2": + pytest.xfail("pyarrow wheels don't have bz2 codec support") if buffer: with open(csv1) as f: result = parser.read_csv(f, **kwargs) @@ -124,6 +133,7 @@ def test_infer_compression(all_parsers, csv1, buffer, ext): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt): # see gh-18071, gh-24130 parser = all_parsers @@ -141,6 +151,7 @@ def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"]) def test_invalid_compression(all_parsers, invalid_compression): parser = all_parsers diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 1d2fb7fddc9dd..158f924882503 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -12,6 +12,8 @@ from pandas import DataFrame, Index import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + def test_converters_type_must_be_dict(all_parsers): parser = all_parsers diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py index c12eb5ec873b2..1b081b066c42c 100644 --- a/pandas/tests/io/parser/test_dialect.py +++ b/pandas/tests/io/parser/test_dialect.py @@ -13,6 +13,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + @pytest.fixture def custom_dialect(): diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 10386cf87b9c2..dde8277f1732a 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -13,7 +13,11 @@ from pandas import DataFrame, read_csv import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@skip_pyarrow def test_bytes_io_input(all_parsers): encoding = "cp1255" parser = all_parsers @@ -25,6 +29,7 @@ def test_bytes_io_input(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_read_csv_unicode(all_parsers): parser = all_parsers data = BytesIO("\u0141aski, Jan;1".encode()) @@ -34,6 +39,7 @@ def test_read_csv_unicode(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("sep", [",", "\t"]) @pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"]) def test_utf16_bom_skiprows(all_parsers, sep, encoding): @@ -68,6 +74,7 @@ def test_utf16_bom_skiprows(all_parsers, sep, encoding): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_utf16_example(all_parsers, csv_dir_path): path = os.path.join(csv_dir_path, "utf16_ex.txt") parser = all_parsers @@ -75,6 +82,7 @@ def test_utf16_example(all_parsers, csv_dir_path): assert len(result) == 50 +@skip_pyarrow def test_unicode_encoding(all_parsers, csv_dir_path): path = os.path.join(csv_dir_path, "unicode_series.csv") parser = all_parsers @@ -87,6 +95,7 @@ def test_unicode_encoding(all_parsers, csv_dir_path): assert got == expected +@skip_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -120,6 +129,7 @@ def _encode_data_with_bom(_data): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt): # see gh-13549 expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]}) @@ -132,6 +142,7 @@ def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "file_path,encoding", [ @@ -166,6 +177,7 @@ def test_binary_mode_file_buffers( tm.assert_frame_equal(expected, result) +@skip_pyarrow @pytest.mark.parametrize("pass_encoding", [True, False]) def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding): # see gh-24130 @@ -182,6 +194,7 @@ def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding) tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_encoding_named_temp_file(all_parsers): # see gh-31819 parser = all_parsers diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index ae2808f494118..0de6e389dd09b 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -14,7 +14,11 @@ from pandas import DataFrame, Index, MultiIndex import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@skip_pyarrow def test_read_with_bad_header(all_parsers): parser = all_parsers msg = r"but only \d+ lines in file" @@ -82,6 +86,7 @@ def test_no_header_prefix(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_header_with_index_col(all_parsers): parser = all_parsers data = """foo,1,2,3 @@ -119,6 +124,7 @@ def test_header_not_first_line(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_header_multi_index(all_parsers): parser = all_parsers expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) @@ -184,6 +190,7 @@ def test_header_multi_index_invalid(all_parsers, kwargs, msg): _TestTuple = namedtuple("names", ["first", "second"]) +@skip_pyarrow @pytest.mark.parametrize( "kwargs", [ @@ -231,6 +238,7 @@ def test_header_multi_index_common_format1(all_parsers, kwargs): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "kwargs", [ @@ -277,6 +285,7 @@ def test_header_multi_index_common_format2(all_parsers, kwargs): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "kwargs", [ @@ -324,6 +333,7 @@ def test_header_multi_index_common_format3(all_parsers, kwargs): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_header_multi_index_common_format_malformed1(all_parsers): parser = all_parsers expected = DataFrame( @@ -344,6 +354,7 @@ def test_header_multi_index_common_format_malformed1(all_parsers): tm.assert_frame_equal(expected, result) +@skip_pyarrow def test_header_multi_index_common_format_malformed2(all_parsers): parser = all_parsers expected = DataFrame( @@ -365,6 +376,7 @@ def test_header_multi_index_common_format_malformed2(all_parsers): tm.assert_frame_equal(expected, result) +@skip_pyarrow def test_header_multi_index_common_format_malformed3(all_parsers): parser = all_parsers expected = DataFrame( @@ -385,6 +397,7 @@ def test_header_multi_index_common_format_malformed3(all_parsers): tm.assert_frame_equal(expected, result) +@skip_pyarrow @pytest.mark.parametrize( "data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)] ) @@ -397,6 +410,7 @@ def test_header_names_backward_compat(all_parsers, data, header): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("kwargs", [{}, {"index_col": False}]) def test_read_only_header_no_rows(all_parsers, kwargs): # See gh-7773 @@ -442,6 +456,7 @@ def test_non_int_header(all_parsers, header): parser.read_csv(StringIO(data), header=header) +@skip_pyarrow def test_singleton_header(all_parsers): # see gh-7757 data = """a,b,c\n0,1,2\n1,2,3""" @@ -452,6 +467,7 @@ def test_singleton_header(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -498,6 +514,7 @@ def test_mangles_multi_index(all_parsers, data, expected): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("index_col", [None, [0]]) @pytest.mark.parametrize( "columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])] @@ -541,6 +558,7 @@ def test_multi_index_unnamed(all_parsers, index_col, columns): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_read_csv_multiindex_columns(all_parsers): # GH#6051 parser = all_parsers diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index a409751e261d6..87abeaf18ac76 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -11,7 +11,10 @@ from pandas import DataFrame, Index, MultiIndex import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + +@skip_pyarrow @pytest.mark.parametrize("with_header", [True, False]) def test_index_col_named(all_parsers, with_header): parser = all_parsers @@ -66,6 +69,7 @@ def test_index_col_is_true(all_parsers): parser.read_csv(StringIO(data), index_col=True) +@skip_pyarrow def test_infer_index_col(all_parsers): data = """A,B,C foo,1,2,3 @@ -83,6 +87,7 @@ def test_infer_index_col(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "index_col,kwargs", [ @@ -131,6 +136,7 @@ def test_index_col_empty_data(all_parsers, index_col, kwargs): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_empty_with_index_col_false(all_parsers): # see gh-10413 data = "x,y" @@ -141,6 +147,7 @@ def test_empty_with_index_col_false(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "index_names", [ @@ -165,6 +172,7 @@ def test_multi_index_naming(all_parsers, index_names): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_multi_index_naming_not_all_at_beginning(all_parsers): parser = all_parsers data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4" @@ -179,6 +187,7 @@ def test_multi_index_naming_not_all_at_beginning(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_no_multi_index_level_names_empty(all_parsers): # GH 10984 parser = all_parsers @@ -190,6 +199,7 @@ def test_no_multi_index_level_names_empty(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_header_with_index_col(all_parsers): # GH 33476 parser = all_parsers @@ -213,6 +223,7 @@ def test_header_with_index_col(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.slow def test_index_col_large_csv(all_parsers): # https://github.com/pandas-dev/pandas/issues/37094 @@ -228,6 +239,7 @@ def test_index_col_large_csv(all_parsers): tm.assert_frame_equal(result, df.set_index("a")) +@skip_pyarrow def test_index_col_multiindex_columns_no_data(all_parsers): # GH#38292 parser = all_parsers @@ -243,6 +255,7 @@ def test_index_col_multiindex_columns_no_data(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_index_col_header_no_data(all_parsers): # GH#38292 parser = all_parsers @@ -255,6 +268,7 @@ def test_index_col_header_no_data(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_multiindex_columns_no_data(all_parsers): # GH#38292 parser = all_parsers @@ -265,6 +279,7 @@ def test_multiindex_columns_no_data(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_multiindex_columns_index_col_with_data(all_parsers): # GH#38292 parser = all_parsers diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 457a6567febab..8fb7f3c093ae0 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -10,7 +10,10 @@ from pandas import DataFrame import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + +@skip_pyarrow @pytest.mark.parametrize("kwargs", [{}, {"mangle_dupe_cols": True}]) def test_basic(all_parsers, kwargs): # TODO: add test for condition "mangle_dupe_cols=False" @@ -24,6 +27,7 @@ def test_basic(all_parsers, kwargs): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_basic_names(all_parsers): # See gh-7160 parser = all_parsers @@ -44,6 +48,7 @@ def test_basic_names_raise(all_parsers): parser.read_csv(StringIO(data), names=["a", "b", "a"]) +@skip_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -111,6 +116,7 @@ def test_thorough_mangle_names(all_parsers, data, names, expected): parser.read_csv(StringIO(data), names=names) +@skip_pyarrow def test_mangled_unnamed_placeholders(all_parsers): # xref gh-13017 orig_key = "0" diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index 123dce2048a44..720a94b45adcc 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -13,6 +13,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_xfail") + def _construct_dataframe(num_rows): """ diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 4237a774261ca..6e56d325efdad 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -12,7 +12,11 @@ from pandas import DataFrame, Index, MultiIndex import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@skip_pyarrow def test_string_nas(all_parsers): parser = all_parsers data = """A,B,C @@ -28,6 +32,7 @@ def test_string_nas(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_detect_string_na(all_parsers): parser = all_parsers data = """A,B @@ -42,6 +47,7 @@ def test_detect_string_na(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "na_values", [ @@ -79,6 +85,7 @@ def test_non_string_na_values(all_parsers, data, na_values): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_default_na_values(all_parsers): _NA_VALUES = { "-1.#IND", @@ -126,6 +133,7 @@ def f(i, v): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("na_values", ["baz", ["baz"]]) def test_custom_na_values(all_parsers, na_values): parser = all_parsers @@ -159,6 +167,7 @@ def test_bool_na_values(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_na_value_dict(all_parsers): data = """A,B,C foo,bar,NA @@ -177,6 +186,7 @@ def test_na_value_dict(all_parsers): tm.assert_frame_equal(df, expected) +@skip_pyarrow @pytest.mark.parametrize( "index_col,expected", [ @@ -210,6 +220,7 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "kwargs,expected", [ @@ -297,6 +308,7 @@ def test_no_na_values_no_keep_default(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_no_keep_default_na_dict_na_values(all_parsers): # see gh-19227 data = "a,b\n,2" @@ -308,6 +320,7 @@ def test_no_keep_default_na_dict_na_values(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_no_keep_default_na_dict_na_scalar_values(all_parsers): # see gh-19227 # @@ -319,6 +332,7 @@ def test_no_keep_default_na_dict_na_scalar_values(all_parsers): tm.assert_frame_equal(df, expected) +@skip_pyarrow @pytest.mark.parametrize("col_zero_na_values", [113125, "113125"]) def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values): # see gh-19227 @@ -348,6 +362,7 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "na_filter,row_data", [ @@ -369,6 +384,7 @@ def test_na_values_na_filter_override(all_parsers, na_filter, row_data): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_na_trailing_columns(all_parsers): parser = all_parsers data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax @@ -396,6 +412,7 @@ def test_na_trailing_columns(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "na_values,row_data", [ @@ -414,6 +431,7 @@ def test_na_values_scalar(all_parsers, na_values, row_data): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_na_values_dict_aliasing(all_parsers): parser = all_parsers na_values = {"a": 2, "b": 1} @@ -429,6 +447,7 @@ def test_na_values_dict_aliasing(all_parsers): tm.assert_dict_equal(na_values, na_values_copy) +@skip_pyarrow def test_na_values_dict_col_index(all_parsers): # see gh-14203 data = "a\nfoo\n1" @@ -440,6 +459,7 @@ def test_na_values_dict_col_index(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -469,6 +489,7 @@ def test_empty_na_values_no_default_with_index(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])] ) @@ -497,6 +518,7 @@ def test_inf_na_values_with_int_index(all_parsers): tm.assert_frame_equal(out, expected) +@skip_pyarrow @pytest.mark.parametrize("na_filter", [True, False]) def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): # see gh-20377 @@ -512,6 +534,7 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "data, na_values", [ @@ -540,6 +563,7 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): ) +@skip_pyarrow def test_str_nan_dropped(all_parsers): # see gh-21131 parser = all_parsers diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index c0b29d5019675..2dce5bd501947 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -34,6 +34,8 @@ else: date_strategy = st.datetimes() +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + def test_separator_date_conflict(all_parsers): # Regression test for gh-4678 diff --git a/pandas/tests/io/parser/test_quoting.py b/pandas/tests/io/parser/test_quoting.py index 7a07632390eff..6995965467d05 100644 --- a/pandas/tests/io/parser/test_quoting.py +++ b/pandas/tests/io/parser/test_quoting.py @@ -13,6 +13,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + @pytest.mark.parametrize( "kwargs,msg", diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 35b155705ccee..ffd4f3aecb5d0 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -14,6 +14,8 @@ from pandas import DataFrame, Index import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + @pytest.mark.parametrize("skiprows", [list(range(6)), 6]) def test_skip_rows_bug(all_parsers, skiprows): diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 267fae760398a..6e9cdacd40586 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -121,3 +121,23 @@ def read(self): with pytest.raises(ValueError, match=msg): read_csv(NoNextBuffer(data), engine=python_engine) + + def test_pyarrow_engine(self): + from pandas.io.parsers import _pyarrow_unsupported as pa_unsupported + + data = """1,2,3,, + 1,2,3,4, + 1,2,3,4,5 + 1,2,,, + 1,2,3,4,""" + + for default in pa_unsupported: + msg = ( + f"The {repr(default)} option is not " + f"supported with the 'pyarrow' engine" + ) + kwargs = {default: object()} + if default == "dialect": + kwargs[default] = "excel" # test a random dialect + with pytest.raises(ValueError, match=msg): + read_csv(StringIO(data), engine="pyarrow", **kwargs) diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py index c6b700c0adfff..34d5b4b7d183b 100644 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -20,6 +20,8 @@ "Usecols do not match columns, columns expected but not found: {0}" ) +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) def test_usecols_with_parse_dates(all_parsers, usecols): diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 7d81a88e09012..a163326124878 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -19,6 +19,8 @@ "Usecols do not match columns, columns expected but not found: {0}" ) +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + def test_raise_on_mixed_dtype_usecols(all_parsers): # See gh-12678