From f22ff466b510d13b323c5e483cdeecbbf739dd4e Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sat, 8 Feb 2020 20:24:35 -0800 Subject: [PATCH 01/95] add arrow engine to read_csv --- pandas/io/parsers.py | 132 +++++++++++++++++++++++++++++++------------ 1 file changed, 97 insertions(+), 35 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 84a8b5b2a94fe..f5c00f3f7d137 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -20,6 +20,7 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing from pandas._typing import FilePathOrBuffer +from pandas.compat._optional import import_optional_dependency from pandas.errors import ( AbstractMethodError, EmptyDataError, @@ -165,9 +166,10 @@ to preserve and not interpret dtype. If converters are specified, they will be applied INSTEAD of dtype conversion. -engine : {{'c', 'python'}}, optional - Parser engine to use. The C engine is faster while the python engine is - currently more feature-complete. +engine : {{'c', 'python', 'arrow'}}, optional + Parser engine to use. The C and arrow engines are faster while the python engine is + currently more feature-complete. The arrow engine requires ``pyarrow`` + as a dependency however. converters : dict, optional Dict of functions for converting values in certain columns. Keys can either be integers or column labels. @@ -506,7 +508,6 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "skip_blank_lines": True, } - _c_parser_defaults = { "delim_whitespace": False, "na_filter": True, @@ -520,6 +521,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} _c_unsupported = {"skipfooter"} +_arrow_unsupported = {"skipfooter", "low_memory", "float_precision"} _python_unsupported = {"low_memory", "float_precision"} _deprecated_defaults: Dict[str, Any] = {} @@ -705,7 +707,6 @@ def read_fwf( infer_nrows=100, **kwds, ): - r""" Read a table of fixed-width formatted lines into DataFrame. @@ -879,7 +880,8 @@ def __init__(self, f, engine=None, **kwds): self._make_engine(self.engine) def close(self): - self._engine.close() + if self.engine != "arrow": + self._engine.close() def _get_options_with_defaults(self, engine): kwds = self.orig_options @@ -945,16 +947,16 @@ def _clean_options(self, options, engine): delim_whitespace = options["delim_whitespace"] # C engine not supported yet - if engine == "c": + if engine == "c" or engine == "arrow": if options["skipfooter"] > 0: - fallback_reason = "the 'c' engine does not support skipfooter" + fallback_reason = f"the {engine} engine does not support skipfooter" engine = "python" encoding = sys.getfilesystemencoding() or "utf-8" if sep is None and not delim_whitespace: - if engine == "c": + if engine == "c" or engine == "arrow": fallback_reason = ( - "the 'c' engine does not support " + f"the {engine} engine does not support " "sep=None with delim_whitespace=False" ) engine = "python" @@ -1081,14 +1083,20 @@ def _clean_options(self, options, engine): na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) # handle skiprows; this is internally handled by the - # c-engine, so only need for python parsers + # c-engine, so only need for python parser if engine != "c": - if is_integer(skiprows): - skiprows = list(range(skiprows)) - if skiprows is None: - skiprows = set() - elif not callable(skiprows): - skiprows = set(skiprows) + if engine == "arrow": + if not is_integer(skiprows) and skiprows is not None: + raise ValueError( + "skiprows argument must be integer when using arrow engine" + ) + else: + if is_integer(skiprows): + skiprows = list(range(skiprows)) + if skiprows is None: + skiprows = set() + elif not callable(skiprows): + skiprows = set(skiprows) # put stuff back result["names"] = names @@ -1109,6 +1117,8 @@ def __next__(self): def _make_engine(self, engine="c"): if engine == "c": self._engine = CParserWrapper(self.f, **self.options) + elif engine == "arrow": + self._engine = ArrowParserWrapper(self.f, **self.options) else: if engine == "python": klass = PythonParser @@ -1125,29 +1135,32 @@ def _failover_to_python(self): raise AbstractMethodError(self) def read(self, nrows=None): - nrows = _validate_integer("nrows", nrows) - ret = self._engine.read(nrows) + if self.engine == "arrow": + return self._engine.read(nrows) + else: + nrows = _validate_integer("nrows", nrows) + ret = self._engine.read(nrows) - # May alter columns / col_dict - index, columns, col_dict = self._create_index(ret) + # May alter columns / col_dict + index, columns, col_dict = self._create_index(ret) - if index is None: - if col_dict: - # Any column is actually fine: - new_rows = len(next(iter(col_dict.values()))) - index = RangeIndex(self._currow, self._currow + new_rows) + if index is None: + if col_dict: + # Any column is actually fine: + new_rows = len(next(iter(col_dict.values()))) + index = RangeIndex(self._currow, self._currow + new_rows) + else: + new_rows = 0 else: - new_rows = 0 - else: - new_rows = len(index) + new_rows = len(index) - df = DataFrame(col_dict, columns=columns, index=index) + df = DataFrame(col_dict, columns=columns, index=index) - self._currow += new_rows + self._currow += new_rows - if self.squeeze and len(df.columns) == 1: - return df[df.columns[0]].copy() - return df + if self.squeeze and len(df.columns) == 1: + return df[df.columns[0]].copy() + return df def _create_index(self, ret): index, columns, col_dict = ret @@ -2135,6 +2148,56 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): return values +class ArrowParserWrapper(ParserBase): + """ + + """ + + def __init__(self, src, **kwds): + self.kwds = kwds + self.src = src + kwds = kwds.copy() + + ParserBase.__init__(self, kwds) + + # #2442 + kwds["allow_leading_cols"] = self.index_col is not False + + # GH20529, validate usecol arg before TextReader + self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) + kwds["usecols"] = self.usecols + + self.names = kwds["names"] + + def read(self, nrows=None): + pyarrow = import_optional_dependency( + "pyarrow.csv", extra="pyarrow is required to use arrow engine" + ) + nrows = _validate_integer("nrows", nrows) + table = pyarrow.read_csv( + self.src, + read_options=pyarrow.ReadOptions( + skip_rows=self.kwds.get("skiprows"), column_names=self.names + ), + parse_options=pyarrow.ParseOptions( + delimiter=self.kwds.get("delimiter"), + quote_char=self.kwds.get("quotechar"), + ), + convert_options=pyarrow.ConvertOptions( + include_columns=self.usecols, column_types=self.kwds.get("dtype") + ), + ) + if nrows: + table = table[:nrows] + table_width = len(table.column_names) + if self.names is None: + if self.prefix: + self.names = [f"{self.prefix}{i}" for i in range(table_width)] + if self.names: + table = table.rename_columns(self.names) + return table.to_pandas() + + def TextParser(*args, **kwds): """ Converts lists of lists/tuples into DataFrames with proper type inference @@ -3336,7 +3399,6 @@ def _try_convert_dates(parser, colspec, data_dict, columns): def _clean_na_values(na_values, keep_default_na=True): - if na_values is None: if keep_default_na: na_values = STR_NA_VALUES From 8ae43e44cdbec134771173b69a5d4c1a2400504f Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sat, 8 Feb 2020 21:01:26 -0800 Subject: [PATCH 02/95] fix failing test --- pandas/io/parsers.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f5c00f3f7d137..75da1d991dc9b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1135,7 +1135,7 @@ def _failover_to_python(self): raise AbstractMethodError(self) def read(self, nrows=None): - if self.engine == "arrow": + if isinstance(self._engine, ArrowParserWrapper): return self._engine.read(nrows) else: nrows = _validate_integer("nrows", nrows) @@ -2165,9 +2165,6 @@ def __init__(self, src, **kwds): # GH20529, validate usecol arg before TextReader self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) - kwds["usecols"] = self.usecols - - self.names = kwds["names"] def read(self, nrows=None): pyarrow = import_optional_dependency( From 09074df84e42eec3e7f7dd1ae7c710af53b386cc Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sun, 9 Feb 2020 10:01:55 -0800 Subject: [PATCH 03/95] formatting and revert unnecessary change --- pandas/io/parsers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 75da1d991dc9b..ad60b223daa06 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -167,7 +167,7 @@ If converters are specified, they will be applied INSTEAD of dtype conversion. engine : {{'c', 'python', 'arrow'}}, optional - Parser engine to use. The C and arrow engines are faster while the python engine is + Parser engine to use. The C and arrow engines are faster, while the python engine is currently more feature-complete. The arrow engine requires ``pyarrow`` as a dependency however. converters : dict, optional @@ -508,6 +508,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "skip_blank_lines": True, } + _c_parser_defaults = { "delim_whitespace": False, "na_filter": True, From 6be276db8c7c5e1384bfb45591534176d2f6bfe5 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sun, 9 Feb 2020 10:07:03 -0800 Subject: [PATCH 04/95] remove bloat and more formatting changes --- pandas/io/parsers.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index ad60b223daa06..6d8764fef385c 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -881,8 +881,7 @@ def __init__(self, f, engine=None, **kwds): self._make_engine(self.engine) def close(self): - if self.engine != "arrow": - self._engine.close() + self._engine.close() def _get_options_with_defaults(self, engine): kwds = self.orig_options @@ -1089,7 +1088,7 @@ def _clean_options(self, options, engine): if engine == "arrow": if not is_integer(skiprows) and skiprows is not None: raise ValueError( - "skiprows argument must be integer when using arrow engine" + "skiprows argument must be an integer when using engine='arrow'" ) else: if is_integer(skiprows): From df4fa7e2ac359f7e25031f8f92d312049972d1ec Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sun, 9 Feb 2020 10:25:25 -0800 Subject: [PATCH 05/95] Whatsnew --- doc/source/whatsnew/v1.1.0.rst | 4 +++- pandas/io/parsers.py | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 920919755dc23..2c4f5dcfbcde8 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -42,7 +42,9 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) -- +- :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing + if pyarrow>0.11 is installed. However, the pyarrow engine is less feature-complete than its "c" or + "python" counterparts. - .. --------------------------------------------------------------------------- diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 6d8764fef385c..938bafa780d89 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -708,6 +708,7 @@ def read_fwf( infer_nrows=100, **kwds, ): + r""" Read a table of fixed-width formatted lines into DataFrame. @@ -3396,6 +3397,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns): def _clean_na_values(na_values, keep_default_na=True): + if na_values is None: if keep_default_na: na_values = STR_NA_VALUES From ecaf3fd036d38dfd34e5d9a5de45304dbdfacca4 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sun, 9 Feb 2020 16:35:32 -0800 Subject: [PATCH 06/95] Get tests up and running --- pandas/io/parsers.py | 12 +++++++----- pandas/tests/io/parser/conftest.py | 12 ++++++++++-- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 371660b19b171..43272ef2cf600 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -508,7 +508,6 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "skip_blank_lines": True, } - _c_parser_defaults = { "delim_whitespace": False, "na_filter": True, @@ -522,7 +521,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} _c_unsupported = {"skipfooter"} -_arrow_unsupported = {"skipfooter", "low_memory", "float_precision"} +_arrow_unsupported = {"skipfooter", "low_memory", "float_precision", "chunksize"} _python_unsupported = {"low_memory", "float_precision"} _deprecated_defaults: Dict[str, Any] = {} @@ -708,7 +707,6 @@ def read_fwf( infer_nrows=100, **kwds, ): - r""" Read a table of fixed-width formatted lines into DataFrame. @@ -947,7 +945,12 @@ def _clean_options(self, options, engine): sep = options["delimiter"] delim_whitespace = options["delim_whitespace"] - # C engine not supported yet + # arrow engine not supported yet + if engine == "arrow": + if options["chunksize"] is not None: + fallback_reason = f"the arrow engine does not support chunksize" + engine = "python" + # C and arrow engine not supported yet if engine == "c" or engine == "arrow": if options["skipfooter"] > 0: fallback_reason = f"the {engine} engine does not support skipfooter" @@ -3401,7 +3404,6 @@ def _try_convert_dates(parser, colspec, data_dict, columns): def _clean_na_values(na_values, keep_default_na=True): - if na_values is None: if keep_default_na: na_values = STR_NA_VALUES diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 15967e3be176a..751db1d22e8ae 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -44,6 +44,11 @@ class PythonParser(BaseParser): float_precision_choices = [None] +class ArrowParser(BaseParser): + engine = "arrow" + float_precision_choices = [None] + + @pytest.fixture def csv_dir_path(datapath): """ @@ -63,14 +68,17 @@ def csv1(csv_dir_path): _cParserHighMemory = CParserHighMemory() _cParserLowMemory = CParserLowMemory() _pythonParser = PythonParser() +_arrowParser = ArrowParser() _py_parsers_only = [_pythonParser] _c_parsers_only = [_cParserHighMemory, _cParserLowMemory] -_all_parsers = [*_c_parsers_only, *_py_parsers_only] +_arrow_parsers_only = [_arrowParser] +_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_arrow_parsers_only] _py_parser_ids = ["python"] _c_parser_ids = ["c_high", "c_low"] -_all_parser_ids = [*_c_parser_ids, *_py_parser_ids] +_arrow_parser_ids = ["arrow"] +_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_arrow_parser_ids] @pytest.fixture(params=_all_parsers, ids=_all_parser_ids) From b3c328723bb997a675e31cd8db84d77d75afa4f7 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 10 Feb 2020 07:26:58 -0800 Subject: [PATCH 07/95] Some fixes --- pandas/io/parsers.py | 45 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 43272ef2cf600..d3f40a6b9df2b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -947,7 +947,7 @@ def _clean_options(self, options, engine): # arrow engine not supported yet if engine == "arrow": - if options["chunksize"] is not None: + if self.chunksize is not None: fallback_reason = f"the arrow engine does not support chunksize" engine = "python" # C and arrow engine not supported yet @@ -1087,10 +1087,11 @@ def _clean_options(self, options, engine): na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) # handle skiprows; this is internally handled by the - # c-engine, so only need for python parser + # c-engine, so only need for python and arrow parsers if engine != "c": if engine == "arrow": if not is_integer(skiprows) and skiprows is not None: + # pyarrow expects skiprows to be passed as an integer raise ValueError( "skiprows argument must be an integer when using engine='arrow'" ) @@ -1131,7 +1132,7 @@ def _make_engine(self, engine="c"): else: raise ValueError( f"Unknown engine: {engine} (valid options " - 'are "c", "python", or "python-fwf")' + 'are "c", "python", "arrow", or "python-fwf")' ) self._engine = klass(self.f, **self.options) @@ -1139,32 +1140,31 @@ def _failover_to_python(self): raise AbstractMethodError(self) def read(self, nrows=None): - if isinstance(self._engine, ArrowParserWrapper): + nrows = _validate_integer("nrows", nrows) + if self.engine == "arrow": return self._engine.read(nrows) - else: - nrows = _validate_integer("nrows", nrows) - ret = self._engine.read(nrows) + ret = self._engine.read(nrows) - # May alter columns / col_dict - index, columns, col_dict = self._create_index(ret) + # May alter columns / col_dict + index, columns, col_dict = self._create_index(ret) - if index is None: - if col_dict: - # Any column is actually fine: - new_rows = len(next(iter(col_dict.values()))) - index = RangeIndex(self._currow, self._currow + new_rows) - else: - new_rows = 0 + if index is None: + if col_dict: + # Any column is actually fine: + new_rows = len(next(iter(col_dict.values()))) + index = RangeIndex(self._currow, self._currow + new_rows) else: - new_rows = len(index) + new_rows = 0 + else: + new_rows = len(index) - df = DataFrame(col_dict, columns=columns, index=index) + df = DataFrame(col_dict, columns=columns, index=index) - self._currow += new_rows + self._currow += new_rows - if self.squeeze and len(df.columns) == 1: - return df[df.columns[0]].copy() - return df + if self.squeeze and len(df.columns) == 1: + return df[df.columns[0]].copy() + return df def _create_index(self, ret): index, columns, col_dict = ret @@ -2178,7 +2178,6 @@ def read(self, nrows=None): pyarrow = import_optional_dependency( "pyarrow.csv", extra="pyarrow is required to use arrow engine" ) - nrows = _validate_integer("nrows", nrows) table = pyarrow.read_csv( self.src, read_options=pyarrow.ReadOptions( From 474baf4c83ee28330ef38b426f09617d2f8cfc9e Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 10 Feb 2020 20:35:38 -0800 Subject: [PATCH 08/95] Add asvs and xfail some tests --- asv_bench/benchmarks/io/csv.py | 10 ++++++++++ pandas/io/parsers.py | 8 +++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 9bcd125f56bbb..89c81a937090b 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -254,6 +254,16 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision): names=list("abc"), ) + def time_read_csv_arrow_engine(self, sep, decimal, float_precision): + read_csv( + self.data(self.StringIO_input), + sep=sep, + header=None, + engine="arrow", + float_precision=None, + names=list("abc"), + ) + class ReadCSVCategorical(BaseIO): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d3f40a6b9df2b..dd2155d2d735b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -521,7 +521,13 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} _c_unsupported = {"skipfooter"} -_arrow_unsupported = {"skipfooter", "low_memory", "float_precision", "chunksize"} +_arrow_unsupported = { + "skipfooter", + "low_memory", + "float_precision", + "chunksize", + "comment", +} _python_unsupported = {"low_memory", "float_precision"} _deprecated_defaults: Dict[str, Any] = {} From 2cd993771b6c07a8144c8472c710e164410c8e37 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 19 Feb 2020 16:57:52 -0800 Subject: [PATCH 09/95] address comments --- asv_bench/benchmarks/io/csv.py | 4 +- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/io/parsers.py | 63 +++++++++++++++++++----------- pandas/tests/io/parser/conftest.py | 14 +++---- 4 files changed, 50 insertions(+), 33 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 89c81a937090b..a4e6f94f326ba 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -254,12 +254,12 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision): names=list("abc"), ) - def time_read_csv_arrow_engine(self, sep, decimal, float_precision): + def time_read_csv_pyarrow_engine(self, sep, decimal, float_precision): read_csv( self.data(self.StringIO_input), sep=sep, header=None, - engine="arrow", + engine="pyarrow", float_precision=None, names=list("abc"), ) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index fc0e486978ffb..297c561557053 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -43,7 +43,7 @@ Other enhancements - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) - :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing - if pyarrow>0.11 is installed. However, the pyarrow engine is less feature-complete than its "c" or + if pyarrow>0.13 is installed. However, the pyarrow engine is less feature-complete than its "c" or "python" counterparts. - diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index dd2155d2d735b..59678d675b0b1 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -5,7 +5,7 @@ from collections import abc, defaultdict import csv import datetime -from io import BufferedIOBase, RawIOBase, StringIO, TextIOWrapper +from io import BufferedIOBase, BytesIO, RawIOBase, StringIO, TextIOWrapper import re import sys from textwrap import fill @@ -166,10 +166,11 @@ to preserve and not interpret dtype. If converters are specified, they will be applied INSTEAD of dtype conversion. -engine : {{'c', 'python', 'arrow'}}, optional - Parser engine to use. The C and arrow engines are faster, while the python engine is - currently more feature-complete. The arrow engine requires ``pyarrow`` +engine : {{'c', 'python', 'pyarrow'}}, optional + Parser engine to use. The C and pyarrow engines are faster, while the python engine + is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.13 as a dependency however. + .. versionchanged(1.1) converters : dict, optional Dict of functions for converting values in certain columns. Keys can either be integers or column labels. @@ -521,9 +522,8 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} _c_unsupported = {"skipfooter"} -_arrow_unsupported = { +_pyarrow_unsupported = { "skipfooter", - "low_memory", "float_precision", "chunksize", "comment", @@ -951,20 +951,29 @@ def _clean_options(self, options, engine): sep = options["delimiter"] delim_whitespace = options["delim_whitespace"] - # arrow engine not supported yet - if engine == "arrow": - if self.chunksize is not None: - fallback_reason = f"the arrow engine does not support chunksize" - engine = "python" - # C and arrow engine not supported yet - if engine == "c" or engine == "arrow": + # pyarrow engine not supported yet + if engine == "pyarrow": + for option in _pyarrow_unsupported: + if option != "chunksize" and option != "skipfooter": + if options[option] is not None: + fallback_reason = ( + f"the pyarrow engine does not support the {option} argumnet" + ) + engine = "python" + else: + if self.chunksize is not None: + fallback_reason = ( + "the pyarrow engine does not support using chunksize" + ) + # C and pyarrow engine not supported yet + if engine == "c" or "pyarrow": if options["skipfooter"] > 0: fallback_reason = f"the {engine} engine does not support skipfooter" engine = "python" encoding = sys.getfilesystemencoding() or "utf-8" if sep is None and not delim_whitespace: - if engine == "c" or engine == "arrow": + if engine == "c" or engine == "pyarrow": fallback_reason = ( f"the {engine} engine does not support " "sep=None with delim_whitespace=False" @@ -1093,13 +1102,14 @@ def _clean_options(self, options, engine): na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) # handle skiprows; this is internally handled by the - # c-engine, so only need for python and arrow parsers + # c-engine, so only need for python and pyarrow parsers if engine != "c": - if engine == "arrow": + if engine == "pyarrow": if not is_integer(skiprows) and skiprows is not None: # pyarrow expects skiprows to be passed as an integer raise ValueError( - "skiprows argument must be an integer when using engine='arrow'" + "skiprows argument must be an integer when using " + "engine='pyarrow'" ) else: if is_integer(skiprows): @@ -2164,7 +2174,7 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): class ArrowParserWrapper(ParserBase): """ - + Wrapper for the pyarrow engine for pd.read_csv() """ def __init__(self, src, **kwds): @@ -2174,12 +2184,13 @@ def __init__(self, src, **kwds): ParserBase.__init__(self, kwds) - # #2442 - kwds["allow_leading_cols"] = self.index_col is not False + encoding = kwds.get("encoding") if kwds.get("encoding") is not None else "utf-8" - # GH20529, validate usecol arg before TextReader self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) + if isinstance(self.src, StringIO): + self.src = BytesIO(self.src.getvalue().encode(encoding)) + def read(self, nrows=None): pyarrow = import_optional_dependency( "pyarrow.csv", extra="pyarrow is required to use arrow engine" @@ -2197,12 +2208,18 @@ def read(self, nrows=None): include_columns=self.usecols, column_types=self.kwds.get("dtype") ), ) - if nrows: - table = table[:nrows] + table_width = len(table.column_names) if self.names is None: if self.prefix: self.names = [f"{self.prefix}{i}" for i in range(table_width)] + elif self.header is not None: + if self.header == "infer": + header = 0 + else: + header = self.header + self.names = table[header] + del table[header] if self.names: table = table.rename_columns(self.names) return table.to_pandas() diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 751db1d22e8ae..327f87303aeb0 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -44,8 +44,8 @@ class PythonParser(BaseParser): float_precision_choices = [None] -class ArrowParser(BaseParser): - engine = "arrow" +class PyArrowParser(BaseParser): + engine = "pyarrow" float_precision_choices = [None] @@ -68,17 +68,17 @@ def csv1(csv_dir_path): _cParserHighMemory = CParserHighMemory() _cParserLowMemory = CParserLowMemory() _pythonParser = PythonParser() -_arrowParser = ArrowParser() +_pyarrowParser = PyArrowParser() _py_parsers_only = [_pythonParser] _c_parsers_only = [_cParserHighMemory, _cParserLowMemory] -_arrow_parsers_only = [_arrowParser] -_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_arrow_parsers_only] +_pyarrow_parsers_only = [_pyarrowParser] +_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] _py_parser_ids = ["python"] _c_parser_ids = ["c_high", "c_low"] -_arrow_parser_ids = ["arrow"] -_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_arrow_parser_ids] +_pyarrow_parser_ids = ["pyarrow"] +_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids] @pytest.fixture(params=_all_parsers, ids=_all_parser_ids) From 3d15a5660d7779eb7638875a33882b3e9103b190 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Thu, 20 Feb 2020 10:57:11 -0800 Subject: [PATCH 10/95] fix typo --- pandas/io/parsers.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 59678d675b0b1..4d31ca3230df6 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1138,7 +1138,7 @@ def __next__(self): def _make_engine(self, engine="c"): if engine == "c": self._engine = CParserWrapper(self.f, **self.options) - elif engine == "arrow": + elif engine == "pyarrow": self._engine = ArrowParserWrapper(self.f, **self.options) else: if engine == "python": @@ -1157,7 +1157,7 @@ def _failover_to_python(self): def read(self, nrows=None): nrows = _validate_integer("nrows", nrows) - if self.engine == "arrow": + if self.engine == "pyarrow": return self._engine.read(nrows) ret = self._engine.read(nrows) @@ -2208,21 +2208,19 @@ def read(self, nrows=None): include_columns=self.usecols, column_types=self.kwds.get("dtype") ), ) - + frame = table.to_pandas() table_width = len(table.column_names) if self.names is None: if self.prefix: self.names = [f"{self.prefix}{i}" for i in range(table_width)] - elif self.header is not None: - if self.header == "infer": - header = 0 - else: - header = self.header - self.names = table[header] - del table[header] + elif self.header is not None and self.header != "infer": + header = self.header + self.names = frame.iloc[header] + frame = frame.drop(header, axis=0) + if self.names: - table = table.rename_columns(self.names) - return table.to_pandas() + frame = frame.rename(self.names, axis="columns") + return frame def TextParser(*args, **kwds): From 98aa134d85044ab84adade39f66639777d971eed Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sat, 29 Feb 2020 08:59:43 -0800 Subject: [PATCH 11/95] some fixes --- pandas/io/parsers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3ab847ebd7e04..dbd55f2015d1c 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2195,7 +2195,9 @@ def read(self, nrows=None): table = pyarrow.read_csv( self.src, read_options=pyarrow.ReadOptions( - skip_rows=self.kwds.get("skiprows"), column_names=self.names + skip_rows=self.kwds.get("skiprows"), + column_names=self.names, + autogenerate_column_names=True if self.header != 0 else False, ), parse_options=pyarrow.ParseOptions( delimiter=self.kwds.get("delimiter"), @@ -2215,8 +2217,7 @@ def read(self, nrows=None): self.names = frame.iloc[header] frame = frame.drop(header, axis=0) - if self.names: - frame = frame.rename(self.names, axis="columns") + frame = frame.rename(zip(frame.names, self.names), axis="columns") return frame From b9c6d2c0a2b177c12c94b30f7c1395d77d1d0242 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sat, 4 Apr 2020 19:42:14 -0700 Subject: [PATCH 12/95] Fix bug --- pandas/io/parsers.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index dbd55f2015d1c..ac7658d5b3772 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -526,6 +526,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "float_precision", "chunksize", "comment", + "nrows", } _python_unsupported = {"low_memory", "float_precision"} @@ -952,7 +953,11 @@ def _clean_options(self, options, engine): # pyarrow engine not supported yet if engine == "pyarrow": for option in _pyarrow_unsupported: - if option != "chunksize" and option != "skipfooter": + if ( + option != "chunksize" + and option != "skipfooter" + and option != "nrows" + ): if options[option] is not None: fallback_reason = ( f"the pyarrow engine does not support the {option} argumnet" @@ -963,6 +968,10 @@ def _clean_options(self, options, engine): fallback_reason = ( "the pyarrow engine does not support using chunksize" ) + if self.nrows is not None: + fallback_reason = ( + "the pyarrow engine does not support using skipfooter" + ) # C and pyarrow engine not supported yet if engine == "c" or "pyarrow": if options["skipfooter"] > 0: @@ -2171,7 +2180,7 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): class ArrowParserWrapper(ParserBase): """ - Wrapper for the pyarrow engine for pd.read_csv() + Wrapper for the pyarrow engine for read_csv() """ def __init__(self, src, **kwds): @@ -2208,16 +2217,22 @@ def read(self, nrows=None): ), ) frame = table.to_pandas() - table_width = len(table.column_names) + num_cols = len(frame.columns) if self.names is None: if self.prefix: - self.names = [f"{self.prefix}{i}" for i in range(table_width)] - elif self.header is not None and self.header != "infer": + self.names = [f"{self.prefix}{i}" for i in range(num_cols)] + frame = frame.rename( + dict(zip(frame.columns, self.names), axis="columns") + ) + elif self.header != 0: header = self.header self.names = frame.iloc[header] frame = frame.drop(header, axis=0) - - frame = frame.rename(zip(frame.names, self.names), axis="columns") + frame = frame.rename( + dict(zip(frame.columns, self.names), axis="columns") + ) + if self.kwds.get("squeeze"): + frame = frame.squeeze() return frame From 7f891a64d8887d69ca435d6b7093a81239ca95f3 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 10 Apr 2020 11:02:05 -0700 Subject: [PATCH 13/95] New benchmark and fix more tests --- asv_bench/benchmarks/io/csv.py | 37 ++++++++++------- pandas/io/parsers.py | 73 ++++++++++++++++++++++------------ 2 files changed, 71 insertions(+), 39 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index a4e6f94f326ba..047fc1fe5f7f7 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -10,7 +10,6 @@ class ToCSV(BaseIO): - fname = "__test__.csv" params = ["wide", "long", "mixed"] param_names = ["kind"] @@ -43,7 +42,6 @@ def time_frame(self, kind): class ToCSVDatetime(BaseIO): - fname = "__test__.csv" def setup(self): @@ -55,7 +53,6 @@ def time_frame_date_formatting(self): class ToCSVDatetimeBig(BaseIO): - fname = "__test__.csv" timeout = 1500 params = [1000, 10000, 100000] @@ -83,7 +80,6 @@ def data(self, stringio_object): class ReadCSVDInferDatetimeFormat(StringIORewind): - params = ([True, False], ["custom", "iso8601", "ymd"]) param_names = ["infer_datetime_format", "format"] @@ -108,7 +104,6 @@ def time_read_csv(self, infer_datetime_format, format): class ReadCSVConcatDatetime(StringIORewind): - iso8601 = "%Y-%m-%d %H:%M:%S" def setup(self): @@ -126,7 +121,6 @@ def time_read_csv(self): class ReadCSVConcatDatetimeBadDateValue(StringIORewind): - params = (["nan", "0", ""],) param_names = ["bad_date_value"] @@ -144,7 +138,6 @@ def time_read_csv(self, bad_date_value): class ReadCSVSkipRows(BaseIO): - fname = "__test__.csv" params = [None, 10000] param_names = ["skiprows"] @@ -190,7 +183,6 @@ def time_read_uint64_na_values(self): class ReadCSVThousands(BaseIO): - fname = "__test__.csv" params = ([",", "|"], [None, ","]) param_names = ["sep", "thousands"] @@ -222,7 +214,6 @@ def time_comment(self): class ReadCSVFloatPrecision(StringIORewind): - params = ([",", ";"], [".", "_"], [None, "high", "round_trip"]) param_names = ["sep", "decimal", "float_precision"] @@ -254,19 +245,38 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision): names=list("abc"), ) - def time_read_csv_pyarrow_engine(self, sep, decimal, float_precision): + def time_read_csv_arrow(self, sep): + read_csv( + self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"), + ) + + +class ReadCSVEngine(StringIORewind): + def setup(self): + data = ["A,B,C"] + (["1,2,3"] * 100000) + self.StringIO_input = StringIO("\n".join(data)) + + def time_read_csv_c(self, sep): + read_csv( + self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"), + ) + + def time_read_csv_arrow(self, sep): + read_csv( + self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"), + ) + + def time_read_csv_python_engine(self, sep): read_csv( self.data(self.StringIO_input), sep=sep, header=None, - engine="pyarrow", - float_precision=None, + engine="python", names=list("abc"), ) class ReadCSVCategorical(BaseIO): - fname = "__test__.csv" def setup(self): @@ -335,7 +345,6 @@ def time_read_csv_cached(self, do_cache): class ReadCSVMemoryGrowth(BaseIO): - chunksize = 20 num_rows = 1000 fname = "__test__.csv" diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f17c1008e29a5..175dccf0633df 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -5,7 +5,7 @@ from collections import abc, defaultdict import csv import datetime -from io import BytesIO, StringIO, TextIOWrapper +from io import StringIO, TextIOBase, TextIOWrapper import itertools import re import sys @@ -172,7 +172,7 @@ Parser engine to use. The C and pyarrow engines are faster, while the python engine is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.13 as a dependency however. - .. versionchanged(1.1) + .. versionchanged:: (1.1) converters : dict, optional Dict of functions for converting values in certain columns. Keys can either be integers or column labels. @@ -1167,27 +1167,28 @@ def _failover_to_python(self): raise AbstractMethodError(self) def read(self, nrows=None): - nrows = _validate_integer("nrows", nrows) if self.engine == "pyarrow": - return self._engine.read(nrows) - ret = self._engine.read(nrows) + df = self._engine.read() + else: + nrows = _validate_integer("nrows", nrows) + ret = self._engine.read(nrows) - # May alter columns / col_dict - index, columns, col_dict = self._create_index(ret) + # May alter columns / col_dict + index, columns, col_dict = self._create_index(ret) - if index is None: - if col_dict: - # Any column is actually fine: - new_rows = len(next(iter(col_dict.values()))) - index = RangeIndex(self._currow, self._currow + new_rows) + if index is None: + if col_dict: + # Any column is actually fine: + new_rows = len(next(iter(col_dict.values()))) + index = RangeIndex(self._currow, self._currow + new_rows) + else: + new_rows = 0 else: - new_rows = 0 - else: - new_rows = len(index) + new_rows = len(index) - df = DataFrame(col_dict, columns=columns, index=index) + df = DataFrame(col_dict, columns=columns, index=index) - self._currow += new_rows + self._currow += new_rows if self.squeeze and len(df.columns) == 1: return df[df.columns[0]].copy() @@ -2231,6 +2232,19 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): return values +class BytesIOWrapper: + def __init__(self, string_buffer, encoding="utf-8"): + self.string_buffer = string_buffer + self.encoding = encoding + + def __getattr__(self, attr): + return getattr(self.string_buffer, attr) + + def read(self, size=-1): + content = self.string_buffer.read(size) + return content.encode(self.encoding) + + class ArrowParserWrapper(ParserBase): """ Wrapper for the pyarrow engine for read_csv() @@ -2247,10 +2261,10 @@ def __init__(self, src, **kwds): self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) - if isinstance(self.src, StringIO): - self.src = BytesIO(self.src.getvalue().encode(encoding)) + if isinstance(self.src, TextIOBase): + self.src = BytesIOWrapper(self.src, encoding=encoding) - def read(self, nrows=None): + def read(self): pyarrow = import_optional_dependency( "pyarrow.csv", extra="pyarrow is required to use arrow engine" ) @@ -2259,7 +2273,9 @@ def read(self, nrows=None): read_options=pyarrow.ReadOptions( skip_rows=self.kwds.get("skiprows"), column_names=self.names, - autogenerate_column_names=True if self.header != 0 else False, + autogenerate_column_names=True + if self.header != 0 or self.kwds.get("skiprows") != set() + else False, ), parse_options=pyarrow.ParseOptions( delimiter=self.kwds.get("delimiter"), @@ -2277,15 +2293,22 @@ def read(self, nrows=None): frame = frame.rename( dict(zip(frame.columns, self.names), axis="columns") ) - elif self.header != 0: + elif self.header is not None and self.header != 0: header = self.header self.names = frame.iloc[header] frame = frame.drop(header, axis=0) frame = frame.rename( - dict(zip(frame.columns, self.names), axis="columns") + columns=dict(zip(frame.columns, self.names), axis="columns") ) - if self.kwds.get("squeeze"): - frame = frame.squeeze() + elif self.header is None: + self.names = range(len(frame.columns)) + frame = frame.rename( + columns=dict(zip(frame.columns, self.names), axis="columns") + ) + + index_col = self.kwds.get("index_col")[0] # flatten list w/ 1 elem + if index_col is not None: + frame.set_index(frame.columns[index_col], drop=True, inplace=True) return frame From 23425f7be4840ac48ff35058ae9a64d064628537 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 10 Apr 2020 15:27:33 -0700 Subject: [PATCH 14/95] More cleanups --- asv_bench/benchmarks/io/csv.py | 22 +++++++--------------- doc/source/whatsnew/v1.1.0.rst | 6 +++--- pandas/io/parsers.py | 7 +++---- 3 files changed, 13 insertions(+), 22 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 047fc1fe5f7f7..b7d7c4e8c120a 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -245,7 +245,7 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision): names=list("abc"), ) - def time_read_csv_arrow(self, sep): + def time_read_csv_arrow(self, sep, decimal, float_precision): read_csv( self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"), ) @@ -256,23 +256,15 @@ def setup(self): data = ["A,B,C"] + (["1,2,3"] * 100000) self.StringIO_input = StringIO("\n".join(data)) - def time_read_csv_c(self, sep): - read_csv( - self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"), - ) + def time_read_csv_c(self): + read_csv(self.data(self.StringIO_input)) - def time_read_csv_arrow(self, sep): - read_csv( - self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"), - ) + def time_read_csv_arrow(self): + read_csv(self.data(self.StringIO_input), engine="pyarrow") - def time_read_csv_python_engine(self, sep): + def time_read_csv_python_engine(self): read_csv( - self.data(self.StringIO_input), - sep=sep, - header=None, - engine="python", - names=list("abc"), + self.data(self.StringIO_input), engine="python", ) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 4c44e35169ba7..b60a79a239628 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -88,9 +88,6 @@ Other enhancements - :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`). - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) - :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`) -- :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing - if pyarrow>0.13 is installed. However, the pyarrow engine is less feature-complete than its "c" or - "python" counterparts. (:issue:`23697`) .. --------------------------------------------------------------------------- @@ -412,6 +409,9 @@ I/O - Bug in :meth:`read_csv` was raising a misleading exception on a permissions issue (:issue:`23784`) - Bug in :meth:`read_csv` was raising an ``IndexError`` when header=None and 2 extra data columns - Bug in :meth:`DataFrame.to_sql` where an ``AttributeError`` was raised when saving an out of bounds date (:issue:`26761`) +- :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing + if pyarrow>0.13 is installed. However, the pyarrow engine is less feature-complete than its "c" or + "python" counterparts. (:issue:`23697`) Plotting ^^^^^^^^ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 175dccf0633df..455b7f748102d 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -172,6 +172,7 @@ Parser engine to use. The C and pyarrow engines are faster, while the python engine is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.13 as a dependency however. + .. versionchanged:: (1.1) converters : dict, optional Dict of functions for converting values in certain columns. Keys can either @@ -2266,16 +2267,14 @@ def __init__(self, src, **kwds): def read(self): pyarrow = import_optional_dependency( - "pyarrow.csv", extra="pyarrow is required to use arrow engine" + "pyarrow.csv", extra="pyarrow is required to use the pyarrow engine" ) table = pyarrow.read_csv( self.src, read_options=pyarrow.ReadOptions( skip_rows=self.kwds.get("skiprows"), column_names=self.names, - autogenerate_column_names=True - if self.header != 0 or self.kwds.get("skiprows") != set() - else False, + autogenerate_column_names=True if self.header != 0 else False, ), parse_options=pyarrow.ParseOptions( delimiter=self.kwds.get("delimiter"), From 01c03942b61f4ab38cf4712c4d078a52c4f27939 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 10 Apr 2020 19:46:34 -0700 Subject: [PATCH 15/95] Formatting fixes and typo correction --- asv_bench/benchmarks/io/csv.py | 9 +++++++++ doc/source/whatsnew/v1.1.0.rst | 2 ++ pandas/io/parsers.py | 6 +++--- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index b7d7c4e8c120a..8dec39091e322 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -10,6 +10,7 @@ class ToCSV(BaseIO): + fname = "__test__.csv" params = ["wide", "long", "mixed"] param_names = ["kind"] @@ -42,6 +43,7 @@ def time_frame(self, kind): class ToCSVDatetime(BaseIO): + fname = "__test__.csv" def setup(self): @@ -53,6 +55,7 @@ def time_frame_date_formatting(self): class ToCSVDatetimeBig(BaseIO): + fname = "__test__.csv" timeout = 1500 params = [1000, 10000, 100000] @@ -80,6 +83,7 @@ def data(self, stringio_object): class ReadCSVDInferDatetimeFormat(StringIORewind): + params = ([True, False], ["custom", "iso8601", "ymd"]) param_names = ["infer_datetime_format", "format"] @@ -104,6 +108,7 @@ def time_read_csv(self, infer_datetime_format, format): class ReadCSVConcatDatetime(StringIORewind): + iso8601 = "%Y-%m-%d %H:%M:%S" def setup(self): @@ -121,6 +126,7 @@ def time_read_csv(self): class ReadCSVConcatDatetimeBadDateValue(StringIORewind): + params = (["nan", "0", ""],) param_names = ["bad_date_value"] @@ -138,6 +144,7 @@ def time_read_csv(self, bad_date_value): class ReadCSVSkipRows(BaseIO): + fname = "__test__.csv" params = [None, 10000] param_names = ["skiprows"] @@ -183,6 +190,7 @@ def time_read_uint64_na_values(self): class ReadCSVThousands(BaseIO): + fname = "__test__.csv" params = ([",", "|"], [None, ","]) param_names = ["sep", "thousands"] @@ -214,6 +222,7 @@ def time_comment(self): class ReadCSVFloatPrecision(StringIORewind): + params = ([",", ";"], [".", "_"], [None, "high", "round_trip"]) param_names = ["sep", "decimal", "float_precision"] diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 690df648ceada..1704f3c096801 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -527,6 +527,8 @@ I/O - :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing if pyarrow>0.13 is installed. However, the pyarrow engine is less feature-complete than its "c" or "python" counterparts. (:issue:`23697`) + + Plotting ^^^^^^^^ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 455b7f748102d..0cf148366cc1c 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -975,7 +975,7 @@ def _clean_options(self, options, engine): ) if self.nrows is not None: fallback_reason = ( - "the pyarrow engine does not support using skipfooter" + "the pyarrow engine does not support using nrows" ) # C and pyarrow engine not supported yet if engine == "c" or "pyarrow": @@ -2305,9 +2305,9 @@ def read(self): columns=dict(zip(frame.columns, self.names), axis="columns") ) - index_col = self.kwds.get("index_col")[0] # flatten list w/ 1 elem + index_col = self.kwds.get("index_col") # need to flatten since returns list if index_col is not None: - frame.set_index(frame.columns[index_col], drop=True, inplace=True) + frame.set_index(frame.columns[index_col[0]], drop=True, inplace=True) return frame From ba5620ff84a14baa0814f96d2499b652a30afdd8 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sat, 11 Apr 2020 17:22:45 -0700 Subject: [PATCH 16/95] skip pyarrow tests if not installed --- asv_bench/benchmarks/io/csv.py | 1 + pandas/tests/io/parser/conftest.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 8dec39091e322..fef4fee047862 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -346,6 +346,7 @@ def time_read_csv_cached(self, do_cache): class ReadCSVMemoryGrowth(BaseIO): + chunksize = 20 num_rows = 1000 fname = "__test__.csv" diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 327f87303aeb0..87a34d728bc60 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -1,4 +1,5 @@ import os +import pkgutil from typing import List, Optional import pytest @@ -73,12 +74,17 @@ def csv1(csv_dir_path): _py_parsers_only = [_pythonParser] _c_parsers_only = [_cParserHighMemory, _cParserLowMemory] _pyarrow_parsers_only = [_pyarrowParser] -_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] _py_parser_ids = ["python"] _c_parser_ids = ["c_high", "c_low"] _pyarrow_parser_ids = ["pyarrow"] -_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids] + +if pkgutil.find_loader("pyarrow"): + _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] + _all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids] +else: + _all_parsers = [*_c_parsers_only, *_py_parsers_only] + _all_parser_ids = [*_c_parser_ids, *_py_parser_ids] @pytest.fixture(params=_all_parsers, ids=_all_parser_ids) From 2570c823f28eb722435929dd86ccfdfb2ff1a37b Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sat, 11 Apr 2020 17:31:51 -0700 Subject: [PATCH 17/95] Address comments --- pandas/io/parsers.py | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 0cf148366cc1c..235cefd82f2d5 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -173,7 +173,8 @@ is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.13 as a dependency however. - .. versionchanged:: (1.1) + .. versionchanged:: 1.1 + The "pyarrow" engine was added. converters : dict, optional Dict of functions for converting values in certain columns. Keys can either be integers or column labels. @@ -958,11 +959,7 @@ def _clean_options(self, options, engine): # pyarrow engine not supported yet if engine == "pyarrow": for option in _pyarrow_unsupported: - if ( - option != "chunksize" - and option != "skipfooter" - and option != "nrows" - ): + if option not in ["chunksize", "skipfooter", "nrows"]: if options[option] is not None: fallback_reason = ( f"the pyarrow engine does not support the {option} argumnet" @@ -2274,11 +2271,12 @@ def read(self): read_options=pyarrow.ReadOptions( skip_rows=self.kwds.get("skiprows"), column_names=self.names, - autogenerate_column_names=True if self.header != 0 else False, + autogenerate_column_names=False if self.header == 0 else True, ), parse_options=pyarrow.ParseOptions( delimiter=self.kwds.get("delimiter"), quote_char=self.kwds.get("quotechar"), + ignore_empty_lines=self.kwds.get("skip_blank_lines"), ), convert_options=pyarrow.ConvertOptions( include_columns=self.usecols, column_types=self.kwds.get("dtype") @@ -2289,21 +2287,15 @@ def read(self): if self.names is None: if self.prefix: self.names = [f"{self.prefix}{i}" for i in range(num_cols)] - frame = frame.rename( - dict(zip(frame.columns, self.names), axis="columns") - ) + frame.columns = self.names elif self.header is not None and self.header != 0: header = self.header self.names = frame.iloc[header] frame = frame.drop(header, axis=0) - frame = frame.rename( - columns=dict(zip(frame.columns, self.names), axis="columns") - ) + frame.columns = self.names elif self.header is None: - self.names = range(len(frame.columns)) - frame = frame.rename( - columns=dict(zip(frame.columns, self.names), axis="columns") - ) + self.names = range(num_cols) + frame.columns = self.names index_col = self.kwds.get("index_col") # need to flatten since returns list if index_col is not None: From b3a1f6628879b8df819c82bc75686d6fd89f42d2 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Tue, 14 Apr 2020 14:24:28 -0700 Subject: [PATCH 18/95] Get some more tests to pass --- asv_bench/benchmarks/io/csv.py | 2 +- pandas/io/parsers.py | 45 ++++++++++++++++----------- pandas/tests/io/parser/test_common.py | 1 + 3 files changed, 28 insertions(+), 20 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index fef4fee047862..55bc8d35af432 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -262,7 +262,7 @@ def time_read_csv_arrow(self, sep, decimal, float_precision): class ReadCSVEngine(StringIORewind): def setup(self): - data = ["A,B,C"] + (["1,2,3"] * 100000) + data = ["A,B,C"] + (["1,2,3"] * 1000000) self.StringIO_input = StringIO("\n".join(data)) def time_read_csv_c(self): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 235cefd82f2d5..444582cbe723c 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -531,6 +531,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "chunksize", "comment", "nrows", + "thousands", } _python_unsupported = {"low_memory", "float_precision"} @@ -959,12 +960,11 @@ def _clean_options(self, options, engine): # pyarrow engine not supported yet if engine == "pyarrow": for option in _pyarrow_unsupported: - if option not in ["chunksize", "skipfooter", "nrows"]: + if option not in ["chunksize", "nrows"]: if options[option] is not None: fallback_reason = ( f"the pyarrow engine does not support the {option} argumnet" ) - engine = "python" else: if self.chunksize is not None: fallback_reason = ( @@ -974,10 +974,10 @@ def _clean_options(self, options, engine): fallback_reason = ( "the pyarrow engine does not support using nrows" ) - # C and pyarrow engine not supported yet - if engine == "c" or "pyarrow": + # C engine not supported yet + if engine == "c": if options["skipfooter"] > 0: - fallback_reason = f"the {engine} engine does not support skipfooter" + fallback_reason = f"the 'c' engine does not support skipfooter" engine = "python" encoding = sys.getfilesystemencoding() or "utf-8" @@ -1157,7 +1157,7 @@ def _make_engine(self, engine="c"): else: raise ValueError( f"Unknown engine: {engine} (valid options " - 'are "c", "python", "arrow", or "python-fwf")' + 'are "c", "python", "pyarrow", or "python-fwf")' ) self._engine = klass(self.f, **self.options) @@ -2266,13 +2266,24 @@ def read(self): pyarrow = import_optional_dependency( "pyarrow.csv", extra="pyarrow is required to use the pyarrow engine" ) - table = pyarrow.read_csv( - self.src, - read_options=pyarrow.ReadOptions( + try: + read_options = pyarrow.ReadOptions( skip_rows=self.kwds.get("skiprows"), - column_names=self.names, autogenerate_column_names=False if self.header == 0 else True, - ), + ) + except TypeError as e: + msg = "__init__() got an unexpected keyword argument" + if msg in str(e): + raise ImportError( + "Pyarrow version >= 0.15.0 is needed in order " + "to use skiprows kwarg with engine=pyarrow. " + "Please upgrade Pyarrow or switch engines." + ) + else: + raise e + table = pyarrow.read_csv( + self.src, + read_options=read_options, parse_options=pyarrow.ParseOptions( delimiter=self.kwds.get("delimiter"), quote_char=self.kwds.get("quotechar"), @@ -2287,17 +2298,13 @@ def read(self): if self.names is None: if self.prefix: self.names = [f"{self.prefix}{i}" for i in range(num_cols)] - frame.columns = self.names elif self.header is not None and self.header != 0: - header = self.header - self.names = frame.iloc[header] - frame = frame.drop(header, axis=0) - frame.columns = self.names + self.names = frame.iloc[self.header] + frame = frame.drop(self.header, axis=0) elif self.header is None: self.names = range(num_cols) - frame.columns = self.names - - index_col = self.kwds.get("index_col") # need to flatten since returns list + frame.columns = self.names + index_col = self.index_col # need to flatten since returns list if index_col is not None: frame.set_index(frame.columns[index_col[0]], drop=True, inplace=True) return frame diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 5bf9587a6ca22..f27178cdc429f 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -63,6 +63,7 @@ def _set_noconvert_columns(self): "parse_dates": parse_dates, "delimiter": ",", } + parser.engine = "c" parser._engine = MyCParserWrapper(StringIO(data), **parser.options) result = parser.read() From d46ceed07a5197cc24748e09a92c3b8199ce7fa3 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Thu, 16 Apr 2020 20:20:22 -0700 Subject: [PATCH 19/95] Fix some bugs and cleanups --- pandas/io/parsers.py | 113 ++++++++++++++++++++++++++++++++----------- 1 file changed, 85 insertions(+), 28 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 444582cbe723c..39ee43f905950 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -532,6 +532,24 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "comment", "nrows", "thousands", + "memory_map", + "dialect", + "warn_bad_lines", + "error_bad_lines", + "delim_whitespace", + "quoting", + "lineterminator", + "converters", + "decimal", + "iterator", + "cache_dates", + "dayfirst", + "keep_date_col", + "infer_datetime_format", + "verbose", + "skipinitialspace", + "date_parser", + "cache_dates", } _python_unsupported = {"low_memory", "float_precision"} @@ -902,6 +920,16 @@ def _get_options_with_defaults(self, engine): for argname, default in _parser_defaults.items(): value = kwds.get(argname, default) + if argname in _pyarrow_unsupported: + if engine == "pyarrow" and value != default: + raise ValueError( + f"The {repr(argname)} option is not supported with the " + f"{repr(engine)} engine" + ) + if argname == "iterator" and engine == "pyarrow": + raise ValueError( + "The iterator option is not supported with the" "pyarrow engine" + ) # see gh-12935 if argname == "mangle_dupe_cols" and not value: raise ValueError("Setting mangle_dupe_cols=False is not supported yet") @@ -957,27 +985,10 @@ def _clean_options(self, options, engine): sep = options["delimiter"] delim_whitespace = options["delim_whitespace"] - # pyarrow engine not supported yet - if engine == "pyarrow": - for option in _pyarrow_unsupported: - if option not in ["chunksize", "nrows"]: - if options[option] is not None: - fallback_reason = ( - f"the pyarrow engine does not support the {option} argumnet" - ) - else: - if self.chunksize is not None: - fallback_reason = ( - "the pyarrow engine does not support using chunksize" - ) - if self.nrows is not None: - fallback_reason = ( - "the pyarrow engine does not support using nrows" - ) # C engine not supported yet if engine == "c": if options["skipfooter"] > 0: - fallback_reason = f"the 'c' engine does not support skipfooter" + fallback_reason = "the 'c' engine does not support skipfooter" engine = "python" encoding = sys.getfilesystemencoding() or "utf-8" @@ -2251,13 +2262,16 @@ class ArrowParserWrapper(ParserBase): def __init__(self, src, **kwds): self.kwds = kwds self.src = src - kwds = kwds.copy() + # kwds = kwds.copy() ParserBase.__init__(self, kwds) encoding = kwds.get("encoding") if kwds.get("encoding") is not None else "utf-8" self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) + self.na_values = _clean_na_values( + kwds["na_values"], keep_default_na=kwds["keep_default_na"] + ) if isinstance(self.src, TextIOBase): self.src = BytesIOWrapper(self.src, encoding=encoding) @@ -2268,8 +2282,7 @@ def read(self): ) try: read_options = pyarrow.ReadOptions( - skip_rows=self.kwds.get("skiprows"), - autogenerate_column_names=False if self.header == 0 else True, + skip_rows=self.kwds.get("skiprows"), autogenerate_column_names=True, ) except TypeError as e: msg = "__init__() got an unexpected keyword argument" @@ -2287,10 +2300,14 @@ def read(self): parse_options=pyarrow.ParseOptions( delimiter=self.kwds.get("delimiter"), quote_char=self.kwds.get("quotechar"), + escape_char=self.kwds.get("escapechar"), ignore_empty_lines=self.kwds.get("skip_blank_lines"), ), convert_options=pyarrow.ConvertOptions( - include_columns=self.usecols, column_types=self.kwds.get("dtype") + include_columns=self.usecols, + null_values=self.kwds.get("na_values"), + true_values=self.kwds.get("true_values"), + false_values=self.kwds.get("false_values"), ), ) frame = table.to_pandas() @@ -2298,17 +2315,57 @@ def read(self): if self.names is None: if self.prefix: self.names = [f"{self.prefix}{i}" for i in range(num_cols)] - elif self.header is not None and self.header != 0: - self.names = frame.iloc[self.header] - frame = frame.drop(self.header, axis=0) + elif self.header is not None: + self.names = frame.iloc[self.header].tolist() + frame.drop(range(self.header + 1), axis=0, inplace=True) + frame.reset_index(drop=True, inplace=True) elif self.header is None: self.names = range(num_cols) frame.columns = self.names - index_col = self.index_col # need to flatten since returns list - if index_col is not None: - frame.set_index(frame.columns[index_col[0]], drop=True, inplace=True) + if self.index_col is not None: + index_col = [frame.columns[i] for i in self.index_col] + frame.set_index(index_col, drop=True, inplace=True) + if self.kwds.get("dtype") is not None: + frame = frame.astype(self.kwds.get("dtype")) + else: + frame = frame.infer_objects() return frame + def _clean_na_values(na_values, keep_default_na=True): + if na_values is None: + if keep_default_na: + na_values = STR_NA_VALUES + else: + na_values = set() + na_fvalues = set() + elif isinstance(na_values, dict): + old_na_values = na_values.copy() + na_values = {} # Prevent aliasing. + + # Convert the values in the na_values dictionary + # into array-likes for further use. This is also + # where we append the default NaN values, provided + # that `keep_default_na=True`. + for k, v in old_na_values.items(): + if not is_list_like(v): + v = [v] + + if keep_default_na: + v = set(v) | STR_NA_VALUES + + na_values[k] = v + na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()} + else: + if not is_list_like(na_values): + na_values = [na_values] + na_values = _stringify_na_values(na_values) + if keep_default_na: + na_values = na_values | STR_NA_VALUES + + na_fvalues = _floatify_na_values(na_values) + + return na_values, na_fvalues + def TextParser(*args, **kwds): """ From 637845922e829e9a6bc97c577b064935591f99ac Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Tue, 19 May 2020 20:40:57 -0700 Subject: [PATCH 20/95] Perform version checks for submodule imports too --- pandas/compat/_optional.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 7e253a52a9c00..139641f300980 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -1,5 +1,6 @@ import distutils.version import importlib +import sys import types import warnings @@ -92,10 +93,16 @@ def import_optional_dependency( raise ImportError(msg) from None else: return None - + # Grab parent module if submodule being imported + parent = name.split(".")[0] + if parent != name: + name = parent + module_to_get = sys.modules[name] + else: + module_to_get = module minimum_version = VERSIONS.get(name) if minimum_version: - version = _get_version(module) + version = _get_version(module_to_get) if distutils.version.LooseVersion(version) < minimum_version: assert on_version in {"warn", "raise", "ignore"} msg = ( From 9d648821b047419b9541381ad50c419f9f571847 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Tue, 19 May 2020 20:44:52 -0700 Subject: [PATCH 21/95] Refresh with newer pyarrow --- asv_bench/benchmarks/io/csv.py | 19 ++++-- pandas/io/parsers.py | 116 +++++++++++++-------------------- 2 files changed, 59 insertions(+), 76 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 55bc8d35af432..52d88d20b6d52 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -262,20 +262,31 @@ def time_read_csv_arrow(self, sep, decimal, float_precision): class ReadCSVEngine(StringIORewind): def setup(self): - data = ["A,B,C"] + (["1,2,3"] * 1000000) + data = ["A,B,C"] + (["1,2,3"] * 100000) self.StringIO_input = StringIO("\n".join(data)) + # simulate reading from file + self.BytesIO_input = self.StringIO_input.read().encode("utf-8") - def time_read_csv_c(self): + def time_read_stringcsv_c(self): read_csv(self.data(self.StringIO_input)) - def time_read_csv_arrow(self): + def time_read_stringcsv_arrow(self): read_csv(self.data(self.StringIO_input), engine="pyarrow") - def time_read_csv_python_engine(self): + def time_read_stringcsv_python_engine(self): read_csv( self.data(self.StringIO_input), engine="python", ) + def time_read_bytescsv_c(self): + read_csv(self.BytesIO_input) + + def time_read_bytescsv_arrow(self): + read_csv(self.BytesIO_input, engine="pyarrow") + + def time_read_bytescsv_python_engine(self): + read_csv(self.BytesIO_input, engine="python") + class ReadCSVCategorical(BaseIO): fname = "__test__.csv" diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 39ee43f905950..40dbfc4c4956d 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -170,7 +170,7 @@ of dtype conversion. engine : {{'c', 'python', 'pyarrow'}}, optional Parser engine to use. The C and pyarrow engines are faster, while the python engine - is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.13 + is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.15 as a dependency however. .. versionchanged:: 1.1 @@ -919,7 +919,6 @@ def _get_options_with_defaults(self, engine): for argname, default in _parser_defaults.items(): value = kwds.get(argname, default) - if argname in _pyarrow_unsupported: if engine == "pyarrow" and value != default: raise ValueError( @@ -928,7 +927,7 @@ def _get_options_with_defaults(self, engine): ) if argname == "iterator" and engine == "pyarrow": raise ValueError( - "The iterator option is not supported with the" "pyarrow engine" + "The iterator option is not supported with the pyarrow engine" ) # see gh-12935 if argname == "mangle_dupe_cols" and not value: @@ -2262,17 +2261,22 @@ class ArrowParserWrapper(ParserBase): def __init__(self, src, **kwds): self.kwds = kwds self.src = src - # kwds = kwds.copy() ParserBase.__init__(self, kwds) encoding = kwds.get("encoding") if kwds.get("encoding") is not None else "utf-8" self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) - self.na_values = _clean_na_values( - kwds["na_values"], keep_default_na=kwds["keep_default_na"] + na_values = kwds["na_values"] + if isinstance(na_values, dict): + raise ValueError( + "The pyarrow engine doesn't support passing a dict for na_values" + ) + self.na_values = list( + _clean_na_values( + kwds["na_values"], keep_default_na=kwds["keep_default_na"] + )[0] ) - if isinstance(self.src, TextIOBase): self.src = BytesIOWrapper(self.src, encoding=encoding) @@ -2280,48 +2284,51 @@ def read(self): pyarrow = import_optional_dependency( "pyarrow.csv", extra="pyarrow is required to use the pyarrow engine" ) + kwdscopy = {k: v for k, v in self.kwds.items() if v is not None} + # these are kwargs passed to pyarrow + parseoptions = {"delimiter", "quote_char", "escape_char", "ignore_empty_lines"} + convertoptions = { + "include_columns", + "null_values", + "true_values", + "false_values", + } + parse_options = {k: v for k, v in kwdscopy.items() if k in parseoptions} + convert_options = {k: v for k, v in kwdscopy.items() if k in convertoptions} + read_options = pyarrow.ReadOptions(autogenerate_column_names=True) + headerexists = True if self.header is not None and self.header >= 0 else False try: - read_options = pyarrow.ReadOptions( - skip_rows=self.kwds.get("skiprows"), autogenerate_column_names=True, - ) + skiprows = self.kwds.get("skiprows") + if skiprows is not None: + read_options = pyarrow.ReadOptions(skip_rows=skiprows) + elif self.header >= 0: + read_options = pyarrow.ReadOptions(skip_rows=self.header) except TypeError as e: msg = "__init__() got an unexpected keyword argument" if msg in str(e): raise ImportError( - "Pyarrow version >= 0.15.0 is needed in order " - "to use skiprows kwarg with engine=pyarrow. " - "Please upgrade Pyarrow or switch engines." + "pyarrow version >= 0.15.0 is required to use " + "read_csv with engine='pyarrow'" ) - else: - raise e table = pyarrow.read_csv( self.src, read_options=read_options, - parse_options=pyarrow.ParseOptions( - delimiter=self.kwds.get("delimiter"), - quote_char=self.kwds.get("quotechar"), - escape_char=self.kwds.get("escapechar"), - ignore_empty_lines=self.kwds.get("skip_blank_lines"), - ), - convert_options=pyarrow.ConvertOptions( - include_columns=self.usecols, - null_values=self.kwds.get("na_values"), - true_values=self.kwds.get("true_values"), - false_values=self.kwds.get("false_values"), - ), + parse_options=pyarrow.ParseOptions(**parse_options), + convert_options=pyarrow.ConvertOptions(**convert_options), ) frame = table.to_pandas() num_cols = len(frame.columns) - if self.names is None: - if self.prefix: - self.names = [f"{self.prefix}{i}" for i in range(num_cols)] - elif self.header is not None: - self.names = frame.iloc[self.header].tolist() - frame.drop(range(self.header + 1), axis=0, inplace=True) - frame.reset_index(drop=True, inplace=True) - elif self.header is None: - self.names = range(num_cols) - frame.columns = self.names + if not headerexists: + if self.names is None: + if self.prefix is not None: + self.names = [f"{self.prefix}{i}" for i in range(num_cols)] + # elif self.header is not None: + # self.names = frame.iloc[self.header].tolist() + # frame.drop(range(self.header + 1), axis=0, inplace=True) + # frame.reset_index(drop=True, inplace=True) + elif self.header is None: + self.names = range(num_cols) + frame.columns = self.names if self.index_col is not None: index_col = [frame.columns[i] for i in self.index_col] frame.set_index(index_col, drop=True, inplace=True) @@ -2331,41 +2338,6 @@ def read(self): frame = frame.infer_objects() return frame - def _clean_na_values(na_values, keep_default_na=True): - if na_values is None: - if keep_default_na: - na_values = STR_NA_VALUES - else: - na_values = set() - na_fvalues = set() - elif isinstance(na_values, dict): - old_na_values = na_values.copy() - na_values = {} # Prevent aliasing. - - # Convert the values in the na_values dictionary - # into array-likes for further use. This is also - # where we append the default NaN values, provided - # that `keep_default_na=True`. - for k, v in old_na_values.items(): - if not is_list_like(v): - v = [v] - - if keep_default_na: - v = set(v) | STR_NA_VALUES - - na_values[k] = v - na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()} - else: - if not is_list_like(na_values): - na_values = [na_values] - na_values = _stringify_na_values(na_values) - if keep_default_na: - na_values = na_values | STR_NA_VALUES - - na_fvalues = _floatify_na_values(na_values) - - return na_values, na_fvalues - def TextParser(*args, **kwds): """ From 93382b421cf62c2ad2f1ede65bd702e2912e8db6 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Thu, 21 May 2020 11:55:20 -0700 Subject: [PATCH 22/95] Start xfailing tests --- asv_bench/benchmarks/io/csv.py | 4 +-- pandas/io/parsers.py | 4 --- pandas/tests/io/parser/conftest.py | 19 ++++++++-- pandas/tests/io/parser/test_common.py | 42 +++++++++++----------- pandas/tests/io/parser/test_compression.py | 15 +++++--- 5 files changed, 50 insertions(+), 34 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 52d88d20b6d52..6e166ec315df6 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -1,4 +1,4 @@ -from io import StringIO +from io import BytesIO, StringIO import random import string @@ -265,7 +265,7 @@ def setup(self): data = ["A,B,C"] + (["1,2,3"] * 100000) self.StringIO_input = StringIO("\n".join(data)) # simulate reading from file - self.BytesIO_input = self.StringIO_input.read().encode("utf-8") + self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8")) def time_read_stringcsv_c(self): read_csv(self.data(self.StringIO_input)) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5bcd9253abb72..e64ca0651e7c7 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2322,10 +2322,6 @@ def read(self): if self.names is None: if self.prefix is not None: self.names = [f"{self.prefix}{i}" for i in range(num_cols)] - # elif self.header is not None: - # self.names = frame.iloc[self.header].tolist() - # frame.drop(range(self.header + 1), axis=0, inplace=True) - # frame.reset_index(drop=True, inplace=True) elif self.header is None: self.names = range(num_cols) frame.columns = self.names diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 87a34d728bc60..8f473bded9225 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -1,7 +1,8 @@ +import distutils.version import os -import pkgutil from typing import List, Optional +import pkg_resources import pytest from pandas import read_csv, read_table @@ -79,7 +80,10 @@ def csv1(csv_dir_path): _c_parser_ids = ["c_high", "c_low"] _pyarrow_parser_ids = ["pyarrow"] -if pkgutil.find_loader("pyarrow"): +pyarrow_version = pkg_resources.get_distribution("pyarrow").version +if ( + distutils.version.LooseVersion(pyarrow_version) > "0.15.0" +): # TODO remove this if block once required pyarrow>0.15.0 _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] _all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids] else: @@ -135,3 +139,14 @@ def encoding_fmt(request): Fixture for all possible string formats of a UTF encoding. """ return request.param + + +@pytest.fixture +def pyarrow_xfail(request): + """ + Fixture that xfails a test if the engine is pyarrow. + """ + if "all_parsers" in request.fixturenames: + parser = request.getfixturevalue("all_parsers") + if parser.engine == "pyarrow": + pytest.xfail("pyarrow doesn't support this.") diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index b6987dae5ed2b..e0b6d70b607d6 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -70,7 +70,7 @@ def _set_noconvert_columns(self): tm.assert_frame_equal(result, expected) -def test_empty_decimal_marker(all_parsers): +def test_empty_decimal_marker(all_parsers, pyarrow_xfail): data = """A|B|C 1|2,334|5 10|13|10. @@ -83,7 +83,7 @@ def test_empty_decimal_marker(all_parsers): parser.read_csv(StringIO(data), decimal="") -def test_bad_stream_exception(all_parsers, csv_dir_path): +def test_bad_stream_exception(all_parsers, csv_dir_path, pyarrow_xfail): # see gh-13652 # # This test validates that both the Python engine and C engine will @@ -169,7 +169,7 @@ def test_squeeze(all_parsers): assert not result._is_view -def test_malformed(all_parsers): +def test_malformed(all_parsers, pyarrow_xfail): # see gh-6607 parser = all_parsers data = """ignore @@ -184,7 +184,7 @@ def test_malformed(all_parsers): @pytest.mark.parametrize("nrows", [5, 3, None]) -def test_malformed_chunks(all_parsers, nrows): +def test_malformed_chunks(all_parsers, nrows, pyarrow_xfail): data = """ignore A,B,C skip @@ -203,7 +203,7 @@ def test_malformed_chunks(all_parsers, nrows): reader.read(nrows) -def test_unnamed_columns(all_parsers): +def test_unnamed_columns(all_parsers, pyarrow_xfail): data = """A,B,C,, 1,2,3,4,5 6,7,8,9,10 @@ -306,7 +306,7 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path): tm.assert_frame_equal(result, expected) -def test_read_csv_wrong_num_columns(all_parsers): +def test_read_csv_wrong_num_columns(all_parsers, pyarrow_xfail): # Too few columns. data = """A,B,C,D,E,F 1,2,3,4,5,6 @@ -422,7 +422,7 @@ def test_int_conversion(all_parsers): @pytest.mark.parametrize("nrows", [3, 3.0]) -def test_read_nrows(all_parsers, nrows): +def test_read_nrows(all_parsers, nrows, pyarrow_xfail): # see gh-10476 data = """index,A,B,C,D foo,2,3,4,5 @@ -443,7 +443,7 @@ def test_read_nrows(all_parsers, nrows): @pytest.mark.parametrize("nrows", [1.2, "foo", -1]) -def test_read_nrows_bad(all_parsers, nrows): +def test_read_nrows_bad(all_parsers, nrows, pyarrow_xfail): data = """index,A,B,C,D foo,2,3,4,5 bar,7,8,9,10 @@ -460,7 +460,7 @@ def test_read_nrows_bad(all_parsers, nrows): @pytest.mark.parametrize("index_col", [0, "index"]) -def test_read_chunksize_with_index(all_parsers, index_col): +def test_read_chunksize_with_index(all_parsers, index_col, pyarrow_xfail): parser = all_parsers data = """index,A,B,C,D foo,2,3,4,5 @@ -492,7 +492,7 @@ def test_read_chunksize_with_index(all_parsers, index_col): @pytest.mark.parametrize("chunksize", [1.3, "foo", 0]) -def test_read_chunksize_bad(all_parsers, chunksize): +def test_read_chunksize_bad(all_parsers, chunksize, pyarrow_xfail): data = """index,A,B,C,D foo,2,3,4,5 bar,7,8,9,10 @@ -509,7 +509,7 @@ def test_read_chunksize_bad(all_parsers, chunksize): @pytest.mark.parametrize("chunksize", [2, 8]) -def test_read_chunksize_and_nrows(all_parsers, chunksize): +def test_read_chunksize_and_nrows(all_parsers, chunksize, pyarrow_xfail): # see gh-15755 data = """index,A,B,C,D foo,2,3,4,5 @@ -527,7 +527,7 @@ def test_read_chunksize_and_nrows(all_parsers, chunksize): tm.assert_frame_equal(concat(reader), expected) -def test_read_chunksize_and_nrows_changing_size(all_parsers): +def test_read_chunksize_and_nrows_changing_size(all_parsers, pyarrow_xfail): data = """index,A,B,C,D foo,2,3,4,5 bar,7,8,9,10 @@ -549,7 +549,7 @@ def test_read_chunksize_and_nrows_changing_size(all_parsers): reader.get_chunk(size=3) -def test_get_chunk_passed_chunksize(all_parsers): +def test_get_chunk_passed_chunksize(all_parsers, pyarrow_xfail): parser = all_parsers data = """A,B,C 1,2,3 @@ -565,7 +565,7 @@ def test_get_chunk_passed_chunksize(all_parsers): @pytest.mark.parametrize("kwargs", [dict(), dict(index_col=0)]) -def test_read_chunksize_compat(all_parsers, kwargs): +def test_read_chunksize_compat(all_parsers, kwargs, pyarrow_xfail): # see gh-12185 data = """index,A,B,C,D foo,2,3,4,5 @@ -582,7 +582,7 @@ def test_read_chunksize_compat(all_parsers, kwargs): tm.assert_frame_equal(concat(reader), result) -def test_read_chunksize_jagged_names(all_parsers): +def test_read_chunksize_jagged_names(all_parsers, pyarrow_xfail): # see gh-23509 parser = all_parsers data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)]) @@ -594,7 +594,7 @@ def test_read_chunksize_jagged_names(all_parsers): tm.assert_frame_equal(result, expected) -def test_read_data_list(all_parsers): +def test_read_data_list(all_parsers, pyarrow_xfail): parser = all_parsers kwargs = dict(index_col=0) data = "A,B,C\nfoo,1,2,3\nbar,4,5,6" @@ -608,7 +608,7 @@ def test_read_data_list(all_parsers): tm.assert_frame_equal(result, expected) -def test_iterator(all_parsers): +def test_iterator(all_parsers, pyarrow_xfail): # see gh-6607 data = """index,A,B,C,D foo,2,3,4,5 @@ -631,7 +631,7 @@ def test_iterator(all_parsers): tm.assert_frame_equal(last_chunk, expected[3:]) -def test_iterator2(all_parsers): +def test_iterator2(all_parsers, pyarrow_xfail): parser = all_parsers data = """A,B,C foo,1,2,3 @@ -694,7 +694,7 @@ def test_reader_list_skiprows(all_parsers): tm.assert_frame_equal(chunks[0], expected[1:3]) -def test_iterator_stop_on_chunksize(all_parsers): +def test_iterator_stop_on_chunksize(all_parsers, pyarrow_xfail): # gh-3967: stopping iteration when chunksize is specified parser = all_parsers data = """A,B,C @@ -718,7 +718,7 @@ def test_iterator_stop_on_chunksize(all_parsers): @pytest.mark.parametrize( "kwargs", [dict(iterator=True, chunksize=1), dict(iterator=True), dict(chunksize=1)] ) -def test_iterator_skipfooter_errors(all_parsers, kwargs): +def test_iterator_skipfooter_errors(all_parsers, kwargs, pyarrow_xfail): msg = "'skipfooter' not supported for 'iteration'" parser = all_parsers data = "a\n1\n2" @@ -727,7 +727,7 @@ def test_iterator_skipfooter_errors(all_parsers, kwargs): parser.read_csv(StringIO(data), skipfooter=1, **kwargs) -def test_nrows_skipfooter_errors(all_parsers): +def test_nrows_skipfooter_errors(all_parsers, pyarrow_xfail): msg = "'skipfooter' not supported with 'nrows'" data = "a\n1\n2\n3\n4\n5\n6" parser = all_parsers diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index b773664adda72..22bba9bd3f98a 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -29,7 +29,7 @@ def parser_and_data(all_parsers, csv1): @pytest.mark.parametrize("compression", ["zip", "infer", "zip2"]) -def test_zip(parser_and_data, compression): +def test_zip(parser_and_data, compression, pyarrow_xfail): parser, data, expected = parser_and_data with tm.ensure_clean("test_file.zip") as path: @@ -46,7 +46,7 @@ def test_zip(parser_and_data, compression): @pytest.mark.parametrize("compression", ["zip", "infer"]) -def test_zip_error_multiple_files(parser_and_data, compression): +def test_zip_error_multiple_files(parser_and_data, compression, pyarrow_xfail): parser, data, expected = parser_and_data with tm.ensure_clean("combined_zip.zip") as path: @@ -60,7 +60,7 @@ def test_zip_error_multiple_files(parser_and_data, compression): parser.read_csv(path, compression=compression) -def test_zip_error_no_files(parser_and_data): +def test_zip_error_no_files(parser_and_data, pyarrow_xfail): parser, _, _ = parser_and_data with tm.ensure_clean() as path: @@ -71,7 +71,7 @@ def test_zip_error_no_files(parser_and_data): parser.read_csv(path, compression="zip") -def test_zip_error_invalid_zip(parser_and_data): +def test_zip_error_invalid_zip(parser_and_data, pyarrow_xfail): parser, _, _ = parser_and_data with tm.ensure_clean() as path: @@ -86,6 +86,11 @@ def test_compression(parser_and_data, compression_only, buffer, filename): compress_type = compression_only ext = "gz" if compress_type == "gzip" else compress_type + pyarrow_unsupported_exts = {"bz2", "zip", "xz"} + if ext in pyarrow_unsupported_exts and parser.engine == "pyarrow": + # need to skip since this test will hang forever and not fail + pytest.skip(f"The pyarrow package doesn't come with {ext} support") + filename = filename if filename is None else filename.format(ext=ext) if filename and buffer: @@ -141,7 +146,7 @@ def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding @pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"]) -def test_invalid_compression(all_parsers, invalid_compression): +def test_invalid_compression(all_parsers, invalid_compression, pyarrow_xfail): parser = all_parsers compress_kwargs = dict(compression=invalid_compression) From f1bb4e25c77f4b672ddd5dfc7afc2af51abc9e32 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 27 May 2020 10:57:57 -0700 Subject: [PATCH 23/95] Get all tests to run & some fixes --- pandas/io/parsers.py | 37 ++++++++++++---------- pandas/tests/io/parser/conftest.py | 7 ++-- pandas/tests/io/parser/test_common.py | 4 +-- pandas/tests/io/parser/test_compression.py | 11 +++---- pandas/tests/io/parser/test_unsupported.py | 19 +++++++++++ 5 files changed, 50 insertions(+), 28 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e64ca0651e7c7..2f9e4ec11187e 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -21,7 +21,7 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing from pandas._typing import FilePathOrBuffer -from pandas.compat._optional import import_optional_dependency +from pandas.compat._optional import import_optional_dependency, VERSIONS from pandas.errors import ( AbstractMethodError, EmptyDataError, @@ -444,7 +444,14 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): # Extract some of the arguments (pass chunksize on). iterator = kwds.get("iterator", False) - chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1) + chunksize = kwds.get("chunksize", None) + if kwds.get("engine") == "pyarrow": # chunksize not supported for pyarrow + if iterator: + raise ValueError("The 'iterator' option is not supported with the 'pyarrow' engine") + if chunksize is not None: + raise ValueError("The 'chunksize' option is not supported with the 'pyarrow' engine") + else: + chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1) nrows = kwds.get("nrows", None) # Check for duplicates in names. @@ -830,6 +837,9 @@ def __init__(self, f, engine=None, **kwds): self._engine_specified = kwds.get("engine_specified", engine_specified) if kwds.get("dialect") is not None: + if engine == "pyarrow": + raise ValueError("The 'dialect' option is not supported with the 'pyarrow' engine") + dialect = kwds["dialect"] if dialect in csv.list_dialects(): dialect = csv.get_dialect(dialect) @@ -923,11 +933,11 @@ def _get_options_with_defaults(self, engine): if engine == "pyarrow" and value != default: raise ValueError( f"The {repr(argname)} option is not supported with the " - f"{repr(engine)} engine" + f"'pyarrow' engine" ) if argname == "iterator" and engine == "pyarrow": raise ValueError( - "The iterator option is not supported with the pyarrow engine" + "The iterator option is not supported with the 'pyarrow' engine" ) # see gh-12935 if argname == "mangle_dupe_cols" and not value: @@ -2281,6 +2291,7 @@ def __init__(self, src, **kwds): self.src = BytesIOWrapper(self.src, encoding=encoding) def read(self): + VERSIONS["pyarrow"] = "0.15.0" pyarrow = import_optional_dependency( "pyarrow.csv", extra="pyarrow is required to use the pyarrow engine" ) @@ -2297,19 +2308,11 @@ def read(self): convert_options = {k: v for k, v in kwdscopy.items() if k in convertoptions} read_options = pyarrow.ReadOptions(autogenerate_column_names=True) headerexists = True if self.header is not None and self.header >= 0 else False - try: - skiprows = self.kwds.get("skiprows") - if skiprows is not None: - read_options = pyarrow.ReadOptions(skip_rows=skiprows) - elif self.header >= 0: - read_options = pyarrow.ReadOptions(skip_rows=self.header) - except TypeError as e: - msg = "__init__() got an unexpected keyword argument" - if msg in str(e): - raise ImportError( - "pyarrow version >= 0.15.0 is required to use " - "read_csv with engine='pyarrow'" - ) + skiprows = self.kwds.get("skiprows") + if skiprows is not None: + read_options = pyarrow.ReadOptions(skip_rows=skiprows) + elif headerexists: + read_options = pyarrow.ReadOptions(skip_rows=self.header) table = pyarrow.read_csv( self.src, read_options=read_options, diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 8f473bded9225..09379ac1b6922 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -80,10 +80,13 @@ def csv1(csv_dir_path): _c_parser_ids = ["c_high", "c_low"] _pyarrow_parser_ids = ["pyarrow"] -pyarrow_version = pkg_resources.get_distribution("pyarrow").version +try: + pyarrow_version = pkg_resources.get_distribution("pyarrow").version +except pkg_resources.DistributionNotFound: + pyarrow_version = None if ( distutils.version.LooseVersion(pyarrow_version) > "0.15.0" -): # TODO remove this if block once required pyarrow>0.15.0 +): _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] _all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids] else: diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index e0b6d70b607d6..f35da606110fe 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1517,7 +1517,7 @@ def test_uneven_lines_with_usecols(all_parsers, usecols): ), ], ) -def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): +def test_read_empty_with_usecols(all_parsers, data, kwargs, expected, pyarrow_xfail): # see gh-12493 parser = all_parsers @@ -2082,7 +2082,7 @@ def test_read_table_equivalency_to_read_csv(all_parsers): tm.assert_frame_equal(result, expected) -def test_first_row_bom(all_parsers): +def test_first_row_bom(all_parsers, pyarrow_xfail): # see gh-26545 parser = all_parsers data = '''\ufeff"Head1" "Head2" "Head3"''' diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index 22bba9bd3f98a..2c5f1b61370a5 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -81,16 +81,11 @@ def test_zip_error_invalid_zip(parser_and_data, pyarrow_xfail): @pytest.mark.parametrize("filename", [None, "test.{ext}"]) -def test_compression(parser_and_data, compression_only, buffer, filename): +def test_compression(parser_and_data, compression_only, buffer, filename, pyarrow_xfail): parser, data, expected = parser_and_data compress_type = compression_only ext = "gz" if compress_type == "gzip" else compress_type - pyarrow_unsupported_exts = {"bz2", "zip", "xz"} - if ext in pyarrow_unsupported_exts and parser.engine == "pyarrow": - # need to skip since this test will hang forever and not fail - pytest.skip(f"The pyarrow package doesn't come with {ext} support") - filename = filename if filename is None else filename.format(ext=ext) if filename and buffer: @@ -118,6 +113,8 @@ def test_infer_compression(all_parsers, csv1, buffer, ext): expected = parser.read_csv(csv1, **kwargs) kwargs["compression"] = "infer" + if ext == "bz2": + pytest.xfail("pyarrow wheels don't have bz2 codec support") if buffer: with open(csv1) as f: result = parser.read_csv(f, **kwargs) @@ -128,7 +125,7 @@ def test_infer_compression(all_parsers, csv1, buffer, ext): tm.assert_frame_equal(result, expected) -def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt): +def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt, pyarrow_xfail): # see gh-18071, gh-24130 parser = all_parsers encoding = encoding_fmt.format(utf_value) diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 267fae760398a..44865d61d1b05 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -121,3 +121,22 @@ def read(self): with pytest.raises(ValueError, match=msg): read_csv(NoNextBuffer(data), engine=python_engine) + + def test_pyarrow_engine(self): + from pandas.io.parsers import _pyarrow_unsupported as pa_unsupported + + data = """1,2,3,, + 1,2,3,4, + 1,2,3,4,5 + 1,2,,, + 1,2,3,4,""" + + for default in pa_unsupported: + msg = ( + f"The {repr(default)} option is not " + f"supported with the 'pyarrow' engine" + ) + print(default) + kwargs = {default: object()} + with pytest.raises(ValueError, match=msg): + read_csv(StringIO(data), engine="pyarrow", **kwargs) From 7876b4ef795150510837f74538fdc10b1c38333e Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 29 May 2020 15:57:58 -0700 Subject: [PATCH 24/95] Lint and CI --- pandas/io/parsers.py | 15 +++++++++++---- pandas/tests/io/parser/conftest.py | 6 ++---- pandas/tests/io/parser/test_common.py | 2 +- pandas/tests/io/parser/test_compression.py | 8 ++++++-- pandas/tests/io/parser/test_dtypes.py | 2 +- pandas/tests/io/parser/test_unsupported.py | 1 - 6 files changed, 21 insertions(+), 13 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2f9e4ec11187e..f1a89da794849 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -21,7 +21,7 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing from pandas._typing import FilePathOrBuffer -from pandas.compat._optional import import_optional_dependency, VERSIONS +from pandas.compat._optional import VERSIONS, import_optional_dependency from pandas.errors import ( AbstractMethodError, EmptyDataError, @@ -447,9 +447,13 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): chunksize = kwds.get("chunksize", None) if kwds.get("engine") == "pyarrow": # chunksize not supported for pyarrow if iterator: - raise ValueError("The 'iterator' option is not supported with the 'pyarrow' engine") + raise ValueError( + "The 'iterator' option is not supported with the 'pyarrow' engine" + ) if chunksize is not None: - raise ValueError("The 'chunksize' option is not supported with the 'pyarrow' engine") + raise ValueError( + "The 'chunksize' option is not supported with the 'pyarrow' engine" + ) else: chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1) nrows = kwds.get("nrows", None) @@ -557,6 +561,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "skipinitialspace", "date_parser", "cache_dates", + "parse_dates", } _python_unsupported = {"low_memory", "float_precision"} @@ -838,7 +843,9 @@ def __init__(self, f, engine=None, **kwds): if kwds.get("dialect") is not None: if engine == "pyarrow": - raise ValueError("The 'dialect' option is not supported with the 'pyarrow' engine") + raise ValueError( + "The 'dialect' option is not supported with the 'pyarrow' engine" + ) dialect = kwds["dialect"] if dialect in csv.list_dialects(): diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 09379ac1b6922..9aa23bd739d24 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -83,10 +83,8 @@ def csv1(csv_dir_path): try: pyarrow_version = pkg_resources.get_distribution("pyarrow").version except pkg_resources.DistributionNotFound: - pyarrow_version = None -if ( - distutils.version.LooseVersion(pyarrow_version) > "0.15.0" -): + pyarrow_version = "0" # represents pyarrow not found +if distutils.version.LooseVersion(pyarrow_version) > "0.15.0": _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] _all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids] else: diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index f35da606110fe..96410f626952b 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1561,7 +1561,7 @@ def test_trailing_spaces(all_parsers, kwargs, expected): tm.assert_frame_equal(result, expected) -def test_raise_on_sep_with_delim_whitespace(all_parsers): +def test_raise_on_sep_with_delim_whitespace(all_parsers, pyarrow_xfail): # see gh-6607 data = "a b c\n1 2 3" parser = all_parsers diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index 2c5f1b61370a5..ecc35dd6644c8 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -81,7 +81,9 @@ def test_zip_error_invalid_zip(parser_and_data, pyarrow_xfail): @pytest.mark.parametrize("filename", [None, "test.{ext}"]) -def test_compression(parser_and_data, compression_only, buffer, filename, pyarrow_xfail): +def test_compression( + parser_and_data, compression_only, buffer, filename, pyarrow_xfail +): parser, data, expected = parser_and_data compress_type = compression_only @@ -125,7 +127,9 @@ def test_infer_compression(all_parsers, csv1, buffer, ext): tm.assert_frame_equal(result, expected) -def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt, pyarrow_xfail): +def test_compression_utf_encoding( + all_parsers, csv_dir_path, utf_value, encoding_fmt, pyarrow_xfail +): # see gh-18071, gh-24130 parser = all_parsers encoding = encoding_fmt.format(utf_value) diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index d1ed85cc6f466..626d4febd7ddf 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -403,7 +403,7 @@ def test_empty_with_multi_index_pass_dtype(all_parsers): tm.assert_frame_equal(result, expected) -def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers): +def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers, pyarrow_xfail): parser = all_parsers data = "one,one" diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 44865d61d1b05..2e6165619f318 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -136,7 +136,6 @@ def test_pyarrow_engine(self): f"The {repr(default)} option is not " f"supported with the 'pyarrow' engine" ) - print(default) kwargs = {default: object()} with pytest.raises(ValueError, match=msg): read_csv(StringIO(data), engine="pyarrow", **kwargs) From 008acab51559e76c1646bd659146d6b79081b99d Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 3 Jun 2020 14:20:56 -0700 Subject: [PATCH 25/95] parse_dates support and fixups of some tests --- asv_bench/benchmarks/io/csv.py | 2 +- pandas/io/parsers.py | 8 +++----- pandas/tests/io/parser/test_unsupported.py | 1 + 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 6e166ec315df6..f2462184abb37 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -262,7 +262,7 @@ def time_read_csv_arrow(self, sep, decimal, float_precision): class ReadCSVEngine(StringIORewind): def setup(self): - data = ["A,B,C"] + (["1,2,3"] * 100000) + data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 1000000) self.StringIO_input = StringIO("\n".join(data)) # simulate reading from file self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8")) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f1a89da794849..24aff9ddba376 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -555,13 +555,10 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "iterator", "cache_dates", "dayfirst", - "keep_date_col", "infer_datetime_format", "verbose", "skipinitialspace", - "date_parser", "cache_dates", - "parse_dates", } _python_unsupported = {"low_memory", "float_precision"} @@ -2338,10 +2335,11 @@ def read(self): if self.index_col is not None: index_col = [frame.columns[i] for i in self.index_col] frame.set_index(index_col, drop=True, inplace=True) + + frame.columns, frame = self._do_date_conversions(frame.columns, frame) + if self.kwds.get("dtype") is not None: frame = frame.astype(self.kwds.get("dtype")) - else: - frame = frame.infer_objects() return frame diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 2e6165619f318..d2ae4c160d519 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -132,6 +132,7 @@ def test_pyarrow_engine(self): 1,2,3,4,""" for default in pa_unsupported: + print(default) msg = ( f"The {repr(default)} option is not " f"supported with the 'pyarrow' engine" From 2dddae747d4d612ab8e78761bd058ff76a13a5eb Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 12 Jun 2020 21:33:34 -0700 Subject: [PATCH 26/95] Date parsing fixes and address comments --- asv_bench/benchmarks/io/csv.py | 68 +++++++++---------- doc/source/user_guide/io.rst | 8 ++- doc/source/whatsnew/v1.1.0.rst | 6 +- pandas/io/parsers.py | 102 ++++++++++++++++++++++++----- pandas/tests/io/parser/conftest.py | 16 ++--- 5 files changed, 130 insertions(+), 70 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index f2462184abb37..3681cd4df481f 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -146,10 +146,10 @@ def time_read_csv(self, bad_date_value): class ReadCSVSkipRows(BaseIO): fname = "__test__.csv" - params = [None, 10000] - param_names = ["skiprows"] + params = ([None, 10000], ["c", "pyarrow"]) + param_names = ["skiprows", "engine"] - def setup(self, skiprows): + def setup(self, skiprows, engine): N = 20000 index = tm.makeStringIndex(N) df = DataFrame( @@ -164,8 +164,8 @@ def setup(self, skiprows): ) df.to_csv(self.fname) - def time_skipprows(self, skiprows): - read_csv(self.fname, skiprows=skiprows) + def time_skipprows(self, skiprows, engine): + read_csv(self.fname, skiprows=skiprows, engine=engine) class ReadUint64Integers(StringIORewind): @@ -261,31 +261,20 @@ def time_read_csv_arrow(self, sep, decimal, float_precision): class ReadCSVEngine(StringIORewind): - def setup(self): - data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 1000000) + params = ["c", "python", "pyarrow"] + param_names = ["engine"] + + def setup(self, engine): + data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 100000) self.StringIO_input = StringIO("\n".join(data)) # simulate reading from file self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8")) - def time_read_stringcsv_c(self): - read_csv(self.data(self.StringIO_input)) - - def time_read_stringcsv_arrow(self): - read_csv(self.data(self.StringIO_input), engine="pyarrow") - - def time_read_stringcsv_python_engine(self): - read_csv( - self.data(self.StringIO_input), engine="python", - ) - - def time_read_bytescsv_c(self): - read_csv(self.BytesIO_input) - - def time_read_bytescsv_arrow(self): - read_csv(self.BytesIO_input, engine="pyarrow") + def time_read_stringcsv(self, engine): + read_csv(self.data(self.StringIO_input), engine=engine) - def time_read_bytescsv_python_engine(self): - read_csv(self.BytesIO_input, engine="python") + def time_read_bytescsv(self, engine): + read_csv(self.data(self.BytesIO_input), engine=engine) class ReadCSVCategorical(BaseIO): @@ -305,7 +294,10 @@ def time_convert_direct(self): class ReadCSVParseDates(StringIORewind): - def setup(self): + params = ["c", "pyarrow", "python"] + param_names = ["engine"] + + def setup(self, engine): data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n {},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n {},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n @@ -316,18 +308,20 @@ def setup(self): data = data.format(*two_cols) self.StringIO_input = StringIO(data) - def time_multiple_date(self): + def time_multiple_date(self, engine): read_csv( self.data(self.StringIO_input), + engine=engine, sep=",", header=None, names=list(string.digits[:9]), parse_dates=[[1, 2], [1, 3]], ) - def time_baseline(self): + def time_baseline(self, engine): read_csv( self.data(self.StringIO_input), + engine=engine, sep=",", header=None, parse_dates=[1], @@ -336,17 +330,18 @@ def time_baseline(self): class ReadCSVCachedParseDates(StringIORewind): - params = ([True, False],) - param_names = ["do_cache"] + params = ([True, False], ["c", "pyarrow", "python"]) + param_names = ["do_cache", "engine"] - def setup(self, do_cache): + def setup(self, do_cache, engine): data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10 self.StringIO_input = StringIO(data) - def time_read_csv_cached(self, do_cache): + def time_read_csv_cached(self, do_cache, engine): try: read_csv( self.data(self.StringIO_input), + engine=engine, header=None, parse_dates=[0], cache_dates=do_cache, @@ -376,22 +371,23 @@ def mem_parser_chunks(self): class ReadCSVParseSpecialDate(StringIORewind): - params = (["mY", "mdY", "hm"],) - param_names = ["value"] + params = (["mY", "mdY", "hm"], ["c", "pyarrow", "python"]) + param_names = ["value", "engine"] objects = { "mY": "01-2019\n10-2019\n02/2000\n", "mdY": "12/02/2010\n", "hm": "21:34\n", } - def setup(self, value): + def setup(self, value, engine): count_elem = 10000 data = self.objects[value] * count_elem self.StringIO_input = StringIO(data) - def time_read_special_date(self, value): + def time_read_special_date(self, value, engine): read_csv( self.data(self.StringIO_input), + engine=engine, sep=",", header=None, names=["Date"], diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index df6b44ac654ce..9ff714a8211bb 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -160,9 +160,11 @@ dtype : Type name or dict of column -> type, default ``None`` (unsupported with ``engine='python'``). Use `str` or `object` together with suitable ``na_values`` settings to preserve and not interpret dtype. -engine : {``'c'``, ``'python'``} - Parser engine to use. The C engine is faster while the Python engine is - currently more feature-complete. +engine : {``'c'``, ``'pyarrow'``,``'python'``} + Parser engine to use. In terms of performance, the pyarrow engine, + which requires pyarrow>=0.15.0, is faster than the C engine, which + is faster than the python engine. However, the pyarrow and C engines + are currently less feature complete than their Python counterpart. converters : dict, default ``None`` Dict of functions for converting values in certain columns. Keys can either be integers or column labels. diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 44a56e0818ae8..dee66257f2d56 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -288,6 +288,9 @@ Other enhancements - :meth:`HDFStore.put` now accepts `track_times` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`). - Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`) - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). +- :func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing + if pyarrow>=0.15 is installed. However, the pyarrow engine is less feature-complete than its "c" or + "python" counterparts. See the :doc:`I/O docs ` for more info. (:issue:`23697`) .. --------------------------------------------------------------------------- @@ -901,9 +904,6 @@ I/O - Bug in :meth:`~DataFrame.read_feather` was raising an `ArrowIOError` when reading an s3 or http file path (:issue:`29055`) - Bug in :meth:`~DataFrame.to_excel` could not handle the column name `render` and was raising an ``KeyError`` (:issue:`34331`) - Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`) -- :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing - if pyarrow>0.15 is installed. However, the pyarrow engine is less feature-complete than its "c" or - "python" counterparts. (:issue:`23697`) Plotting ^^^^^^^^ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 24aff9ddba376..d8ef6488dc02a 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -170,9 +170,8 @@ of dtype conversion. engine : {{'c', 'python', 'pyarrow'}}, optional Parser engine to use. The C and pyarrow engines are faster, while the python engine - is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.15 + is currently more feature-complete. The pyarrow engine requires ``pyarrow`` >= 0.15 as a dependency however. - .. versionchanged:: 1.1 The "pyarrow" engine was added. converters : dict, optional @@ -445,7 +444,8 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): # Extract some of the arguments (pass chunksize on). iterator = kwds.get("iterator", False) chunksize = kwds.get("chunksize", None) - if kwds.get("engine") == "pyarrow": # chunksize not supported for pyarrow + # chunksize and iterator not supported for pyarrow + if kwds.get("engine") == "pyarrow": if iterator: raise ValueError( "The 'iterator' option is not supported with the 'pyarrow' engine" @@ -523,6 +523,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "skip_blank_lines": True, } + _c_parser_defaults = { "delim_whitespace": False, "na_filter": True, @@ -553,12 +554,11 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "converters", "decimal", "iterator", - "cache_dates", "dayfirst", "infer_datetime_format", "verbose", "skipinitialspace", - "cache_dates", + "low_memory", } _python_unsupported = {"low_memory", "float_precision"} @@ -939,10 +939,6 @@ def _get_options_with_defaults(self, engine): f"The {repr(argname)} option is not supported with the " f"'pyarrow' engine" ) - if argname == "iterator" and engine == "pyarrow": - raise ValueError( - "The iterator option is not supported with the 'pyarrow' engine" - ) # see gh-12935 if argname == "mangle_dupe_cols" and not value: raise ValueError("Setting mangle_dupe_cols=False is not supported yet") @@ -2255,14 +2251,18 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): class BytesIOWrapper: - def __init__(self, string_buffer, encoding="utf-8"): + """ + Allows the pyarrow engine for read_csv() to read from string buffers + """ + + def __init__(self, string_buffer: StringIO, encoding: str = "utf-8"): self.string_buffer = string_buffer self.encoding = encoding - def __getattr__(self, attr): + def __getattr__(self, attr: str): return getattr(self.string_buffer, attr) - def read(self, size=-1): + def read(self, size: int = -1): content = self.string_buffer.read(size) return content.encode(self.encoding) @@ -2332,16 +2332,85 @@ def read(self): elif self.header is None: self.names = range(num_cols) frame.columns = self.names - if self.index_col is not None: - index_col = [frame.columns[i] for i in self.index_col] - frame.set_index(index_col, drop=True, inplace=True) - frame.columns, frame = self._do_date_conversions(frame.columns, frame) + frame = self._date_conversion( + frame, self._date_conv, self.parse_dates, keep_date_col=self.keep_date_col + ) + + if self.index_col is not None: + for i, item in enumerate(self.index_col): + if is_integer(item): + self.index_col[i] = frame.columns[item] + frame.set_index(self.index_col, drop=True, inplace=True) if self.kwds.get("dtype") is not None: frame = frame.astype(self.kwds.get("dtype")) return frame + def _date_conversion( + self, data, converter, parse_spec, keep_date_col=False, + ): + + orig_names = data.columns + columns = list(data.columns) + + date_cols = set() + + if parse_spec is None or isinstance(parse_spec, bool): + return data, columns + + if isinstance(parse_spec, list): + # list of column lists + for colspec in parse_spec: + if is_scalar(colspec): + if isinstance(colspec, int) and colspec not in data: + colspec = orig_names[colspec] + data[colspec] = converter(data[colspec].values) + else: + new_name, col, old_names = self._try_convert_dates( + converter, colspec, data, orig_names + ) + if new_name in data: + raise ValueError(f"New date column already in dict {new_name}") + data[new_name] = col + date_cols.update(old_names) + + elif isinstance(parse_spec, dict): + # dict of new name to column list + for new_name, colspec in parse_spec.items(): + if new_name in data: + raise ValueError(f"Date column {new_name} already in dict") + + _, col, old_names = self._try_convert_dates( + converter, colspec, data, orig_names + ) + + data[new_name] = col + date_cols.update(old_names) + + if not keep_date_col: + data = data.drop(date_cols, axis=1) + + return data + + def _try_convert_dates(self, parser, colspec, data, columns): + colset = set(columns) + colnames = [] + + for c in colspec: + if c in colset: + colnames.append(c) + elif isinstance(c, int) and c not in columns: + colnames.append(columns[c]) + else: + colnames.append(c) + + new_name = "_".join(str(x) for x in colnames) + to_parse = [data[c].values for c in colnames if c in data] + + new_col = parser(*to_parse) + return new_name, new_col, colnames + def TextParser(*args, **kwds): """ @@ -3548,6 +3617,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns): def _clean_na_values(na_values, keep_default_na=True): + if na_values is None: if keep_default_na: na_values = STR_NA_VALUES diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 9aa23bd739d24..11710fda521f1 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -1,8 +1,6 @@ -import distutils.version import os from typing import List, Optional -import pkg_resources import pytest from pandas import read_csv, read_table @@ -80,16 +78,8 @@ def csv1(csv_dir_path): _c_parser_ids = ["c_high", "c_low"] _pyarrow_parser_ids = ["pyarrow"] -try: - pyarrow_version = pkg_resources.get_distribution("pyarrow").version -except pkg_resources.DistributionNotFound: - pyarrow_version = "0" # represents pyarrow not found -if distutils.version.LooseVersion(pyarrow_version) > "0.15.0": - _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] - _all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids] -else: - _all_parsers = [*_c_parsers_only, *_py_parsers_only] - _all_parser_ids = [*_c_parser_ids, *_py_parser_ids] +_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] +_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids] @pytest.fixture(params=_all_parsers, ids=_all_parser_ids) @@ -97,6 +87,8 @@ def all_parsers(request): """ Fixture all of the CSV parsers. """ + if request.param.engine == "pyarrow": + pytest.importorskip("pyarrow", "0.15.0") return request.param From 88e200a108985baa5ac05e5c07287b8971ea091d Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 29 Jun 2020 11:04:49 -0700 Subject: [PATCH 27/95] Clean/Address comments/Update docs --- asv_bench/benchmarks/io/csv.py | 2 +- doc/source/whatsnew/v1.1.0.rst | 11 ++- pandas/compat/_optional.py | 16 ++-- pandas/io/parsers.py | 108 ++++++----------------- pandas/tests/test_optional_dependency.py | 7 +- 5 files changed, 51 insertions(+), 93 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 3681cd4df481f..8792fff5300d3 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -294,7 +294,7 @@ def time_convert_direct(self): class ReadCSVParseDates(StringIORewind): - params = ["c", "pyarrow", "python"] + params = ["c", "python"] param_names = ["engine"] def setup(self, engine): diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 7c0a707c964c5..d54935c2bdc08 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -245,6 +245,14 @@ If needed you can adjust the bins with the argument ``offset`` (a Timedelta) tha For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`. +.. _whatsnew_110.enhancements.read_csv_pyarrow_engine_support: + +read_csv() now accepts pyarrow as an engine +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing on multicore machines +with pyarrow>=0.15 installed. See the :doc:`I/O docs ` for more info. (:issue:`23697`) + .. _whatsnew_110.enhancements.other: @@ -293,9 +301,6 @@ Other enhancements - :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). - :meth `~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`). - :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). -- :func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing - if pyarrow>=0.15 is installed. However, the pyarrow engine is less feature-complete than its "c" or - "python" counterparts. See the :doc:`I/O docs ` for more info. (:issue:`23697`) .. --------------------------------------------------------------------------- diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index ed025ec36dafd..f65d53c05257c 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -2,6 +2,7 @@ import importlib import sys import types +from typing import Optional import warnings # Update install.rst when updating versions! @@ -46,7 +47,11 @@ def _get_version(module: types.ModuleType) -> str: def import_optional_dependency( - name: str, extra: str = "", raise_on_missing: bool = True, on_version: str = "raise" + name: str, + extra: str = "", + raise_on_missing: bool = True, + on_version: str = "raise", + min_version: Optional[str] = None, ): """ Import an optional dependency. @@ -58,8 +63,7 @@ def import_optional_dependency( Parameters ---------- name : str - The module name. This should be top-level only, so that the - version may be checked. + The module name. extra : str Additional text to include in the ImportError message. raise_on_missing : bool, default True @@ -73,6 +77,8 @@ def import_optional_dependency( * ignore: Return the module, even if the version is too old. It's expected that users validate the version locally when using ``on_version="ignore"`` (see. ``io/html.py``) + min_version: Optional[str] + Specify the minimum version Returns ------- @@ -93,14 +99,14 @@ def import_optional_dependency( raise ImportError(msg) from None else: return None - # Grab parent module if submodule being imported + # Handle submodules: if we have submodule, grab parent module from sys.modules parent = name.split(".")[0] if parent != name: name = parent module_to_get = sys.modules[name] else: module_to_get = module - minimum_version = VERSIONS.get(name) + minimum_version = min_version if min_version is not None else VERSIONS.get(name) if minimum_version: version = _get_version(module_to_get) if distutils.version.LooseVersion(version) < minimum_version: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3563a1ea0f04e..ebaefafd8b5b8 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -21,7 +21,7 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing from pandas._typing import FilePathOrBuffer, Union -from pandas.compat._optional import VERSIONS, import_optional_dependency +from pandas.compat._optional import import_optional_dependency from pandas.errors import ( AbstractMethodError, EmptyDataError, @@ -172,6 +172,7 @@ Parser engine to use. The C and pyarrow engines are faster, while the python engine is currently more feature-complete. The pyarrow engine requires ``pyarrow`` >= 0.15 as a dependency however. + .. versionchanged:: 1.1 The "pyarrow" engine was added. converters : dict, optional @@ -1015,7 +1016,7 @@ def _clean_options(self, options, engine): elif engine not in ("python", "python-fwf"): # wait until regex engine integrated fallback_reason = ( - "the 'c' engine does not support " + f"the '{engine}' engine does not support " "regex separators (separators > 1 char and " r"different from '\s+' are interpreted as regex)" ) @@ -2302,9 +2303,10 @@ def __init__(self, src, **kwds): self.src = BytesIOWrapper(self.src, encoding=encoding) def read(self): - VERSIONS["pyarrow"] = "0.15.0" pyarrow = import_optional_dependency( - "pyarrow.csv", extra="pyarrow is required to use the pyarrow engine" + "pyarrow.csv", + min_version="0.15.0", + extra="pyarrow is required to use the pyarrow engine", ) kwdscopy = {k: v for k, v in self.kwds.items() if v is not None} # these are kwargs passed to pyarrow @@ -2315,15 +2317,26 @@ def read(self): "true_values", "false_values", } + # rename some arguments to pass to pyarrow + kwdscopy["include_columns"] = kwdscopy.get("usecols") + kwdscopy["null_values"] = kwdscopy.get("na_values") + kwdscopy["escape_char"] = kwdscopy.get("escapechar") + kwdscopy["ignore_empty_lines"] = kwdscopy.get("skip_blank_lines") + parse_options = {k: v for k, v in kwdscopy.items() if k in parseoptions} convert_options = {k: v for k, v in kwdscopy.items() if k in convertoptions} - read_options = pyarrow.ReadOptions(autogenerate_column_names=True) - headerexists = True if self.header is not None and self.header >= 0 else False + headerexists = True if self.header is not None else False + read_options = dict() + skiprows = self.kwds.get("skiprows") - if skiprows is not None: - read_options = pyarrow.ReadOptions(skip_rows=skiprows) - elif headerexists: - read_options = pyarrow.ReadOptions(skip_rows=self.header) + if headerexists: + read_options["skip_rows"] = self.header + read_options["autogenerate_column_names"] = False + else: + if skiprows is not None: + read_options["skip_rows"] = skiprows + read_options["autogenerate_column_names"] = True + read_options = pyarrow.ReadOptions(**read_options) table = pyarrow.read_csv( self.src, read_options=read_options, @@ -2339,11 +2352,8 @@ def read(self): elif self.header is None: self.names = range(num_cols) frame.columns = self.names - - frame = self._date_conversion( - frame, self._date_conv, self.parse_dates, keep_date_col=self.keep_date_col - ) - + # we only need the frame not the names + frame.columns, frame = self._do_date_conversions(frame.columns, frame) if self.index_col is not None: for i, item in enumerate(self.index_col): if is_integer(item): @@ -2354,70 +2364,6 @@ def read(self): frame = frame.astype(self.kwds.get("dtype")) return frame - def _date_conversion( - self, data, converter, parse_spec, keep_date_col=False, - ): - - orig_names = data.columns - columns = list(data.columns) - - date_cols = set() - - if parse_spec is None or isinstance(parse_spec, bool): - return data, columns - - if isinstance(parse_spec, list): - # list of column lists - for colspec in parse_spec: - if is_scalar(colspec): - if isinstance(colspec, int) and colspec not in data: - colspec = orig_names[colspec] - data[colspec] = converter(data[colspec].values) - else: - new_name, col, old_names = self._try_convert_dates( - converter, colspec, data, orig_names - ) - if new_name in data: - raise ValueError(f"New date column already in dict {new_name}") - data[new_name] = col - date_cols.update(old_names) - - elif isinstance(parse_spec, dict): - # dict of new name to column list - for new_name, colspec in parse_spec.items(): - if new_name in data: - raise ValueError(f"Date column {new_name} already in dict") - - _, col, old_names = self._try_convert_dates( - converter, colspec, data, orig_names - ) - - data[new_name] = col - date_cols.update(old_names) - - if not keep_date_col: - data = data.drop(date_cols, axis=1) - - return data - - def _try_convert_dates(self, parser, colspec, data, columns): - colset = set(columns) - colnames = [] - - for c in colspec: - if c in colset: - colnames.append(c) - elif isinstance(c, int) and c not in columns: - colnames.append(columns[c]) - else: - colnames.append(c) - - new_name = "_".join(str(x) for x in colnames) - to_parse = [data[c].values for c in colnames if c in data] - - new_col = parser(*to_parse) - return new_name, new_col, colnames - def TextParser(*args, **kwds): """ @@ -3568,7 +3514,7 @@ def _isindex(colspec): colspec = orig_names[colspec] if _isindex(colspec): continue - data_dict[colspec] = converter(data_dict[colspec]) + data_dict[colspec] = converter(np.array(data_dict[colspec])) else: new_name, col, old_names = _try_convert_dates( converter, colspec, data_dict, orig_names @@ -3617,7 +3563,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns): colnames.append(c) new_name = "_".join(str(x) for x in colnames) - to_parse = [data_dict[c] for c in colnames if c in data_dict] + to_parse = [np.array(data_dict[c]) for c in colnames if c in data_dict] new_col = parser(*to_parse) return new_name, new_col, colnames diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py index e5ed69b7703b1..61dbd81e2cee5 100644 --- a/pandas/tests/test_optional_dependency.py +++ b/pandas/tests/test_optional_dependency.py @@ -27,14 +27,15 @@ def test_bad_version(monkeypatch): module = types.ModuleType(name) module.__version__ = "0.9.0" sys.modules[name] = module - monkeypatch.setitem(VERSIONS, name, "1.0.0") match = "Pandas requires .*1.0.0.* of .fakemodule.*'0.9.0'" with pytest.raises(ImportError, match=match): - import_optional_dependency("fakemodule") + import_optional_dependency("fakemodule", min_version="1.0.0") with tm.assert_produces_warning(UserWarning): - result = import_optional_dependency("fakemodule", on_version="warn") + result = import_optional_dependency( + "fakemodule", min_version="1.0.0", on_version="warn" + ) assert result is None module.__version__ = "1.0.0" # exact match is OK From ede279925c591f42a1585d0aae9e186a3b936cd0 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 29 Jun 2020 11:08:18 -0700 Subject: [PATCH 28/95] Fix typo Co-authored-by: Joris Van den Bossche --- pandas/io/parsers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index b3127d4f84cd8..de2a833e51ea0 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -173,6 +173,7 @@ is currently more feature-complete. The pyarrow engine requires ``pyarrow`` >= 0.15 as a dependency however. + .. versionchanged:: 1.1 The "pyarrow" engine was added. converters : dict, optional From e8eff08c8b939539ecbe6e9466f9248722fd0927 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 8 Jul 2020 16:46:46 -0700 Subject: [PATCH 29/95] Fix doc failures --- doc/source/user_guide/io.rst | 21 ++++++++++++++------- pandas/io/parsers.py | 1 - 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 2fcffcd814195..e4da778ee7378 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -160,9 +160,9 @@ dtype : Type name or dict of column -> type, default ``None`` (unsupported with ``engine='python'``). Use `str` or `object` together with suitable ``na_values`` settings to preserve and not interpret dtype. -engine : {``'c'``, ``'pyarrow'``,``'python'``} +engine : {``'c'``, ``'pyarrow'``, ``'python'``} Parser engine to use. In terms of performance, the pyarrow engine, - which requires pyarrow>=0.15.0, is faster than the C engine, which + which requires ``pyarrow`` >= 0.15.0, is faster than the C engine, which is faster than the python engine. However, the pyarrow and C engines are currently less feature complete than their Python counterpart. converters : dict, default ``None`` @@ -1621,11 +1621,18 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object: Specifying the parser engine '''''''''''''''''''''''''''' -Under the hood pandas uses a fast and efficient parser implemented in C as well -as a Python implementation which is currently more feature-complete. Where -possible pandas uses the C parser (specified as ``engine='c'``), but may fall -back to Python if C-unsupported options are specified. Currently, C-unsupported -options include: +Currently, pandas supports using three engines, the C engine, the python engine, +and an optional pyarrow engine(requires ``pyarrow`` >= 0.15). In terms of performance +the pyarrow engine is fastest, followed by the C and Python engines. However, +the pyarrow engine is much less robust than the C engine, which in turn lacks a +couple of features present in the Python parser. + +Where possible pandas uses the C parser (specified as ``engine='c'``), but may fall +back to Python if C-unsupported options are specified. If pyarrow unsupported options are +specified while using ``engine='pyarrow'``, the parser will error out +(a full list of unsupported options is available at ``pandas.io.parsers._pyarrow_unsupported``). + +Currently, C-unsupported options include: * ``sep`` other than a single character (e.g. regex separators) * ``skipfooter`` diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index de2a833e51ea0..b3127d4f84cd8 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -173,7 +173,6 @@ is currently more feature-complete. The pyarrow engine requires ``pyarrow`` >= 0.15 as a dependency however. - .. versionchanged:: 1.1 The "pyarrow" engine was added. converters : dict, optional From 55139ee19a512c3bd83b3c07caa4c44a92a49a59 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 22 Oct 2020 16:35:14 +0100 Subject: [PATCH 30/95] wip --- pandas/tests/io/parser/conftest.py | 14 +++++- pandas/tests/io/parser/test_comment.py | 2 + pandas/tests/io/parser/test_common.py | 64 ++++++++++++++++++++++++-- 3 files changed, 76 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 05fae470f5a88..a179c1b82baae 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -142,4 +142,16 @@ def pyarrow_xfail(request): if "all_parsers" in request.fixturenames: parser = request.getfixturevalue("all_parsers") if parser.engine == "pyarrow": - pytest.xfail("pyarrow doesn't support this.") + mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") + request.node.add_marker(mark) + + +@pytest.fixture +def pyarrow_skip(request): + """ + Fixture that skips a test if the engine is pyarrow. + """ + if "all_parsers" in request.fixturenames: + parser = request.getfixturevalue("all_parsers") + if parser.engine == "pyarrow": + pytest.skip("pyarrow doesn't support this.") diff --git a/pandas/tests/io/parser/test_comment.py b/pandas/tests/io/parser/test_comment.py index 60e32d7c27200..a9a03f006668b 100644 --- a/pandas/tests/io/parser/test_comment.py +++ b/pandas/tests/io/parser/test_comment.py @@ -10,6 +10,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_xfail") + @pytest.mark.parametrize("na_values", [None, ["NaN"]]) def test_comment(all_parsers, na_values): diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 753189ea7c8d2..1295f0061f808 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -23,6 +23,9 @@ from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + def test_override_set_noconvert_columns(): # see gh-17351 @@ -84,7 +87,8 @@ def test_empty_decimal_marker(all_parsers, pyarrow_xfail): parser.read_csv(StringIO(data), decimal="") -def test_bad_stream_exception(all_parsers, csv_dir_path, pyarrow_xfail): +@skip_pyarrow +def test_bad_stream_exception(all_parsers, csv_dir_path): # see gh-13652 # # This test validates that both the Python engine and C engine will @@ -139,6 +143,7 @@ def test_read_csv_local(all_parsers, csv1): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_1000_sep(all_parsers): parser = all_parsers data = """A|B|C @@ -232,6 +237,7 @@ def test_csv_mixed_type(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_read_csv_low_memory_no_rows_with_index(all_parsers): # see gh-21141 parser = all_parsers @@ -280,6 +286,7 @@ def test_read_csv_dataframe(all_parsers, csv1): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_read_csv_no_index_name(all_parsers, csv_dir_path): parser = all_parsers csv2 = os.path.join(csv_dir_path, "test2.csv") @@ -348,6 +355,7 @@ def test_read_duplicate_index_explicit(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_read_duplicate_index_implicit(all_parsers): data = """A,B,C,D foo,2,3,4,5 @@ -728,7 +736,7 @@ def test_iterator_skipfooter_errors(all_parsers, kwargs, pyarrow_xfail): parser.read_csv(StringIO(data), skipfooter=1, **kwargs) -def test_nrows_skipfooter_errors(all_parsers, pyarrow_xfail): +def test_nrows_skipfooter_errors(all_parsers): msg = "'skipfooter' not supported with 'nrows'" data = "a\n1\n2\n3\n4\n5\n6" parser = all_parsers @@ -799,6 +807,7 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) def test_multi_index_no_level_names(all_parsers, index_col): data = """index1,index2,A,B,C,D @@ -823,6 +832,7 @@ def test_multi_index_no_level_names(all_parsers, index_col): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_multi_index_no_level_names_implicit(all_parsers): parser = all_parsers data = """A,B,C,D @@ -856,6 +866,7 @@ def test_multi_index_no_level_names_implicit(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,expected,header", [ @@ -877,6 +888,7 @@ def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_no_unnamed_index(all_parsers): parser = all_parsers data = """ id c0 c1 c2 @@ -939,6 +951,7 @@ def test_local_file(all_parsers, csv_dir_path): pytest.skip("Failing on: " + " ".join(platform.uname())) +@xfail_pyarrow def test_path_path_lib(all_parsers): parser = all_parsers df = tm.makeDataFrame() @@ -946,6 +959,7 @@ def test_path_path_lib(all_parsers): tm.assert_frame_equal(df, result) +@xfail_pyarrow def test_path_local_path(all_parsers): parser = all_parsers df = tm.makeDataFrame() @@ -955,6 +969,7 @@ def test_path_local_path(all_parsers): tm.assert_frame_equal(df, result) +@xfail_pyarrow def test_nonexistent_path(all_parsers): # gh-2428: pls no segfault # gh-14086: raise more helpful FileNotFoundError @@ -968,6 +983,7 @@ def test_nonexistent_path(all_parsers): assert path == e.value.filename +@xfail_pyarrow @td.skip_if_windows # os.chmod does not work in windows def test_no_permission(all_parsers): # GH 23784 @@ -990,6 +1006,7 @@ def test_no_permission(all_parsers): assert path == e.value.filename +@xfail_pyarrow def test_missing_trailing_delimiters(all_parsers): parser = all_parsers data = """A,B,C,D @@ -1005,6 +1022,7 @@ def test_missing_trailing_delimiters(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_skip_initial_space(all_parsers): data = ( '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' @@ -1065,6 +1083,7 @@ def test_skip_initial_space(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_trailing_delimiters(all_parsers): # see gh-2442 data = """A,B,C @@ -1168,6 +1187,7 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers): assert df.a.dtype == object +@skip_pyarrow @pytest.mark.parametrize("sep", [" ", r"\s+"]) def test_integer_overflow_bug(all_parsers, sep): # see gh-2601 @@ -1179,6 +1199,7 @@ def test_integer_overflow_bug(all_parsers, sep): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_catch_too_many_names(all_parsers): # see gh-5156 data = """\ @@ -1198,6 +1219,7 @@ def test_catch_too_many_names(all_parsers): parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) +@xfail_pyarrow def test_ignore_leading_whitespace(all_parsers): # see gh-3374, gh-6607 parser = all_parsers @@ -1218,6 +1240,7 @@ def test_chunk_begins_with_newline_whitespace(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_index(all_parsers): # see gh-10184 data = "x,y" @@ -1228,6 +1251,7 @@ def test_empty_with_index(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_multi_index(all_parsers): # see gh-10467 data = "x,y,z" @@ -1240,6 +1264,7 @@ def test_empty_with_multi_index(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_reversed_multi_index(all_parsers): data = "x,y,z" parser = all_parsers @@ -1251,6 +1276,7 @@ def test_empty_with_reversed_multi_index(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_float_parser(all_parsers): # see gh-9565 parser = all_parsers @@ -1272,6 +1298,7 @@ def test_scientific_no_exponent(all_parsers): tm.assert_frame_equal(df_roundtrip, df) +@xfail_pyarrow @pytest.mark.parametrize("conv", [None, np.int64, np.uint64]) def test_int64_overflow(all_parsers, conv): data = """ID @@ -1315,6 +1342,7 @@ def test_int64_overflow(all_parsers, conv): parser.read_csv(StringIO(data), converters={"ID": conv}) +@xfail_pyarrow @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min] ) @@ -1328,6 +1356,7 @@ def test_int64_uint64_range(all_parsers, val): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] ) @@ -1341,6 +1370,7 @@ def test_outside_int64_uint64_range(all_parsers, val): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("exp_data", [[str(-1), str(2 ** 63)], [str(2 ** 63), str(-1)]]) def test_numeric_range_too_wide(all_parsers, exp_data): # No numerical dtype can hold both negative and uint64 @@ -1353,6 +1383,7 @@ def test_numeric_range_too_wide(all_parsers, exp_data): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("iterator", [True, False]) def test_empty_with_nrows_chunksize(all_parsers, iterator): # see gh-9535 @@ -1370,6 +1401,7 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "data,kwargs,expected,msg", [ @@ -1477,6 +1509,7 @@ def test_eof_states(all_parsers, data, kwargs, expected, msg): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]]) def test_uneven_lines_with_usecols(all_parsers, usecols): # see gh-12203 @@ -1531,6 +1564,7 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected, pyarrow_xf tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs,expected", [ @@ -1562,7 +1596,7 @@ def test_trailing_spaces(all_parsers, kwargs, expected): tm.assert_frame_equal(result, expected) -def test_raise_on_sep_with_delim_whitespace(all_parsers, pyarrow_xfail): +def test_raise_on_sep_with_delim_whitespace(all_parsers): # see gh-6607 data = "a b c\n1 2 3" parser = all_parsers @@ -1571,6 +1605,7 @@ def test_raise_on_sep_with_delim_whitespace(all_parsers, pyarrow_xfail): parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True) +@xfail_pyarrow @pytest.mark.parametrize("delim_whitespace", [True, False]) def test_single_char_leading_whitespace(all_parsers, delim_whitespace): # see gh-9710 @@ -1589,6 +1624,7 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "sep,skip_blank_lines,exp_data", [ @@ -1628,6 +1664,7 @@ def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_whitespace_lines(all_parsers): parser = all_parsers data = """ @@ -1643,6 +1680,7 @@ def test_whitespace_lines(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -1671,6 +1709,7 @@ def test_whitespace_regex_separator(all_parsers, data, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_verbose_read(all_parsers, capsys): parser = all_parsers data = """a,b,c,d @@ -1694,6 +1733,7 @@ def test_verbose_read(all_parsers, capsys): assert captured.out == "Filled 3 NA values in column a\n" +@xfail_pyarrow def test_verbose_read2(all_parsers, capsys): parser = all_parsers data = """a,b,c,d @@ -1735,6 +1775,7 @@ def test_iteration_open_handle(all_parsers): tm.assert_series_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,thousands,decimal", [ @@ -1766,6 +1807,7 @@ def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_euro_decimal_format(all_parsers): parser = all_parsers data = """Id;Number1;Number2;Text1;Text2;Number3 @@ -1785,6 +1827,7 @@ def test_euro_decimal_format(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("na_filter", [True, False]) def test_inf_parsing(all_parsers, na_filter): parser = all_parsers @@ -1808,6 +1851,7 @@ def test_inf_parsing(all_parsers, na_filter): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("na_filter", [True, False]) def test_infinity_parsing(all_parsers, na_filter): parser = all_parsers @@ -1825,6 +1869,7 @@ def test_infinity_parsing(all_parsers, na_filter): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5]) def test_raise_on_no_columns(all_parsers, nrows): parser = all_parsers @@ -1835,6 +1880,7 @@ def test_raise_on_no_columns(all_parsers, nrows): parser.read_csv(StringIO(data)) +@xfail_pyarrow @td.check_file_leaks def test_memory_map(all_parsers, csv_dir_path): mmap_file = os.path.join(csv_dir_path, "test_mmap.csv") @@ -1848,6 +1894,7 @@ def test_memory_map(all_parsers, csv_dir_path): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_null_byte_char(all_parsers): # see gh-2741 data = "\x00,foo" @@ -1864,6 +1911,7 @@ def test_null_byte_char(all_parsers): parser.read_csv(StringIO(data), names=names) +@xfail_pyarrow def test_temporary_file(all_parsers): # see gh-13398 parser = all_parsers @@ -1985,6 +2033,7 @@ def seek(self, pos, whence=0): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs", [dict(), dict(error_bad_lines=True)], # Default is True. # Explicitly pass in. @@ -2003,6 +2052,7 @@ def test_error_bad_lines(all_parsers, kwargs, warn_kwargs): parser.read_csv(StringIO(data), **kwargs) +@xfail_pyarrow def test_warn_bad_lines(all_parsers, capsys): # see gh-15925 parser = all_parsers @@ -2017,6 +2067,7 @@ def test_warn_bad_lines(all_parsers, capsys): assert "Skipping line 5" in captured.err +@xfail_pyarrow def test_suppress_error_output(all_parsers, capsys): # see gh-15925 parser = all_parsers @@ -2045,6 +2096,7 @@ def test_filename_with_special_chars(all_parsers, filename): tm.assert_frame_equal(result, df) +@xfail_pyarrow def test_read_csv_memory_growth_chunksize(all_parsers): # see gh-24805 # @@ -2127,6 +2179,7 @@ def test_first_row_bom(all_parsers, pyarrow_xfail): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_first_row_bom_unquoted(all_parsers): # see gh-36343 parser = all_parsers @@ -2147,6 +2200,7 @@ def test_integer_precision(all_parsers): tm.assert_series_equal(result, expected) +@xfail_pyarrow def test_file_descriptor_leak(all_parsers): # GH 31488 @@ -2160,6 +2214,7 @@ def test(): td.check_file_leaks(test)() +@xfail_pyarrow @pytest.mark.parametrize("nrows", range(1, 6)) def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): # GH 28071 @@ -2173,6 +2228,7 @@ def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): tm.assert_frame_equal(df, ref[:nrows]) +@xfail_pyarrow def test_no_header_two_extra_columns(all_parsers): # GH 26218 column_names = ["one", "two", "three"] @@ -2203,6 +2259,7 @@ def test_read_csv_with_use_inf_as_na(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_read_table_delim_whitespace_default_sep(all_parsers): # GH: 35958 f = StringIO("a b c\n1 -2 -3\n4 5 6") @@ -2244,6 +2301,7 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): parser.read_table(f, delim_whitespace=True, delimiter=delimiter) +@xfail_pyarrow def test_dict_keys_as_names(all_parsers): # GH: 36928 data = "1,2" From c1aeecf20a519d3ae5b198097a4746291942c936 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 22 Oct 2020 20:27:33 +0100 Subject: [PATCH 31/95] more xfails and skips --- pandas/tests/io/parser/test_common.py | 8 ++--- pandas/tests/io/parser/test_compression.py | 5 +++- pandas/tests/io/parser/test_converters.py | 2 ++ pandas/tests/io/parser/test_dialect.py | 2 ++ pandas/tests/io/parser/test_dtypes.py | 25 ++++++++++++++++ pandas/tests/io/parser/test_encoding.py | 13 ++++++++ pandas/tests/io/parser/test_header.py | 18 +++++++++++ pandas/tests/io/parser/test_index_col.py | 11 +++++++ pandas/tests/io/parser/test_mangle_dupes.py | 6 ++++ pandas/tests/io/parser/test_multi_thread.py | 2 ++ pandas/tests/io/parser/test_na_values.py | 24 +++++++++++++++ pandas/tests/io/parser/test_parse_dates.py | 33 +++++++++++++++++++++ pandas/tests/io/parser/test_quoting.py | 10 +++++++ pandas/tests/io/parser/test_skiprows.py | 13 ++++++++ pandas/tests/io/parser/test_usecols.py | 25 ++++++++++++++++ 15 files changed, 192 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 1295f0061f808..cbf474ad5e5c6 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1240,7 +1240,7 @@ def test_chunk_begins_with_newline_whitespace(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_empty_with_index(all_parsers): # see gh-10184 data = "x,y" @@ -1264,7 +1264,7 @@ def test_empty_with_multi_index(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_empty_with_reversed_multi_index(all_parsers): data = "x,y,z" parser = all_parsers @@ -1869,7 +1869,7 @@ def test_infinity_parsing(all_parsers, na_filter): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5]) def test_raise_on_no_columns(all_parsers, nrows): parser = all_parsers @@ -2301,7 +2301,7 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): parser.read_table(f, delim_whitespace=True, delimiter=delimiter) -@xfail_pyarrow +@skip_pyarrow def test_dict_keys_as_names(all_parsers): # GH: 36928 data = "1,2" diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index ecc35dd6644c8..e23b91373f611 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -11,6 +11,8 @@ import pandas as pd import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + @pytest.fixture(params=[True, False]) def buffer(request): @@ -80,6 +82,7 @@ def test_zip_error_invalid_zip(parser_and_data, pyarrow_xfail): parser.read_csv(f, compression="zip") +@skip_pyarrow @pytest.mark.parametrize("filename", [None, "test.{ext}"]) def test_compression( parser_and_data, compression_only, buffer, filename, pyarrow_xfail @@ -147,7 +150,7 @@ def test_compression_utf_encoding( @pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"]) -def test_invalid_compression(all_parsers, invalid_compression, pyarrow_xfail): +def test_invalid_compression(all_parsers, invalid_compression): parser = all_parsers compress_kwargs = dict(compression=invalid_compression) diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 88b400d9a11df..a70fe847b6ae9 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -12,6 +12,8 @@ from pandas import DataFrame, Index import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_xfail") + def test_converters_type_must_be_dict(all_parsers): parser = all_parsers diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py index cc65def0fd096..7a65e46ba670f 100644 --- a/pandas/tests/io/parser/test_dialect.py +++ b/pandas/tests/io/parser/test_dialect.py @@ -13,6 +13,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_xfail") + @pytest.fixture def custom_dialect(): diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 1ba6f0ea0a342..8e6462767513a 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -16,7 +16,11 @@ from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) def test_dtype_all_columns(all_parsers, dtype, check_orig): @@ -43,6 +47,7 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_dtype_all_columns_empty(all_parsers): # see gh-12048 parser = all_parsers @@ -52,6 +57,7 @@ def test_dtype_all_columns_empty(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_dtype_per_column(all_parsers): parser = all_parsers data = """\ @@ -70,6 +76,7 @@ def test_dtype_per_column(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_invalid_dtype_per_column(all_parsers): parser = all_parsers data = """\ @@ -83,6 +90,7 @@ def test_invalid_dtype_per_column(all_parsers): parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) +@xfail_pyarrow @pytest.mark.parametrize( "dtype", [ @@ -109,6 +117,7 @@ def test_categorical_dtype(all_parsers, dtype): tm.assert_frame_equal(actual, expected) +@skip_pyarrow @pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) def test_categorical_dtype_single(all_parsers, dtype): # see gh-10153 @@ -124,6 +133,7 @@ def test_categorical_dtype_single(all_parsers, dtype): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow def test_categorical_dtype_unsorted(all_parsers): # see gh-10153 parser = all_parsers @@ -142,6 +152,7 @@ def test_categorical_dtype_unsorted(all_parsers): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow def test_categorical_dtype_missing(all_parsers): # see gh-10153 parser = all_parsers @@ -160,6 +171,7 @@ def test_categorical_dtype_missing(all_parsers): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow @pytest.mark.slow def test_categorical_dtype_high_cardinality_numeric(all_parsers): # see gh-18186 @@ -187,6 +199,7 @@ def test_categorical_dtype_latin1(all_parsers, csv_dir_path): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow def test_categorical_dtype_utf16(all_parsers, csv_dir_path): # see gh-10153 pth = os.path.join(csv_dir_path, "utf16_ex.txt") @@ -201,6 +214,7 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow def test_categorical_dtype_chunksize_infer_categories(all_parsers): # see gh-10153 parser = all_parsers @@ -219,6 +233,7 @@ def test_categorical_dtype_chunksize_infer_categories(all_parsers): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow def test_categorical_dtype_chunksize_explicit_categories(all_parsers): # see gh-10153 parser = all_parsers @@ -320,6 +335,7 @@ def test_categorical_coerces_timestamp(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_categorical_coerces_timedelta(all_parsers): parser = all_parsers dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} @@ -361,6 +377,7 @@ def test_categorical_unexpected_categories(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_pass_dtype(all_parsers): parser = all_parsers @@ -374,6 +391,7 @@ def test_empty_pass_dtype(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_index_pass_dtype(all_parsers): parser = all_parsers @@ -388,6 +406,7 @@ def test_empty_with_index_pass_dtype(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_multi_index_pass_dtype(all_parsers): parser = all_parsers @@ -416,6 +435,7 @@ def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers, pyarrow_xfai tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): parser = all_parsers @@ -429,6 +449,7 @@ def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers): # see gh-9424 parser = all_parsers @@ -457,6 +478,7 @@ def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers): parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"}) +@xfail_pyarrow def test_raise_on_passed_int_dtype_with_nas(all_parsers): # see gh-2631 parser = all_parsers @@ -474,6 +496,7 @@ def test_raise_on_passed_int_dtype_with_nas(all_parsers): parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True) +@xfail_pyarrow def test_dtype_with_converters(all_parsers): parser = all_parsers data = """a,b @@ -489,6 +512,7 @@ def test_dtype_with_converters(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "dtype,expected", [ @@ -553,6 +577,7 @@ def test_numeric_dtype(all_parsers, dtype): tm.assert_frame_equal(expected, result) +@xfail_pyarrow def test_boolean_dtype(all_parsers): parser = all_parsers data = "\n".join( diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 876696ecdad9c..eac906601876b 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -13,7 +13,11 @@ from pandas import DataFrame, read_csv import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow def test_bytes_io_input(all_parsers): encoding = "cp1255" parser = all_parsers @@ -25,6 +29,7 @@ def test_bytes_io_input(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_read_csv_unicode(all_parsers): parser = all_parsers data = BytesIO("\u0141aski, Jan;1".encode()) @@ -34,6 +39,7 @@ def test_read_csv_unicode(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("sep", [",", "\t"]) @pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"]) def test_utf16_bom_skiprows(all_parsers, sep, encoding): @@ -68,6 +74,7 @@ def test_utf16_bom_skiprows(all_parsers, sep, encoding): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_utf16_example(all_parsers, csv_dir_path): path = os.path.join(csv_dir_path, "utf16_ex.txt") parser = all_parsers @@ -75,6 +82,7 @@ def test_utf16_example(all_parsers, csv_dir_path): assert len(result) == 50 +@xfail_pyarrow def test_unicode_encoding(all_parsers, csv_dir_path): path = os.path.join(csv_dir_path, "unicode_series.csv") parser = all_parsers @@ -87,6 +95,7 @@ def test_unicode_encoding(all_parsers, csv_dir_path): assert got == expected +@skip_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -120,6 +129,7 @@ def _encode_data_with_bom(_data): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt): # see gh-13549 expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]}) @@ -132,6 +142,7 @@ def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "file_path,encoding", [ @@ -163,6 +174,7 @@ def test_binary_mode_file_buffers( tm.assert_frame_equal(expected, result) +@skip_pyarrow @pytest.mark.parametrize("pass_encoding", [True, False]) def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding): # see gh-24130 @@ -179,6 +191,7 @@ def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding) tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_encoding_named_temp_file(all_parsers): # see gh-31819 parser = all_parsers diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 4cd110136d7b0..34eaf6ae306b4 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -14,7 +14,11 @@ from pandas import DataFrame, Index, MultiIndex import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow def test_read_with_bad_header(all_parsers): parser = all_parsers msg = r"but only \d+ lines in file" @@ -82,6 +86,7 @@ def test_no_header_prefix(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_header_with_index_col(all_parsers): parser = all_parsers data = """foo,1,2,3 @@ -119,6 +124,7 @@ def test_header_not_first_line(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_header_multi_index(all_parsers): parser = all_parsers expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) @@ -184,6 +190,7 @@ def test_header_multi_index_invalid(all_parsers, kwargs, msg): _TestTuple = namedtuple("names", ["first", "second"]) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs", [ @@ -231,6 +238,7 @@ def test_header_multi_index_common_format1(all_parsers, kwargs): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs", [ @@ -277,6 +285,7 @@ def test_header_multi_index_common_format2(all_parsers, kwargs): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs", [ @@ -324,6 +333,7 @@ def test_header_multi_index_common_format3(all_parsers, kwargs): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_header_multi_index_common_format_malformed1(all_parsers): parser = all_parsers expected = DataFrame( @@ -344,6 +354,7 @@ def test_header_multi_index_common_format_malformed1(all_parsers): tm.assert_frame_equal(expected, result) +@xfail_pyarrow def test_header_multi_index_common_format_malformed2(all_parsers): parser = all_parsers expected = DataFrame( @@ -365,6 +376,7 @@ def test_header_multi_index_common_format_malformed2(all_parsers): tm.assert_frame_equal(expected, result) +@xfail_pyarrow def test_header_multi_index_common_format_malformed3(all_parsers): parser = all_parsers expected = DataFrame( @@ -385,6 +397,7 @@ def test_header_multi_index_common_format_malformed3(all_parsers): tm.assert_frame_equal(expected, result) +@skip_pyarrow @pytest.mark.parametrize( "data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)] ) @@ -397,6 +410,7 @@ def test_header_names_backward_compat(all_parsers, data, header): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("kwargs", [dict(), dict(index_col=False)]) def test_read_only_header_no_rows(all_parsers, kwargs): # See gh-7773 @@ -442,6 +456,7 @@ def test_non_int_header(all_parsers, header): parser.read_csv(StringIO(data), header=header) +@xfail_pyarrow def test_singleton_header(all_parsers): # see gh-7757 data = """a,b,c\n0,1,2\n1,2,3""" @@ -452,6 +467,7 @@ def test_singleton_header(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -498,6 +514,7 @@ def test_mangles_multi_index(all_parsers, data, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("index_col", [None, [0]]) @pytest.mark.parametrize( "columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])] @@ -541,6 +558,7 @@ def test_multi_index_unnamed(all_parsers, index_col, columns): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_read_csv_multiindex_columns(all_parsers): # GH#6051 parser = all_parsers diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 4d64f2bf411bd..a0a4fdbc25d49 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -11,7 +11,11 @@ from pandas import DataFrame, Index, MultiIndex import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@skip_pyarrow @pytest.mark.parametrize("with_header", [True, False]) def test_index_col_named(all_parsers, with_header): parser = all_parsers @@ -66,6 +70,7 @@ def test_index_col_is_true(all_parsers): parser.read_csv(StringIO(data), index_col=True) +@xfail_pyarrow def test_infer_index_col(all_parsers): data = """A,B,C foo,1,2,3 @@ -83,6 +88,7 @@ def test_infer_index_col(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "index_col,kwargs", [ @@ -127,6 +133,7 @@ def test_index_col_empty_data(all_parsers, index_col, kwargs): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_index_col_false(all_parsers): # see gh-10413 data = "x,y" @@ -137,6 +144,7 @@ def test_empty_with_index_col_false(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "index_names", [ @@ -161,6 +169,7 @@ def test_multi_index_naming(all_parsers, index_names): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_multi_index_naming_not_all_at_beginning(all_parsers): parser = all_parsers data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4" @@ -175,6 +184,7 @@ def test_multi_index_naming_not_all_at_beginning(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_no_multi_index_level_names_empty(all_parsers): # GH 10984 parser = all_parsers @@ -186,6 +196,7 @@ def test_no_multi_index_level_names_empty(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_header_with_index_col(all_parsers): # GH 33476 parser = all_parsers diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 5c4e642115798..cc88a1d974767 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -10,7 +10,10 @@ from pandas import DataFrame import pandas._testing as tm +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow @pytest.mark.parametrize("kwargs", [dict(), dict(mangle_dupe_cols=True)]) def test_basic(all_parsers, kwargs): # TODO: add test for condition "mangle_dupe_cols=False" @@ -24,6 +27,7 @@ def test_basic(all_parsers, kwargs): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_basic_names(all_parsers): # See gh-7160 parser = all_parsers @@ -44,6 +48,7 @@ def test_basic_names_raise(all_parsers): parser.read_csv(StringIO(data), names=["a", "b", "a"]) +@xfail_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -111,6 +116,7 @@ def test_thorough_mangle_names(all_parsers, data, names, expected): parser.read_csv(StringIO(data), names=names) +@xfail_pyarrow def test_mangled_unnamed_placeholders(all_parsers): # xref gh-13017 orig_key = "0" diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index d50560c684084..06f14e28435ef 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -12,6 +12,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_xfail") + def _construct_dataframe(num_rows): """ diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 9f86bbd65640e..9e7a445234a45 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -12,7 +12,11 @@ from pandas import DataFrame, Index, MultiIndex import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow def test_string_nas(all_parsers): parser = all_parsers data = """A,B,C @@ -28,6 +32,7 @@ def test_string_nas(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_detect_string_na(all_parsers): parser = all_parsers data = """A,B @@ -42,6 +47,7 @@ def test_detect_string_na(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "na_values", [ @@ -79,6 +85,7 @@ def test_non_string_na_values(all_parsers, data, na_values): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_default_na_values(all_parsers): _NA_VALUES = { "-1.#IND", @@ -126,6 +133,7 @@ def f(i, v): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("na_values", ["baz", ["baz"]]) def test_custom_na_values(all_parsers, na_values): parser = all_parsers @@ -159,6 +167,7 @@ def test_bool_na_values(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_na_value_dict(all_parsers): data = """A,B,C foo,bar,NA @@ -177,6 +186,7 @@ def test_na_value_dict(all_parsers): tm.assert_frame_equal(df, expected) +@xfail_pyarrow @pytest.mark.parametrize( "index_col,expected", [ @@ -210,6 +220,7 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs,expected", [ @@ -297,6 +308,7 @@ def test_no_na_values_no_keep_default(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_no_keep_default_na_dict_na_values(all_parsers): # see gh-19227 data = "a,b\n,2" @@ -308,6 +320,7 @@ def test_no_keep_default_na_dict_na_values(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_no_keep_default_na_dict_na_scalar_values(all_parsers): # see gh-19227 # @@ -319,6 +332,7 @@ def test_no_keep_default_na_dict_na_scalar_values(all_parsers): tm.assert_frame_equal(df, expected) +@xfail_pyarrow @pytest.mark.parametrize("col_zero_na_values", [113125, "113125"]) def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values): # see gh-19227 @@ -348,6 +362,7 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "na_filter,row_data", [ @@ -369,6 +384,7 @@ def test_na_values_na_filter_override(all_parsers, na_filter, row_data): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_na_trailing_columns(all_parsers): parser = all_parsers data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax @@ -396,6 +412,7 @@ def test_na_trailing_columns(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "na_values,row_data", [ @@ -414,6 +431,7 @@ def test_na_values_scalar(all_parsers, na_values, row_data): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_na_values_dict_aliasing(all_parsers): parser = all_parsers na_values = {"a": 2, "b": 1} @@ -429,6 +447,7 @@ def test_na_values_dict_aliasing(all_parsers): tm.assert_dict_equal(na_values, na_values_copy) +@xfail_pyarrow def test_na_values_dict_col_index(all_parsers): # see gh-14203 data = "a\nfoo\n1" @@ -440,6 +459,7 @@ def test_na_values_dict_col_index(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -469,6 +489,7 @@ def test_empty_na_values_no_default_with_index(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])] ) @@ -497,6 +518,7 @@ def test_inf_na_values_with_int_index(all_parsers): tm.assert_frame_equal(out, expected) +@xfail_pyarrow @pytest.mark.parametrize("na_filter", [True, False]) def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): # see gh-20377 @@ -512,6 +534,7 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data, na_values", [ @@ -540,6 +563,7 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): ) +@xfail_pyarrow def test_str_nan_dropped(all_parsers): # see gh-21131 parser = all_parsers diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 662659982c0b3..722170c9b76df 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -34,7 +34,10 @@ else: date_strategy = st.datetimes() +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow def test_separator_date_conflict(all_parsers): # Regression test for gh-4678 # @@ -56,6 +59,7 @@ def test_separator_date_conflict(all_parsers): tm.assert_frame_equal(df, expected) +@xfail_pyarrow @pytest.mark.parametrize("keep_date_col", [True, False]) def test_multiple_date_col_custom(all_parsers, keep_date_col): data = """\ @@ -199,6 +203,7 @@ def date_parser(*date_cols): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("container", [list, tuple, Index, Series]) @pytest.mark.parametrize("dim", [1, 2]) def test_concat_date_col_fail(container, dim): @@ -211,6 +216,7 @@ def test_concat_date_col_fail(container, dim): parsing.concat_date_cols(date_cols) +@xfail_pyarrow @pytest.mark.parametrize("keep_date_col", [True, False]) def test_multiple_date_col(all_parsers, keep_date_col): data = """\ @@ -370,6 +376,7 @@ def test_date_col_as_index_col(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_parser, warning", ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]), @@ -434,6 +441,7 @@ def test_multiple_date_cols_int_cast(all_parsers, date_parser, warning): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_multiple_date_col_timestamp_parse(all_parsers): parser = all_parsers data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 @@ -468,6 +476,7 @@ def test_multiple_date_col_timestamp_parse(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_multiple_date_cols_with_header(all_parsers): parser = all_parsers data = """\ @@ -637,6 +646,7 @@ def test_date_parser_int_bug(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_nat_parse(all_parsers): # see gh-3062 parser = all_parsers @@ -652,6 +662,7 @@ def test_nat_parse(all_parsers): tm.assert_frame_equal(result, df) +@xfail_pyarrow def test_csv_custom_parser(all_parsers): data = """A,B,C 20090101,a,1,2 @@ -666,6 +677,7 @@ def test_csv_custom_parser(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_parse_dates_implicit_first_col(all_parsers): data = """A,B,C 20090101,a,1,2 @@ -679,6 +691,7 @@ def test_parse_dates_implicit_first_col(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_parse_dates_string(all_parsers): data = """date,A,B,C 20090101,a,1,2 @@ -723,6 +736,7 @@ def test_yy_format_with_year_first(all_parsers, parse_dates): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]]) def test_parse_dates_column_list(all_parsers, parse_dates): data = "a,b,c\n01/01/2010,1,15/02/2010" @@ -739,6 +753,7 @@ def test_parse_dates_column_list(all_parsers, parse_dates): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) def test_multi_index_parse_dates(all_parsers, index_col): data = """index1,index2,A,B,C @@ -784,6 +799,7 @@ def test_multi_index_parse_dates(all_parsers, index_col): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("kwargs", [dict(dayfirst=True), dict(day_first=True)]) def test_parse_dates_custom_euro_format(all_parsers, kwargs): parser = all_parsers @@ -828,6 +844,7 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs): ) +@xfail_pyarrow def test_parse_tz_aware(all_parsers): # See gh-1693 parser = all_parsers @@ -841,6 +858,7 @@ def test_parse_tz_aware(all_parsers): assert result.index.tz is pytz.utc +@xfail_pyarrow @pytest.mark.parametrize( "parse_dates,index_col", [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)], @@ -941,6 +959,7 @@ def test_multiple_date_cols_index(all_parsers, parse_dates, index_col): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_multiple_date_cols_chunked(all_parsers): parser = all_parsers data = """\ @@ -1033,6 +1052,7 @@ def test_multiple_date_cols_chunked(all_parsers): tm.assert_frame_equal(chunks[2], expected[4:]) +@xfail_pyarrow def test_multiple_date_col_named_index_compat(all_parsers): parser = all_parsers data = """\ @@ -1056,6 +1076,7 @@ def test_multiple_date_col_named_index_compat(all_parsers): tm.assert_frame_equal(with_indices, with_names) +@xfail_pyarrow def test_multiple_date_col_multiple_index_compat(all_parsers): parser = all_parsers data = """\ @@ -1123,6 +1144,7 @@ def test_bad_date_parse(all_parsers, cache_dates, value): ) +@xfail_pyarrow def test_parse_dates_empty_string(all_parsers): # see gh-2263 parser = all_parsers @@ -1135,6 +1157,7 @@ def test_parse_dates_empty_string(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -1174,6 +1197,7 @@ def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_parser, warning", ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]), @@ -1202,6 +1226,7 @@ def test_parse_date_time_multi_level_column_name(all_parsers, date_parser, warni tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_parser, warning", ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]), @@ -1290,6 +1315,7 @@ def test_parse_date_time(all_parsers, data, kwargs, expected, date_parser, warni tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_parser, warning", ([conv.parse_date_fields, FutureWarning], [pd.to_datetime, None]), @@ -1312,6 +1338,7 @@ def test_parse_date_fields(all_parsers, date_parser, warning): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_parser, warning", ( @@ -1343,6 +1370,7 @@ def test_parse_date_all_fields(all_parsers, date_parser, warning): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_parser, warning", ( @@ -1374,6 +1402,7 @@ def test_datetime_fractional_seconds(all_parsers, date_parser, warning): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_generic(all_parsers): parser = all_parsers data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." @@ -1392,6 +1421,7 @@ def test_generic(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_date_parser_resolution_if_not_ns(all_parsers): # see gh-10245 parser = all_parsers @@ -1489,6 +1519,7 @@ def test_parse_timezone(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_string", ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"], @@ -1500,6 +1531,7 @@ def test_invalid_parse_delimited_date(all_parsers, date_string): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_string,dayfirst,expected", [ @@ -1565,6 +1597,7 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti assert result == expected +@xfail_pyarrow @pytest.mark.parametrize( "names, usecols, parse_dates, missing_cols", [ diff --git a/pandas/tests/io/parser/test_quoting.py b/pandas/tests/io/parser/test_quoting.py index 14773dfbea20e..8b010df470386 100644 --- a/pandas/tests/io/parser/test_quoting.py +++ b/pandas/tests/io/parser/test_quoting.py @@ -13,7 +13,11 @@ from pandas import DataFrame import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow @pytest.mark.parametrize( "kwargs,msg", [ @@ -33,6 +37,7 @@ def test_bad_quote_char(all_parsers, kwargs, msg): parser.read_csv(StringIO(data), **kwargs) +@xfail_pyarrow @pytest.mark.parametrize( "quoting,msg", [ @@ -57,6 +62,7 @@ def test_quote_char_basic(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"]) def test_quote_char_various(all_parsers, quote_char): parser = all_parsers @@ -69,6 +75,7 @@ def test_quote_char_various(all_parsers, quote_char): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE]) @pytest.mark.parametrize("quote_char", ["", None]) def test_null_quote_char(all_parsers, quoting, quote_char): @@ -88,6 +95,7 @@ def test_null_quote_char(all_parsers, quoting, quote_char): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs,exp_data", [ @@ -114,6 +122,7 @@ def test_quoting_various(all_parsers, kwargs, exp_data): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])] ) @@ -137,6 +146,7 @@ def test_quotechar_unicode(all_parsers, quotechar): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("balanced", [True, False]) def test_unbalanced_quoting(all_parsers, balanced): # see gh-22789. diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index fdccef1127c7e..732f2eb18fdd9 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -14,7 +14,10 @@ from pandas import DataFrame, Index import pandas._testing as tm +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow @pytest.mark.parametrize("skiprows", [list(range(6)), 6]) def test_skip_rows_bug(all_parsers, skiprows): # see gh-505 @@ -42,6 +45,7 @@ def test_skip_rows_bug(all_parsers, skiprows): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_deep_skip_rows(all_parsers): # see gh-4382 parser = all_parsers @@ -57,6 +61,7 @@ def test_deep_skip_rows(all_parsers): tm.assert_frame_equal(result, condensed_result) +@xfail_pyarrow def test_skip_rows_blank(all_parsers): # see gh-9832 parser = all_parsers @@ -83,6 +88,7 @@ def test_skip_rows_blank(all_parsers): tm.assert_frame_equal(data, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -123,6 +129,7 @@ def test_skip_row_with_newline(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_skip_row_with_quote(all_parsers): # see gh-12775 and gh-10911 parser = all_parsers @@ -138,6 +145,7 @@ def test_skip_row_with_quote(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,exp_data", [ @@ -173,6 +181,7 @@ def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "line_terminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR" ) @@ -209,6 +218,7 @@ def test_skiprows_lineterminator(all_parsers, line_terminator): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_skiprows_infield_quote(all_parsers): # see gh-14459 parser = all_parsers @@ -219,6 +229,7 @@ def test_skiprows_infield_quote(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs,expected", [ @@ -234,6 +245,7 @@ def test_skip_rows_callable(all_parsers, kwargs, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_skip_rows_skip_all(all_parsers): parser = all_parsers data = "a\n1\n2\n3\n4\n5" @@ -243,6 +255,7 @@ def test_skip_rows_skip_all(all_parsers): parser.read_csv(StringIO(data), skiprows=lambda x: True) +@xfail_pyarrow def test_skip_rows_bad_callable(all_parsers): msg = "by zero" parser = all_parsers diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index 7e9c9866a666d..0f2e5882439f8 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -12,6 +12,9 @@ from pandas import DataFrame, Index import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + _msg_validate_usecols_arg = ( "'usecols' must either be list-like " "of all strings, all unicode, all " @@ -22,6 +25,7 @@ ) +@skip_pyarrow def test_raise_on_mixed_dtype_usecols(all_parsers): # See gh-12678 data = """a,b,c @@ -35,6 +39,7 @@ def test_raise_on_mixed_dtype_usecols(all_parsers): parser.read_csv(StringIO(data), usecols=usecols) +@skip_pyarrow @pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")]) def test_usecols(all_parsers, usecols): data = """\ @@ -50,6 +55,7 @@ def test_usecols(all_parsers, usecols): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_with_names(all_parsers): data = """\ a,b,c @@ -65,6 +71,7 @@ def test_usecols_with_names(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])] ) @@ -81,6 +88,7 @@ def test_usecols_relative_to_names(all_parsers, names, usecols): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_relative_to_names2(all_parsers): # see gh-5766 data = """\ @@ -97,6 +105,7 @@ def test_usecols_relative_to_names2(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_name_length_conflict(all_parsers): data = """\ 1,2,3 @@ -125,6 +134,7 @@ def test_usecols_single_string(all_parsers): parser.read_csv(StringIO(data), usecols="foo") +@xfail_pyarrow @pytest.mark.parametrize( "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"] ) @@ -138,6 +148,7 @@ def test_usecols_index_col_false(all_parsers, data): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("index_col", ["b", 0]) @pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]]) def test_usecols_index_col_conflict(all_parsers, usecols, index_col): @@ -164,6 +175,7 @@ def test_usecols_index_col_conflict2(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_implicit_index_col(all_parsers): # see gh-2654 parser = all_parsers @@ -174,6 +186,7 @@ def test_usecols_implicit_index_col(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_regex_sep(all_parsers): # see gh-2733 parser = all_parsers @@ -184,6 +197,7 @@ def test_usecols_regex_sep(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_with_whitespace(all_parsers): parser = all_parsers data = "a b c\n4 apple bat 5.7\n8 orange cow 10" @@ -193,6 +207,7 @@ def test_usecols_with_whitespace(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "usecols,expected", [ @@ -212,6 +227,7 @@ def test_usecols_with_integer_like_header(all_parsers, usecols, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) def test_usecols_with_parse_dates(all_parsers, usecols): # see gh-9755 @@ -230,6 +246,7 @@ def test_usecols_with_parse_dates(all_parsers, usecols): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_usecols_with_parse_dates2(all_parsers): # see gh-13604 parser = all_parsers @@ -290,6 +307,7 @@ def test_usecols_with_parse_dates3(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_with_parse_dates4(all_parsers): data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8" usecols = list("abcdefghij") @@ -313,6 +331,7 @@ def test_usecols_with_parse_dates4(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) @pytest.mark.parametrize( "names", @@ -406,6 +425,7 @@ def test_usecols_with_multi_byte_characters(all_parsers, usecols): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_usecols(all_parsers): data = "a,b,c\n1,2,3\n4,5,6" expected = DataFrame() @@ -426,6 +446,7 @@ def test_np_array_usecols(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "usecols,expected", [ @@ -458,6 +479,7 @@ def test_callable_usecols(all_parsers, usecols, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]]) def test_incomplete_first_row(all_parsers, usecols): # see gh-6710 @@ -470,6 +492,7 @@ def test_incomplete_first_row(all_parsers, usecols): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,usecols,kwargs,expected", [ @@ -502,6 +525,7 @@ def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "usecols,kwargs,expected,msg", [ @@ -558,6 +582,7 @@ def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]]) def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request): if all_parsers.engine != "c": From b53a620b8fb77e1ab804a18e01662d85cf653bf7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 28 Oct 2020 04:07:45 +0000 Subject: [PATCH 32/95] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- asv_bench/benchmarks/io/csv.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 8792fff5300d3..c1fad1efde082 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -256,7 +256,10 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision): def time_read_csv_arrow(self, sep, decimal, float_precision): read_csv( - self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"), + self.data(self.StringIO_input), + sep=sep, + header=None, + names=list("abc"), ) From f13113d37ccad7f16d493931dac876d4cd246d96 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 28 Oct 2020 10:39:52 -0700 Subject: [PATCH 33/95] Fix typos --- pandas/compat/_optional.py | 30 +++++++++++----------- pandas/io/parsers.py | 10 +++----- pandas/tests/io/parser/test_unsupported.py | 3 ++- 3 files changed, 20 insertions(+), 23 deletions(-) diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 6f00c8ddb37af..6569b077069e2 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -45,6 +45,7 @@ "pandas_gbq": "pandas-gbq", "sqlalchemy": "SQLAlchemy", "jinja2": "Jinja2", + "pyarrow.csv": "pyarrow", } @@ -119,23 +120,22 @@ def import_optional_dependency( # Handle submodules: if we have submodule, grab parent module from sys.modules parent = name.split(".")[0] if parent != name: - name = parent - module_to_get = sys.modules[name] + install_name = parent + module_to_get = sys.modules[install_name] else: module_to_get = module minimum_version = min_version if min_version is not None else VERSIONS.get(name) - if minimum_version: - version = _get_version(module_to_get) - if distutils.version.LooseVersion(version) < minimum_version: - assert on_version in {"warn", "raise", "ignore"} - msg = ( - f"Pandas requires version '{minimum_version}' or newer of '{name}' " - f"(version '{version}' currently installed)." - ) - if on_version == "warn": - warnings.warn(msg, UserWarning) - return None - elif on_version == "raise": - raise ImportError(msg) + version = _get_version(module_to_get) + if distutils.version.LooseVersion(version) < minimum_version: + assert on_version in {"warn", "raise", "ignore"} + msg = ( + f"Pandas requires version '{minimum_version}' or newer of '{name}' " + f"(version '{version}' currently installed)." + ) + if on_version == "warn": + warnings.warn(msg, UserWarning) + return None + elif on_version == "raise": + raise ImportError(msg) return module diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 03a70615591a1..75c1d7b06b635 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -839,7 +839,7 @@ def __init__(self, f, engine=None, **kwds): if engine == "pyarrow": raise ValueError( "The 'dialect' option is not supported with the 'pyarrow' engine" - + ) kwds = _merge_with_dialect_properties(dialect, kwds) if kwds.get("header", "infer") == "infer": @@ -2223,11 +2223,7 @@ def __init__(self, src, **kwds): self.src = BytesIOWrapper(self.src, encoding=encoding) def read(self): - pyarrow = import_optional_dependency( - "pyarrow.csv", - min_version="0.15.0", - extra="pyarrow is required to use the pyarrow engine", - ) + pyarrow = import_optional_dependency("pyarrow.csv", min_version="0.15.0") kwdscopy = {k: v for k, v in self.kwds.items() if v is not None} # these are kwargs passed to pyarrow parseoptions = {"delimiter", "quote_char", "escape_char", "ignore_empty_lines"} @@ -3434,7 +3430,7 @@ def _isindex(colspec): colspec = orig_names[colspec] if _isindex(colspec): continue - data_dict[colspec] = converter(np.array(data_dict[colspec])) + data_dict[colspec] = converter(np.asarray(data_dict[colspec])) else: new_name, col, old_names = _try_convert_dates( converter, colspec, data_dict, orig_names diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index d2ae4c160d519..6e9cdacd40586 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -132,11 +132,12 @@ def test_pyarrow_engine(self): 1,2,3,4,""" for default in pa_unsupported: - print(default) msg = ( f"The {repr(default)} option is not " f"supported with the 'pyarrow' engine" ) kwargs = {default: object()} + if default == "dialect": + kwargs[default] = "excel" # test a random dialect with pytest.raises(ValueError, match=msg): read_csv(StringIO(data), engine="pyarrow", **kwargs) From f9ce2e46838a0aec07d180dc8e909573b5408918 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 28 Oct 2020 11:47:47 -0700 Subject: [PATCH 34/95] Doc fixes and more typo fixes --- doc/source/whatsnew/v1.1.0.rst | 8 -------- doc/source/whatsnew/v1.2.0.rst | 6 ++++++ pandas/compat/_optional.py | 23 ++++++++++++----------- pandas/io/parsers.py | 7 ++++--- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a0383d7248624..50443f8810e5f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -270,14 +270,6 @@ change, as ``fsspec`` will still bring in the same packages as before. .. _fsspec docs: https://filesystem-spec.readthedocs.io/en/latest/ - -read_csv() now accepts pyarrow as an engine -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing on multicore machines -with pyarrow>=0.15 installed. See the :doc:`I/O docs ` for more info. (:issue:`23697`) - - .. _whatsnew_110.enhancements.other: Other enhancements diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f1f24ab7a101b..16b0324acaf6c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -203,6 +203,12 @@ example where the index name is preserved: The same is true for :class:`MultiIndex`, but the logic is applied separately on a level-by-level basis. +read_csv() now accepts pyarrow as an engine +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing on multicore machines +with pyarrow>=0.15 installed. See the :doc:`I/O docs ` for more info. (:issue:`23697`) + .. _whatsnew_120.enhancements.other: Other enhancements diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 6569b077069e2..a6a14fcbee757 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -126,16 +126,17 @@ def import_optional_dependency( module_to_get = module minimum_version = min_version if min_version is not None else VERSIONS.get(name) version = _get_version(module_to_get) - if distutils.version.LooseVersion(version) < minimum_version: - assert on_version in {"warn", "raise", "ignore"} - msg = ( - f"Pandas requires version '{minimum_version}' or newer of '{name}' " - f"(version '{version}' currently installed)." - ) - if on_version == "warn": - warnings.warn(msg, UserWarning) - return None - elif on_version == "raise": - raise ImportError(msg) + if minimum_version: + if distutils.version.LooseVersion(version) < minimum_version: + assert on_version in {"warn", "raise", "ignore"} + msg = ( + f"Pandas requires version '{minimum_version}' or newer of '{name}' " + f"(version '{version}' currently installed)." + ) + if on_version == "warn": + warnings.warn(msg, UserWarning) + return None + elif on_version == "raise": + raise ImportError(msg) return module diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 75c1d7b06b635..5c70e31aca041 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -165,10 +165,11 @@ of dtype conversion. engine : {{'c', 'python', 'pyarrow'}}, optional Parser engine to use. The C and pyarrow engines are faster, while the python engine - is currently more feature-complete. The pyarrow engine requires ``pyarrow`` >= 0.15 - as a dependency however. + is currently more feature-complete. The pyarrow engine also supports multithreading + something that is not present in the C or python engines. It requires + ``pyarrow`` >= 0.15 as a dependency however. - .. versionchanged:: 1.1 + .. versionchanged:: 1.2 The "pyarrow" engine was added. converters : dict, optional Dict of functions for converting values in certain columns. Keys can either From 4158d6af395ba4335a59001010621ae0479abf48 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 2 Nov 2020 09:59:01 -0800 Subject: [PATCH 35/95] Green? --- pandas/compat/_optional.py | 2 +- pandas/tests/io/parser/test_dialect.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index a6a14fcbee757..28741c1560543 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -125,8 +125,8 @@ def import_optional_dependency( else: module_to_get = module minimum_version = min_version if min_version is not None else VERSIONS.get(name) - version = _get_version(module_to_get) if minimum_version: + version = _get_version(module_to_get) if distutils.version.LooseVersion(version) < minimum_version: assert on_version in {"warn", "raise", "ignore"} msg = ( diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py index 7a65e46ba670f..afdd7548ed0dd 100644 --- a/pandas/tests/io/parser/test_dialect.py +++ b/pandas/tests/io/parser/test_dialect.py @@ -13,7 +13,7 @@ from pandas import DataFrame import pandas._testing as tm -pytestmark = pytest.mark.usefixtures("pyarrow_xfail") +pytestmark = pytest.mark.usefixtures("pyarrow_skip") @pytest.fixture From 10be581b3da43373c0b28fd928aa692e4a847e1c Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Tue, 8 Dec 2020 12:39:35 -0500 Subject: [PATCH 36/95] xfail tests --- pandas/tests/io/parser/test_common.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 1f4419988fe5a..2bdc43b892e42 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -109,6 +109,7 @@ def test_bad_stream_exception(all_parsers, csv_dir_path): parser.read_csv(stream) +@skip_pyarrow def test_read_csv_local(all_parsers, csv1): prefix = "file:///" if compat.is_platform_windows() else "file://" parser = all_parsers @@ -915,6 +916,7 @@ def test_read_csv_parse_simple_list(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @tm.network def test_url(all_parsers, csv_dir_path): # TODO: FTP testing @@ -932,6 +934,7 @@ def test_url(all_parsers, csv_dir_path): tm.assert_frame_equal(url_result, local_result) +@skip_pyarrow @pytest.mark.slow def test_local_file(all_parsers, csv_dir_path): parser = all_parsers @@ -1986,6 +1989,7 @@ def test_file_handles_with_open(all_parsers, csv1): assert not f.closed +@skip_pyarrow def test_invalid_file_buffer_class(all_parsers): # see gh-15337 class InvalidBuffer: @@ -1998,6 +2002,7 @@ class InvalidBuffer: parser.read_csv(InvalidBuffer()) +@skip_pyarrow def test_invalid_file_buffer_mock(all_parsers): # see gh-15337 parser = all_parsers @@ -2332,6 +2337,7 @@ def test_read_csv_file_handle(all_parsers, io_class, encoding): assert not handle.closed +@skip_pyarrow def test_memory_map_file_handle_silent_fallback(all_parsers, compression): """ Do not fail for buffers with memory_map=True (cannot memory map BytesIO). @@ -2351,6 +2357,7 @@ def test_memory_map_file_handle_silent_fallback(all_parsers, compression): ) +@skip_pyarrow def test_memory_map_compression(all_parsers, compression): """ Support memory map for compressed files. @@ -2369,6 +2376,7 @@ def test_memory_map_compression(all_parsers, compression): ) +@skip_pyarrow def test_context_manager(all_parsers, datapath): # make sure that opened files are closed parser = all_parsers @@ -2385,6 +2393,7 @@ def test_context_manager(all_parsers, datapath): assert reader._engine.handles.handle.closed +@skip_pyarrow def test_context_manageri_user_provided(all_parsers, datapath): # make sure that user-provided handles are not closed parser = all_parsers From fcc7e043e730bb13de7dd58dcbe6519aff870793 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Tue, 8 Dec 2020 12:41:11 -0500 Subject: [PATCH 37/95] xfail test --- pandas/tests/io/parser/test_mangle_dupes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index f151ee1d27f99..bef2b08a308f6 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -12,6 +12,8 @@ xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow @pytest.mark.parametrize("kwargs", [{}, {"mangle_dupe_cols": True}]) def test_basic(all_parsers, kwargs): # TODO: add test for condition "mangle_dupe_cols=False" From d7959a102acdecadcaf88dcf47acc752548d7db5 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Tue, 8 Dec 2020 12:41:30 -0500 Subject: [PATCH 38/95] fix import --- pandas/io/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index bcd5e33cc2f67..6c44f1ff0077a 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -5,7 +5,7 @@ from collections import abc, defaultdict import csv import datetime -from io import StringIO +from io import StringIO, TextIOBase import itertools import re import sys From e37d12698a1c76e0594b7891b3c9371d8fa0a14b Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Tue, 8 Dec 2020 12:49:15 -0500 Subject: [PATCH 39/95] xfail tests --- pandas/tests/io/parser/test_compression.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index 1c6726f13b843..0af10c4124072 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -150,6 +150,7 @@ def test_compression_utf_encoding( tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"]) def test_invalid_compression(all_parsers, invalid_compression): parser = all_parsers From 3bc4775068baf7dd1c70ad807bf064b136d9b0ee Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Wed, 9 Dec 2020 13:29:01 -0500 Subject: [PATCH 40/95] skip tests --- pandas/tests/io/parser/test_common.py | 93 +++++++++++++-------------- 1 file changed, 46 insertions(+), 47 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 2bdc43b892e42..a9ebd9004e9f8 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -24,7 +24,6 @@ from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") def test_override_set_noconvert_columns(): @@ -144,7 +143,7 @@ def test_read_csv_local(all_parsers, csv1): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_1000_sep(all_parsers): parser = all_parsers data = """A|B|C @@ -237,7 +236,7 @@ def test_csv_mixed_type(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_read_csv_low_memory_no_rows_with_index(all_parsers): # see gh-21141 parser = all_parsers @@ -286,7 +285,7 @@ def test_read_csv_dataframe(all_parsers, csv1): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_read_csv_no_index_name(all_parsers, csv_dir_path): parser = all_parsers csv2 = os.path.join(csv_dir_path, "test2.csv") @@ -355,7 +354,7 @@ def test_read_duplicate_index_explicit(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_read_duplicate_index_implicit(all_parsers): data = """A,B,C,D foo,2,3,4,5 @@ -806,7 +805,7 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) def test_multi_index_no_level_names(all_parsers, index_col): data = """index1,index2,A,B,C,D @@ -831,7 +830,7 @@ def test_multi_index_no_level_names(all_parsers, index_col): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_multi_index_no_level_names_implicit(all_parsers): parser = all_parsers data = """A,B,C,D @@ -865,7 +864,7 @@ def test_multi_index_no_level_names_implicit(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "data,expected,header", [ @@ -887,7 +886,7 @@ def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_no_unnamed_index(all_parsers): parser = all_parsers data = """ id c0 c1 c2 @@ -952,7 +951,7 @@ def test_local_file(all_parsers, csv_dir_path): pytest.skip("Failing on: " + " ".join(platform.uname())) -@xfail_pyarrow +@skip_pyarrow def test_path_path_lib(all_parsers): parser = all_parsers df = tm.makeDataFrame() @@ -960,7 +959,7 @@ def test_path_path_lib(all_parsers): tm.assert_frame_equal(df, result) -@xfail_pyarrow +@skip_pyarrow def test_path_local_path(all_parsers): parser = all_parsers df = tm.makeDataFrame() @@ -970,7 +969,7 @@ def test_path_local_path(all_parsers): tm.assert_frame_equal(df, result) -@xfail_pyarrow +@skip_pyarrow def test_nonexistent_path(all_parsers): # gh-2428: pls no segfault # gh-14086: raise more helpful FileNotFoundError @@ -984,7 +983,7 @@ def test_nonexistent_path(all_parsers): assert path == e.value.filename -@xfail_pyarrow +@skip_pyarrow @td.skip_if_windows # os.chmod does not work in windows def test_no_permission(all_parsers): # GH 23784 @@ -1007,7 +1006,7 @@ def test_no_permission(all_parsers): assert path == e.value.filename -@xfail_pyarrow +@skip_pyarrow def test_missing_trailing_delimiters(all_parsers): parser = all_parsers data = """A,B,C,D @@ -1023,7 +1022,7 @@ def test_missing_trailing_delimiters(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_skip_initial_space(all_parsers): data = ( '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' @@ -1084,7 +1083,7 @@ def test_skip_initial_space(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_trailing_delimiters(all_parsers): # see gh-2442 data = """A,B,C @@ -1200,7 +1199,7 @@ def test_integer_overflow_bug(all_parsers, sep): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_catch_too_many_names(all_parsers): # see gh-5156 data = """\ @@ -1220,7 +1219,7 @@ def test_catch_too_many_names(all_parsers): parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) -@xfail_pyarrow +@skip_pyarrow def test_ignore_leading_whitespace(all_parsers): # see gh-3374, gh-6607 parser = all_parsers @@ -1252,7 +1251,7 @@ def test_empty_with_index(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_empty_with_multi_index(all_parsers): # see gh-10467 data = "x,y,z" @@ -1299,7 +1298,7 @@ def test_scientific_no_exponent(all_parsers): tm.assert_frame_equal(df_roundtrip, df) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("conv", [None, np.int64, np.uint64]) def test_int64_overflow(all_parsers, conv): data = """ID @@ -1343,7 +1342,7 @@ def test_int64_overflow(all_parsers, conv): parser.read_csv(StringIO(data), converters={"ID": conv}) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min] ) @@ -1357,7 +1356,7 @@ def test_int64_uint64_range(all_parsers, val): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] ) @@ -1371,7 +1370,7 @@ def test_outside_int64_uint64_range(all_parsers, val): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("exp_data", [[str(-1), str(2 ** 63)], [str(2 ** 63), str(-1)]]) def test_numeric_range_too_wide(all_parsers, exp_data): # No numerical dtype can hold both negative and uint64 @@ -1384,7 +1383,7 @@ def test_numeric_range_too_wide(all_parsers, exp_data): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("iterator", [True, False]) def test_empty_with_nrows_chunksize(all_parsers, iterator): # see gh-9535 @@ -1511,7 +1510,7 @@ def test_eof_states(all_parsers, data, kwargs, expected, msg): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]]) def test_uneven_lines_with_usecols(all_parsers, usecols): # see gh-12203 @@ -1566,7 +1565,7 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected, pyarrow_xf tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "kwargs,expected", [ @@ -1607,7 +1606,7 @@ def test_raise_on_sep_with_delim_whitespace(all_parsers): parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("delim_whitespace", [True, False]) def test_single_char_leading_whitespace(all_parsers, delim_whitespace): # see gh-9710 @@ -1666,7 +1665,7 @@ def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_whitespace_lines(all_parsers): parser = all_parsers data = """ @@ -1682,7 +1681,7 @@ def test_whitespace_lines(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -1711,7 +1710,7 @@ def test_whitespace_regex_separator(all_parsers, data, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_verbose_read(all_parsers, capsys): parser = all_parsers data = """a,b,c,d @@ -1735,7 +1734,7 @@ def test_verbose_read(all_parsers, capsys): assert captured.out == "Filled 3 NA values in column a\n" -@xfail_pyarrow +@skip_pyarrow def test_verbose_read2(all_parsers, capsys): parser = all_parsers data = """a,b,c,d @@ -1777,7 +1776,7 @@ def test_iteration_open_handle(all_parsers): tm.assert_series_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "data,thousands,decimal", [ @@ -1809,7 +1808,7 @@ def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_euro_decimal_format(all_parsers): parser = all_parsers data = """Id;Number1;Number2;Text1;Text2;Number3 @@ -1829,7 +1828,7 @@ def test_euro_decimal_format(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("na_filter", [True, False]) def test_inf_parsing(all_parsers, na_filter): parser = all_parsers @@ -1853,7 +1852,7 @@ def test_inf_parsing(all_parsers, na_filter): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("na_filter", [True, False]) def test_infinity_parsing(all_parsers, na_filter): parser = all_parsers @@ -1882,7 +1881,7 @@ def test_raise_on_no_columns(all_parsers, nrows): parser.read_csv(StringIO(data)) -@xfail_pyarrow +@skip_pyarrow @td.check_file_leaks def test_memory_map(all_parsers, csv_dir_path): mmap_file = os.path.join(csv_dir_path, "test_mmap.csv") @@ -1896,7 +1895,7 @@ def test_memory_map(all_parsers, csv_dir_path): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_null_byte_char(all_parsers): # see gh-2741 data = "\x00,foo" @@ -1913,7 +1912,7 @@ def test_null_byte_char(all_parsers): parser.read_csv(StringIO(data), names=names) -@xfail_pyarrow +@skip_pyarrow def test_temporary_file(all_parsers): # see gh-13398 parser = all_parsers @@ -2037,7 +2036,7 @@ def seek(self, pos, whence=0): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "kwargs", [dict(), dict(error_bad_lines=True)], # Default is True. # Explicitly pass in. @@ -2056,7 +2055,7 @@ def test_error_bad_lines(all_parsers, kwargs, warn_kwargs): parser.read_csv(StringIO(data), **kwargs) -@xfail_pyarrow +@skip_pyarrow def test_warn_bad_lines(all_parsers, capsys): # see gh-15925 parser = all_parsers @@ -2071,7 +2070,7 @@ def test_warn_bad_lines(all_parsers, capsys): assert "Skipping line 5" in captured.err -@xfail_pyarrow +@skip_pyarrow def test_suppress_error_output(all_parsers, capsys): # see gh-15925 parser = all_parsers @@ -2100,7 +2099,7 @@ def test_filename_with_special_chars(all_parsers, filename): tm.assert_frame_equal(result, df) -@xfail_pyarrow +@skip_pyarrow def test_read_csv_memory_growth_chunksize(all_parsers): # see gh-24805 # @@ -2182,7 +2181,7 @@ def test_first_row_bom(all_parsers, pyarrow_xfail): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_first_row_bom_unquoted(all_parsers): # see gh-36343 parser = all_parsers @@ -2203,7 +2202,7 @@ def test_integer_precision(all_parsers): tm.assert_series_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_file_descriptor_leak(all_parsers): # GH 31488 @@ -2217,7 +2216,7 @@ def test(): td.check_file_leaks(test)() -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("nrows", range(1, 6)) def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): # GH 28071 @@ -2231,7 +2230,7 @@ def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): tm.assert_frame_equal(df, ref[:nrows]) -@xfail_pyarrow +@skip_pyarrow def test_no_header_two_extra_columns(all_parsers): # GH 26218 column_names = ["one", "two", "three"] @@ -2262,7 +2261,7 @@ def test_read_csv_with_use_inf_as_na(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_read_table_delim_whitespace_default_sep(all_parsers): # GH: 35958 f = StringIO("a b c\n1 -2 -3\n4 5 6") From 17a502d806063ce77bafc75a5367e450a4cb609b Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Wed, 9 Dec 2020 13:43:37 -0500 Subject: [PATCH 41/95] skip tests --- pandas/tests/io/parser/test_dtypes.py | 43 +++++++++++++-------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 23d4fef424624..4ef609cb87980 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -17,10 +17,9 @@ import pandas._testing as tm skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) def test_dtype_all_columns(all_parsers, dtype, check_orig): @@ -47,7 +46,7 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_dtype_all_columns_empty(all_parsers): # see gh-12048 parser = all_parsers @@ -57,7 +56,7 @@ def test_dtype_all_columns_empty(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_dtype_per_column(all_parsers): parser = all_parsers data = """\ @@ -76,7 +75,7 @@ def test_dtype_per_column(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_invalid_dtype_per_column(all_parsers): parser = all_parsers data = """\ @@ -90,7 +89,7 @@ def test_invalid_dtype_per_column(all_parsers): parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "dtype", [ @@ -133,7 +132,7 @@ def test_categorical_dtype_single(all_parsers, dtype): tm.assert_frame_equal(actual, expected) -@xfail_pyarrow +@skip_pyarrow def test_categorical_dtype_unsorted(all_parsers): # see gh-10153 parser = all_parsers @@ -152,7 +151,7 @@ def test_categorical_dtype_unsorted(all_parsers): tm.assert_frame_equal(actual, expected) -@xfail_pyarrow +@skip_pyarrow def test_categorical_dtype_missing(all_parsers): # see gh-10153 parser = all_parsers @@ -171,7 +170,7 @@ def test_categorical_dtype_missing(all_parsers): tm.assert_frame_equal(actual, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.slow def test_categorical_dtype_high_cardinality_numeric(all_parsers): # see gh-18186 @@ -199,7 +198,7 @@ def test_categorical_dtype_latin1(all_parsers, csv_dir_path): tm.assert_frame_equal(actual, expected) -@xfail_pyarrow +@skip_pyarrow def test_categorical_dtype_utf16(all_parsers, csv_dir_path): # see gh-10153 pth = os.path.join(csv_dir_path, "utf16_ex.txt") @@ -214,7 +213,7 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path): tm.assert_frame_equal(actual, expected) -@xfail_pyarrow +@skip_pyarrow def test_categorical_dtype_chunksize_infer_categories(all_parsers): # see gh-10153 parser = all_parsers @@ -234,7 +233,7 @@ def test_categorical_dtype_chunksize_infer_categories(all_parsers): tm.assert_frame_equal(actual, expected) -@xfail_pyarrow +@skip_pyarrow def test_categorical_dtype_chunksize_explicit_categories(all_parsers): # see gh-10153 parser = all_parsers @@ -335,7 +334,7 @@ def test_categorical_coerces_timestamp(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_categorical_coerces_timedelta(all_parsers): parser = all_parsers dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} @@ -377,7 +376,7 @@ def test_categorical_unexpected_categories(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_empty_pass_dtype(all_parsers): parser = all_parsers @@ -391,7 +390,7 @@ def test_empty_pass_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_empty_with_index_pass_dtype(all_parsers): parser = all_parsers @@ -406,7 +405,7 @@ def test_empty_with_index_pass_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_empty_with_multi_index_pass_dtype(all_parsers): parser = all_parsers @@ -435,7 +434,7 @@ def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers, pyarrow_xfai tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): parser = all_parsers @@ -449,7 +448,7 @@ def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers): # see gh-9424 parser = all_parsers @@ -478,7 +477,7 @@ def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers): parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"}) -@xfail_pyarrow +@skip_pyarrow def test_raise_on_passed_int_dtype_with_nas(all_parsers): # see gh-2631 parser = all_parsers @@ -496,7 +495,7 @@ def test_raise_on_passed_int_dtype_with_nas(all_parsers): parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True) -@xfail_pyarrow +@skip_pyarrow def test_dtype_with_converters(all_parsers): parser = all_parsers data = """a,b @@ -512,7 +511,7 @@ def test_dtype_with_converters(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "dtype,expected", [ @@ -577,7 +576,7 @@ def test_numeric_dtype(all_parsers, dtype): tm.assert_frame_equal(expected, result) -@xfail_pyarrow +@skip_pyarrow def test_boolean_dtype(all_parsers): parser = all_parsers data = "\n".join( From e27d7ef5d3b3b043d547593a8612681aafb253d7 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Wed, 9 Dec 2020 13:51:12 -0500 Subject: [PATCH 42/95] C408 failure --- pandas/io/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 6c44f1ff0077a..e7089f708d47f 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2292,7 +2292,7 @@ def read(self): parse_options = {k: v for k, v in kwdscopy.items() if k in parseoptions} convert_options = {k: v for k, v in kwdscopy.items() if k in convertoptions} headerexists = True if self.header is not None else False - read_options = dict() + read_options = {} skiprows = self.kwds.get("skiprows") if headerexists: From 4e638e9019cf530e53b2670b24204fac8b432db5 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Wed, 9 Dec 2020 17:27:47 -0500 Subject: [PATCH 43/95] skip tests --- pandas/tests/io/parser/test_common.py | 1 + pandas/tests/io/parser/test_encoding.py | 12 ++-- pandas/tests/io/parser/test_header.py | 26 ++++----- pandas/tests/io/parser/test_index_col.py | 13 +++-- pandas/tests/io/parser/test_mangle_dupes.py | 10 ++-- pandas/tests/io/parser/test_na_values.py | 40 ++++++------- pandas/tests/io/parser/test_parse_dates.py | 64 ++++++++++----------- pandas/tests/io/parser/test_quoting.py | 10 ++-- pandas/tests/io/parser/test_skiprows.py | 24 ++++---- pandas/tests/io/parser/test_usecols.py | 32 +++++------ 10 files changed, 117 insertions(+), 115 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 8c8ef1ef26de5..443af3a33be18 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -73,6 +73,7 @@ def _set_noconvert_columns(self): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_empty_decimal_marker(all_parsers, pyarrow_xfail): data = """A|B|C 1|2,334|5 diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index c248a878a9d23..dde8277f1732a 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -17,7 +17,7 @@ xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -@xfail_pyarrow +@skip_pyarrow def test_bytes_io_input(all_parsers): encoding = "cp1255" parser = all_parsers @@ -29,7 +29,7 @@ def test_bytes_io_input(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_read_csv_unicode(all_parsers): parser = all_parsers data = BytesIO("\u0141aski, Jan;1".encode()) @@ -39,7 +39,7 @@ def test_read_csv_unicode(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("sep", [",", "\t"]) @pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"]) def test_utf16_bom_skiprows(all_parsers, sep, encoding): @@ -74,7 +74,7 @@ def test_utf16_bom_skiprows(all_parsers, sep, encoding): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_utf16_example(all_parsers, csv_dir_path): path = os.path.join(csv_dir_path, "utf16_ex.txt") parser = all_parsers @@ -82,7 +82,7 @@ def test_utf16_example(all_parsers, csv_dir_path): assert len(result) == 50 -@xfail_pyarrow +@skip_pyarrow def test_unicode_encoding(all_parsers, csv_dir_path): path = os.path.join(csv_dir_path, "unicode_series.csv") parser = all_parsers @@ -194,7 +194,7 @@ def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding) tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_encoding_named_temp_file(all_parsers): # see gh-31819 parser = all_parsers diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 95d9a23eb4d92..0de6e389dd09b 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -18,7 +18,7 @@ xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -@xfail_pyarrow +@skip_pyarrow def test_read_with_bad_header(all_parsers): parser = all_parsers msg = r"but only \d+ lines in file" @@ -86,7 +86,7 @@ def test_no_header_prefix(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_header_with_index_col(all_parsers): parser = all_parsers data = """foo,1,2,3 @@ -124,7 +124,7 @@ def test_header_not_first_line(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_header_multi_index(all_parsers): parser = all_parsers expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) @@ -190,7 +190,7 @@ def test_header_multi_index_invalid(all_parsers, kwargs, msg): _TestTuple = namedtuple("names", ["first", "second"]) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "kwargs", [ @@ -238,7 +238,7 @@ def test_header_multi_index_common_format1(all_parsers, kwargs): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "kwargs", [ @@ -285,7 +285,7 @@ def test_header_multi_index_common_format2(all_parsers, kwargs): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "kwargs", [ @@ -333,7 +333,7 @@ def test_header_multi_index_common_format3(all_parsers, kwargs): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_header_multi_index_common_format_malformed1(all_parsers): parser = all_parsers expected = DataFrame( @@ -354,7 +354,7 @@ def test_header_multi_index_common_format_malformed1(all_parsers): tm.assert_frame_equal(expected, result) -@xfail_pyarrow +@skip_pyarrow def test_header_multi_index_common_format_malformed2(all_parsers): parser = all_parsers expected = DataFrame( @@ -376,7 +376,7 @@ def test_header_multi_index_common_format_malformed2(all_parsers): tm.assert_frame_equal(expected, result) -@xfail_pyarrow +@skip_pyarrow def test_header_multi_index_common_format_malformed3(all_parsers): parser = all_parsers expected = DataFrame( @@ -456,7 +456,7 @@ def test_non_int_header(all_parsers, header): parser.read_csv(StringIO(data), header=header) -@xfail_pyarrow +@skip_pyarrow def test_singleton_header(all_parsers): # see gh-7757 data = """a,b,c\n0,1,2\n1,2,3""" @@ -467,7 +467,7 @@ def test_singleton_header(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -514,7 +514,7 @@ def test_mangles_multi_index(all_parsers, data, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("index_col", [None, [0]]) @pytest.mark.parametrize( "columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])] @@ -558,7 +558,7 @@ def test_multi_index_unnamed(all_parsers, index_col, columns): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_read_csv_multiindex_columns(all_parsers): # GH#6051 parser = all_parsers diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index a17b858940b2f..0ef11f8a91576 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -70,7 +70,7 @@ def test_index_col_is_true(all_parsers): parser.read_csv(StringIO(data), index_col=True) -@xfail_pyarrow +@skip_pyarrow def test_infer_index_col(all_parsers): data = """A,B,C foo,1,2,3 @@ -88,7 +88,7 @@ def test_infer_index_col(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "index_col,kwargs", [ @@ -137,7 +137,7 @@ def test_index_col_empty_data(all_parsers, index_col, kwargs): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_empty_with_index_col_false(all_parsers): # see gh-10413 data = "x,y" @@ -173,7 +173,7 @@ def test_multi_index_naming(all_parsers, index_names): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_multi_index_naming_not_all_at_beginning(all_parsers): parser = all_parsers data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4" @@ -188,7 +188,7 @@ def test_multi_index_naming_not_all_at_beginning(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_no_multi_index_level_names_empty(all_parsers): # GH 10984 parser = all_parsers @@ -200,7 +200,7 @@ def test_no_multi_index_level_names_empty(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_header_with_index_col(all_parsers): # GH 33476 parser = all_parsers @@ -224,6 +224,7 @@ def test_header_with_index_col(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.slow def test_index_col_large_csv(all_parsers): # https://github.com/pandas-dev/pandas/issues/37094 diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index bef2b08a308f6..8fb7f3c093ae0 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -10,10 +10,10 @@ from pandas import DataFrame import pandas._testing as tm -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("kwargs", [{}, {"mangle_dupe_cols": True}]) def test_basic(all_parsers, kwargs): # TODO: add test for condition "mangle_dupe_cols=False" @@ -27,7 +27,7 @@ def test_basic(all_parsers, kwargs): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_basic_names(all_parsers): # See gh-7160 parser = all_parsers @@ -48,7 +48,7 @@ def test_basic_names_raise(all_parsers): parser.read_csv(StringIO(data), names=["a", "b", "a"]) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -116,7 +116,7 @@ def test_thorough_mangle_names(all_parsers, data, names, expected): parser.read_csv(StringIO(data), names=names) -@xfail_pyarrow +@skip_pyarrow def test_mangled_unnamed_placeholders(all_parsers): # xref gh-13017 orig_key = "0" diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index d84a886e2451b..6e56d325efdad 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -16,7 +16,7 @@ xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -@xfail_pyarrow +@skip_pyarrow def test_string_nas(all_parsers): parser = all_parsers data = """A,B,C @@ -32,7 +32,7 @@ def test_string_nas(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_detect_string_na(all_parsers): parser = all_parsers data = """A,B @@ -47,7 +47,7 @@ def test_detect_string_na(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "na_values", [ @@ -85,7 +85,7 @@ def test_non_string_na_values(all_parsers, data, na_values): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_default_na_values(all_parsers): _NA_VALUES = { "-1.#IND", @@ -133,7 +133,7 @@ def f(i, v): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("na_values", ["baz", ["baz"]]) def test_custom_na_values(all_parsers, na_values): parser = all_parsers @@ -167,7 +167,7 @@ def test_bool_na_values(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_na_value_dict(all_parsers): data = """A,B,C foo,bar,NA @@ -186,7 +186,7 @@ def test_na_value_dict(all_parsers): tm.assert_frame_equal(df, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "index_col,expected", [ @@ -220,7 +220,7 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "kwargs,expected", [ @@ -308,7 +308,7 @@ def test_no_na_values_no_keep_default(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_no_keep_default_na_dict_na_values(all_parsers): # see gh-19227 data = "a,b\n,2" @@ -320,7 +320,7 @@ def test_no_keep_default_na_dict_na_values(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_no_keep_default_na_dict_na_scalar_values(all_parsers): # see gh-19227 # @@ -332,7 +332,7 @@ def test_no_keep_default_na_dict_na_scalar_values(all_parsers): tm.assert_frame_equal(df, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("col_zero_na_values", [113125, "113125"]) def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values): # see gh-19227 @@ -362,7 +362,7 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "na_filter,row_data", [ @@ -384,7 +384,7 @@ def test_na_values_na_filter_override(all_parsers, na_filter, row_data): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_na_trailing_columns(all_parsers): parser = all_parsers data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax @@ -412,7 +412,7 @@ def test_na_trailing_columns(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "na_values,row_data", [ @@ -431,7 +431,7 @@ def test_na_values_scalar(all_parsers, na_values, row_data): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_na_values_dict_aliasing(all_parsers): parser = all_parsers na_values = {"a": 2, "b": 1} @@ -447,7 +447,7 @@ def test_na_values_dict_aliasing(all_parsers): tm.assert_dict_equal(na_values, na_values_copy) -@xfail_pyarrow +@skip_pyarrow def test_na_values_dict_col_index(all_parsers): # see gh-14203 data = "a\nfoo\n1" @@ -459,7 +459,7 @@ def test_na_values_dict_col_index(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -518,7 +518,7 @@ def test_inf_na_values_with_int_index(all_parsers): tm.assert_frame_equal(out, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("na_filter", [True, False]) def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): # see gh-20377 @@ -534,7 +534,7 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "data, na_values", [ @@ -563,7 +563,7 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): ) -@xfail_pyarrow +@skip_pyarrow def test_str_nan_dropped(all_parsers): # see gh-21131 parser = all_parsers diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 07faab37c6997..d4c0d28214849 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -34,10 +34,10 @@ else: date_strategy = st.datetimes() -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -@xfail_pyarrow +@skip_pyarrow def test_separator_date_conflict(all_parsers): # Regression test for gh-4678 # @@ -59,7 +59,7 @@ def test_separator_date_conflict(all_parsers): tm.assert_frame_equal(df, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("keep_date_col", [True, False]) def test_multiple_date_col_custom(all_parsers, keep_date_col): data = """\ @@ -203,7 +203,7 @@ def date_parser(*date_cols): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("container", [list, tuple, Index, Series]) @pytest.mark.parametrize("dim", [1, 2]) def test_concat_date_col_fail(container, dim): @@ -216,7 +216,7 @@ def test_concat_date_col_fail(container, dim): parsing.concat_date_cols(date_cols) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("keep_date_col", [True, False]) def test_multiple_date_col(all_parsers, keep_date_col): data = """\ @@ -376,7 +376,7 @@ def test_date_col_as_index_col(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "date_parser, warning", ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]), @@ -441,7 +441,7 @@ def test_multiple_date_cols_int_cast(all_parsers, date_parser, warning): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_multiple_date_col_timestamp_parse(all_parsers): parser = all_parsers data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 @@ -476,7 +476,7 @@ def test_multiple_date_col_timestamp_parse(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_multiple_date_cols_with_header(all_parsers): parser = all_parsers data = """\ @@ -646,7 +646,7 @@ def test_date_parser_int_bug(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_nat_parse(all_parsers): # see gh-3062 parser = all_parsers @@ -662,7 +662,7 @@ def test_nat_parse(all_parsers): tm.assert_frame_equal(result, df) -@xfail_pyarrow +@skip_pyarrow def test_csv_custom_parser(all_parsers): data = """A,B,C 20090101,a,1,2 @@ -677,7 +677,7 @@ def test_csv_custom_parser(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_parse_dates_implicit_first_col(all_parsers): data = """A,B,C 20090101,a,1,2 @@ -691,7 +691,7 @@ def test_parse_dates_implicit_first_col(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_parse_dates_string(all_parsers): data = """date,A,B,C 20090101,a,1,2 @@ -736,7 +736,7 @@ def test_yy_format_with_year_first(all_parsers, parse_dates): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]]) def test_parse_dates_column_list(all_parsers, parse_dates): data = "a,b,c\n01/01/2010,1,15/02/2010" @@ -753,7 +753,7 @@ def test_parse_dates_column_list(all_parsers, parse_dates): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) def test_multi_index_parse_dates(all_parsers, index_col): data = """index1,index2,A,B,C @@ -799,7 +799,7 @@ def test_multi_index_parse_dates(all_parsers, index_col): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("kwargs", [{"dayfirst": True}, {"day_first": True}]) def test_parse_dates_custom_euro_format(all_parsers, kwargs): parser = all_parsers @@ -844,7 +844,7 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs): ) -@xfail_pyarrow +@skip_pyarrow def test_parse_tz_aware(all_parsers): # See gh-1693 parser = all_parsers @@ -858,7 +858,7 @@ def test_parse_tz_aware(all_parsers): assert result.index.tz is pytz.utc -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "parse_dates,index_col", [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)], @@ -959,7 +959,7 @@ def test_multiple_date_cols_index(all_parsers, parse_dates, index_col): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_multiple_date_cols_chunked(all_parsers): parser = all_parsers data = """\ @@ -1052,7 +1052,7 @@ def test_multiple_date_cols_chunked(all_parsers): tm.assert_frame_equal(chunks[2], expected[4:]) -@xfail_pyarrow +@skip_pyarrow def test_multiple_date_col_named_index_compat(all_parsers): parser = all_parsers data = """\ @@ -1076,7 +1076,7 @@ def test_multiple_date_col_named_index_compat(all_parsers): tm.assert_frame_equal(with_indices, with_names) -@xfail_pyarrow +@skip_pyarrow def test_multiple_date_col_multiple_index_compat(all_parsers): parser = all_parsers data = """\ @@ -1144,7 +1144,7 @@ def test_bad_date_parse(all_parsers, cache_dates, value): ) -@xfail_pyarrow +@skip_pyarrow def test_parse_dates_empty_string(all_parsers): # see gh-2263 parser = all_parsers @@ -1157,7 +1157,7 @@ def test_parse_dates_empty_string(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -1197,7 +1197,7 @@ def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "date_parser, warning", ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]), @@ -1226,7 +1226,7 @@ def test_parse_date_time_multi_level_column_name(all_parsers, date_parser, warni tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "date_parser, warning", ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]), @@ -1315,7 +1315,7 @@ def test_parse_date_time(all_parsers, data, kwargs, expected, date_parser, warni tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "date_parser, warning", ([conv.parse_date_fields, FutureWarning], [pd.to_datetime, None]), @@ -1338,7 +1338,7 @@ def test_parse_date_fields(all_parsers, date_parser, warning): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "date_parser, warning", ( @@ -1370,7 +1370,7 @@ def test_parse_date_all_fields(all_parsers, date_parser, warning): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "date_parser, warning", ( @@ -1402,7 +1402,7 @@ def test_datetime_fractional_seconds(all_parsers, date_parser, warning): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_generic(all_parsers): parser = all_parsers data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." @@ -1421,7 +1421,7 @@ def test_generic(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_date_parser_resolution_if_not_ns(all_parsers): # see gh-10245 parser = all_parsers @@ -1519,7 +1519,7 @@ def test_parse_timezone(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "date_string", ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"], @@ -1531,7 +1531,7 @@ def test_invalid_parse_delimited_date(all_parsers, date_string): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "date_string,dayfirst,expected", [ @@ -1597,7 +1597,7 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti assert result == expected -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "names, usecols, parse_dates, missing_cols", [ diff --git a/pandas/tests/io/parser/test_quoting.py b/pandas/tests/io/parser/test_quoting.py index 30ffc598563e7..a93dbde24b001 100644 --- a/pandas/tests/io/parser/test_quoting.py +++ b/pandas/tests/io/parser/test_quoting.py @@ -17,7 +17,7 @@ xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "kwargs,msg", [ @@ -37,7 +37,7 @@ def test_bad_quote_char(all_parsers, kwargs, msg): parser.read_csv(StringIO(data), **kwargs) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "quoting,msg", [ @@ -62,7 +62,7 @@ def test_quote_char_basic(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"]) def test_quote_char_various(all_parsers, quote_char): parser = all_parsers @@ -75,7 +75,7 @@ def test_quote_char_various(all_parsers, quote_char): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE]) @pytest.mark.parametrize("quote_char", ["", None]) def test_null_quote_char(all_parsers, quoting, quote_char): @@ -95,7 +95,7 @@ def test_null_quote_char(all_parsers, quoting, quote_char): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "kwargs,exp_data", [ diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index cf5eb3f813169..6d85e01c6fd4a 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -14,10 +14,10 @@ from pandas import DataFrame, Index import pandas._testing as tm -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("skiprows", [list(range(6)), 6]) def test_skip_rows_bug(all_parsers, skiprows): # see gh-505 @@ -45,7 +45,7 @@ def test_skip_rows_bug(all_parsers, skiprows): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_deep_skip_rows(all_parsers): # see gh-4382 parser = all_parsers @@ -61,7 +61,7 @@ def test_deep_skip_rows(all_parsers): tm.assert_frame_equal(result, condensed_result) -@xfail_pyarrow +@skip_pyarrow def test_skip_rows_blank(all_parsers): # see gh-9832 parser = all_parsers @@ -88,7 +88,7 @@ def test_skip_rows_blank(all_parsers): tm.assert_frame_equal(data, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -129,7 +129,7 @@ def test_skip_row_with_newline(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_skip_row_with_quote(all_parsers): # see gh-12775 and gh-10911 parser = all_parsers @@ -145,7 +145,7 @@ def test_skip_row_with_quote(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "data,exp_data", [ @@ -181,7 +181,7 @@ def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "line_terminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR" ) @@ -218,7 +218,7 @@ def test_skiprows_lineterminator(all_parsers, line_terminator): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_skiprows_infield_quote(all_parsers): # see gh-14459 parser = all_parsers @@ -229,7 +229,7 @@ def test_skiprows_infield_quote(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "kwargs,expected", [ @@ -245,7 +245,7 @@ def test_skip_rows_callable(all_parsers, kwargs, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_skip_rows_skip_all(all_parsers): parser = all_parsers data = "a\n1\n2\n3\n4\n5" @@ -255,7 +255,7 @@ def test_skip_rows_skip_all(all_parsers): parser.read_csv(StringIO(data), skiprows=lambda x: True) -@xfail_pyarrow +@skip_pyarrow def test_skip_rows_bad_callable(all_parsers): msg = "by zero" parser = all_parsers diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index 8f72a32ed99e5..a3a2b3e984339 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -55,7 +55,7 @@ def test_usecols(all_parsers, usecols): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_usecols_with_names(all_parsers): data = """\ a,b,c @@ -71,7 +71,7 @@ def test_usecols_with_names(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])] ) @@ -88,7 +88,7 @@ def test_usecols_relative_to_names(all_parsers, names, usecols): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_usecols_relative_to_names2(all_parsers): # see gh-5766 data = """\ @@ -105,7 +105,7 @@ def test_usecols_relative_to_names2(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_usecols_name_length_conflict(all_parsers): data = """\ 1,2,3 @@ -134,7 +134,7 @@ def test_usecols_single_string(all_parsers): parser.read_csv(StringIO(data), usecols="foo") -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"] ) @@ -175,7 +175,7 @@ def test_usecols_index_col_conflict2(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_usecols_implicit_index_col(all_parsers): # see gh-2654 parser = all_parsers @@ -186,7 +186,7 @@ def test_usecols_implicit_index_col(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_usecols_regex_sep(all_parsers): # see gh-2733 parser = all_parsers @@ -197,7 +197,7 @@ def test_usecols_regex_sep(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_usecols_with_whitespace(all_parsers): parser = all_parsers data = "a b c\n4 apple bat 5.7\n8 orange cow 10" @@ -227,7 +227,7 @@ def test_usecols_with_integer_like_header(all_parsers, usecols, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) def test_usecols_with_parse_dates(all_parsers, usecols): # see gh-9755 @@ -307,7 +307,7 @@ def test_usecols_with_parse_dates3(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_usecols_with_parse_dates4(all_parsers): data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8" usecols = list("abcdefghij") @@ -331,7 +331,7 @@ def test_usecols_with_parse_dates4(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) @pytest.mark.parametrize( "names", @@ -425,7 +425,7 @@ def test_usecols_with_multi_byte_characters(all_parsers, usecols): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_empty_usecols(all_parsers): data = "a,b,c\n1,2,3\n4,5,6" expected = DataFrame() @@ -446,7 +446,7 @@ def test_np_array_usecols(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "usecols,expected", [ @@ -479,7 +479,7 @@ def test_callable_usecols(all_parsers, usecols, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]]) def test_incomplete_first_row(all_parsers, usecols): # see gh-6710 @@ -492,7 +492,7 @@ def test_incomplete_first_row(all_parsers, usecols): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "data,usecols,kwargs,expected", [ @@ -582,7 +582,7 @@ def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]]) def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request): if all_parsers.engine != "c": From 4f7ebd05133cb6340c70d6226741ce1dbf7d199a Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Wed, 9 Dec 2020 23:24:00 -0500 Subject: [PATCH 44/95] simplify import_optional_dependency code --- pandas/compat/_optional.py | 26 ++++++-------------------- pandas/io/parsers.py | 2 +- 2 files changed, 7 insertions(+), 21 deletions(-) diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 89b36e07f3c1d..533e67acfa2f4 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -1,8 +1,6 @@ import distutils.version import importlib -import sys import types -from typing import Optional import warnings # Update install.rst when updating versions! @@ -45,7 +43,6 @@ "pandas_gbq": "pandas-gbq", "sqlalchemy": "SQLAlchemy", "jinja2": "Jinja2", - "pyarrow.csv": "pyarrow", } @@ -61,11 +58,7 @@ def _get_version(module: types.ModuleType) -> str: def import_optional_dependency( - name: str, - extra: str = "", - raise_on_missing: bool = True, - on_version: str = "raise", - min_version: Optional[str] = None, + name: str, extra: str = "", raise_on_missing: bool = True, on_version: str = "raise" ): """ Import an optional dependency. @@ -77,7 +70,8 @@ def import_optional_dependency( Parameters ---------- name : str - The module name. + The module name. This should be top-level only, so that the + version may be checked. extra : str Additional text to include in the ImportError message. raise_on_missing : bool, default True @@ -91,8 +85,6 @@ def import_optional_dependency( * ignore: Return the module, even if the version is too old. It's expected that users validate the version locally when using ``on_version="ignore"`` (see. ``io/html.py``) - min_version: Optional[str] - Specify the minimum version Returns ------- @@ -117,16 +109,10 @@ def import_optional_dependency( raise ImportError(msg) from None else: return None - # Handle submodules: if we have submodule, grab parent module from sys.modules - parent = name.split(".")[0] - if parent != name: - install_name = parent - module_to_get = sys.modules[install_name] - else: - module_to_get = module - minimum_version = min_version if min_version is not None else VERSIONS.get(name) + + minimum_version = VERSIONS.get(name) if minimum_version: - version = _get_version(module_to_get) + version = _get_version(module) if distutils.version.LooseVersion(version) < minimum_version: assert on_version in {"warn", "raise", "ignore"} msg = ( diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e7089f708d47f..a0bc537dce6dc 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2273,7 +2273,7 @@ def __init__(self, src, **kwds): self.src = BytesIOWrapper(self.src, encoding=encoding) def read(self): - pyarrow = import_optional_dependency("pyarrow.csv", min_version="0.15.0") + pyarrow = import_optional_dependency("pyarrow.csv") kwdscopy = {k: v for k, v in self.kwds.items() if v is not None} # these are kwargs passed to pyarrow parseoptions = {"delimiter", "quote_char", "escape_char", "ignore_empty_lines"} From 69b3b42ef3521bb3fb81e78b640bcc96745271db Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Wed, 9 Dec 2020 23:29:06 -0500 Subject: [PATCH 45/95] move whatsnew to 1.3 --- doc/source/whatsnew/v1.1.0.rst | 1 - doc/source/whatsnew/v1.2.0.rst | 5 ----- doc/source/whatsnew/v1.3.0.rst | 6 ++++++ 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 50443f8810e5f..e054ac830ce41 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -252,7 +252,6 @@ If needed you can adjust the bins with the argument ``offset`` (a :class:`Timede For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`. - fsspec now used for filesystem handling ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f6493c68e5aa4..af9219bc25931 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -249,11 +249,6 @@ example where the index name is preserved: The same is true for :class:`MultiIndex`, but the logic is applied separately on a level-by-level basis. -read_csv() now accepts pyarrow as an engine -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing on multicore machines -with pyarrow>=0.15 installed. See the :doc:`I/O docs ` for more info. (:issue:`23697`) .. _whatsnew_120.groupby_ewm: Groupby supports EWM operations directly diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 90f611c55e710..130ee90349cfe 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -13,6 +13,12 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +read_csv() now accepts pyarrow as an engine +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing on multicore machines +with pyarrow>=0.15 installed. See the :doc:`I/O docs ` for more info. (:issue:`23697`) + .. _whatsnew_130.enhancements.other: From 9d5cf249e74dce82a30656cedd0cf514aa9c3960 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Wed, 9 Dec 2020 23:40:15 -0500 Subject: [PATCH 46/95] clean _get_options_with_defaults --- pandas/io/parsers.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a0bc537dce6dc..68657a86ea27f 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -868,14 +868,17 @@ def _get_options_with_defaults(self, engine): for argname, default in _parser_defaults.items(): value = kwds.get(argname, default) - if argname in _pyarrow_unsupported: - if engine == "pyarrow" and value != default: - raise ValueError( - f"The {repr(argname)} option is not supported with the " - f"'pyarrow' engine" - ) - # see gh-12935 - if argname == "mangle_dupe_cols" and not value: + if ( + engine == "pyarrow" + and argname in _pyarrow_unsupported + and value != default + ): + raise ValueError( + f"The {repr(argname)} option is not supported with the " + f"'pyarrow' engine" + ) + elif argname == "mangle_dupe_cols" and value is False: + # GH12935 raise ValueError("Setting mangle_dupe_cols=False is not supported yet") else: options[argname] = value From 2d4a0aa70ae26bc48a328bcb5f240d4e8d677b34 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Wed, 9 Dec 2020 23:46:10 -0500 Subject: [PATCH 47/95] clean _clean_options --- pandas/io/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 68657a86ea27f..5a111385c4455 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -936,7 +936,7 @@ def _clean_options(self, options, engine): delim_whitespace = options["delim_whitespace"] if sep is None and not delim_whitespace: - if engine == "c" or engine == "pyarrow": + if engine in ("c", "pyarrow"): fallback_reason = ( f"the {engine} engine does not support " "sep=None with delim_whitespace=False" From e46b95d7b4432ec5b00a1094dbbf9f984d8d8e7e Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Wed, 9 Dec 2020 23:54:48 -0500 Subject: [PATCH 48/95] clean _read --- pandas/io/parsers.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5a111385c4455..bf33040b075d4 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -452,20 +452,20 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): kwds["parse_dates"] = True # Extract some of the arguments (pass chunksize on). + iterator = kwds.get("iterator", False) + if kwds.get("engine") == "pyarrow" and iterator is True: + raise ValueError( + "The 'iterator' option is not supported with the 'pyarrow' engine" + ) + chunksize = kwds.get("chunksize", None) - # chunksize and iterator not supported for pyarrow - if kwds.get("engine") == "pyarrow": - if iterator: - raise ValueError( - "The 'iterator' option is not supported with the 'pyarrow' engine" - ) - if chunksize is not None: - raise ValueError( - "The 'chunksize' option is not supported with the 'pyarrow' engine" - ) - else: - chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1) + if kwds.get("engine") == "pyarrow" and chunksize is not None: + raise ValueError( + "The 'chunksize' option is not supported with the 'pyarrow' engine" + ) + chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1) + nrows = kwds.get("nrows", None) # Check for duplicates in names. From 1844a6c48ebdedfb77e50facb19462f3b7f49727 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Thu, 10 Dec 2020 01:20:52 -0500 Subject: [PATCH 49/95] extract kwd validation from __init__ --- pandas/io/parsers.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index bf33040b075d4..4122d2d263d3f 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2259,7 +2259,16 @@ def __init__(self, src, **kwds): ParserBase.__init__(self, kwds) - encoding = kwds.get("encoding") if kwds.get("encoding") is not None else "utf-8" + self._validate_kwds() + + if isinstance(self.src, TextIOBase): + self.src = BytesIOWrapper(self.src, encoding=self.encoding) + + def _validate_kwds(self): + kwds = self.kwds + self.encoding = ( + kwds.get("encoding") if kwds.get("encoding") is not None else "utf-8" + ) self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) na_values = kwds["na_values"] @@ -2272,8 +2281,6 @@ def __init__(self, src, **kwds): kwds["na_values"], keep_default_na=kwds["keep_default_na"] )[0] ) - if isinstance(self.src, TextIOBase): - self.src = BytesIOWrapper(self.src, encoding=encoding) def read(self): pyarrow = import_optional_dependency("pyarrow.csv") From 94178e467697793af549e858bf2c5d7164c45353 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Fri, 11 Dec 2020 17:13:47 -0500 Subject: [PATCH 50/95] revert mistaken refactor --- pandas/io/parsers.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4122d2d263d3f..fd674d301d9ac 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -454,17 +454,19 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): # Extract some of the arguments (pass chunksize on). iterator = kwds.get("iterator", False) - if kwds.get("engine") == "pyarrow" and iterator is True: - raise ValueError( - "The 'iterator' option is not supported with the 'pyarrow' engine" - ) - chunksize = kwds.get("chunksize", None) - if kwds.get("engine") == "pyarrow" and chunksize is not None: - raise ValueError( - "The 'chunksize' option is not supported with the 'pyarrow' engine" - ) - chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1) + if kwds.get("engine") == "pyarrow": + if iterator: + raise ValueError( + "The 'iterator' option is not supported with the 'pyarrow' engine" + ) + + if chunksize is not None: + raise ValueError( + "The 'chunksize' option is not supported with the 'pyarrow' engine" + ) + else: + chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1) nrows = kwds.get("nrows", None) From 13a24880e8c575b37e119073c78405b294adb66d Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Fri, 11 Dec 2020 17:17:59 -0500 Subject: [PATCH 51/95] typing --- pandas/io/parsers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index fd674d301d9ac..c3b0b2d46d7ee 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2238,7 +2238,11 @@ class BytesIOWrapper: Allows the pyarrow engine for read_csv() to read from string buffers """ - def __init__(self, string_buffer: StringIO, encoding: str = "utf-8"): + def __init__( + self, + string_buffer: Union[StringIO, TextIOBase], + encoding: Optional[str] = "utf-8", + ): self.string_buffer = string_buffer self.encoding = encoding From a98cffd33a89ec98ed1f1ea88e8534fdc56d6095 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Fri, 11 Dec 2020 17:25:18 -0500 Subject: [PATCH 52/95] REF: ArrowParserWrapper.read --- pandas/io/parsers.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index c3b0b2d46d7ee..93d5171fbad45 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2290,7 +2290,7 @@ def _validate_kwds(self): def read(self): pyarrow = import_optional_dependency("pyarrow.csv") - kwdscopy = {k: v for k, v in self.kwds.items() if v is not None} + self.kwds = {k: v for k, v in self.kwds.items() if v is not None} # these are kwargs passed to pyarrow parseoptions = {"delimiter", "quote_char", "escape_char", "ignore_empty_lines"} convertoptions = { @@ -2300,13 +2300,13 @@ def read(self): "false_values", } # rename some arguments to pass to pyarrow - kwdscopy["include_columns"] = kwdscopy.get("usecols") - kwdscopy["null_values"] = kwdscopy.get("na_values") - kwdscopy["escape_char"] = kwdscopy.get("escapechar") - kwdscopy["ignore_empty_lines"] = kwdscopy.get("skip_blank_lines") + self.kwds["include_columns"] = self.kwds.pop("usecols") + self.kwds["null_values"] = self.kwds.pop("na_values") + self.kwds["escape_char"] = self.kwds.pop("escapechar") + self.kwds["ignore_empty_lines"] = self.kwds.pop("skip_blank_lines") - parse_options = {k: v for k, v in kwdscopy.items() if k in parseoptions} - convert_options = {k: v for k, v in kwdscopy.items() if k in convertoptions} + parse_options = {k: v for k, v in self.kwds.items() if k in parseoptions} + convert_options = {k: v for k, v in self.kwds.items() if k in convertoptions} headerexists = True if self.header is not None else False read_options = {} From a32e3a595114650e7272a212552dd1ad7a6f7e22 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Fri, 11 Dec 2020 18:07:52 -0500 Subject: [PATCH 53/95] REF: ArrowParserWrapper.read --- pandas/io/parsers.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 93d5171fbad45..5efedb1d5a026 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2292,21 +2292,26 @@ def read(self): pyarrow = import_optional_dependency("pyarrow.csv") self.kwds = {k: v for k, v in self.kwds.items() if v is not None} # these are kwargs passed to pyarrow - parseoptions = {"delimiter", "quote_char", "escape_char", "ignore_empty_lines"} - convertoptions = { + parse_kwargs = {"delimiter", "quote_char", "escape_char", "ignore_empty_lines"} + convert_kwargs = { "include_columns", "null_values", "true_values", "false_values", } # rename some arguments to pass to pyarrow - self.kwds["include_columns"] = self.kwds.pop("usecols") - self.kwds["null_values"] = self.kwds.pop("na_values") - self.kwds["escape_char"] = self.kwds.pop("escapechar") - self.kwds["ignore_empty_lines"] = self.kwds.pop("skip_blank_lines") + mapping = { + "usecols": "include_columns", + "na_values": "null_values", + "escapechar": "escape_char", + "skip_blank_lines": "ignore_empty_lines", + } + for pandas_name, pyarrow_name in mapping.items(): + if pandas_name in self.kwds: + self.kwds[pyarrow_name] = self.kwds.pop(pandas_name) - parse_options = {k: v for k, v in self.kwds.items() if k in parseoptions} - convert_options = {k: v for k, v in self.kwds.items() if k in convertoptions} + parse_options = {k: v for k, v in self.kwds.items() if k in parse_kwargs} + convert_options = {k: v for k, v in self.kwds.items() if k in convert_kwargs} headerexists = True if self.header is not None else False read_options = {} From 89416cc217dc61b510da8b796511d6ea24fbff36 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 12 Dec 2020 12:16:23 -0500 Subject: [PATCH 54/95] remove optional dependency code --- pandas/tests/test_optional_dependency.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py index 61dbd81e2cee5..e5ed69b7703b1 100644 --- a/pandas/tests/test_optional_dependency.py +++ b/pandas/tests/test_optional_dependency.py @@ -27,15 +27,14 @@ def test_bad_version(monkeypatch): module = types.ModuleType(name) module.__version__ = "0.9.0" sys.modules[name] = module + monkeypatch.setitem(VERSIONS, name, "1.0.0") match = "Pandas requires .*1.0.0.* of .fakemodule.*'0.9.0'" with pytest.raises(ImportError, match=match): - import_optional_dependency("fakemodule", min_version="1.0.0") + import_optional_dependency("fakemodule") with tm.assert_produces_warning(UserWarning): - result = import_optional_dependency( - "fakemodule", min_version="1.0.0", on_version="warn" - ) + result = import_optional_dependency("fakemodule", on_version="warn") assert result is None module.__version__ = "1.0.0" # exact match is OK From 9687990b0c2ed662af78bcdf0314ca7cfce5bd2f Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 12 Dec 2020 13:08:31 -0500 Subject: [PATCH 55/95] REF: ArrowParserWrapper.read --- pandas/io/parsers.py | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5efedb1d5a026..41ed74309a934 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2292,13 +2292,6 @@ def read(self): pyarrow = import_optional_dependency("pyarrow.csv") self.kwds = {k: v for k, v in self.kwds.items() if v is not None} # these are kwargs passed to pyarrow - parse_kwargs = {"delimiter", "quote_char", "escape_char", "ignore_empty_lines"} - convert_kwargs = { - "include_columns", - "null_values", - "true_values", - "false_values", - } # rename some arguments to pass to pyarrow mapping = { "usecols": "include_columns", @@ -2310,19 +2303,25 @@ def read(self): if pandas_name in self.kwds: self.kwds[pyarrow_name] = self.kwds.pop(pandas_name) - parse_options = {k: v for k, v in self.kwds.items() if k in parse_kwargs} - convert_options = {k: v for k, v in self.kwds.items() if k in convert_kwargs} - headerexists = True if self.header is not None else False - read_options = {} + parse_options = { + k: v + for k, v in self.kwds.items() + if k is not None + and k in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines") + } + convert_options = { + k: v + for k, v in self.kwds.items() + if k is not None + and k in ("include_columns", "null_values", "true_values", "false_values") + } - skiprows = self.kwds.get("skiprows") - if headerexists: + read_options = {"autogenerate_column_names": self.header is None} + if self.header is not None: read_options["skip_rows"] = self.header - read_options["autogenerate_column_names"] = False - else: - if skiprows is not None: - read_options["skip_rows"] = skiprows - read_options["autogenerate_column_names"] = True + elif self.kwds.get("skiprows") is not None: + read_options["skip_rows"] = self.kwds.get("skiprows") + read_options = pyarrow.ReadOptions(**read_options) table = pyarrow.read_csv( self.src, @@ -2332,7 +2331,7 @@ def read(self): ) frame = table.to_pandas() num_cols = len(frame.columns) - if not headerexists: + if self.header is None: if self.names is None: if self.prefix is not None: self.names = [f"{self.prefix}{i}" for i in range(num_cols)] From 98f20617aa22a8443339962f3ba6f1ad955a4246 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 12 Dec 2020 13:14:47 -0500 Subject: [PATCH 56/95] REF: ArrowParserWrapper.read --- pandas/io/parsers.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 41ed74309a934..f2544c078f2c3 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2301,19 +2301,23 @@ def read(self): } for pandas_name, pyarrow_name in mapping.items(): if pandas_name in self.kwds: - self.kwds[pyarrow_name] = self.kwds.pop(pandas_name) + value = self.kwds.pop(pandas_name) + if value is not None: + self.kwds[pyarrow_name] = value parse_options = { - k: v - for k, v in self.kwds.items() - if k is not None - and k in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines") + option_name: option_value + for option_name, option_value in self.kwds.items() + if option_value is not None + and option_name + in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines") } convert_options = { - k: v - for k, v in self.kwds.items() - if k is not None - and k in ("include_columns", "null_values", "true_values", "false_values") + option_name: option_value + for option_name, option_value in self.kwds.items() + if option_value is not None + and option_name + in ("include_columns", "null_values", "true_values", "false_values") } read_options = {"autogenerate_column_names": self.header is None} From ec01fad2dc7864955d0edd34bfeb71b849e656ea Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 12 Dec 2020 13:34:05 -0500 Subject: [PATCH 57/95] REF: ArrowParserWrapper.read --- pandas/io/parsers.py | 46 +++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f2544c078f2c3..474171223292e 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2288,10 +2288,7 @@ def _validate_kwds(self): )[0] ) - def read(self): - pyarrow = import_optional_dependency("pyarrow.csv") - self.kwds = {k: v for k, v in self.kwds.items() if v is not None} - # these are kwargs passed to pyarrow + def _get_pyarrow_options(self): # rename some arguments to pass to pyarrow mapping = { "usecols": "include_columns", @@ -2300,40 +2297,30 @@ def read(self): "skip_blank_lines": "ignore_empty_lines", } for pandas_name, pyarrow_name in mapping.items(): - if pandas_name in self.kwds: - value = self.kwds.pop(pandas_name) - if value is not None: - self.kwds[pyarrow_name] = value + if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None: + self.kwds[pyarrow_name] = self.kwds.pop(pandas_name) - parse_options = { + self.parse_options = { option_name: option_value for option_name, option_value in self.kwds.items() if option_value is not None and option_name in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines") } - convert_options = { + self.convert_options = { option_name: option_value for option_name, option_value in self.kwds.items() if option_value is not None and option_name in ("include_columns", "null_values", "true_values", "false_values") } - - read_options = {"autogenerate_column_names": self.header is None} + self.read_options = {"autogenerate_column_names": self.header is None} if self.header is not None: - read_options["skip_rows"] = self.header + self.read_options["skip_rows"] = self.header elif self.kwds.get("skiprows") is not None: - read_options["skip_rows"] = self.kwds.get("skiprows") + self.read_options["skip_rows"] = self.kwds.get("skiprows") - read_options = pyarrow.ReadOptions(**read_options) - table = pyarrow.read_csv( - self.src, - read_options=read_options, - parse_options=pyarrow.ParseOptions(**parse_options), - convert_options=pyarrow.ConvertOptions(**convert_options), - ) - frame = table.to_pandas() + def _finalize_output(self, frame): num_cols = len(frame.columns) if self.header is None: if self.names is None: @@ -2354,6 +2341,21 @@ def read(self): frame = frame.astype(self.kwds.get("dtype")) return frame + def read(self): + pyarrow = import_optional_dependency("pyarrow.csv") + + self._get_pyarrow_options() + + table = pyarrow.read_csv( + self.src, + read_options=pyarrow.ReadOptions(**self.read_options), + parse_options=pyarrow.ParseOptions(**self.parse_options), + convert_options=pyarrow.ConvertOptions(**self.convert_options), + ) + + frame = table.to_pandas() + return self._finalize_output(frame) + def TextParser(*args, **kwds): """ From 7b9572b960492a2ee6d2d6fd25dab031130ee260 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 12 Dec 2020 13:48:40 -0500 Subject: [PATCH 58/95] rewrite docs --- doc/source/user_guide/io.rst | 21 ++++++++++----------- pandas/io/parsers.py | 7 +++---- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 6828dbf319a7e..47295f2cb5bc1 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -159,10 +159,10 @@ dtype : Type name or dict of column -> type, default ``None`` with suitable ``na_values`` settings to preserve and not interpret dtype. engine : {``'c'``, ``'pyarrow'``, ``'python'``} - Parser engine to use. In terms of performance, the pyarrow engine, - which requires ``pyarrow`` >= 0.15.0, is faster than the C engine, which - is faster than the python engine. However, the pyarrow and C engines - are currently less feature complete than their Python counterpart. + Parser engine to use. The pyarrow engine is the most performant, followed by + the C engine, which in turn is faster than the python engine. However, the + pyarrow and C engine are currently less feature complete than their Python + counterpart. converters : dict, default ``None`` Dict of functions for converting values in certain columns. Keys can either be integers or column labels. @@ -1604,15 +1604,14 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object: Specifying the parser engine '''''''''''''''''''''''''''' -Currently, pandas supports using three engines, the C engine, the python engine, -and an optional pyarrow engine(requires ``pyarrow`` >= 0.15). In terms of performance -the pyarrow engine is fastest, followed by the C and Python engines. However, -the pyarrow engine is much less robust than the C engine, which in turn lacks a -couple of features present in the Python parser. +Pandas currently supports three engines, the C engine, the python engine, and an optional +pyarrow engine. The pyarrow engine is fastest, followed by the C and Python engines. However, +the pyarrow engine is much less robust than the C engine, and the C engine is less feature-rich +than the Python engine. -Where possible pandas uses the C parser (specified as ``engine='c'``), but may fall +Where possible pandas uses the C parser (specified as ``engine='c'``), but it may fall back to Python if C-unsupported options are specified. If pyarrow unsupported options are -specified while using ``engine='pyarrow'``, the parser will error out +specified while using ``engine='pyarrow'``, the parser will throw an error. (a full list of unsupported options is available at ``pandas.io.parsers._pyarrow_unsupported``). Currently, C-unsupported options include: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 474171223292e..13e9939f850e8 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -176,11 +176,10 @@ of dtype conversion. engine : {{'c', 'python', 'pyarrow'}}, optional Parser engine to use. The C and pyarrow engines are faster, while the python engine - is currently more feature-complete. The pyarrow engine also supports multithreading - something that is not present in the C or python engines. It requires - ``pyarrow`` >= 0.15 as a dependency however. + is currently more feature-complete. Multithreading is currently only supported by + the pyarrow engine. - .. versionchanged:: 1.2 + .. versionchanged:: 1.3 The "pyarrow" engine was added. converters : dict, optional Dict of functions for converting values in certain columns. Keys can either From 6773a719ab06e4c5ab443086d4a6c634ebe5a53d Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 12 Dec 2020 13:50:15 -0500 Subject: [PATCH 59/95] rewrite docs --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index b2cc1450d1efb..a45d651440ffb 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -17,7 +17,7 @@ read_csv() now accepts pyarrow as an engine ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing on multicore machines -with pyarrow>=0.15 installed. See the :doc:`I/O docs ` for more info. (:issue:`23697`) +with pyarrow installed. See the :doc:`I/O docs ` for more info. (:issue:`23697`) .. _whatsnew_130.enhancements.other: From d63f5d0b5d87174f50703464b4dddca6b7b551d4 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 12 Dec 2020 14:05:46 -0500 Subject: [PATCH 60/95] remove datetime hadling --- pandas/io/parsers.py | 4 ++-- pandas/tests/io/parser/test_parse_dates.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 13e9939f850e8..a9ad6bba531e1 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -3557,7 +3557,7 @@ def _isindex(colspec): colspec = orig_names[colspec] if _isindex(colspec): continue - data_dict[colspec] = converter(np.asarray(data_dict[colspec])) + data_dict[colspec] = converter(data_dict[colspec]) else: new_name, col, old_names = _try_convert_dates( converter, colspec, data_dict, orig_names @@ -3606,7 +3606,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns): colnames.append(c) new_name = "_".join(str(x) for x in colnames) - to_parse = [np.array(data_dict[c]) for c in colnames if c in data_dict] + to_parse = [data_dict[c] for c in colnames if c in data_dict] new_col = parser(*to_parse) return new_name, new_col, colnames diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index d4c0d28214849..641579922e506 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -339,6 +339,7 @@ def test_multiple_date_col(all_parsers, keep_date_col): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_date_col_as_index_col(all_parsers): data = """\ KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 @@ -566,6 +567,7 @@ def test_multiple_date_cols_with_header(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "data,parse_dates,msg", [ @@ -594,6 +596,7 @@ def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg): parser.read_csv(StringIO(data), parse_dates=parse_dates) +@skip_pyarrow def test_date_parser_int_bug(all_parsers): # see gh-3071 parser = all_parsers @@ -713,6 +716,7 @@ def test_parse_dates_string(all_parsers): # Bug in https://github.com/dateutil/dateutil/issues/217 # has been addressed, but we just don't pass in the `yearfirst` +@skip_pyarrow @pytest.mark.xfail(reason="yearfirst is not surfaced in read_*") @pytest.mark.parametrize("parse_dates", [[["date", "time"]], [[0, 1]]]) def test_yy_format_with_year_first(all_parsers, parse_dates): @@ -1126,6 +1130,7 @@ def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): parser.read_csv(StringIO(data), parse_dates=(1,)) +@skip_pyarrow @pytest.mark.parametrize("cache_dates", [True, False]) @pytest.mark.parametrize("value", ["nan", "0", ""]) def test_bad_date_parse(all_parsers, cache_dates, value): @@ -1455,6 +1460,7 @@ def date_parser(dt, time): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_parse_date_column_with_empty_string(all_parsers): # see gh-6428 parser = all_parsers @@ -1466,6 +1472,7 @@ def test_parse_date_column_with_empty_string(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -1491,6 +1498,7 @@ def test_parse_date_float(all_parsers, data, expected, parse_dates): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_parse_timezone(all_parsers): # see gh-22256 parser = all_parsers @@ -1554,6 +1562,7 @@ def test_parse_delimited_date_swap(all_parsers, date_string, dayfirst, expected) tm.assert_frame_equal(result, expected) +@skip_pyarrow def _helper_hypothesis_delimited_date(call, date_string, **kwargs): msg, result = None, None try: @@ -1564,6 +1573,7 @@ def _helper_hypothesis_delimited_date(call, date_string, **kwargs): return msg, result +@skip_pyarrow @given(date_strategy) @settings(deadline=None) @pytest.mark.parametrize("delimiter", list(" -./")) From 9ff95ad01cab0d715ad9c3b867569c9d3d795922 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 12 Dec 2020 14:39:13 -0500 Subject: [PATCH 61/95] skiprows cannot be None --- pandas/io/parsers.py | 16 +++++++++++----- pandas/tests/io/parser/test_usecols.py | 1 + 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a9ad6bba531e1..61b546a4ccf31 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2313,11 +2313,17 @@ def _get_pyarrow_options(self): and option_name in ("include_columns", "null_values", "true_values", "false_values") } - self.read_options = {"autogenerate_column_names": self.header is None} - if self.header is not None: - self.read_options["skip_rows"] = self.header - elif self.kwds.get("skiprows") is not None: - self.read_options["skip_rows"] = self.kwds.get("skiprows") + # self.read_options = {"autogenerate_column_names": self.header is None} + # if self.header is not None: + # self.read_options["skip_rows"] = self.header + # elif self.kwds.get("skiprows") is not None: + # self.read_options["skip_rows"] = self.kwds.get("skiprows") + self.read_options = { + "autogenerate_column_names": self.header is None, + "skip_rows": self.header + if self.header is not None + else self.kwds["skiprows"], + } def _finalize_output(self, frame): num_cols = len(frame.columns) diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index a3a2b3e984339..bdfe121bae179 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -280,6 +280,7 @@ def test_usecols_with_parse_dates2(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_usecols_with_parse_dates3(all_parsers): # see gh-14792 parser = all_parsers From 6133a4c0d60bcbf559f8665231c726a790c3954a Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 12 Dec 2020 16:34:09 -0500 Subject: [PATCH 62/95] REF: ArrowParserWrapper.read --- pandas/io/parsers.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 61b546a4ccf31..14e0ad591db07 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2240,7 +2240,7 @@ class BytesIOWrapper: def __init__( self, string_buffer: Union[StringIO, TextIOBase], - encoding: Optional[str] = "utf-8", + encoding: str = "utf-8", ): self.string_buffer = string_buffer self.encoding = encoding @@ -2264,29 +2264,27 @@ def __init__(self, src, **kwds): ParserBase.__init__(self, kwds) - self._validate_kwds() + self._parse_kwds() - if isinstance(self.src, TextIOBase): - self.src = BytesIOWrapper(self.src, encoding=self.encoding) - - def _validate_kwds(self): - kwds = self.kwds - self.encoding = ( - kwds.get("encoding") if kwds.get("encoding") is not None else "utf-8" - ) + def _parse_kwds(self): + encoding: Optional[str] = self.kwds.get("encoding") + self.encoding = "utf-8" if encoding is None else encoding - self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) - na_values = kwds["na_values"] + self.usecols, self.usecols_dtype = _validate_usecols_arg(self.kwds["usecols"]) + na_values = self.kwds["na_values"] if isinstance(na_values, dict): raise ValueError( "The pyarrow engine doesn't support passing a dict for na_values" ) self.na_values = list( _clean_na_values( - kwds["na_values"], keep_default_na=kwds["keep_default_na"] + self.kwds["na_values"], keep_default_na=self.kwds["keep_default_na"] )[0] ) + if isinstance(self.src, TextIOBase): + self.src = BytesIOWrapper(self.src, encoding=self.encoding) + def _get_pyarrow_options(self): # rename some arguments to pass to pyarrow mapping = { From 454892f6420cd7bb85f549983d274d4d21ce0fd7 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 12 Dec 2020 16:38:00 -0500 Subject: [PATCH 63/95] REF: ArrowParserWrapper.read --- pandas/io/parsers.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 14e0ad591db07..dfef7e32836ab 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2311,11 +2311,6 @@ def _get_pyarrow_options(self): and option_name in ("include_columns", "null_values", "true_values", "false_values") } - # self.read_options = {"autogenerate_column_names": self.header is None} - # if self.header is not None: - # self.read_options["skip_rows"] = self.header - # elif self.kwds.get("skiprows") is not None: - # self.read_options["skip_rows"] = self.kwds.get("skiprows") self.read_options = { "autogenerate_column_names": self.header is None, "skip_rows": self.header From e0503945d704d7d8e8db28ecc2f8f13fd7b8edb5 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 12 Dec 2020 16:54:50 -0500 Subject: [PATCH 64/95] skip all pyarrow csv datetime tests --- pandas/tests/io/parser/test_parse_dates.py | 43 +--------------------- 1 file changed, 1 insertion(+), 42 deletions(-) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 641579922e506..77c0e3a9c4f6f 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -34,10 +34,9 @@ else: date_strategy = st.datetimes() -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +pytestmark = pytest.mark.usefixtures("pyarrow_skip") -@skip_pyarrow def test_separator_date_conflict(all_parsers): # Regression test for gh-4678 # @@ -59,7 +58,6 @@ def test_separator_date_conflict(all_parsers): tm.assert_frame_equal(df, expected) -@skip_pyarrow @pytest.mark.parametrize("keep_date_col", [True, False]) def test_multiple_date_col_custom(all_parsers, keep_date_col): data = """\ @@ -203,7 +201,6 @@ def date_parser(*date_cols): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize("container", [list, tuple, Index, Series]) @pytest.mark.parametrize("dim", [1, 2]) def test_concat_date_col_fail(container, dim): @@ -216,7 +213,6 @@ def test_concat_date_col_fail(container, dim): parsing.concat_date_cols(date_cols) -@skip_pyarrow @pytest.mark.parametrize("keep_date_col", [True, False]) def test_multiple_date_col(all_parsers, keep_date_col): data = """\ @@ -339,7 +335,6 @@ def test_multiple_date_col(all_parsers, keep_date_col): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_date_col_as_index_col(all_parsers): data = """\ KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 @@ -377,7 +372,6 @@ def test_date_col_as_index_col(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "date_parser, warning", ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]), @@ -442,7 +436,6 @@ def test_multiple_date_cols_int_cast(all_parsers, date_parser, warning): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_multiple_date_col_timestamp_parse(all_parsers): parser = all_parsers data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 @@ -477,7 +470,6 @@ def test_multiple_date_col_timestamp_parse(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_multiple_date_cols_with_header(all_parsers): parser = all_parsers data = """\ @@ -567,7 +559,6 @@ def test_multiple_date_cols_with_header(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "data,parse_dates,msg", [ @@ -596,7 +587,6 @@ def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg): parser.read_csv(StringIO(data), parse_dates=parse_dates) -@skip_pyarrow def test_date_parser_int_bug(all_parsers): # see gh-3071 parser = all_parsers @@ -649,7 +639,6 @@ def test_date_parser_int_bug(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_nat_parse(all_parsers): # see gh-3062 parser = all_parsers @@ -665,7 +654,6 @@ def test_nat_parse(all_parsers): tm.assert_frame_equal(result, df) -@skip_pyarrow def test_csv_custom_parser(all_parsers): data = """A,B,C 20090101,a,1,2 @@ -680,7 +668,6 @@ def test_csv_custom_parser(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_parse_dates_implicit_first_col(all_parsers): data = """A,B,C 20090101,a,1,2 @@ -694,7 +681,6 @@ def test_parse_dates_implicit_first_col(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_parse_dates_string(all_parsers): data = """date,A,B,C 20090101,a,1,2 @@ -716,7 +702,6 @@ def test_parse_dates_string(all_parsers): # Bug in https://github.com/dateutil/dateutil/issues/217 # has been addressed, but we just don't pass in the `yearfirst` -@skip_pyarrow @pytest.mark.xfail(reason="yearfirst is not surfaced in read_*") @pytest.mark.parametrize("parse_dates", [[["date", "time"]], [[0, 1]]]) def test_yy_format_with_year_first(all_parsers, parse_dates): @@ -740,7 +725,6 @@ def test_yy_format_with_year_first(all_parsers, parse_dates): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]]) def test_parse_dates_column_list(all_parsers, parse_dates): data = "a,b,c\n01/01/2010,1,15/02/2010" @@ -757,7 +741,6 @@ def test_parse_dates_column_list(all_parsers, parse_dates): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) def test_multi_index_parse_dates(all_parsers, index_col): data = """index1,index2,A,B,C @@ -803,7 +786,6 @@ def test_multi_index_parse_dates(all_parsers, index_col): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize("kwargs", [{"dayfirst": True}, {"day_first": True}]) def test_parse_dates_custom_euro_format(all_parsers, kwargs): parser = all_parsers @@ -848,7 +830,6 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs): ) -@skip_pyarrow def test_parse_tz_aware(all_parsers): # See gh-1693 parser = all_parsers @@ -862,7 +843,6 @@ def test_parse_tz_aware(all_parsers): assert result.index.tz is pytz.utc -@skip_pyarrow @pytest.mark.parametrize( "parse_dates,index_col", [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)], @@ -963,7 +943,6 @@ def test_multiple_date_cols_index(all_parsers, parse_dates, index_col): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_multiple_date_cols_chunked(all_parsers): parser = all_parsers data = """\ @@ -1056,7 +1035,6 @@ def test_multiple_date_cols_chunked(all_parsers): tm.assert_frame_equal(chunks[2], expected[4:]) -@skip_pyarrow def test_multiple_date_col_named_index_compat(all_parsers): parser = all_parsers data = """\ @@ -1080,7 +1058,6 @@ def test_multiple_date_col_named_index_compat(all_parsers): tm.assert_frame_equal(with_indices, with_names) -@skip_pyarrow def test_multiple_date_col_multiple_index_compat(all_parsers): parser = all_parsers data = """\ @@ -1130,7 +1107,6 @@ def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): parser.read_csv(StringIO(data), parse_dates=(1,)) -@skip_pyarrow @pytest.mark.parametrize("cache_dates", [True, False]) @pytest.mark.parametrize("value", ["nan", "0", ""]) def test_bad_date_parse(all_parsers, cache_dates, value): @@ -1149,7 +1125,6 @@ def test_bad_date_parse(all_parsers, cache_dates, value): ) -@skip_pyarrow def test_parse_dates_empty_string(all_parsers): # see gh-2263 parser = all_parsers @@ -1162,7 +1137,6 @@ def test_parse_dates_empty_string(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -1202,7 +1176,6 @@ def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "date_parser, warning", ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]), @@ -1231,7 +1204,6 @@ def test_parse_date_time_multi_level_column_name(all_parsers, date_parser, warni tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "date_parser, warning", ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]), @@ -1320,7 +1292,6 @@ def test_parse_date_time(all_parsers, data, kwargs, expected, date_parser, warni tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "date_parser, warning", ([conv.parse_date_fields, FutureWarning], [pd.to_datetime, None]), @@ -1343,7 +1314,6 @@ def test_parse_date_fields(all_parsers, date_parser, warning): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "date_parser, warning", ( @@ -1375,7 +1345,6 @@ def test_parse_date_all_fields(all_parsers, date_parser, warning): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "date_parser, warning", ( @@ -1407,7 +1376,6 @@ def test_datetime_fractional_seconds(all_parsers, date_parser, warning): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_generic(all_parsers): parser = all_parsers data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." @@ -1426,7 +1394,6 @@ def test_generic(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_date_parser_resolution_if_not_ns(all_parsers): # see gh-10245 parser = all_parsers @@ -1460,7 +1427,6 @@ def date_parser(dt, time): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_parse_date_column_with_empty_string(all_parsers): # see gh-6428 parser = all_parsers @@ -1472,7 +1438,6 @@ def test_parse_date_column_with_empty_string(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -1498,7 +1463,6 @@ def test_parse_date_float(all_parsers, data, expected, parse_dates): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_parse_timezone(all_parsers): # see gh-22256 parser = all_parsers @@ -1527,7 +1491,6 @@ def test_parse_timezone(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "date_string", ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"], @@ -1539,7 +1502,6 @@ def test_invalid_parse_delimited_date(all_parsers, date_string): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "date_string,dayfirst,expected", [ @@ -1562,7 +1524,6 @@ def test_parse_delimited_date_swap(all_parsers, date_string, dayfirst, expected) tm.assert_frame_equal(result, expected) -@skip_pyarrow def _helper_hypothesis_delimited_date(call, date_string, **kwargs): msg, result = None, None try: @@ -1573,7 +1534,6 @@ def _helper_hypothesis_delimited_date(call, date_string, **kwargs): return msg, result -@skip_pyarrow @given(date_strategy) @settings(deadline=None) @pytest.mark.parametrize("delimiter", list(" -./")) @@ -1607,7 +1567,6 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti assert result == expected -@skip_pyarrow @pytest.mark.parametrize( "names, usecols, parse_dates, missing_cols", [ From 09fca60e1634a6be4231c6d10b5090d7e5f453e6 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 12 Dec 2020 17:54:29 -0500 Subject: [PATCH 65/95] rewrite benchmarks --- asv_bench/benchmarks/io/csv.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index c1fad1efde082..db271c84ceed0 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -254,7 +254,7 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision): names=list("abc"), ) - def time_read_csv_arrow(self, sep, decimal, float_precision): + def time_read_csv_pyarrow(self, sep, decimal, float_precision): read_csv( self.data(self.StringIO_input), sep=sep, @@ -333,7 +333,7 @@ def time_baseline(self, engine): class ReadCSVCachedParseDates(StringIORewind): - params = ([True, False], ["c", "pyarrow", "python"]) + params = ([True, False], ["c", "python"]) param_names = ["do_cache", "engine"] def setup(self, do_cache, engine): @@ -374,7 +374,7 @@ def mem_parser_chunks(self): class ReadCSVParseSpecialDate(StringIORewind): - params = (["mY", "mdY", "hm"], ["c", "pyarrow", "python"]) + params = (["mY", "mdY", "hm"], ["c", "python"]) param_names = ["value", "engine"] objects = { "mY": "01-2019\n10-2019\n02/2000\n", From f9bf5f1dbaf62958bb84d552b75faefcaec97ec5 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 19 Dec 2020 01:19:55 -0500 Subject: [PATCH 66/95] typo --- asv_bench/benchmarks/io/csv.py | 1 + 1 file changed, 1 insertion(+) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index f5fad16d5afbb..83eb1bea42a14 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -263,6 +263,7 @@ def time_read_csv_pyarrow(self, sep, decimal, float_precision): sep=sep, header=None, names=list("abc"), + engine="pyarrow", ) From 922bf4fa87dac6daaf83faddc32bf9abf10ef92f Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 19 Dec 2020 01:20:45 -0500 Subject: [PATCH 67/95] typo --- asv_bench/benchmarks/io/csv.py | 1 + 1 file changed, 1 insertion(+) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 83eb1bea42a14..287f1d997d665 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -285,6 +285,7 @@ def time_read_bytescsv(self, engine): class ReadCSVCategorical(BaseIO): + fname = "__test__.csv" params = ["c", "python"] param_names = ["engine"] From 1252a054e3bc61e2daef1c75600977123121676a Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 19 Dec 2020 01:38:34 -0500 Subject: [PATCH 68/95] test reorg --- pandas/tests/io/parser/test_usecols.py | 1014 ++++++++++++------------ 1 file changed, 501 insertions(+), 513 deletions(-) diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index d774ca4113ab3..d0de6e5c2c95f 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -26,566 +26,554 @@ @skip_pyarrow -def test_raise_on_mixed_dtype_usecols(all_parsers): - # See gh-12678 - data = """a,b,c - 1000,2000,3000 - 4000,5000,6000 - """ - usecols = [0, "b", 2] - parser = all_parsers - - with pytest.raises(ValueError, match=_msg_validate_usecols_arg): - parser.read_csv(StringIO(data), usecols=usecols) - - -@skip_pyarrow -@pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")]) -def test_usecols(all_parsers, usecols): - data = """\ -a,b,c -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - parser = all_parsers - result = parser.read_csv(StringIO(data), usecols=usecols) - - expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_usecols_with_names(all_parsers): - data = """\ -a,b,c -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - parser = all_parsers - names = ["foo", "bar"] - result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0) - - expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])] -) -def test_usecols_relative_to_names(all_parsers, names, usecols): - data = """\ -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - parser = all_parsers - result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols) - - expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) - tm.assert_frame_equal(result, expected) +class TestParserUsecolsBasic: + def test_raise_on_mixed_dtype_usecols(self, all_parsers): + # See gh-12678 + data = """a,b,c + 1000,2000,3000 + 4000,5000,6000 + """ + usecols = [0, "b", 2] + parser = all_parsers + + with pytest.raises(ValueError, match=_msg_validate_usecols_arg): + parser.read_csv(StringIO(data), usecols=usecols) + + @pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")]) + def test_usecols(self, all_parsers, usecols): + data = """\ + a,b,c + 1,2,3 + 4,5,6 + 7,8,9 + 10,11,12""" + parser = all_parsers + result = parser.read_csv(StringIO(data), usecols=usecols) + + expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) + tm.assert_frame_equal(result, expected) + def test_usecols_with_names(self, all_parsers): + data = """\ + a,b,c + 1,2,3 + 4,5,6 + 7,8,9 + 10,11,12""" + parser = all_parsers + names = ["foo", "bar"] + result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0) + + expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names) + tm.assert_frame_equal(result, expected) -@skip_pyarrow -def test_usecols_relative_to_names2(all_parsers): - # see gh-5766 - data = """\ -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - parser = all_parsers - result = parser.read_csv( - StringIO(data), names=["a", "b"], header=None, usecols=[0, 1] + @pytest.mark.parametrize( + "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])] ) + def test_usecols_relative_to_names(self, all_parsers, names, usecols): + data = """\ + 1,2,3 + 4,5,6 + 7,8,9 + 10,11,12""" + parser = all_parsers + result = parser.read_csv( + StringIO(data), names=names, header=None, usecols=usecols + ) + + expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) + tm.assert_frame_equal(result, expected) - expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_usecols_name_length_conflict(all_parsers): - data = """\ -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - parser = all_parsers - msg = "Number of passed names did not match number of header fields in the file" - - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1]) - - -def test_usecols_single_string(all_parsers): - # see gh-20558 - parser = all_parsers - data = """foo, bar, baz -1000, 2000, 3000 -4000, 5000, 6000""" - - with pytest.raises(ValueError, match=_msg_validate_usecols_arg): - parser.read_csv(StringIO(data), usecols="foo") - - -@skip_pyarrow -@pytest.mark.parametrize( - "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"] -) -def test_usecols_index_col_false(all_parsers, data): - # see gh-9082 - parser = all_parsers - usecols = ["a", "c", "d"] - expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]}) - - result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("index_col", ["b", 0]) -@pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]]) -def test_usecols_index_col_conflict(all_parsers, usecols, index_col): - # see gh-4201: test that index_col as integer reflects usecols - parser = all_parsers - data = "a,b,c,d\nA,a,1,one\nB,b,2,two" - expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b")) + def test_usecols_relative_to_names2(self, all_parsers): + # see gh-5766 + data = """\ + 1,2,3 + 4,5,6 + 7,8,9 + 10,11,12""" + parser = all_parsers + result = parser.read_csv( + StringIO(data), names=["a", "b"], header=None, usecols=[0, 1] + ) + + expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) - result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col) - tm.assert_frame_equal(result, expected) + def test_usecols_name_length_conflict(self, all_parsers): + data = """\ + 1,2,3 + 4,5,6 + 7,8,9 + 10,11,12""" + parser = all_parsers + msg = "Number of passed names did not match number of header fields in the file" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1]) -def test_usecols_index_col_conflict2(all_parsers): - # see gh-4201: test that index_col as integer reflects usecols - parser = all_parsers - data = "a,b,c,d\nA,a,1,one\nB,b,2,two" + def test_usecols_single_string(self, all_parsers): + # see gh-20558 + parser = all_parsers + data = """foo, bar, baz + 1000, 2000, 3000 + 4000, 5000, 6000""" - expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")}) - expected = expected.set_index(["b", "c"]) + with pytest.raises(ValueError, match=_msg_validate_usecols_arg): + parser.read_csv(StringIO(data), usecols="foo") - result = parser.read_csv( - StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"] + @pytest.mark.parametrize( + "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"] ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_usecols_implicit_index_col(all_parsers): - # see gh-2654 - parser = all_parsers - data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10" - - result = parser.read_csv(StringIO(data), usecols=["a", "b"]) - expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) - tm.assert_frame_equal(result, expected) + def test_usecols_index_col_false(self, all_parsers, data): + # see gh-9082 + parser = all_parsers + usecols = ["a", "c", "d"] + expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]}) + result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False) + tm.assert_frame_equal(result, expected) -@skip_pyarrow -def test_usecols_regex_sep(all_parsers): - # see gh-2733 - parser = all_parsers - data = "a b c\n4 apple bat 5.7\n8 orange cow 10" - result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) + @pytest.mark.parametrize("index_col", ["b", 0]) + @pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]]) + def test_usecols_index_col_conflict(self, all_parsers, usecols, index_col): + # see gh-4201: test that index_col as integer reflects usecols + parser = all_parsers + data = "a,b,c,d\nA,a,1,one\nB,b,2,two" + expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b")) - expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) - tm.assert_frame_equal(result, expected) + result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col) + tm.assert_frame_equal(result, expected) + def test_usecols_index_col_conflict2(self, all_parsers): + # see gh-4201: test that index_col as integer reflects usecols + parser = all_parsers + data = "a,b,c,d\nA,a,1,one\nB,b,2,two" -@skip_pyarrow -def test_usecols_with_whitespace(all_parsers): - parser = all_parsers - data = "a b c\n4 apple bat 5.7\n8 orange cow 10" + expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")}) + expected = expected.set_index(["b", "c"]) - result = parser.read_csv(StringIO(data), delim_whitespace=True, usecols=("a", "b")) - expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) - tm.assert_frame_equal(result, expected) + result = parser.read_csv( + StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"] + ) + tm.assert_frame_equal(result, expected) + def test_usecols_implicit_index_col(self, all_parsers): + # see gh-2654 + parser = all_parsers + data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10" -@skip_pyarrow -@pytest.mark.parametrize( - "usecols,expected", - [ - # Column selection by index. - ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])), - # Column selection by name. - (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"])), - ], -) -def test_usecols_with_integer_like_header(all_parsers, usecols, expected): - parser = all_parsers - data = """2,0,1 -1000,2000,3000 -4000,5000,6000""" + result = parser.read_csv(StringIO(data), usecols=["a", "b"]) + expected = DataFrame( + {"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8] + ) + tm.assert_frame_equal(result, expected) - result = parser.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(result, expected) + def test_usecols_regex_sep(self, all_parsers): + # see gh-2733 + parser = all_parsers + data = "a b c\n4 apple bat 5.7\n8 orange cow 10" + result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) + expected = DataFrame( + {"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8] + ) + tm.assert_frame_equal(result, expected) -@skip_pyarrow -@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) -def test_usecols_with_parse_dates(all_parsers, usecols): - # see gh-9755 - data = """a,b,c,d,e -0,1,20140101,0900,4 -0,1,20140102,1000,4""" - parser = all_parsers - parse_dates = [[1, 2]] - - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) - tm.assert_frame_equal(result, expected) + def test_usecols_with_whitespace(self, all_parsers): + parser = all_parsers + data = "a b c\n4 apple bat 5.7\n8 orange cow 10" + result = parser.read_csv( + StringIO(data), delim_whitespace=True, usecols=("a", "b") + ) + expected = DataFrame( + {"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8] + ) + tm.assert_frame_equal(result, expected) -@skip_pyarrow -def test_usecols_with_parse_dates2(all_parsers): - # see gh-13604 - parser = all_parsers - data = """2008-02-07 09:40,1032.43 -2008-02-07 09:50,1042.54 -2008-02-07 10:00,1051.65""" - - names = ["date", "values"] - usecols = names[:] - parse_dates = [0] - - index = Index( + @pytest.mark.parametrize( + "usecols,expected", [ - Timestamp("2008-02-07 09:40"), - Timestamp("2008-02-07 09:50"), - Timestamp("2008-02-07 10:00"), + # Column selection by index. + ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])), + # Column selection by name. + ( + ["0", "1"], + DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"]), + ), ], - name="date", ) - cols = {"values": [1032.43, 1042.54, 1051.65]} - expected = DataFrame(cols, index=index) - - result = parser.read_csv( - StringIO(data), - parse_dates=parse_dates, - index_col=0, - usecols=usecols, - header=None, - names=names, - ) - tm.assert_frame_equal(result, expected) + def test_usecols_with_integer_like_header(self, all_parsers, usecols, expected): + parser = all_parsers + data = """2,0,1 + 1000,2000,3000 + 4000,5000,6000""" + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) -@skip_pyarrow -def test_usecols_with_parse_dates3(all_parsers): - # see gh-14792 - parser = all_parsers - data = """a,b,c,d,e,f,g,h,i,j -2016/09/21,1,1,2,3,4,5,6,7,8""" - - usecols = list("abcdefghij") - parse_dates = [0] - - cols = { - "a": Timestamp("2016-09-21"), - "b": [1], - "c": [1], - "d": [2], - "e": [3], - "f": [4], - "g": [5], - "h": [6], - "i": [7], - "j": [8], - } - expected = DataFrame(cols, columns=usecols) - - result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) - tm.assert_frame_equal(result, expected) + def test_empty_usecols(self, all_parsers): + data = "a,b,c\n1,2,3\n4,5,6" + expected = DataFrame() + parser = all_parsers + result = parser.read_csv(StringIO(data), usecols=set()) + tm.assert_frame_equal(result, expected) -@skip_pyarrow -def test_usecols_with_parse_dates4(all_parsers): - data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8" - usecols = list("abcdefghij") - parse_dates = [[0, 1]] - parser = all_parsers - - cols = { - "a_b": "2016/09/21 1", - "c": [1], - "d": [2], - "e": [3], - "f": [4], - "g": [5], - "h": [6], - "i": [7], - "j": [8], - } - expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) - - result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) - tm.assert_frame_equal(result, expected) + def test_np_array_usecols(self, all_parsers): + # see gh-12546 + parser = all_parsers + data = "a,b,c\n1,2,3" + usecols = np.array(["a", "b"]) + expected = DataFrame([[1, 2]], columns=usecols) + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) -@skip_pyarrow -@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) -@pytest.mark.parametrize( - "names", - [ - list("abcde"), # Names span all columns in original data. - list("acd"), # Names span only the selected columns. - ], -) -def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names): - # see gh-9755 - s = """0,1,20140101,0900,4 -0,1,20140102,1000,4""" - parse_dates = [[1, 2]] - parser = all_parsers - - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - - result = parser.read_csv( - StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols + @pytest.mark.parametrize( + "usecols,expected", + [ + ( + lambda x: x.upper() in ["AAA", "BBB", "DDD"], + DataFrame( + { + "AaA": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "bBb": {0: 8, 1: 2, 2: 7}, + "ddd": {0: "a", 1: "b", 2: "a"}, + } + ), + ), + (lambda x: False, DataFrame()), + ], ) - tm.assert_frame_equal(result, expected) - - -def test_usecols_with_unicode_strings(all_parsers): - # see gh-13219 - data = """AAA,BBB,CCC,DDD -0.056674973,8,True,a -2.613230982,2,False,b -3.568935038,7,False,a""" - parser = all_parsers - - exp_data = { - "AAA": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002}, - "BBB": {0: 8, 1: 2, 2: 7}, - } - expected = DataFrame(exp_data) - - result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"]) - tm.assert_frame_equal(result, expected) - - -def test_usecols_with_single_byte_unicode_strings(all_parsers): - # see gh-13219 - data = """A,B,C,D -0.056674973,8,True,a -2.613230982,2,False,b -3.568935038,7,False,a""" - parser = all_parsers - - exp_data = { - "A": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002}, - "B": {0: 8, 1: 2, 2: 7}, - } - expected = DataFrame(exp_data) - - result = parser.read_csv(StringIO(data), usecols=["A", "B"]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]]) -def test_usecols_with_mixed_encoding_strings(all_parsers, usecols): - data = """AAA,BBB,CCC,DDD -0.056674973,8,True,a -2.613230982,2,False,b -3.568935038,7,False,a""" - parser = all_parsers - - with pytest.raises(ValueError, match=_msg_validate_usecols_arg): - parser.read_csv(StringIO(data), usecols=usecols) - - -@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]]) -def test_usecols_with_multi_byte_characters(all_parsers, usecols): - data = """あああ,いい,ううう,ええええ -0.056674973,8,True,a -2.613230982,2,False,b -3.568935038,7,False,a""" - parser = all_parsers - - exp_data = { - "あああ": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002}, - "いい": {0: 8, 1: 2, 2: 7}, - } - expected = DataFrame(exp_data) - - result = parser.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_empty_usecols(all_parsers): - data = "a,b,c\n1,2,3\n4,5,6" - expected = DataFrame() - parser = all_parsers - - result = parser.read_csv(StringIO(data), usecols=set()) - tm.assert_frame_equal(result, expected) - - -def test_np_array_usecols(all_parsers): - # see gh-12546 - parser = all_parsers - data = "a,b,c\n1,2,3" - usecols = np.array(["a", "b"]) + def test_callable_usecols(self, all_parsers, usecols, expected): + # see gh-14154 + data = """AaA,bBb,CCC,ddd + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a""" + parser = all_parsers + + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) - expected = DataFrame([[1, 2]], columns=usecols) - result = parser.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]]) + def test_incomplete_first_row(self, all_parsers, usecols): + # see gh-6710 + data = "1,2\n1,2,3" + parser = all_parsers + names = ["a", "b", "c"] + expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]}) + result = parser.read_csv(StringIO(data), names=names, usecols=usecols) + tm.assert_frame_equal(result, expected) -@skip_pyarrow -@pytest.mark.parametrize( - "usecols,expected", - [ - ( - lambda x: x.upper() in ["AAA", "BBB", "DDD"], - DataFrame( - { - "AaA": { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002, - }, - "bBb": {0: 8, 1: 2, 2: 7}, - "ddd": {0: "a", 1: "b", 2: "a"}, - } + @pytest.mark.parametrize( + "data,usecols,kwargs,expected", + [ + # see gh-8985 + ( + "19,29,39\n" * 2 + "10,20,30,40", + [0, 1, 2], + {"header": None}, + DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]), ), - ), - (lambda x: False, DataFrame()), - ], -) -def test_callable_usecols(all_parsers, usecols, expected): - # see gh-14154 - data = """AaA,bBb,CCC,ddd -0.056674973,8,True,a -2.613230982,2,False,b -3.568935038,7,False,a""" - parser = all_parsers + # see gh-9549 + ( + ("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"), + ["A", "B", "C"], + {}, + DataFrame( + { + "A": [1, 3, 1, 1, 1, 5], + "B": [2, 4, 2, 2, 2, 6], + "C": [3, 5, 4, 3, 3, 7], + } + ), + ), + ], + ) + def test_uneven_length_cols(self, all_parsers, data, usecols, kwargs, expected): + # see gh-8985 + parser = all_parsers + result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs) + tm.assert_frame_equal(result, expected) - result = parser.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "usecols,kwargs,expected,msg", + [ + ( + ["a", "b", "c", "d"], + {}, + DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), + None, + ), + ( + ["a", "b", "c", "f"], + {}, + None, + _msg_validate_usecols_names.format(r"\['f'\]"), + ), + (["a", "b", "f"], {}, None, _msg_validate_usecols_names.format(r"\['f'\]")), + ( + ["a", "b", "f", "g"], + {}, + None, + _msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"), + ), + # see gh-14671 + ( + None, + {"header": 0, "names": ["A", "B", "C", "D"]}, + DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}), + None, + ), + ( + ["A", "B", "C", "f"], + {"header": 0, "names": ["A", "B", "C", "D"]}, + None, + _msg_validate_usecols_names.format(r"\['f'\]"), + ), + ( + ["A", "B", "f"], + {"names": ["A", "B", "C", "D"]}, + None, + _msg_validate_usecols_names.format(r"\['f'\]"), + ), + ], + ) + def test_raises_on_usecols_names_mismatch( + self, all_parsers, usecols, kwargs, expected, msg + ): + data = "a,b,c,d\n1,2,3,4\n5,6,7,8" + kwargs.update(usecols=usecols) + parser = all_parsers + + if expected is None: + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + else: + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]]) + def test_usecols_subset_names_mismatch_orig_columns(self, all_parsers, usecols): + data = "a,b,c,d\n1,2,3,4\n5,6,7,8" + names = ["A", "B", "C", "D"] + parser = all_parsers + + result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols) + expected = DataFrame({"A": [1, 5], "C": [3, 7]}) + tm.assert_frame_equal(result, expected) @skip_pyarrow -@pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]]) -def test_incomplete_first_row(all_parsers, usecols): - # see gh-6710 - data = "1,2\n1,2,3" - parser = all_parsers - names = ["a", "b", "c"] - expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]}) +class TestUsecolsParseDates: + @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) + def test_usecols_with_parse_dates(self, all_parsers, usecols): + # see gh-9755 + data = """a,b,c,d,e + 0,1,20140101,0900,4 + 0,1,20140102,1000,4""" + parser = all_parsers + parse_dates = [[1, 2]] + + cols = { + "a": [0, 0], + "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], + } + expected = DataFrame(cols, columns=["c_d", "a"]) + result = parser.read_csv( + StringIO(data), usecols=usecols, parse_dates=parse_dates + ) + tm.assert_frame_equal(result, expected) - result = parser.read_csv(StringIO(data), names=names, usecols=usecols) - tm.assert_frame_equal(result, expected) + def test_usecols_with_parse_dates2(self, all_parsers): + # see gh-13604 + parser = all_parsers + data = """2008-02-07 09:40,1032.43 + 2008-02-07 09:50,1042.54 + 2008-02-07 10:00,1051.65""" + + names = ["date", "values"] + usecols = names[:] + parse_dates = [0] + + index = Index( + [ + Timestamp("2008-02-07 09:40"), + Timestamp("2008-02-07 09:50"), + Timestamp("2008-02-07 10:00"), + ], + name="date", + ) + cols = {"values": [1032.43, 1042.54, 1051.65]} + expected = DataFrame(cols, index=index) + + result = parser.read_csv( + StringIO(data), + parse_dates=parse_dates, + index_col=0, + usecols=usecols, + header=None, + names=names, + ) + tm.assert_frame_equal(result, expected) + def test_usecols_with_parse_dates3(self, all_parsers): + # see gh-14792 + parser = all_parsers + data = """a,b,c,d,e,f,g,h,i,j + 2016/09/21,1,1,2,3,4,5,6,7,8""" + + usecols = list("abcdefghij") + parse_dates = [0] + + cols = { + "a": Timestamp("2016-09-21"), + "b": [1], + "c": [1], + "d": [2], + "e": [3], + "f": [4], + "g": [5], + "h": [6], + "i": [7], + "j": [8], + } + expected = DataFrame(cols, columns=usecols) + + result = parser.read_csv( + StringIO(data), usecols=usecols, parse_dates=parse_dates + ) + tm.assert_frame_equal(result, expected) -@skip_pyarrow -@pytest.mark.parametrize( - "data,usecols,kwargs,expected", - [ - # see gh-8985 - ( - "19,29,39\n" * 2 + "10,20,30,40", - [0, 1, 2], - {"header": None}, - DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]), - ), - # see gh-9549 - ( - ("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"), - ["A", "B", "C"], - {}, - DataFrame( - { - "A": [1, 3, 1, 1, 1, 5], - "B": [2, 4, 2, 2, 2, 6], - "C": [3, 5, 4, 3, 3, 7], - } - ), - ), - ], -) -def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected): - # see gh-8985 - parser = all_parsers - result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs) - tm.assert_frame_equal(result, expected) + def test_usecols_with_parse_dates4(self, all_parsers): + data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8" + usecols = list("abcdefghij") + parse_dates = [[0, 1]] + parser = all_parsers + + cols = { + "a_b": "2016/09/21 1", + "c": [1], + "d": [2], + "e": [3], + "f": [4], + "g": [5], + "h": [6], + "i": [7], + "j": [8], + } + expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) + + result = parser.read_csv( + StringIO(data), usecols=usecols, parse_dates=parse_dates + ) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) + @pytest.mark.parametrize( + "names", + [ + list("abcde"), # Names span all columns in original data. + list("acd"), # Names span only the selected columns. + ], + ) + def test_usecols_with_parse_dates_and_names(self, all_parsers, usecols, names): + # see gh-9755 + s = """0,1,20140101,0900,4 + 0,1,20140102,1000,4""" + parse_dates = [[1, 2]] + parser = all_parsers + + cols = { + "a": [0, 0], + "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], + } + expected = DataFrame(cols, columns=["c_d", "a"]) + + result = parser.read_csv( + StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols + ) + tm.assert_frame_equal(result, expected) -@skip_pyarrow -@pytest.mark.parametrize( - "usecols,kwargs,expected,msg", - [ - ( - ["a", "b", "c", "d"], - {}, - DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), - None, - ), - ( - ["a", "b", "c", "f"], - {}, - None, - _msg_validate_usecols_names.format(r"\['f'\]"), - ), - (["a", "b", "f"], {}, None, _msg_validate_usecols_names.format(r"\['f'\]")), - ( - ["a", "b", "f", "g"], - {}, - None, - _msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"), - ), - # see gh-14671 - ( - None, - {"header": 0, "names": ["A", "B", "C", "D"]}, - DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}), - None, - ), - ( - ["A", "B", "C", "f"], - {"header": 0, "names": ["A", "B", "C", "D"]}, - None, - _msg_validate_usecols_names.format(r"\['f'\]"), - ), - ( - ["A", "B", "f"], - {"names": ["A", "B", "C", "D"]}, - None, - _msg_validate_usecols_names.format(r"\['f'\]"), - ), - ], -) -def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected, msg): - data = "a,b,c,d\n1,2,3,4\n5,6,7,8" - kwargs.update(usecols=usecols) - parser = all_parsers - if expected is None: - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), **kwargs) - else: - result = parser.read_csv(StringIO(data), **kwargs) +class TestUsecolsStrings: + def test_usecols_with_unicode_strings(self, all_parsers): + # see gh-13219 + data = """AAA,BBB,CCC,DDD + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a""" + parser = all_parsers + + exp_data = { + "AAA": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "BBB": {0: 8, 1: 2, 2: 7}, + } + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"]) tm.assert_frame_equal(result, expected) + def test_usecols_with_single_byte_unicode_strings(self, all_parsers): + # see gh-13219 + data = """A,B,C,D + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a""" + parser = all_parsers + + exp_data = { + "A": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "B": {0: 8, 1: 2, 2: 7}, + } + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), usecols=["A", "B"]) + tm.assert_frame_equal(result, expected) -@skip_pyarrow -@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]]) -def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols): - data = "a,b,c,d\n1,2,3,4\n5,6,7,8" - names = ["A", "B", "C", "D"] - parser = all_parsers - - result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols) - expected = DataFrame({"A": [1, 5], "C": [3, 7]}) - tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]]) + def test_usecols_with_mixed_encoding_strings(self, all_parsers, usecols): + data = """AAA,BBB,CCC,DDD + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a""" + parser = all_parsers + + with pytest.raises(ValueError, match=_msg_validate_usecols_arg): + parser.read_csv(StringIO(data), usecols=usecols) + + @pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]]) + def test_usecols_with_multi_byte_characters(self, all_parsers, usecols): + data = """あああ,いい,ううう,ええええ + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a""" + parser = all_parsers + + exp_data = { + "あああ": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "いい": {0: 8, 1: 2, 2: 7}, + } + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) From 0af7291047c5a64418634ab6fe23f2dcd8bc1df8 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 19 Dec 2020 02:05:33 -0500 Subject: [PATCH 69/95] test reorg --- pandas/tests/io/parser/test_skiprows.py | 450 ++++++++++++------------ 1 file changed, 222 insertions(+), 228 deletions(-) diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 6d85e01c6fd4a..f043861b36e4a 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -18,248 +18,242 @@ @skip_pyarrow -@pytest.mark.parametrize("skiprows", [list(range(6)), 6]) -def test_skip_rows_bug(all_parsers, skiprows): - # see gh-505 - parser = all_parsers - text = """#foo,a,b,c -#foo,a,b,c -#foo,a,b,c -#foo,a,b,c -#foo,a,b,c -#foo,a,b,c -1/1/2000,1.,2.,3. -1/2/2000,4,5,6 -1/3/2000,7,8,9 -""" - result = parser.read_csv( - StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True - ) - index = Index( - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 - ) - - expected = DataFrame( - np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_deep_skip_rows(all_parsers): - # see gh-4382 - parser = all_parsers - data = "a,b,c\n" + "\n".join( - [",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)] - ) - condensed_data = "a,b,c\n" + "\n".join( - [",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]] +class TestParserSkiprows: + @pytest.mark.parametrize("skiprows", [list(range(6)), 6]) + def test_skip_rows_bug(self, all_parsers, skiprows): + # see gh-505 + parser = all_parsers + text = """#foo,a,b,c + #foo,a,b,c + #foo,a,b,c + #foo,a,b,c + #foo,a,b,c + #foo,a,b,c + 1/1/2000,1.,2.,3. + 1/2/2000,4,5,6 + 1/3/2000,7,8,9 + """ + result = parser.read_csv( + StringIO(text), + skiprows=skiprows, + header=None, + index_col=0, + parse_dates=True, + ) + index = Index( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + ) + + expected = DataFrame( + np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index + ) + tm.assert_frame_equal(result, expected) + + def test_deep_skip_rows(self, all_parsers): + # see gh-4382 + parser = all_parsers + data = "a,b,c\n" + "\n".join( + [",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)] + ) + condensed_data = "a,b,c\n" + "\n".join( + [ + ",".join([str(i), str(i + 1), str(i + 2)]) + for i in [0, 1, 2, 3, 4, 6, 8, 9] + ] + ) + + result = parser.read_csv(StringIO(data), skiprows=[6, 8]) + condensed_result = parser.read_csv(StringIO(condensed_data)) + tm.assert_frame_equal(result, condensed_result) + + def test_skip_rows_blank(self, all_parsers): + # see gh-9832 + parser = all_parsers + text = """#foo,a,b,c + #foo,a,b,c + + #foo,a,b,c + #foo,a,b,c + + 1/1/2000,1.,2.,3. + 1/2/2000,4,5,6 + 1/3/2000,7,8,9 + """ + data = parser.read_csv( + StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True + ) + index = Index( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + ) + + expected = DataFrame( + np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index + ) + tm.assert_frame_equal(data, expected) + + def test_skip_row_with_quote(self, all_parsers): + # see gh-12775 and gh-10911 + parser = all_parsers + data = """id,text,num_lines + 1,"line '11' line 12",2 + 2,"line '21' line 22",2 + 3,"line '31' line 32",1""" + + exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]] + expected = DataFrame(exp_data, columns=["id", "text", "num_lines"]) + + result = parser.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(result, expected) + + def test_skip_rows_skip_all(self, all_parsers): + parser = all_parsers + data = "a\n1\n2\n3\n4\n5" + msg = "No columns to parse from file" + + with pytest.raises(EmptyDataError, match=msg): + parser.read_csv(StringIO(data), skiprows=lambda x: True) + + def test_skip_rows_bad_callable(self, all_parsers): + msg = "by zero" + parser = all_parsers + data = "a\n1\n2\n3\n4\n5" + + with pytest.raises(ZeroDivisionError, match=msg): + parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0) + + def test_skiprows_infield_quote(self, all_parsers): + # see gh-14459 + parser = all_parsers + data = 'a"\nb"\na\n1' + expected = DataFrame({"a": [1]}) + + result = parser.read_csv(StringIO(data), skiprows=2) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "kwargs,expected", + [ + ({}, DataFrame({"1": [3, 5]})), + ({"header": 0, "names": ["foo"]}, DataFrame({"foo": [3, 5]})), + ], ) + def test_skip_rows_callable(self, all_parsers, kwargs, expected): + parser = all_parsers + data = "a\n1\n2\n3\n4\n5" - result = parser.read_csv(StringIO(data), skiprows=[6, 8]) - condensed_result = parser.read_csv(StringIO(condensed_data)) - tm.assert_frame_equal(result, condensed_result) - - -@skip_pyarrow -def test_skip_rows_blank(all_parsers): - # see gh-9832 - parser = all_parsers - text = """#foo,a,b,c -#foo,a,b,c - -#foo,a,b,c -#foo,a,b,c + result = parser.read_csv( + StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs + ) + tm.assert_frame_equal(result, expected) -1/1/2000,1.,2.,3. -1/2/2000,4,5,6 -1/3/2000,7,8,9 -""" - data = parser.read_csv( - StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True - ) - index = Index( - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + @pytest.mark.parametrize( + "line_terminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR" ) - - expected = DataFrame( - np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index + def test_skiprows_lineterminator(self, all_parsers, line_terminator): + # see gh-9079 + parser = all_parsers + data = "\n".join( + [ + "SMOSMANIA ThetaProbe-ML2X ", + "2007/01/01 01:00 0.2140 U M ", + "2007/01/01 02:00 0.2141 M O ", + "2007/01/01 04:00 0.2142 D M ", + ] + ) + expected = DataFrame( + [ + ["2007/01/01", "01:00", 0.2140, "U", "M"], + ["2007/01/01", "02:00", 0.2141, "M", "O"], + ["2007/01/01", "04:00", 0.2142, "D", "M"], + ], + columns=["date", "time", "var", "flag", "oflag"], + ) + + if parser.engine == "python" and line_terminator == "\r": + pytest.skip("'CR' not respect with the Python parser yet") + + data = data.replace("\n", line_terminator) + result = parser.read_csv( + StringIO(data), + skiprows=1, + delim_whitespace=True, + names=["date", "time", "var", "flag", "oflag"], + ) + tm.assert_frame_equal(result, expected) + + @skip_pyarrow + @pytest.mark.parametrize( + "data,exp_data", + [ + ( + """id,text,num_lines + 1,"line \n'11' line 12",2 + 2,"line \n'21' line 22",2 + 3,"line \n'31' line 32",1""", + [[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]], + ), + ( + """id,text,num_lines + 1,"line '11\n' line 12",2 + 2,"line '21\n' line 22",2 + 3,"line '31\n' line 32",1""", + [[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]], + ), + ( + """id,text,num_lines + 1,"line '11\n' \r\tline 12",2 + 2,"line '21\n' \r\tline 22",2 + 3,"line '31\n' \r\tline 32",1""", + [[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]], + ), + ], ) - tm.assert_frame_equal(data, expected) + def test_skip_row_with_newline_and_quote(self, all_parsers, data, exp_data): + # see gh-12775 and gh-10911 + parser = all_parsers + result = parser.read_csv(StringIO(data), skiprows=[1]) + expected = DataFrame(exp_data, columns=["id", "text", "num_lines"]) + tm.assert_frame_equal(result, expected) -@skip_pyarrow -@pytest.mark.parametrize( - "data,kwargs,expected", - [ - ( - """id,text,num_lines + @skip_pyarrow + @pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + """id,text,num_lines 1,"line 11 line 12",2 2,"line 21 line 22",2 3,"line 31",1""", - {"skiprows": [1]}, - DataFrame( - [[2, "line 21\nline 22", 2], [3, "line 31", 1]], - columns=["id", "text", "num_lines"], + {"skiprows": [1]}, + DataFrame( + [[2, "line 21\nline 22", 2], [3, "line 31", 1]], + columns=["id", "text", "num_lines"], + ), ), - ), - ( - "a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~", - {"quotechar": "~", "skiprows": [2]}, - DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]), - ), - ( ( - "Text,url\n~example\n " - "sentence\n one~,url1\n~" - "example\n sentence\n two~,url2\n~" - "example\n sentence\n three~,url3" + "a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~", + {"quotechar": "~", "skiprows": [2]}, + DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]), + ), + ( + ( + "Text,url\n~example\n " + "sentence\n one~,url1\n~" + "example\n sentence\n two~,url2\n~" + "example\n sentence\n three~,url3" + ), + {"quotechar": "~", "skiprows": [1, 3]}, + DataFrame( + [["example\n sentence\n two", "url2"]], columns=["Text", "url"] + ), ), - {"quotechar": "~", "skiprows": [1, 3]}, - DataFrame([["example\n sentence\n two", "url2"]], columns=["Text", "url"]), - ), - ], -) -def test_skip_row_with_newline(all_parsers, data, kwargs, expected): - # see gh-12775 and gh-10911 - parser = all_parsers - result = parser.read_csv(StringIO(data), **kwargs) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_skip_row_with_quote(all_parsers): - # see gh-12775 and gh-10911 - parser = all_parsers - data = """id,text,num_lines -1,"line '11' line 12",2 -2,"line '21' line 22",2 -3,"line '31' line 32",1""" - - exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]] - expected = DataFrame(exp_data, columns=["id", "text", "num_lines"]) - - result = parser.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "data,exp_data", - [ - ( - """id,text,num_lines -1,"line \n'11' line 12",2 -2,"line \n'21' line 22",2 -3,"line \n'31' line 32",1""", - [[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]], - ), - ( - """id,text,num_lines -1,"line '11\n' line 12",2 -2,"line '21\n' line 22",2 -3,"line '31\n' line 32",1""", - [[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]], - ), - ( - """id,text,num_lines -1,"line '11\n' \r\tline 12",2 -2,"line '21\n' \r\tline 22",2 -3,"line '31\n' \r\tline 32",1""", - [[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]], - ), - ], -) -def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data): - # see gh-12775 and gh-10911 - parser = all_parsers - result = parser.read_csv(StringIO(data), skiprows=[1]) - - expected = DataFrame(exp_data, columns=["id", "text", "num_lines"]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "line_terminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR" -) -def test_skiprows_lineterminator(all_parsers, line_terminator): - # see gh-9079 - parser = all_parsers - data = "\n".join( - [ - "SMOSMANIA ThetaProbe-ML2X ", - "2007/01/01 01:00 0.2140 U M ", - "2007/01/01 02:00 0.2141 M O ", - "2007/01/01 04:00 0.2142 D M ", - ] - ) - expected = DataFrame( - [ - ["2007/01/01", "01:00", 0.2140, "U", "M"], - ["2007/01/01", "02:00", 0.2141, "M", "O"], - ["2007/01/01", "04:00", 0.2142, "D", "M"], ], - columns=["date", "time", "var", "flag", "oflag"], - ) - - if parser.engine == "python" and line_terminator == "\r": - pytest.skip("'CR' not respect with the Python parser yet") - - data = data.replace("\n", line_terminator) - result = parser.read_csv( - StringIO(data), - skiprows=1, - delim_whitespace=True, - names=["date", "time", "var", "flag", "oflag"], ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_skiprows_infield_quote(all_parsers): - # see gh-14459 - parser = all_parsers - data = 'a"\nb"\na\n1' - expected = DataFrame({"a": [1]}) - - result = parser.read_csv(StringIO(data), skiprows=2) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "kwargs,expected", - [ - ({}, DataFrame({"1": [3, 5]})), - ({"header": 0, "names": ["foo"]}, DataFrame({"foo": [3, 5]})), - ], -) -def test_skip_rows_callable(all_parsers, kwargs, expected): - parser = all_parsers - data = "a\n1\n2\n3\n4\n5" - - result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_skip_rows_skip_all(all_parsers): - parser = all_parsers - data = "a\n1\n2\n3\n4\n5" - msg = "No columns to parse from file" - - with pytest.raises(EmptyDataError, match=msg): - parser.read_csv(StringIO(data), skiprows=lambda x: True) - - -@skip_pyarrow -def test_skip_rows_bad_callable(all_parsers): - msg = "by zero" - parser = all_parsers - data = "a\n1\n2\n3\n4\n5" - - with pytest.raises(ZeroDivisionError, match=msg): - parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0) + def test_skip_row_with_newline(self, all_parsers, data, kwargs, expected): + # see gh-12775 and gh-10911 + parser = all_parsers + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) From 361aab6548ca66826880c37ec6c81f80906c626e Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 19 Dec 2020 02:09:14 -0500 Subject: [PATCH 70/95] test reorg --- pandas/tests/io/parser/test_quoting.py | 256 ++++++++++++------------- 1 file changed, 121 insertions(+), 135 deletions(-) diff --git a/pandas/tests/io/parser/test_quoting.py b/pandas/tests/io/parser/test_quoting.py index a93dbde24b001..1617160d2089a 100644 --- a/pandas/tests/io/parser/test_quoting.py +++ b/pandas/tests/io/parser/test_quoting.py @@ -14,156 +14,142 @@ import pandas._testing as tm skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @skip_pyarrow -@pytest.mark.parametrize( - "kwargs,msg", - [ - ({"quotechar": "foo"}, '"quotechar" must be a(n)? 1-character string'), - ( - {"quotechar": None, "quoting": csv.QUOTE_MINIMAL}, - "quotechar must be set if quoting enabled", - ), - ({"quotechar": 2}, '"quotechar" must be string, not int'), - ], -) -def test_bad_quote_char(all_parsers, kwargs, msg): - data = "1,2,3" - parser = all_parsers - - with pytest.raises(TypeError, match=msg): - parser.read_csv(StringIO(data), **kwargs) +class TestParserQuoting: + @pytest.mark.parametrize( + "kwargs,msg", + [ + ({"quotechar": "foo"}, '"quotechar" must be a(n)? 1-character string'), + ( + {"quotechar": None, "quoting": csv.QUOTE_MINIMAL}, + "quotechar must be set if quoting enabled", + ), + ({"quotechar": 2}, '"quotechar" must be string, not int'), + ], + ) + def test_bad_quote_char(self, all_parsers, kwargs, msg): + data = "1,2,3" + parser = all_parsers + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), **kwargs) -@skip_pyarrow -@pytest.mark.parametrize( - "quoting,msg", - [ - ("foo", '"quoting" must be an integer'), - (5, 'bad "quoting" value'), # quoting must be in the range [0, 3] - ], -) -def test_bad_quoting(all_parsers, quoting, msg): - data = "1,2,3" - parser = all_parsers - - with pytest.raises(TypeError, match=msg): - parser.read_csv(StringIO(data), quoting=quoting) - - -def test_quote_char_basic(all_parsers): - parser = all_parsers - data = 'a,b,c\n1,2,"cat"' - expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"]) - - result = parser.read_csv(StringIO(data), quotechar='"') - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"]) -def test_quote_char_various(all_parsers, quote_char): - parser = all_parsers - expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"]) - - data = 'a,b,c\n1,2,"cat"' - new_data = data.replace('"', quote_char) - - result = parser.read_csv(StringIO(new_data), quotechar=quote_char) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE]) -@pytest.mark.parametrize("quote_char", ["", None]) -def test_null_quote_char(all_parsers, quoting, quote_char): - kwargs = {"quotechar": quote_char, "quoting": quoting} - data = "a,b,c\n1,2,3" - parser = all_parsers - - if quoting != csv.QUOTE_NONE: - # Sanity checking. - msg = "quotechar must be set if quoting enabled" + @pytest.mark.parametrize( + "quoting,msg", + [ + ("foo", '"quoting" must be an integer'), + (5, 'bad "quoting" value'), # quoting must be in the range [0, 3] + ], + ) + def test_bad_quoting(self, all_parsers, quoting, msg): + data = "1,2,3" + parser = all_parsers with pytest.raises(TypeError, match=msg): - parser.read_csv(StringIO(data), **kwargs) - else: - expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) - result = parser.read_csv(StringIO(data), **kwargs) - tm.assert_frame_equal(result, expected) + parser.read_csv(StringIO(data), quoting=quoting) + def test_quote_char_basic(self, all_parsers): + parser = all_parsers + data = 'a,b,c\n1,2,"cat"' + expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"]) -@skip_pyarrow -@pytest.mark.parametrize( - "kwargs,exp_data", - [ - ({}, [[1, 2, "foo"]]), # Test default. - # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. - ({"quotechar": '"', "quoting": csv.QUOTE_MINIMAL}, [[1, 2, "foo"]]), - # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. - ({"quotechar": '"', "quoting": csv.QUOTE_ALL}, [[1, 2, "foo"]]), - # QUOTE_NONE tells the reader to do no special handling - # of quote characters and leave them alone. - ({"quotechar": '"', "quoting": csv.QUOTE_NONE}, [[1, 2, '"foo"']]), - # QUOTE_NONNUMERIC tells the reader to cast - # all non-quoted fields to float - ({"quotechar": '"', "quoting": csv.QUOTE_NONNUMERIC}, [[1.0, 2.0, "foo"]]), - ], -) -def test_quoting_various(all_parsers, kwargs, exp_data): - data = '1,2,"foo"' - parser = all_parsers - columns = ["a", "b", "c"] - - result = parser.read_csv(StringIO(data), names=columns, **kwargs) - expected = DataFrame(exp_data, columns=columns) - tm.assert_frame_equal(result, expected) + result = parser.read_csv(StringIO(data), quotechar='"') + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"]) + def test_quote_char_various(self, all_parsers, quote_char): + parser = all_parsers + expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"]) -@skip_pyarrow -@pytest.mark.parametrize( - "doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])] -) -def test_double_quote(all_parsers, doublequote, exp_data): - parser = all_parsers - data = 'a,b\n3,"4 "" 5"' + data = 'a,b,c\n1,2,"cat"' + new_data = data.replace('"', quote_char) - result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote) - expected = DataFrame(exp_data, columns=["a", "b"]) - tm.assert_frame_equal(result, expected) + result = parser.read_csv(StringIO(new_data), quotechar=quote_char) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE]) + @pytest.mark.parametrize("quote_char", ["", None]) + def test_null_quote_char(self, all_parsers, quoting, quote_char): + kwargs = {"quotechar": quote_char, "quoting": quoting} + data = "a,b,c\n1,2,3" + parser = all_parsers + + if quoting != csv.QUOTE_NONE: + # Sanity checking. + msg = "quotechar must be set if quoting enabled" + + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + else: + expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "kwargs,exp_data", + [ + ({}, [[1, 2, "foo"]]), # Test default. + # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. + ({"quotechar": '"', "quoting": csv.QUOTE_MINIMAL}, [[1, 2, "foo"]]), + # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. + ({"quotechar": '"', "quoting": csv.QUOTE_ALL}, [[1, 2, "foo"]]), + # QUOTE_NONE tells the reader to do no special handling + # of quote characters and leave them alone. + ({"quotechar": '"', "quoting": csv.QUOTE_NONE}, [[1, 2, '"foo"']]), + # QUOTE_NONNUMERIC tells the reader to cast + # all non-quoted fields to float + ({"quotechar": '"', "quoting": csv.QUOTE_NONNUMERIC}, [[1.0, 2.0, "foo"]]), + ], + ) + def test_quoting_various(self, all_parsers, kwargs, exp_data): + data = '1,2,"foo"' + parser = all_parsers + columns = ["a", "b", "c"] + + result = parser.read_csv(StringIO(data), names=columns, **kwargs) + expected = DataFrame(exp_data, columns=columns) + tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("quotechar", ['"', "\u0001"]) -def test_quotechar_unicode(all_parsers, quotechar): - # see gh-14477 - data = "a\n1" - parser = all_parsers - expected = DataFrame({"a": [1]}) + @pytest.mark.parametrize( + "doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])] + ) + def test_double_quote(self, all_parsers, doublequote, exp_data): + parser = all_parsers + data = 'a,b\n3,"4 "" 5"' - result = parser.read_csv(StringIO(data), quotechar=quotechar) - tm.assert_frame_equal(result, expected) + result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote) + expected = DataFrame(exp_data, columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("quotechar", ['"', "\u0001"]) + def test_quotechar_unicode(self, all_parsers, quotechar): + # see gh-14477 + data = "a\n1" + parser = all_parsers + expected = DataFrame({"a": [1]}) -@skip_pyarrow -@pytest.mark.parametrize("balanced", [True, False]) -def test_unbalanced_quoting(all_parsers, balanced): - # see gh-22789. - parser = all_parsers - data = 'a,b,c\n1,2,"3' - - if balanced: - # Re-balance the quoting and read in without errors. - expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) - result = parser.read_csv(StringIO(data + '"')) + result = parser.read_csv(StringIO(data), quotechar=quotechar) tm.assert_frame_equal(result, expected) - else: - msg = ( - "EOF inside string starting at row 1" - if parser.engine == "c" - else "unexpected end of data" - ) - - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data)) + + @pytest.mark.parametrize("balanced", [True, False]) + def test_unbalanced_quoting(self, all_parsers, balanced): + # see gh-22789. + parser = all_parsers + data = 'a,b,c\n1,2,"3' + + if balanced: + # Re-balance the quoting and read in without errors. + expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) + result = parser.read_csv(StringIO(data + '"')) + tm.assert_frame_equal(result, expected) + else: + msg = ( + "EOF inside string starting at row 1" + if parser.engine == "c" + else "unexpected end of data" + ) + + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data)) From 75de071b6699cec5d3ef484a42889e33e47a34db Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 19 Dec 2020 02:12:08 -0500 Subject: [PATCH 71/95] test reorg --- pandas/tests/io/parser/test_quoting.py | 256 +++++++++++++------------ 1 file changed, 131 insertions(+), 125 deletions(-) diff --git a/pandas/tests/io/parser/test_quoting.py b/pandas/tests/io/parser/test_quoting.py index 1617160d2089a..6995965467d05 100644 --- a/pandas/tests/io/parser/test_quoting.py +++ b/pandas/tests/io/parser/test_quoting.py @@ -13,143 +13,149 @@ from pandas import DataFrame import pandas._testing as tm -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") - - -@skip_pyarrow -class TestParserQuoting: - @pytest.mark.parametrize( - "kwargs,msg", - [ - ({"quotechar": "foo"}, '"quotechar" must be a(n)? 1-character string'), - ( - {"quotechar": None, "quoting": csv.QUOTE_MINIMAL}, - "quotechar must be set if quoting enabled", - ), - ({"quotechar": 2}, '"quotechar" must be string, not int'), - ], - ) - def test_bad_quote_char(self, all_parsers, kwargs, msg): - data = "1,2,3" - parser = all_parsers +pytestmark = pytest.mark.usefixtures("pyarrow_skip") - with pytest.raises(TypeError, match=msg): - parser.read_csv(StringIO(data), **kwargs) - @pytest.mark.parametrize( - "quoting,msg", - [ - ("foo", '"quoting" must be an integer'), - (5, 'bad "quoting" value'), # quoting must be in the range [0, 3] - ], - ) - def test_bad_quoting(self, all_parsers, quoting, msg): - data = "1,2,3" - parser = all_parsers +@pytest.mark.parametrize( + "kwargs,msg", + [ + ({"quotechar": "foo"}, '"quotechar" must be a(n)? 1-character string'), + ( + {"quotechar": None, "quoting": csv.QUOTE_MINIMAL}, + "quotechar must be set if quoting enabled", + ), + ({"quotechar": 2}, '"quotechar" must be string, not int'), + ], +) +def test_bad_quote_char(all_parsers, kwargs, msg): + data = "1,2,3" + parser = all_parsers - with pytest.raises(TypeError, match=msg): - parser.read_csv(StringIO(data), quoting=quoting) + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), **kwargs) - def test_quote_char_basic(self, all_parsers): - parser = all_parsers - data = 'a,b,c\n1,2,"cat"' - expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"]) - result = parser.read_csv(StringIO(data), quotechar='"') - tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "quoting,msg", + [ + ("foo", '"quoting" must be an integer'), + (5, 'bad "quoting" value'), # quoting must be in the range [0, 3] + ], +) +def test_bad_quoting(all_parsers, quoting, msg): + data = "1,2,3" + parser = all_parsers - @pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"]) - def test_quote_char_various(self, all_parsers, quote_char): - parser = all_parsers - expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"]) + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), quoting=quoting) - data = 'a,b,c\n1,2,"cat"' - new_data = data.replace('"', quote_char) - result = parser.read_csv(StringIO(new_data), quotechar=quote_char) - tm.assert_frame_equal(result, expected) +def test_quote_char_basic(all_parsers): + parser = all_parsers + data = 'a,b,c\n1,2,"cat"' + expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"]) - @pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE]) - @pytest.mark.parametrize("quote_char", ["", None]) - def test_null_quote_char(self, all_parsers, quoting, quote_char): - kwargs = {"quotechar": quote_char, "quoting": quoting} - data = "a,b,c\n1,2,3" - parser = all_parsers - - if quoting != csv.QUOTE_NONE: - # Sanity checking. - msg = "quotechar must be set if quoting enabled" - - with pytest.raises(TypeError, match=msg): - parser.read_csv(StringIO(data), **kwargs) - else: - expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) - result = parser.read_csv(StringIO(data), **kwargs) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "kwargs,exp_data", - [ - ({}, [[1, 2, "foo"]]), # Test default. - # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. - ({"quotechar": '"', "quoting": csv.QUOTE_MINIMAL}, [[1, 2, "foo"]]), - # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. - ({"quotechar": '"', "quoting": csv.QUOTE_ALL}, [[1, 2, "foo"]]), - # QUOTE_NONE tells the reader to do no special handling - # of quote characters and leave them alone. - ({"quotechar": '"', "quoting": csv.QUOTE_NONE}, [[1, 2, '"foo"']]), - # QUOTE_NONNUMERIC tells the reader to cast - # all non-quoted fields to float - ({"quotechar": '"', "quoting": csv.QUOTE_NONNUMERIC}, [[1.0, 2.0, "foo"]]), - ], - ) - def test_quoting_various(self, all_parsers, kwargs, exp_data): - data = '1,2,"foo"' - parser = all_parsers - columns = ["a", "b", "c"] - - result = parser.read_csv(StringIO(data), names=columns, **kwargs) - expected = DataFrame(exp_data, columns=columns) - tm.assert_frame_equal(result, expected) + result = parser.read_csv(StringIO(data), quotechar='"') + tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])] - ) - def test_double_quote(self, all_parsers, doublequote, exp_data): - parser = all_parsers - data = 'a,b\n3,"4 "" 5"' - result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote) - expected = DataFrame(exp_data, columns=["a", "b"]) - tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"]) +def test_quote_char_various(all_parsers, quote_char): + parser = all_parsers + expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"]) + + data = 'a,b,c\n1,2,"cat"' + new_data = data.replace('"', quote_char) - @pytest.mark.parametrize("quotechar", ['"', "\u0001"]) - def test_quotechar_unicode(self, all_parsers, quotechar): - # see gh-14477 - data = "a\n1" - parser = all_parsers - expected = DataFrame({"a": [1]}) + result = parser.read_csv(StringIO(new_data), quotechar=quote_char) + tm.assert_frame_equal(result, expected) - result = parser.read_csv(StringIO(data), quotechar=quotechar) + +@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE]) +@pytest.mark.parametrize("quote_char", ["", None]) +def test_null_quote_char(all_parsers, quoting, quote_char): + kwargs = {"quotechar": quote_char, "quoting": quoting} + data = "a,b,c\n1,2,3" + parser = all_parsers + + if quoting != csv.QUOTE_NONE: + # Sanity checking. + msg = "quotechar must be set if quoting enabled" + + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + else: + expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) + result = parser.read_csv(StringIO(data), **kwargs) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("balanced", [True, False]) - def test_unbalanced_quoting(self, all_parsers, balanced): - # see gh-22789. - parser = all_parsers - data = 'a,b,c\n1,2,"3' - - if balanced: - # Re-balance the quoting and read in without errors. - expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) - result = parser.read_csv(StringIO(data + '"')) - tm.assert_frame_equal(result, expected) - else: - msg = ( - "EOF inside string starting at row 1" - if parser.engine == "c" - else "unexpected end of data" - ) - - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data)) + +@pytest.mark.parametrize( + "kwargs,exp_data", + [ + ({}, [[1, 2, "foo"]]), # Test default. + # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. + ({"quotechar": '"', "quoting": csv.QUOTE_MINIMAL}, [[1, 2, "foo"]]), + # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. + ({"quotechar": '"', "quoting": csv.QUOTE_ALL}, [[1, 2, "foo"]]), + # QUOTE_NONE tells the reader to do no special handling + # of quote characters and leave them alone. + ({"quotechar": '"', "quoting": csv.QUOTE_NONE}, [[1, 2, '"foo"']]), + # QUOTE_NONNUMERIC tells the reader to cast + # all non-quoted fields to float + ({"quotechar": '"', "quoting": csv.QUOTE_NONNUMERIC}, [[1.0, 2.0, "foo"]]), + ], +) +def test_quoting_various(all_parsers, kwargs, exp_data): + data = '1,2,"foo"' + parser = all_parsers + columns = ["a", "b", "c"] + + result = parser.read_csv(StringIO(data), names=columns, **kwargs) + expected = DataFrame(exp_data, columns=columns) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])] +) +def test_double_quote(all_parsers, doublequote, exp_data): + parser = all_parsers + data = 'a,b\n3,"4 "" 5"' + + result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote) + expected = DataFrame(exp_data, columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("quotechar", ['"', "\u0001"]) +def test_quotechar_unicode(all_parsers, quotechar): + # see gh-14477 + data = "a\n1" + parser = all_parsers + expected = DataFrame({"a": [1]}) + + result = parser.read_csv(StringIO(data), quotechar=quotechar) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("balanced", [True, False]) +def test_unbalanced_quoting(all_parsers, balanced): + # see gh-22789. + parser = all_parsers + data = 'a,b,c\n1,2,"3' + + if balanced: + # Re-balance the quoting and read in without errors. + expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) + result = parser.read_csv(StringIO(data + '"')) + tm.assert_frame_equal(result, expected) + else: + msg = ( + "EOF inside string starting at row 1" + if parser.engine == "c" + else "unexpected end of data" + ) + + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data)) From a1dfcb2d5ace8099cee4faed8e930bc6d8828759 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 19 Dec 2020 02:14:03 -0500 Subject: [PATCH 72/95] test reorg --- pandas/tests/io/parser/test_skiprows.py | 447 ++++++++++++------------ 1 file changed, 221 insertions(+), 226 deletions(-) diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index f043861b36e4a..ffd4f3aecb5d0 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -14,246 +14,241 @@ from pandas import DataFrame, Index import pandas._testing as tm -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") - - -@skip_pyarrow -class TestParserSkiprows: - @pytest.mark.parametrize("skiprows", [list(range(6)), 6]) - def test_skip_rows_bug(self, all_parsers, skiprows): - # see gh-505 - parser = all_parsers - text = """#foo,a,b,c - #foo,a,b,c - #foo,a,b,c - #foo,a,b,c - #foo,a,b,c - #foo,a,b,c - 1/1/2000,1.,2.,3. - 1/2/2000,4,5,6 - 1/3/2000,7,8,9 - """ - result = parser.read_csv( - StringIO(text), - skiprows=skiprows, - header=None, - index_col=0, - parse_dates=True, - ) - index = Index( - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 - ) - - expected = DataFrame( - np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index - ) - tm.assert_frame_equal(result, expected) - - def test_deep_skip_rows(self, all_parsers): - # see gh-4382 - parser = all_parsers - data = "a,b,c\n" + "\n".join( - [",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)] - ) - condensed_data = "a,b,c\n" + "\n".join( - [ - ",".join([str(i), str(i + 1), str(i + 2)]) - for i in [0, 1, 2, 3, 4, 6, 8, 9] - ] - ) - - result = parser.read_csv(StringIO(data), skiprows=[6, 8]) - condensed_result = parser.read_csv(StringIO(condensed_data)) - tm.assert_frame_equal(result, condensed_result) - - def test_skip_rows_blank(self, all_parsers): - # see gh-9832 - parser = all_parsers - text = """#foo,a,b,c - #foo,a,b,c - - #foo,a,b,c - #foo,a,b,c - - 1/1/2000,1.,2.,3. - 1/2/2000,4,5,6 - 1/3/2000,7,8,9 - """ - data = parser.read_csv( - StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True - ) - index = Index( - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 - ) - - expected = DataFrame( - np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index - ) - tm.assert_frame_equal(data, expected) - - def test_skip_row_with_quote(self, all_parsers): - # see gh-12775 and gh-10911 - parser = all_parsers - data = """id,text,num_lines - 1,"line '11' line 12",2 - 2,"line '21' line 22",2 - 3,"line '31' line 32",1""" - - exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]] - expected = DataFrame(exp_data, columns=["id", "text", "num_lines"]) - - result = parser.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(result, expected) - - def test_skip_rows_skip_all(self, all_parsers): - parser = all_parsers - data = "a\n1\n2\n3\n4\n5" - msg = "No columns to parse from file" - - with pytest.raises(EmptyDataError, match=msg): - parser.read_csv(StringIO(data), skiprows=lambda x: True) - - def test_skip_rows_bad_callable(self, all_parsers): - msg = "by zero" - parser = all_parsers - data = "a\n1\n2\n3\n4\n5" - - with pytest.raises(ZeroDivisionError, match=msg): - parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0) - - def test_skiprows_infield_quote(self, all_parsers): - # see gh-14459 - parser = all_parsers - data = 'a"\nb"\na\n1' - expected = DataFrame({"a": [1]}) - - result = parser.read_csv(StringIO(data), skiprows=2) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "kwargs,expected", - [ - ({}, DataFrame({"1": [3, 5]})), - ({"header": 0, "names": ["foo"]}, DataFrame({"foo": [3, 5]})), - ], +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + + +@pytest.mark.parametrize("skiprows", [list(range(6)), 6]) +def test_skip_rows_bug(all_parsers, skiprows): + # see gh-505 + parser = all_parsers + text = """#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +1/1/2000,1.,2.,3. +1/2/2000,4,5,6 +1/3/2000,7,8,9 +""" + result = parser.read_csv( + StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True + ) + index = Index( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + ) + + expected = DataFrame( + np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index ) - def test_skip_rows_callable(self, all_parsers, kwargs, expected): - parser = all_parsers - data = "a\n1\n2\n3\n4\n5" + tm.assert_frame_equal(result, expected) - result = parser.read_csv( - StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs - ) - tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "line_terminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR" +def test_deep_skip_rows(all_parsers): + # see gh-4382 + parser = all_parsers + data = "a,b,c\n" + "\n".join( + [",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)] ) - def test_skiprows_lineterminator(self, all_parsers, line_terminator): - # see gh-9079 - parser = all_parsers - data = "\n".join( - [ - "SMOSMANIA ThetaProbe-ML2X ", - "2007/01/01 01:00 0.2140 U M ", - "2007/01/01 02:00 0.2141 M O ", - "2007/01/01 04:00 0.2142 D M ", - ] - ) - expected = DataFrame( - [ - ["2007/01/01", "01:00", 0.2140, "U", "M"], - ["2007/01/01", "02:00", 0.2141, "M", "O"], - ["2007/01/01", "04:00", 0.2142, "D", "M"], - ], - columns=["date", "time", "var", "flag", "oflag"], - ) - - if parser.engine == "python" and line_terminator == "\r": - pytest.skip("'CR' not respect with the Python parser yet") - - data = data.replace("\n", line_terminator) - result = parser.read_csv( - StringIO(data), - skiprows=1, - delim_whitespace=True, - names=["date", "time", "var", "flag", "oflag"], - ) - tm.assert_frame_equal(result, expected) - - @skip_pyarrow - @pytest.mark.parametrize( - "data,exp_data", - [ - ( - """id,text,num_lines - 1,"line \n'11' line 12",2 - 2,"line \n'21' line 22",2 - 3,"line \n'31' line 32",1""", - [[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]], - ), - ( - """id,text,num_lines - 1,"line '11\n' line 12",2 - 2,"line '21\n' line 22",2 - 3,"line '31\n' line 32",1""", - [[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]], - ), - ( - """id,text,num_lines - 1,"line '11\n' \r\tline 12",2 - 2,"line '21\n' \r\tline 22",2 - 3,"line '31\n' \r\tline 32",1""", - [[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]], - ), - ], + condensed_data = "a,b,c\n" + "\n".join( + [",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]] ) - def test_skip_row_with_newline_and_quote(self, all_parsers, data, exp_data): - # see gh-12775 and gh-10911 - parser = all_parsers - result = parser.read_csv(StringIO(data), skiprows=[1]) - expected = DataFrame(exp_data, columns=["id", "text", "num_lines"]) - tm.assert_frame_equal(result, expected) + result = parser.read_csv(StringIO(data), skiprows=[6, 8]) + condensed_result = parser.read_csv(StringIO(condensed_data)) + tm.assert_frame_equal(result, condensed_result) - @skip_pyarrow - @pytest.mark.parametrize( - "data,kwargs,expected", - [ - ( - """id,text,num_lines + +def test_skip_rows_blank(all_parsers): + # see gh-9832 + parser = all_parsers + text = """#foo,a,b,c +#foo,a,b,c + +#foo,a,b,c +#foo,a,b,c + +1/1/2000,1.,2.,3. +1/2/2000,4,5,6 +1/3/2000,7,8,9 +""" + data = parser.read_csv( + StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True + ) + index = Index( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + ) + + expected = DataFrame( + np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index + ) + tm.assert_frame_equal(data, expected) + + +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + """id,text,num_lines 1,"line 11 line 12",2 2,"line 21 line 22",2 3,"line 31",1""", - {"skiprows": [1]}, - DataFrame( - [[2, "line 21\nline 22", 2], [3, "line 31", 1]], - columns=["id", "text", "num_lines"], - ), + {"skiprows": [1]}, + DataFrame( + [[2, "line 21\nline 22", 2], [3, "line 31", 1]], + columns=["id", "text", "num_lines"], ), + ), + ( + "a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~", + {"quotechar": "~", "skiprows": [2]}, + DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]), + ), + ( ( - "a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~", - {"quotechar": "~", "skiprows": [2]}, - DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]), - ), - ( - ( - "Text,url\n~example\n " - "sentence\n one~,url1\n~" - "example\n sentence\n two~,url2\n~" - "example\n sentence\n three~,url3" - ), - {"quotechar": "~", "skiprows": [1, 3]}, - DataFrame( - [["example\n sentence\n two", "url2"]], columns=["Text", "url"] - ), + "Text,url\n~example\n " + "sentence\n one~,url1\n~" + "example\n sentence\n two~,url2\n~" + "example\n sentence\n three~,url3" ), + {"quotechar": "~", "skiprows": [1, 3]}, + DataFrame([["example\n sentence\n two", "url2"]], columns=["Text", "url"]), + ), + ], +) +def test_skip_row_with_newline(all_parsers, data, kwargs, expected): + # see gh-12775 and gh-10911 + parser = all_parsers + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_skip_row_with_quote(all_parsers): + # see gh-12775 and gh-10911 + parser = all_parsers + data = """id,text,num_lines +1,"line '11' line 12",2 +2,"line '21' line 22",2 +3,"line '31' line 32",1""" + + exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]] + expected = DataFrame(exp_data, columns=["id", "text", "num_lines"]) + + result = parser.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,exp_data", + [ + ( + """id,text,num_lines +1,"line \n'11' line 12",2 +2,"line \n'21' line 22",2 +3,"line \n'31' line 32",1""", + [[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]], + ), + ( + """id,text,num_lines +1,"line '11\n' line 12",2 +2,"line '21\n' line 22",2 +3,"line '31\n' line 32",1""", + [[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]], + ), + ( + """id,text,num_lines +1,"line '11\n' \r\tline 12",2 +2,"line '21\n' \r\tline 22",2 +3,"line '31\n' \r\tline 32",1""", + [[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]], + ), + ], +) +def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data): + # see gh-12775 and gh-10911 + parser = all_parsers + result = parser.read_csv(StringIO(data), skiprows=[1]) + + expected = DataFrame(exp_data, columns=["id", "text", "num_lines"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "line_terminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR" +) +def test_skiprows_lineterminator(all_parsers, line_terminator): + # see gh-9079 + parser = all_parsers + data = "\n".join( + [ + "SMOSMANIA ThetaProbe-ML2X ", + "2007/01/01 01:00 0.2140 U M ", + "2007/01/01 02:00 0.2141 M O ", + "2007/01/01 04:00 0.2142 D M ", + ] + ) + expected = DataFrame( + [ + ["2007/01/01", "01:00", 0.2140, "U", "M"], + ["2007/01/01", "02:00", 0.2141, "M", "O"], + ["2007/01/01", "04:00", 0.2142, "D", "M"], ], + columns=["date", "time", "var", "flag", "oflag"], + ) + + if parser.engine == "python" and line_terminator == "\r": + pytest.skip("'CR' not respect with the Python parser yet") + + data = data.replace("\n", line_terminator) + result = parser.read_csv( + StringIO(data), + skiprows=1, + delim_whitespace=True, + names=["date", "time", "var", "flag", "oflag"], ) - def test_skip_row_with_newline(self, all_parsers, data, kwargs, expected): - # see gh-12775 and gh-10911 - parser = all_parsers - result = parser.read_csv(StringIO(data), **kwargs) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) + + +def test_skiprows_infield_quote(all_parsers): + # see gh-14459 + parser = all_parsers + data = 'a"\nb"\na\n1' + expected = DataFrame({"a": [1]}) + + result = parser.read_csv(StringIO(data), skiprows=2) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "kwargs,expected", + [ + ({}, DataFrame({"1": [3, 5]})), + ({"header": 0, "names": ["foo"]}, DataFrame({"foo": [3, 5]})), + ], +) +def test_skip_rows_callable(all_parsers, kwargs, expected): + parser = all_parsers + data = "a\n1\n2\n3\n4\n5" + + result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_skip_rows_skip_all(all_parsers): + parser = all_parsers + data = "a\n1\n2\n3\n4\n5" + msg = "No columns to parse from file" + + with pytest.raises(EmptyDataError, match=msg): + parser.read_csv(StringIO(data), skiprows=lambda x: True) + + +def test_skip_rows_bad_callable(all_parsers): + msg = "by zero" + parser = all_parsers + data = "a\n1\n2\n3\n4\n5" + + with pytest.raises(ZeroDivisionError, match=msg): + parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0) From 24331709f3d77b365a466e47b08ebe6ef626e657 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 19 Dec 2020 02:36:11 -0500 Subject: [PATCH 73/95] test reorg --- pandas/tests/io/parser/test_dtypes.py | 686 +++++++++++++------------- 1 file changed, 330 insertions(+), 356 deletions(-) diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 4ef609cb87980..739c49cb87b3f 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -20,461 +20,435 @@ @skip_pyarrow -@pytest.mark.parametrize("dtype", [str, object]) -@pytest.mark.parametrize("check_orig", [True, False]) -def test_dtype_all_columns(all_parsers, dtype, check_orig): - # see gh-3795, gh-6607 - parser = all_parsers - - df = DataFrame( - np.random.rand(5, 2).round(4), - columns=list("AB"), - index=["1A", "1B", "1C", "1D", "1E"], - ) - - with tm.ensure_clean("__passing_str_as_dtype__.csv") as path: - df.to_csv(path) - - result = parser.read_csv(path, dtype=dtype, index_col=0) - - if check_orig: - expected = df.copy() - result = result.astype(float) - else: - expected = df.astype(str) - - tm.assert_frame_equal(result, expected) +class TestParserDtypesBasic: + @pytest.mark.parametrize("dtype", [str, object]) + @pytest.mark.parametrize("check_orig", [True, False]) + def test_dtype_all_columns(self, all_parsers, dtype, check_orig): + # see gh-3795, gh-6607 + parser = all_parsers + + df = DataFrame( + np.random.rand(5, 2).round(4), + columns=list("AB"), + index=["1A", "1B", "1C", "1D", "1E"], + ) + with tm.ensure_clean("__passing_str_as_dtype__.csv") as path: + df.to_csv(path) -@skip_pyarrow -def test_dtype_all_columns_empty(all_parsers): - # see gh-12048 - parser = all_parsers - result = parser.read_csv(StringIO("A,B"), dtype=str) + result = parser.read_csv(path, dtype=dtype, index_col=0) - expected = DataFrame({"A": [], "B": []}, index=[], dtype=str) - tm.assert_frame_equal(result, expected) + if check_orig: + expected = df.copy() + result = result.astype(float) + else: + expected = df.astype(str) + tm.assert_frame_equal(result, expected) -@skip_pyarrow -def test_dtype_per_column(all_parsers): - parser = all_parsers - data = """\ + def test_dtype_per_column(self, all_parsers): + parser = all_parsers + data = """\ one,two 1,2.5 2,3.5 3,4.5 4,5.5""" - expected = DataFrame( - [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] - ) - expected["one"] = expected["one"].astype(np.float64) - expected["two"] = expected["two"].astype(object) - - result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) - tm.assert_frame_equal(result, expected) + expected = DataFrame( + [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] + ) + expected["one"] = expected["one"].astype(np.float64) + expected["two"] = expected["two"].astype(object) + result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) + tm.assert_frame_equal(result, expected) -@skip_pyarrow -def test_invalid_dtype_per_column(all_parsers): - parser = all_parsers - data = """\ + def test_invalid_dtype_per_column(self, all_parsers): + parser = all_parsers + data = """\ one,two 1,2.5 2,3.5 3,4.5 4,5.5""" - with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"): - parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) + with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"): + parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) @skip_pyarrow -@pytest.mark.parametrize( - "dtype", - [ - "category", - CategoricalDtype(), - {"a": "category", "b": "category", "c": CategoricalDtype()}, - ], -) -def test_categorical_dtype(all_parsers, dtype): - # see gh-10153 - parser = all_parsers - data = """a,b,c +class TestParserDtypesCategorical1: + @pytest.mark.parametrize( + "dtype", + [ + "category", + CategoricalDtype(), + {"a": "category", "b": "category", "c": CategoricalDtype()}, + ], + ) + def test_categorical_dtype(self, all_parsers, dtype): + # see gh-10153 + parser = all_parsers + data = """a,b,c 1,a,3.4 1,a,3.4 2,b,4.5""" - expected = DataFrame( - { - "a": Categorical(["1", "1", "2"]), - "b": Categorical(["a", "a", "b"]), - "c": Categorical(["3.4", "3.4", "4.5"]), - } - ) - actual = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(actual, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) -def test_categorical_dtype_single(all_parsers, dtype): - # see gh-10153 - parser = all_parsers - data = """a,b,c + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["a", "a", "b"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) + actual = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(actual, expected) + + @pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) + def test_categorical_dtype_single(self, all_parsers, dtype): + # see gh-10153 + parser = all_parsers + data = """a,b,c 1,a,3.4 1,a,3.4 2,b,4.5""" - expected = DataFrame( - {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]} - ) - actual = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(actual, expected) - + expected = DataFrame( + {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]} + ) + actual = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(actual, expected) -@skip_pyarrow -def test_categorical_dtype_unsorted(all_parsers): - # see gh-10153 - parser = all_parsers - data = """a,b,c + def test_categorical_dtype_unsorted(self, all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b,c 1,b,3.4 1,b,3.4 2,a,4.5""" - expected = DataFrame( - { - "a": Categorical(["1", "1", "2"]), - "b": Categorical(["b", "b", "a"]), - "c": Categorical(["3.4", "3.4", "4.5"]), - } - ) - actual = parser.read_csv(StringIO(data), dtype="category") - tm.assert_frame_equal(actual, expected) - + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["b", "b", "a"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) + actual = parser.read_csv(StringIO(data), dtype="category") + tm.assert_frame_equal(actual, expected) -@skip_pyarrow -def test_categorical_dtype_missing(all_parsers): - # see gh-10153 - parser = all_parsers - data = """a,b,c + def test_categorical_dtype_missing(self, all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b,c 1,b,3.4 1,nan,3.4 2,a,4.5""" - expected = DataFrame( - { - "a": Categorical(["1", "1", "2"]), - "b": Categorical(["b", np.nan, "a"]), - "c": Categorical(["3.4", "3.4", "4.5"]), - } - ) - actual = parser.read_csv(StringIO(data), dtype="category") - tm.assert_frame_equal(actual, expected) - - -@skip_pyarrow -@pytest.mark.slow -def test_categorical_dtype_high_cardinality_numeric(all_parsers): - # see gh-18186 - parser = all_parsers - data = np.sort([str(i) for i in range(524289)]) - expected = DataFrame({"a": Categorical(data, ordered=True)}) - - actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category") - actual["a"] = actual["a"].cat.reorder_categories( - np.sort(actual.a.cat.categories), ordered=True - ) - tm.assert_frame_equal(actual, expected) - - -def test_categorical_dtype_latin1(all_parsers, csv_dir_path): - # see gh-10153 - pth = os.path.join(csv_dir_path, "unicode_series.csv") - parser = all_parsers - encoding = "latin-1" - - expected = parser.read_csv(pth, header=None, encoding=encoding) - expected[1] = Categorical(expected[1]) - - actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"}) - tm.assert_frame_equal(actual, expected) - - -@skip_pyarrow -def test_categorical_dtype_utf16(all_parsers, csv_dir_path): - # see gh-10153 - pth = os.path.join(csv_dir_path, "utf16_ex.txt") - parser = all_parsers - encoding = "utf-16" - sep = "\t" + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["b", np.nan, "a"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) + actual = parser.read_csv(StringIO(data), dtype="category") + tm.assert_frame_equal(actual, expected) + + @pytest.mark.slow + def test_categorical_dtype_high_cardinality_numeric(self, all_parsers): + # see gh-18186 + parser = all_parsers + data = np.sort([str(i) for i in range(524289)]) + expected = DataFrame({"a": Categorical(data, ordered=True)}) + + actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category") + actual["a"] = actual["a"].cat.reorder_categories( + np.sort(actual.a.cat.categories), ordered=True + ) + tm.assert_frame_equal(actual, expected) - expected = parser.read_csv(pth, sep=sep, encoding=encoding) - expected = expected.apply(Categorical) + def test_categorical_dtype_utf16(self, all_parsers, csv_dir_path): + # see gh-10153 + pth = os.path.join(csv_dir_path, "utf16_ex.txt") + parser = all_parsers + encoding = "utf-16" + sep = "\t" - actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category") - tm.assert_frame_equal(actual, expected) + expected = parser.read_csv(pth, sep=sep, encoding=encoding) + expected = expected.apply(Categorical) + actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category") + tm.assert_frame_equal(actual, expected) -@skip_pyarrow -def test_categorical_dtype_chunksize_infer_categories(all_parsers): - # see gh-10153 - parser = all_parsers - data = """a,b + def test_categorical_dtype_chunksize_infer_categories(self, all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b 1,a 1,b 1,b 2,c""" - expecteds = [ - DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}), - DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]), - ] - with parser.read_csv( - StringIO(data), dtype={"b": "category"}, chunksize=2 - ) as actuals: - for actual, expected in zip(actuals, expecteds): - tm.assert_frame_equal(actual, expected) - - -@skip_pyarrow -def test_categorical_dtype_chunksize_explicit_categories(all_parsers): - # see gh-10153 - parser = all_parsers - data = """a,b + expecteds = [ + DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}), + DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]), + ] + with parser.read_csv( + StringIO(data), dtype={"b": "category"}, chunksize=2 + ) as actuals: + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + + def test_categorical_dtype_chunksize_explicit_categories(self, all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b 1,a 1,b 1,b 2,c""" - cats = ["a", "b", "c"] - expecteds = [ - DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}), - DataFrame( - {"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)}, index=[2, 3] - ), - ] - dtype = CategoricalDtype(cats) - with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals: - for actual, expected in zip(actuals, expecteds): - tm.assert_frame_equal(actual, expected) - + cats = ["a", "b", "c"] + expecteds = [ + DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}), + DataFrame( + {"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)}, + index=[2, 3], + ), + ] + dtype = CategoricalDtype(cats) + with parser.read_csv( + StringIO(data), dtype={"b": dtype}, chunksize=2 + ) as actuals: + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + + +class TestParserDtypesCategorical2: + def test_categorical_dtype_latin1(self, all_parsers, csv_dir_path): + # see gh-10153 + pth = os.path.join(csv_dir_path, "unicode_series.csv") + parser = all_parsers + encoding = "latin-1" + + expected = parser.read_csv(pth, header=None, encoding=encoding) + expected[1] = Categorical(expected[1]) + + actual = parser.read_csv( + pth, header=None, encoding=encoding, dtype={1: "category"} + ) + tm.assert_frame_equal(actual, expected) -@pytest.mark.parametrize("ordered", [False, True]) -@pytest.mark.parametrize( - "categories", - [["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]], -) -def test_categorical_category_dtype(all_parsers, categories, ordered): - parser = all_parsers - data = """a,b + @pytest.mark.parametrize("ordered", [False, True]) + @pytest.mark.parametrize( + "categories", + [["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]], + ) + def test_categorical_category_dtype(self, all_parsers, categories, ordered): + parser = all_parsers + data = """a,b 1,a 1,b 1,b 2,c""" - expected = DataFrame( - { - "a": [1, 1, 1, 2], - "b": Categorical( - ["a", "b", "b", "c"], categories=categories, ordered=ordered - ), - } - ) - - dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)} - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) + expected = DataFrame( + { + "a": [1, 1, 1, 2], + "b": Categorical( + ["a", "b", "b", "c"], categories=categories, ordered=ordered + ), + } + ) + dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)} + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) -def test_categorical_category_dtype_unsorted(all_parsers): - parser = all_parsers - data = """a,b + def test_categorical_category_dtype_unsorted(self, all_parsers): + parser = all_parsers + data = """a,b 1,a 1,b 1,b 2,c""" - dtype = CategoricalDtype(["c", "b", "a"]) - expected = DataFrame( - { - "a": [1, 1, 1, 2], - "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]), - } - ) - - result = parser.read_csv(StringIO(data), dtype={"b": dtype}) - tm.assert_frame_equal(result, expected) - - -def test_categorical_coerces_numeric(all_parsers): - parser = all_parsers - dtype = {"b": CategoricalDtype([1, 2, 3])} - - data = "b\n1\n1\n2\n3" - expected = DataFrame({"b": Categorical([1, 1, 2, 3])}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - -def test_categorical_coerces_datetime(all_parsers): - parser = all_parsers - dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None) - dtype = {"b": CategoricalDtype(dti)} - - data = "b\n2017-01-01\n2018-01-01\n2019-01-01" - expected = DataFrame({"b": Categorical(dtype["b"].categories)}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - + dtype = CategoricalDtype(["c", "b", "a"]) + expected = DataFrame( + { + "a": [1, 1, 1, 2], + "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]), + } + ) -def test_categorical_coerces_timestamp(all_parsers): - parser = all_parsers - dtype = {"b": CategoricalDtype([Timestamp("2014")])} + result = parser.read_csv(StringIO(data), dtype={"b": dtype}) + tm.assert_frame_equal(result, expected) - data = "b\n2014-01-01\n2014-01-01T00:00:00" - expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)}) + def test_categorical_coerces_numeric(self, all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype([1, 2, 3])} - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) + data = "b\n1\n1\n2\n3" + expected = DataFrame({"b": Categorical([1, 1, 2, 3])}) + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) -@skip_pyarrow -def test_categorical_coerces_timedelta(all_parsers): - parser = all_parsers - dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} + def test_categorical_coerces_datetime(self, all_parsers): + parser = all_parsers + dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None) + dtype = {"b": CategoricalDtype(dti)} - data = "b\n1H\n2H\n3H" - expected = DataFrame({"b": Categorical(dtype["b"].categories)}) + data = "b\n2017-01-01\n2018-01-01\n2019-01-01" + expected = DataFrame({"b": Categorical(dtype["b"].categories)}) - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + def test_categorical_coerces_timestamp(self, all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype([Timestamp("2014")])} -@pytest.mark.parametrize( - "data", - [ - "b\nTrue\nFalse\nNA\nFalse", - "b\ntrue\nfalse\nNA\nfalse", - "b\nTRUE\nFALSE\nNA\nFALSE", - "b\nTrue\nFalse\nNA\nFALSE", - ], -) -def test_categorical_dtype_coerces_boolean(all_parsers, data): - # see gh-20498 - parser = all_parsers - dtype = {"b": CategoricalDtype([False, True])} - expected = DataFrame({"b": Categorical([True, False, None, False])}) + data = "b\n2014-01-01\n2014-01-01T00:00:00" + expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)}) - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + @skip_pyarrow + def test_categorical_coerces_timedelta(self, all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} -def test_categorical_unexpected_categories(all_parsers): - parser = all_parsers - dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])} + data = "b\n1H\n2H\n3H" + expected = DataFrame({"b": Categorical(dtype["b"].categories)}) - data = "b\nd\na\nc\nd" # Unexpected c - expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])}) + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "data", + [ + "b\nTrue\nFalse\nNA\nFalse", + "b\ntrue\nfalse\nNA\nfalse", + "b\nTRUE\nFALSE\nNA\nFALSE", + "b\nTrue\nFalse\nNA\nFALSE", + ], + ) + def test_categorical_dtype_coerces_boolean(self, all_parsers, data): + # see gh-20498 + parser = all_parsers + dtype = {"b": CategoricalDtype([False, True])} + expected = DataFrame({"b": Categorical([True, False, None, False])}) + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) -@skip_pyarrow -def test_empty_pass_dtype(all_parsers): - parser = all_parsers + def test_categorical_unexpected_categories(self, all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])} - data = "one,two" - result = parser.read_csv(StringIO(data), dtype={"one": "u1"}) + data = "b\nd\na\nc\nd" # Unexpected c + expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])}) - expected = DataFrame( - {"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=object)}, - index=Index([], dtype=object), - ) - tm.assert_frame_equal(result, expected) + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) @skip_pyarrow -def test_empty_with_index_pass_dtype(all_parsers): - parser = all_parsers +class TestParserDtypesEmpty: + def test_dtype_all_columns_empty(self, all_parsers): + # see gh-12048 + parser = all_parsers + result = parser.read_csv(StringIO("A,B"), dtype=str) - data = "one,two" - result = parser.read_csv( - StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"} - ) + expected = DataFrame({"A": [], "B": []}, index=[], dtype=str) + tm.assert_frame_equal(result, expected) - expected = DataFrame( - {"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one") - ) - tm.assert_frame_equal(result, expected) + def test_empty_pass_dtype(self, all_parsers): + parser = all_parsers + data = "one,two" + result = parser.read_csv(StringIO(data), dtype={"one": "u1"}) -@skip_pyarrow -def test_empty_with_multi_index_pass_dtype(all_parsers): - parser = all_parsers + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=object)}, + index=Index([], dtype=object), + ) + tm.assert_frame_equal(result, expected) - data = "one,two,three" - result = parser.read_csv( - StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"} - ) + def test_empty_with_index_pass_dtype(self, all_parsers): + parser = all_parsers - exp_idx = MultiIndex.from_arrays( - [np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)], names=["one", "two"] - ) - expected = DataFrame({"three": np.empty(0, dtype=object)}, index=exp_idx) - tm.assert_frame_equal(result, expected) + data = "one,two" + result = parser.read_csv( + StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"} + ) + expected = DataFrame( + {"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one") + ) + tm.assert_frame_equal(result, expected) -def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers, pyarrow_xfail): - parser = all_parsers + def test_empty_with_multi_index_pass_dtype(self, all_parsers): + parser = all_parsers - data = "one,one" - result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"}) + data = "one,two,three" + result = parser.read_csv( + StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"} + ) - expected = DataFrame( - {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, - index=Index([], dtype=object), - ) - tm.assert_frame_equal(result, expected) + exp_idx = MultiIndex.from_arrays( + [np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)], + names=["one", "two"], + ) + expected = DataFrame({"three": np.empty(0, dtype=object)}, index=exp_idx) + tm.assert_frame_equal(result, expected) + def test_empty_with_mangled_column_pass_dtype_by_names(self, all_parsers): + parser = all_parsers -@skip_pyarrow -def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): - parser = all_parsers + data = "one,one" + result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"}) - data = "one,one" - result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, + index=Index([], dtype=object), + ) + tm.assert_frame_equal(result, expected) - expected = DataFrame( - {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, - index=Index([], dtype=object), - ) - tm.assert_frame_equal(result, expected) + def test_empty_with_mangled_column_pass_dtype_by_indexes(self, all_parsers): + parser = all_parsers + data = "one,one" + result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) -@skip_pyarrow -def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers): - # see gh-9424 - parser = all_parsers - expected = concat( - [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], - axis=1, - ) - expected.index = expected.index.astype(object) + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, + index=Index([], dtype=object), + ) + tm.assert_frame_equal(result, expected) - data = "one,one" - result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) - tm.assert_frame_equal(result, expected) + def test_empty_with_dup_column_pass_dtype_by_indexes(self, all_parsers): + # see gh-9424 + parser = all_parsers + expected = concat( + [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], + axis=1, + ) + expected.index = expected.index.astype(object) + data = "one,one" + result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) + tm.assert_frame_equal(result, expected) -def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers): - # see gh-9424 - parser = all_parsers - expected = concat( - [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], - axis=1, - ) - expected.index = expected.index.astype(object) + def test_empty_with_dup_column_pass_dtype_by_indexes_raises(self, all_parsers): + # see gh-9424 + parser = all_parsers + expected = concat( + [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], + axis=1, + ) + expected.index = expected.index.astype(object) - with pytest.raises(ValueError, match="Duplicate names"): - data = "" - parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"}) + with pytest.raises(ValueError, match="Duplicate names"): + data = "" + parser.read_csv( + StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"} + ) @skip_pyarrow From 1a9f185b8e1cac6a12b94343ab76882f871d247a Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 19 Dec 2020 02:41:39 -0500 Subject: [PATCH 74/95] test reorg --- pandas/tests/io/parser/test_dtypes.py | 110 +++++++++++++------------- 1 file changed, 57 insertions(+), 53 deletions(-) diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 739c49cb87b3f..dcc58b1f2a484 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -164,6 +164,7 @@ def test_categorical_dtype_high_cardinality_numeric(self, all_parsers): ) tm.assert_frame_equal(actual, expected) + @pytest.mark.slow def test_categorical_dtype_utf16(self, all_parsers, csv_dir_path): # see gh-10153 pth = os.path.join(csv_dir_path, "utf16_ex.txt") @@ -220,6 +221,7 @@ def test_categorical_dtype_chunksize_explicit_categories(self, all_parsers): class TestParserDtypesCategorical2: + @pytest.mark.slow def test_categorical_dtype_latin1(self, all_parsers, csv_dir_path): # see gh-10153 pth = os.path.join(csv_dir_path, "unicode_series.csv") @@ -450,6 +452,61 @@ def test_empty_with_dup_column_pass_dtype_by_indexes_raises(self, all_parsers): StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"} ) + @skip_pyarrow + @pytest.mark.parametrize( + "dtype,expected", + [ + (np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)), + ( + "category", + DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]), + ), + ( + {"a": "category", "b": "category"}, + DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]), + ), + ("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")), + ( + "timedelta64[ns]", + DataFrame( + { + "a": Series([], dtype="timedelta64[ns]"), + "b": Series([], dtype="timedelta64[ns]"), + }, + index=[], + ), + ), + ( + {"a": np.int64, "b": np.int32}, + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + index=[], + ), + ), + ( + {0: np.int64, 1: np.int32}, + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + index=[], + ), + ), + ( + {"a": np.int64, 1: np.int32}, + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + index=[], + ), + ), + ], + ) + def test_empty_dtype(self, all_parsers, dtype, expected): + # see gh-14712 + parser = all_parsers + data = "a,b" + + result = parser.read_csv(StringIO(data), header=0, dtype=dtype) + tm.assert_frame_equal(result, expected) + @skip_pyarrow def test_raise_on_passed_int_dtype_with_nas(all_parsers): @@ -485,59 +542,6 @@ def test_dtype_with_converters(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow -@pytest.mark.parametrize( - "dtype,expected", - [ - (np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)), - ("category", DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[])), - ( - {"a": "category", "b": "category"}, - DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]), - ), - ("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")), - ( - "timedelta64[ns]", - DataFrame( - { - "a": Series([], dtype="timedelta64[ns]"), - "b": Series([], dtype="timedelta64[ns]"), - }, - index=[], - ), - ), - ( - {"a": np.int64, "b": np.int32}, - DataFrame( - {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, - index=[], - ), - ), - ( - {0: np.int64, 1: np.int32}, - DataFrame( - {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, - index=[], - ), - ), - ( - {"a": np.int64, 1: np.int32}, - DataFrame( - {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, - index=[], - ), - ), - ], -) -def test_empty_dtype(all_parsers, dtype, expected): - # see gh-14712 - parser = all_parsers - data = "a,b" - - result = parser.read_csv(StringIO(data), header=0, dtype=dtype) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( "dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"]) ) From 16d37dbe17a24d8d847fb5b35c5a6030035ceee4 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 19 Dec 2020 02:46:36 -0500 Subject: [PATCH 75/95] test reorg --- pandas/tests/io/parser/test_dtypes.py | 193 +++++++++++++------------- 1 file changed, 94 insertions(+), 99 deletions(-) diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index dcc58b1f2a484..452cbc635b470 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -75,6 +75,100 @@ def test_invalid_dtype_per_column(self, all_parsers): with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"): parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) + def test_raise_on_passed_int_dtype_with_nas(self, all_parsers): + # see gh-2631 + parser = all_parsers + data = """YEAR, DOY, a + 2001,106380451,10 + 2001,,11 + 2001,106380451,67""" + + msg = ( + "Integer column has NA values" + if parser.engine == "c" + else "Unable to convert column DOY" + ) + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True + ) + + def test_dtype_with_converters(self, all_parsers): + parser = all_parsers + data = """a,b +1.1,2.2 +1.2,2.3""" + + # Dtype spec ignored if converted specified. + with tm.assert_produces_warning(ParserWarning): + result = parser.read_csv( + StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)} + ) + expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"]) + ) + def test_numeric_dtype(self, all_parsers, dtype): + data = "0\n1" + parser = all_parsers + expected = DataFrame([0, 1], dtype=dtype) + + result = parser.read_csv(StringIO(data), header=None, dtype=dtype) + tm.assert_frame_equal(expected, result) + + def test_boolean_dtype(self, all_parsers): + parser = all_parsers + data = "\n".join( + [ + "a", + "True", + "TRUE", + "true", + "1", + "1.0", + "False", + "FALSE", + "false", + "0", + "0.0", + "NaN", + "nan", + "NA", + "null", + "NULL", + ] + ) + + result = parser.read_csv(StringIO(data), dtype="boolean") + expected = DataFrame( + { + "a": pd.array( + [ + True, + True, + True, + True, + True, + False, + False, + False, + False, + False, + None, + None, + None, + None, + None, + ], + dtype="boolean", + ) + } + ) + + tm.assert_frame_equal(result, expected) + @skip_pyarrow class TestParserDtypesCategorical1: @@ -506,102 +600,3 @@ def test_empty_dtype(self, all_parsers, dtype, expected): result = parser.read_csv(StringIO(data), header=0, dtype=dtype) tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_raise_on_passed_int_dtype_with_nas(all_parsers): - # see gh-2631 - parser = all_parsers - data = """YEAR, DOY, a -2001,106380451,10 -2001,,11 -2001,106380451,67""" - - msg = ( - "Integer column has NA values" - if parser.engine == "c" - else "Unable to convert column DOY" - ) - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True) - - -@skip_pyarrow -def test_dtype_with_converters(all_parsers): - parser = all_parsers - data = """a,b -1.1,2.2 -1.2,2.3""" - - # Dtype spec ignored if converted specified. - with tm.assert_produces_warning(ParserWarning): - result = parser.read_csv( - StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)} - ) - expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"]) -) -def test_numeric_dtype(all_parsers, dtype): - data = "0\n1" - parser = all_parsers - expected = DataFrame([0, 1], dtype=dtype) - - result = parser.read_csv(StringIO(data), header=None, dtype=dtype) - tm.assert_frame_equal(expected, result) - - -@skip_pyarrow -def test_boolean_dtype(all_parsers): - parser = all_parsers - data = "\n".join( - [ - "a", - "True", - "TRUE", - "true", - "1", - "1.0", - "False", - "FALSE", - "false", - "0", - "0.0", - "NaN", - "nan", - "NA", - "null", - "NULL", - ] - ) - - result = parser.read_csv(StringIO(data), dtype="boolean") - expected = DataFrame( - { - "a": pd.array( - [ - True, - True, - True, - True, - True, - False, - False, - False, - False, - False, - None, - None, - None, - None, - None, - ], - dtype="boolean", - ) - } - ) - - tm.assert_frame_equal(result, expected) From e124df0cd2f2c4f0000a3c0fb7a0d6cda9794a43 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Thu, 31 Dec 2020 00:21:25 -0500 Subject: [PATCH 76/95] pyarrow_xfail->pyarrow_skip --- pandas/tests/io/parser/test_common.py | 62 ++++++++++++++++++--------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 443af3a33be18..d9631c5657e33 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -74,7 +74,7 @@ def _set_noconvert_columns(self): @skip_pyarrow -def test_empty_decimal_marker(all_parsers, pyarrow_xfail): +def test_empty_decimal_marker(all_parsers): data = """A|B|C 1|2,334|5 10|13|10. @@ -176,7 +176,8 @@ def test_squeeze(all_parsers): assert not result._is_view -def test_malformed(all_parsers, pyarrow_xfail): +@skip_pyarrow +def test_malformed(all_parsers): # see gh-6607 parser = all_parsers data = """ignore @@ -190,8 +191,9 @@ def test_malformed(all_parsers, pyarrow_xfail): parser.read_csv(StringIO(data), header=1, comment="#") +@skip_pyarrow @pytest.mark.parametrize("nrows", [5, 3, None]) -def test_malformed_chunks(all_parsers, nrows, pyarrow_xfail): +def test_malformed_chunks(all_parsers, nrows): data = """ignore A,B,C skip @@ -209,7 +211,8 @@ def test_malformed_chunks(all_parsers, nrows, pyarrow_xfail): reader.read(nrows) -def test_unnamed_columns(all_parsers, pyarrow_xfail): +@skip_pyarrow +def test_unnamed_columns(all_parsers): data = """A,B,C,, 1,2,3,4,5 6,7,8,9,10 @@ -314,7 +317,8 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path): tm.assert_frame_equal(result, expected) -def test_read_csv_wrong_num_columns(all_parsers, pyarrow_xfail): +@skip_pyarrow +def test_read_csv_wrong_num_columns(all_parsers): # Too few columns. data = """A,B,C,D,E,F 1,2,3,4,5,6 @@ -430,8 +434,9 @@ def test_int_conversion(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("nrows", [3, 3.0]) -def test_read_nrows(all_parsers, nrows, pyarrow_xfail): +def test_read_nrows(all_parsers, nrows): # see gh-10476 data = """index,A,B,C,D foo,2,3,4,5 @@ -451,8 +456,9 @@ def test_read_nrows(all_parsers, nrows, pyarrow_xfail): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("nrows", [1.2, "foo", -1]) -def test_read_nrows_bad(all_parsers, nrows, pyarrow_xfail): +def test_read_nrows_bad(all_parsers, nrows): data = """index,A,B,C,D foo,2,3,4,5 bar,7,8,9,10 @@ -468,8 +474,9 @@ def test_read_nrows_bad(all_parsers, nrows, pyarrow_xfail): parser.read_csv(StringIO(data), nrows=nrows) +@skip_pyarrow @pytest.mark.parametrize("index_col", [0, "index"]) -def test_read_chunksize_with_index(all_parsers, index_col, pyarrow_xfail): +def test_read_chunksize_with_index(all_parsers, index_col): parser = all_parsers data = """index,A,B,C,D foo,2,3,4,5 @@ -500,8 +507,9 @@ def test_read_chunksize_with_index(all_parsers, index_col, pyarrow_xfail): tm.assert_frame_equal(chunks[2], expected[4:]) +@skip_pyarrow @pytest.mark.parametrize("chunksize", [1.3, "foo", 0]) -def test_read_chunksize_bad(all_parsers, chunksize, pyarrow_xfail): +def test_read_chunksize_bad(all_parsers, chunksize): data = """index,A,B,C,D foo,2,3,4,5 bar,7,8,9,10 @@ -518,8 +526,9 @@ def test_read_chunksize_bad(all_parsers, chunksize, pyarrow_xfail): pass +@skip_pyarrow @pytest.mark.parametrize("chunksize", [2, 8]) -def test_read_chunksize_and_nrows(all_parsers, chunksize, pyarrow_xfail): +def test_read_chunksize_and_nrows(all_parsers, chunksize): # see gh-15755 data = """index,A,B,C,D foo,2,3,4,5 @@ -537,7 +546,8 @@ def test_read_chunksize_and_nrows(all_parsers, chunksize, pyarrow_xfail): tm.assert_frame_equal(concat(reader), expected) -def test_read_chunksize_and_nrows_changing_size(all_parsers, pyarrow_xfail): +@skip_pyarrow +def test_read_chunksize_and_nrows_changing_size(all_parsers): data = """index,A,B,C,D foo,2,3,4,5 bar,7,8,9,10 @@ -558,7 +568,8 @@ def test_read_chunksize_and_nrows_changing_size(all_parsers, pyarrow_xfail): reader.get_chunk(size=3) -def test_get_chunk_passed_chunksize(all_parsers, pyarrow_xfail): +@skip_pyarrow +def test_get_chunk_passed_chunksize(all_parsers): parser = all_parsers data = """A,B,C 1,2,3 @@ -573,8 +584,9 @@ def test_get_chunk_passed_chunksize(all_parsers, pyarrow_xfail): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}]) -def test_read_chunksize_compat(all_parsers, kwargs, pyarrow_xfail): +def test_read_chunksize_compat(all_parsers, kwargs): # see gh-12185 data = """index,A,B,C,D foo,2,3,4,5 @@ -590,7 +602,8 @@ def test_read_chunksize_compat(all_parsers, kwargs, pyarrow_xfail): tm.assert_frame_equal(concat(reader), result) -def test_read_chunksize_jagged_names(all_parsers, pyarrow_xfail): +@skip_pyarrow +def test_read_chunksize_jagged_names(all_parsers): # see gh-23509 parser = all_parsers data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)]) @@ -601,7 +614,8 @@ def test_read_chunksize_jagged_names(all_parsers, pyarrow_xfail): tm.assert_frame_equal(result, expected) -def test_read_data_list(all_parsers, pyarrow_xfail): +@skip_pyarrow +def test_read_data_list(all_parsers): parser = all_parsers kwargs = {"index_col": 0} data = "A,B,C\nfoo,1,2,3\nbar,4,5,6" @@ -615,7 +629,8 @@ def test_read_data_list(all_parsers, pyarrow_xfail): tm.assert_frame_equal(result, expected) -def test_iterator(all_parsers, pyarrow_xfail): +@skip_pyarrow +def test_iterator(all_parsers): # see gh-6607 data = """index,A,B,C,D foo,2,3,4,5 @@ -638,7 +653,8 @@ def test_iterator(all_parsers, pyarrow_xfail): tm.assert_frame_equal(last_chunk, expected[3:]) -def test_iterator2(all_parsers, pyarrow_xfail): +@skip_pyarrow +def test_iterator2(all_parsers): parser = all_parsers data = """A,B,C foo,1,2,3 @@ -701,7 +717,8 @@ def test_reader_list_skiprows(all_parsers): tm.assert_frame_equal(chunks[0], expected[1:3]) -def test_iterator_stop_on_chunksize(all_parsers, pyarrow_xfail): +@skip_pyarrow +def test_iterator_stop_on_chunksize(all_parsers): # gh-3967: stopping iteration when chunksize is specified parser = all_parsers data = """A,B,C @@ -722,10 +739,11 @@ def test_iterator_stop_on_chunksize(all_parsers, pyarrow_xfail): tm.assert_frame_equal(concat(result), expected) +@skip_pyarrow @pytest.mark.parametrize( "kwargs", [{"iterator": True, "chunksize": 1}, {"iterator": True}, {"chunksize": 1}] ) -def test_iterator_skipfooter_errors(all_parsers, kwargs, pyarrow_xfail): +def test_iterator_skipfooter_errors(all_parsers, kwargs): msg = "'skipfooter' not supported for iteration" parser = all_parsers data = "a\n1\n2" @@ -1534,6 +1552,7 @@ def test_uneven_lines_with_usecols(all_parsers, usecols): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -1553,7 +1572,7 @@ def test_uneven_lines_with_usecols(all_parsers, usecols): ), ], ) -def test_read_empty_with_usecols(all_parsers, data, kwargs, expected, pyarrow_xfail): +def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): # see gh-12493 parser = all_parsers @@ -2174,7 +2193,8 @@ def test_read_table_equivalency_to_read_csv(all_parsers): tm.assert_frame_equal(result, expected) -def test_first_row_bom(all_parsers, pyarrow_xfail): +@skip_pyarrow +def test_first_row_bom(all_parsers): # see gh-26545 parser = all_parsers data = '''\ufeff"Head1" "Head2" "Head3"''' From fe253bacf5e7c5eb682f183453a5719003cbeb1e Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Thu, 31 Dec 2020 00:26:53 -0500 Subject: [PATCH 77/95] pyarrow_xfail->pyarrow_skip --- pandas/tests/io/parser/test_converters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 955f249cdf9ae..158f924882503 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -12,7 +12,7 @@ from pandas import DataFrame, Index import pandas._testing as tm -pytestmark = pytest.mark.usefixtures("pyarrow_xfail") +pytestmark = pytest.mark.usefixtures("pyarrow_skip") def test_converters_type_must_be_dict(all_parsers): From 72c7c448916d716186795c8063ed7fb3c05a4282 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Thu, 31 Dec 2020 00:28:59 -0500 Subject: [PATCH 78/95] pyarrow_xfail->pyarrow_skip --- pandas/tests/io/parser/test_compression.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index 0af10c4124072..c01542d7d38c5 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -31,8 +31,9 @@ def parser_and_data(all_parsers, csv1): return parser, data, expected +@skip_pyarrow @pytest.mark.parametrize("compression", ["zip", "infer", "zip2"]) -def test_zip(parser_and_data, compression, pyarrow_xfail): +def test_zip(parser_and_data, compression): parser, data, expected = parser_and_data with tm.ensure_clean("test_file.zip") as path: @@ -48,8 +49,9 @@ def test_zip(parser_and_data, compression, pyarrow_xfail): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("compression", ["zip", "infer"]) -def test_zip_error_multiple_files(parser_and_data, compression, pyarrow_xfail): +def test_zip_error_multiple_files(parser_and_data, compression): parser, data, expected = parser_and_data with tm.ensure_clean("combined_zip.zip") as path: @@ -63,7 +65,8 @@ def test_zip_error_multiple_files(parser_and_data, compression, pyarrow_xfail): parser.read_csv(path, compression=compression) -def test_zip_error_no_files(parser_and_data, pyarrow_xfail): +@skip_pyarrow +def test_zip_error_no_files(parser_and_data): parser, _, _ = parser_and_data with tm.ensure_clean() as path: @@ -74,7 +77,8 @@ def test_zip_error_no_files(parser_and_data, pyarrow_xfail): parser.read_csv(path, compression="zip") -def test_zip_error_invalid_zip(parser_and_data, pyarrow_xfail): +@skip_pyarrow +def test_zip_error_invalid_zip(parser_and_data): parser, _, _ = parser_and_data with tm.ensure_clean() as path: @@ -85,9 +89,7 @@ def test_zip_error_invalid_zip(parser_and_data, pyarrow_xfail): @skip_pyarrow @pytest.mark.parametrize("filename", [None, "test.{ext}"]) -def test_compression( - parser_and_data, compression_only, buffer, filename, pyarrow_xfail -): +def test_compression(parser_and_data, compression_only, buffer, filename): parser, data, expected = parser_and_data compress_type = compression_only @@ -131,9 +133,8 @@ def test_infer_compression(all_parsers, csv1, buffer, ext): tm.assert_frame_equal(result, expected) -def test_compression_utf_encoding( - all_parsers, csv_dir_path, utf_value, encoding_fmt, pyarrow_xfail -): +@skip_pyarrow +def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt): # see gh-18071, gh-24130 parser = all_parsers encoding = encoding_fmt.format(utf_value) From 26710071f57ae90cf2285f6e5d473d268901d517 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Thu, 31 Dec 2020 02:48:32 -0500 Subject: [PATCH 79/95] xfail more tests --- pandas/tests/io/parser/test_common.py | 57 ++++++++++++------------ pandas/tests/io/parser/test_index_col.py | 5 ++- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 155799d512b8d..14241140c1b25 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -15,7 +15,6 @@ import pytest from pandas._libs.tslib import Timestamp -from pandas.compat import is_platform_linux from pandas.errors import DtypeWarning, EmptyDataError, ParserError import pandas.util._test_decorators as td @@ -1403,34 +1402,34 @@ def test_numeric_range_too_wide(all_parsers, exp_data): tm.assert_frame_equal(result, expected) -@skip_pyarrow -@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999]) -def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): - # GH#38753 - parser, precision = all_parsers_all_precisions - data = f"data\n10E{neg_exp}" - result = parser.read_csv(StringIO(data), float_precision=precision) - expected = DataFrame({"data": [0.0]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999]) -def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): - # GH#38753 - parser, precision = all_parsers_all_precisions - data = f"data\n10E{exp}" - result = parser.read_csv(StringIO(data), float_precision=precision) - if precision == "round_trip": - if exp == 999999999999999999 and is_platform_linux(): - mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result") - request.node.add_marker(mark) - - value = np.inf if exp > 0 else 0.0 - expected = DataFrame({"data": [value]}) - else: - expected = DataFrame({"data": [f"10E{exp}"]}) - - tm.assert_frame_equal(result, expected) +# @skip_pyarrow +# @pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999]) +# def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): +# # GH#38753 +# parser, precision = all_parsers_all_precisions +# data = f"data\n10E{neg_exp}" +# result = parser.read_csv(StringIO(data), float_precision=precision) +# expected = DataFrame({"data": [0.0]}) +# tm.assert_frame_equal(result, expected) + +# @skip_pyarrow +# @pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999]) +# def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): +# # GH#38753 +# parser, precision = all_parsers_all_precisions +# data = f"data\n10E{exp}" +# result = parser.read_csv(StringIO(data), float_precision=precision) +# if precision == "round_trip": +# if exp == 999999999999999999 and is_platform_linux(): +# mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result") +# request.node.add_marker(mark) + +# value = np.inf if exp > 0 else 0.0 +# expected = DataFrame({"data": [value]}) +# else: +# expected = DataFrame({"data": [f"10E{exp}"]}) + +# tm.assert_frame_equal(result, expected) @skip_pyarrow diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 607fd021b0662..87abeaf18ac76 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -12,7 +12,6 @@ import pandas._testing as tm skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @skip_pyarrow @@ -240,6 +239,7 @@ def test_index_col_large_csv(all_parsers): tm.assert_frame_equal(result, df.set_index("a")) +@skip_pyarrow def test_index_col_multiindex_columns_no_data(all_parsers): # GH#38292 parser = all_parsers @@ -255,6 +255,7 @@ def test_index_col_multiindex_columns_no_data(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_index_col_header_no_data(all_parsers): # GH#38292 parser = all_parsers @@ -267,6 +268,7 @@ def test_index_col_header_no_data(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_multiindex_columns_no_data(all_parsers): # GH#38292 parser = all_parsers @@ -277,6 +279,7 @@ def test_multiindex_columns_no_data(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_multiindex_columns_index_col_with_data(all_parsers): # GH#38292 parser = all_parsers From 73ca5d4c17d481aa303902ed965a372f8da647e4 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Thu, 31 Dec 2020 03:18:18 -0500 Subject: [PATCH 80/95] xfail more tests --- pandas/tests/io/parser/test_common.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 14241140c1b25..a4d02b1bb7873 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1307,14 +1307,14 @@ def test_float_parser(all_parsers): tm.assert_frame_equal(result, expected) -def test_scientific_no_exponent(all_parsers_all_precisions): - # see gh-12215 - df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]}) - data = df.to_csv(index=False) - parser, precision = all_parsers_all_precisions - - df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision) - tm.assert_frame_equal(df_roundtrip, df) +# def test_scientific_no_exponent(all_parsers_all_precisions): +# # see gh-12215 +# df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]}) +# data = df.to_csv(index=False) +# parser, precision = all_parsers_all_precisions + +# df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision) +# tm.assert_frame_equal(df_roundtrip, df) @skip_pyarrow From 639ca283fa39837bafa901da07b4b17e9cb634e6 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Fri, 1 Jan 2021 14:41:25 -0500 Subject: [PATCH 81/95] update refactoredt tests --- pandas/tests/io/parser/dtypes/test_categorical.py | 12 ++++++++++++ pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 2 ++ pandas/tests/io/parser/dtypes/test_empty.py | 2 ++ pandas/tests/io/parser/usecols/test_parse_dates.py | 2 ++ pandas/tests/io/parser/usecols/test_usecols_basic.py | 2 ++ 5 files changed, 20 insertions(+) diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py index 2f569424a82f5..a4e59899f304e 100644 --- a/pandas/tests/io/parser/dtypes/test_categorical.py +++ b/pandas/tests/io/parser/dtypes/test_categorical.py @@ -14,7 +14,10 @@ from pandas import Categorical, DataFrame, Timestamp import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + +@skip_pyarrow @pytest.mark.parametrize( "dtype", [ @@ -41,6 +44,7 @@ def test_categorical_dtype(all_parsers, dtype): tm.assert_frame_equal(actual, expected) +@skip_pyarrow @pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) def test_categorical_dtype_single(all_parsers, dtype): # see gh-10153 @@ -56,6 +60,7 @@ def test_categorical_dtype_single(all_parsers, dtype): tm.assert_frame_equal(actual, expected) +@skip_pyarrow def test_categorical_dtype_unsorted(all_parsers): # see gh-10153 parser = all_parsers @@ -74,6 +79,7 @@ def test_categorical_dtype_unsorted(all_parsers): tm.assert_frame_equal(actual, expected) +@skip_pyarrow def test_categorical_dtype_missing(all_parsers): # see gh-10153 parser = all_parsers @@ -92,6 +98,7 @@ def test_categorical_dtype_missing(all_parsers): tm.assert_frame_equal(actual, expected) +@skip_pyarrow @pytest.mark.slow def test_categorical_dtype_high_cardinality_numeric(all_parsers): # see gh-18186 @@ -106,6 +113,7 @@ def test_categorical_dtype_high_cardinality_numeric(all_parsers): tm.assert_frame_equal(actual, expected) +@skip_pyarrow def test_categorical_dtype_utf16(all_parsers, csv_dir_path): # see gh-10153 pth = os.path.join(csv_dir_path, "utf16_ex.txt") @@ -120,6 +128,7 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path): tm.assert_frame_equal(actual, expected) +@skip_pyarrow def test_categorical_dtype_chunksize_infer_categories(all_parsers): # see gh-10153 parser = all_parsers @@ -139,6 +148,7 @@ def test_categorical_dtype_chunksize_infer_categories(all_parsers): tm.assert_frame_equal(actual, expected) +@skip_pyarrow def test_categorical_dtype_chunksize_explicit_categories(all_parsers): # see gh-10153 parser = all_parsers @@ -242,6 +252,7 @@ def test_categorical_coerces_datetime(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_categorical_coerces_timestamp(all_parsers): parser = all_parsers dtype = {"b": CategoricalDtype([Timestamp("2014")])} @@ -253,6 +264,7 @@ def test_categorical_coerces_timestamp(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_categorical_coerces_timedelta(all_parsers): parser = all_parsers dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index e416d8dcdd905..b6814e39241f0 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -13,6 +13,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) diff --git a/pandas/tests/io/parser/dtypes/test_empty.py b/pandas/tests/io/parser/dtypes/test_empty.py index 57d729fb4b7fc..618af246e4db3 100644 --- a/pandas/tests/io/parser/dtypes/test_empty.py +++ b/pandas/tests/io/parser/dtypes/test_empty.py @@ -10,6 +10,8 @@ from pandas import Categorical, DataFrame, Index, MultiIndex, Series, concat import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + def test_dtype_all_columns_empty(all_parsers): # see gh-12048 diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py index c6b700c0adfff..34d5b4b7d183b 100644 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -20,6 +20,8 @@ "Usecols do not match columns, columns expected but not found: {0}" ) +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) def test_usecols_with_parse_dates(all_parsers, usecols): diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 7d81a88e09012..a163326124878 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -19,6 +19,8 @@ "Usecols do not match columns, columns expected but not found: {0}" ) +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + def test_raise_on_mixed_dtype_usecols(all_parsers): # See gh-12678 From 1994fadebb4e8b04cf5c91e2386f90f1ee86ce62 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 2 Jan 2021 01:40:13 -0500 Subject: [PATCH 82/95] float precision tests --- pandas/tests/io/parser/conftest.py | 4 ++ pandas/tests/io/parser/test_common.py | 74 ++++++++++++++------------- 2 files changed, 43 insertions(+), 35 deletions(-) diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 27c81aa435e57..bda4c771c6511 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -116,10 +116,14 @@ def _get_all_parser_float_precision_combinations(): params = [] ids = [] for parser, parser_id in zip(_all_parsers, _all_parser_ids): + if parser_id == "pyarrow": + # GH38370 + continue for precision in parser.float_precision_choices: params.append((parser, precision)) ids.append(f"{parser_id}-{precision}") + print(params) return {"params": params, "ids": ids} diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index a4d02b1bb7873..f06d1476c515a 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -15,6 +15,7 @@ import pytest from pandas._libs.tslib import Timestamp +from pandas.compat import is_platform_linux from pandas.errors import DtypeWarning, EmptyDataError, ParserError import pandas.util._test_decorators as td @@ -1307,14 +1308,16 @@ def test_float_parser(all_parsers): tm.assert_frame_equal(result, expected) -# def test_scientific_no_exponent(all_parsers_all_precisions): -# # see gh-12215 -# df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]}) -# data = df.to_csv(index=False) -# parser, precision = all_parsers_all_precisions +def test_scientific_no_exponent(all_parsers_all_precisions): + # see gh-12215 + df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]}) + data = df.to_csv(index=False) + parser, precision = all_parsers_all_precisions + if parser == "pyarrow": + pytest.skip() -# df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision) -# tm.assert_frame_equal(df_roundtrip, df) + df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision) + tm.assert_frame_equal(df_roundtrip, df) @skip_pyarrow @@ -1402,34 +1405,35 @@ def test_numeric_range_too_wide(all_parsers, exp_data): tm.assert_frame_equal(result, expected) -# @skip_pyarrow -# @pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999]) -# def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): -# # GH#38753 -# parser, precision = all_parsers_all_precisions -# data = f"data\n10E{neg_exp}" -# result = parser.read_csv(StringIO(data), float_precision=precision) -# expected = DataFrame({"data": [0.0]}) -# tm.assert_frame_equal(result, expected) - -# @skip_pyarrow -# @pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999]) -# def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): -# # GH#38753 -# parser, precision = all_parsers_all_precisions -# data = f"data\n10E{exp}" -# result = parser.read_csv(StringIO(data), float_precision=precision) -# if precision == "round_trip": -# if exp == 999999999999999999 and is_platform_linux(): -# mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result") -# request.node.add_marker(mark) - -# value = np.inf if exp > 0 else 0.0 -# expected = DataFrame({"data": [value]}) -# else: -# expected = DataFrame({"data": [f"10E{exp}"]}) - -# tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999]) +def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): + # GH#38753 + parser, precision = all_parsers_all_precisions + if parser == "pyarrow": + pytest.skip() + data = f"data\n10E{neg_exp}" + result = parser.read_csv(StringIO(data), float_precision=precision) + expected = DataFrame({"data": [0.0]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999]) +def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): + # GH#38753 + parser, precision = all_parsers_all_precisions + data = f"data\n10E{exp}" + result = parser.read_csv(StringIO(data), float_precision=precision) + if precision == "round_trip": + if exp == 999999999999999999 and is_platform_linux(): + mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result") + request.node.add_marker(mark) + + value = np.inf if exp > 0 else 0.0 + expected = DataFrame({"data": [value]}) + else: + expected = DataFrame({"data": [f"10E{exp}"]}) + + tm.assert_frame_equal(result, expected) @skip_pyarrow From 566f1b4c157fa5c5b4c53e22bd1344ccef91c625 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 2 Jan 2021 02:13:24 -0500 Subject: [PATCH 83/95] TST/REF: io/parsers/test_common.py --- .../io/parser/common/test_common_basic.py | 752 ++++++++++++++++++ 1 file changed, 752 insertions(+) create mode 100644 pandas/tests/io/parser/common/test_common_basic.py diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py new file mode 100644 index 0000000000000..4dd75dff16095 --- /dev/null +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -0,0 +1,752 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +from datetime import datetime +from inspect import signature +from io import StringIO +import os + +import numpy as np +import pytest + +from pandas._libs.tslib import Timestamp +from pandas.errors import EmptyDataError, ParserError + +from pandas import DataFrame, Index, Series, compat +import pandas._testing as tm + +from pandas.io.parsers import CParserWrapper, TextFileReader + +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +def test_override_set_noconvert_columns(): + # see gh-17351 + # + # Usecols needs to be sorted in _set_noconvert_columns based + # on the test_usecols_with_parse_dates test from test_usecols.py + class MyTextFileReader(TextFileReader): + def __init__(self): + self._currow = 0 + self.squeeze = False + + class MyCParserWrapper(CParserWrapper): + def _set_noconvert_columns(self): + if self.usecols_dtype == "integer": + # self.usecols is a set, which is documented as unordered + # but in practice, a CPython set of integers is sorted. + # In other implementations this assumption does not hold. + # The following code simulates a different order, which + # before GH 17351 would cause the wrong columns to be + # converted via the parse_dates parameter + self.usecols = list(self.usecols) + self.usecols.reverse() + return CParserWrapper._set_noconvert_columns(self) + + data = """a,b,c,d,e +0,1,20140101,0900,4 +0,1,20140102,1000,4""" + + parse_dates = [[1, 2]] + cols = { + "a": [0, 0], + "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], + } + expected = DataFrame(cols, columns=["c_d", "a"]) + + parser = MyTextFileReader() + parser.options = { + "usecols": [0, 2, 3], + "parse_dates": parse_dates, + "delimiter": ",", + } + parser.engine = "c" + parser._engine = MyCParserWrapper(StringIO(data), **parser.options) + + result = parser.read() + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_read_csv_local(all_parsers, csv1): + prefix = "file:///" if compat.is_platform_windows() else "file://" + parser = all_parsers + + fname = prefix + str(os.path.abspath(csv1)) + result = parser.read_csv(fname, index_col=0, parse_dates=True) + + expected = DataFrame( + [ + [0.980269, 3.685731, -0.364216805298, -1.159738], + [1.047916, -0.041232, -0.16181208307, 0.212549], + [0.498581, 0.731168, -0.537677223318, 1.346270], + [1.120202, 1.567621, 0.00364077397681, 0.675253], + [-0.487094, 0.571455, -1.6116394093, 0.103469], + [0.836649, 0.246462, 0.588542635376, 1.062782], + [-0.157161, 1.340307, 1.1957779562, -1.097007], + ], + columns=["A", "B", "C", "D"], + index=Index( + [ + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + datetime(2000, 1, 6), + datetime(2000, 1, 7), + datetime(2000, 1, 10), + datetime(2000, 1, 11), + ], + name="index", + ), + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_1000_sep(all_parsers): + parser = all_parsers + data = """A|B|C +1|2,334|5 +10|13|10. +""" + expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]}) + + result = parser.read_csv(StringIO(data), sep="|", thousands=",") + tm.assert_frame_equal(result, expected) + + +def test_squeeze(all_parsers): + data = """\ +a,1 +b,2 +c,3 +""" + parser = all_parsers + index = Index(["a", "b", "c"], name=0) + expected = Series([1, 2, 3], name=1, index=index) + + result = parser.read_csv(StringIO(data), index_col=0, header=None, squeeze=True) + tm.assert_series_equal(result, expected) + + # see gh-8217 + # + # Series should not be a view. + assert not result._is_view + + +@skip_pyarrow +def test_unnamed_columns(all_parsers): + data = """A,B,C,, +1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + parser = all_parsers + expected = DataFrame( + [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], + dtype=np.int64, + columns=["A", "B", "C", "Unnamed: 3", "Unnamed: 4"], + ) + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +def test_csv_mixed_type(all_parsers): + data = """A,B,C +a,1,2 +b,3,4 +c,4,5 +""" + parser = all_parsers + expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}) + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_read_csv_low_memory_no_rows_with_index(all_parsers): + # see gh-21141 + parser = all_parsers + + if not parser.low_memory: + pytest.skip("This is a low-memory specific test") + + data = """A,B,C +1,1,1,2 +2,2,3,4 +3,3,4,5 +""" + result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0) + expected = DataFrame(columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_dataframe(all_parsers, csv1): + parser = all_parsers + result = parser.read_csv(csv1, index_col=0, parse_dates=True) + + expected = DataFrame( + [ + [0.980269, 3.685731, -0.364216805298, -1.159738], + [1.047916, -0.041232, -0.16181208307, 0.212549], + [0.498581, 0.731168, -0.537677223318, 1.346270], + [1.120202, 1.567621, 0.00364077397681, 0.675253], + [-0.487094, 0.571455, -1.6116394093, 0.103469], + [0.836649, 0.246462, 0.588542635376, 1.062782], + [-0.157161, 1.340307, 1.1957779562, -1.097007], + ], + columns=["A", "B", "C", "D"], + index=Index( + [ + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + datetime(2000, 1, 6), + datetime(2000, 1, 7), + datetime(2000, 1, 10), + datetime(2000, 1, 11), + ], + name="index", + ), + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize("nrows", [3, 3.0]) +def test_read_nrows(all_parsers, nrows): + # see gh-10476 + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + expected = DataFrame( + [["foo", 2, 3, 4, 5], ["bar", 7, 8, 9, 10], ["baz", 12, 13, 14, 15]], + columns=["index", "A", "B", "C", "D"], + ) + parser = all_parsers + + result = parser.read_csv(StringIO(data), nrows=nrows) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize("nrows", [1.2, "foo", -1]) +def test_read_nrows_bad(all_parsers, nrows): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + msg = r"'nrows' must be an integer >=0" + parser = all_parsers + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), nrows=nrows) + + +def test_nrows_skipfooter_errors(all_parsers): + msg = "'skipfooter' not supported with 'nrows'" + data = "a\n1\n2\n3\n4\n5\n6" + parser = all_parsers + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), skipfooter=1, nrows=5) + + +@skip_pyarrow +def test_missing_trailing_delimiters(all_parsers): + parser = all_parsers + data = """A,B,C,D +1,2,3,4 +1,3,3, +1,4,5""" + + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + [[1, 2, 3, 4], [1, 3, 3, np.nan], [1, 4, 5, np.nan]], + columns=["A", "B", "C", "D"], + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_skip_initial_space(all_parsers): + data = ( + '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' + "1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, " + "314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, " + "70.06056, 344.98370, 1, 1, -0.689265, -0.692787, " + "0.212036, 14.7674, 41.605, -9999.0, -9999.0, " + "-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128" + ) + parser = all_parsers + + result = parser.read_csv( + StringIO(data), + names=list(range(33)), + header=None, + na_values=["-9999.0"], + skipinitialspace=True, + ) + expected = DataFrame( + [ + [ + "09-Apr-2012", + "01:10:18.300", + 2456026.548822908, + 12849, + 1.00361, + 1.12551, + 330.65659, + 355626618.16711, + 73.48821, + 314.11625, + 1917.09447, + 179.71425, + 80.0, + 240.0, + -350, + 70.06056, + 344.9837, + 1, + 1, + -0.689265, + -0.692787, + 0.212036, + 14.7674, + 41.605, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + 0, + 12, + 128, + ] + ] + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_trailing_delimiters(all_parsers): + # see gh-2442 + data = """A,B,C +1,2,3, +4,5,6, +7,8,9,""" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=False) + + expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]}) + tm.assert_frame_equal(result, expected) + + +def test_escapechar(all_parsers): + # https://stackoverflow.com/questions/13824840/feature-request-for- + # pandas-read-csv + data = '''SEARCH_TERM,ACTUAL_URL +"bra tv bord","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" +"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" +"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa + + parser = all_parsers + result = parser.read_csv( + StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8" + ) + + assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals series' + + tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"])) + + +@skip_pyarrow +def test_ignore_leading_whitespace(all_parsers): + # see gh-3374, gh-6607 + parser = all_parsers + data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9" + result = parser.read_csv(StringIO(data), sep=r"\s+") + + expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]}) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]]) +def test_uneven_lines_with_usecols(all_parsers, usecols): + # see gh-12203 + parser = all_parsers + data = r"""a,b,c +0,1,2 +3,4,5,6,7 +8,9,10""" + + if usecols is None: + # Make sure that an error is still raised + # when the "usecols" parameter is not provided. + msg = r"Expected \d+ fields in line \d+, saw \d+" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data)) + else: + expected = DataFrame({"a": [0, 3, 8], "b": [1, 4, 9]}) + + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + # First, check to see that the response of parser when faced with no + # provided columns raises the correct error, with or without usecols. + ("", {}, None), + ("", {"usecols": ["X"]}, None), + ( + ",,", + {"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]}, + DataFrame(columns=["X"], index=[0], dtype=np.float64), + ), + ( + "", + {"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]}, + DataFrame(columns=["X"]), + ), + ], +) +def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): + # see gh-12493 + parser = all_parsers + + if expected is None: + msg = "No columns to parse from file" + with pytest.raises(EmptyDataError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + else: + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize( + "kwargs,expected", + [ + # gh-8661, gh-8679: this should ignore six lines, including + # lines with trailing whitespace and blank lines. + ( + { + "header": None, + "delim_whitespace": True, + "skiprows": [0, 1, 2, 3, 5, 6], + "skip_blank_lines": True, + }, + DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]), + ), + # gh-8983: test skipping set of rows after a row with trailing spaces. + ( + { + "delim_whitespace": True, + "skiprows": [1, 2, 3, 5, 6], + "skip_blank_lines": True, + }, + DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}), + ), + ], +) +def test_trailing_spaces(all_parsers, kwargs, expected): + data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa + parser = all_parsers + + result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_raise_on_sep_with_delim_whitespace(all_parsers): + # see gh-6607 + data = "a b c\n1 2 3" + parser = all_parsers + + with pytest.raises(ValueError, match="you can only specify one"): + parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True) + + +@skip_pyarrow +@pytest.mark.parametrize("delim_whitespace", [True, False]) +def test_single_char_leading_whitespace(all_parsers, delim_whitespace): + # see gh-9710 + parser = all_parsers + data = """\ +MyColumn +a +b +a +b\n""" + + expected = DataFrame({"MyColumn": list("abab")}) + result = parser.read_csv( + StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize( + "sep,skip_blank_lines,exp_data", + [ + (",", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]), + (r"\s+", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]), + ( + ",", + False, + [ + [1.0, 2.0, 4.0], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + [5.0, np.nan, 10.0], + [np.nan, np.nan, np.nan], + [-70.0, 0.4, 1.0], + ], + ), + ], +) +def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data): + parser = all_parsers + data = """\ +A,B,C +1,2.,4. + + +5.,NaN,10.0 + +-70,.4,1 +""" + + if sep == r"\s+": + data = data.replace(",", " ") + + result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines) + expected = DataFrame(exp_data, columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_whitespace_lines(all_parsers): + parser = all_parsers + data = """ + +\t \t\t +\t +A,B,C +\t 1,2.,4. +5.,NaN,10.0 +""" + expected = DataFrame([[1, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]) + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize( + "data,expected", + [ + ( + """ A B C D +a 1 2 3 4 +b 1 2 3 4 +c 1 2 3 4 +""", + DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], + columns=["A", "B", "C", "D"], + index=["a", "b", "c"], + ), + ), + ( + " a b c\n1 2 3 \n4 5 6\n 7 8 9", + DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]), + ), + ], +) +def test_whitespace_regex_separator(all_parsers, data, expected): + # see gh-6607 + parser = all_parsers + result = parser.read_csv(StringIO(data), sep=r"\s+") + tm.assert_frame_equal(result, expected) + + +def test_sub_character(all_parsers, csv_dir_path): + # see gh-16893 + filename = os.path.join(csv_dir_path, "sub_char.csv") + expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"]) + + parser = all_parsers + result = parser.read_csv(filename) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv", "中文文件名.csv"]) +def test_filename_with_special_chars(all_parsers, filename): + # see gh-15086. + parser = all_parsers + df = DataFrame({"a": [1, 2, 3]}) + + with tm.ensure_clean(filename) as path: + df.to_csv(path, index=False) + + result = parser.read_csv(path) + tm.assert_frame_equal(result, df) + + +def test_read_table_same_signature_as_read_csv(all_parsers): + # GH-34976 + parser = all_parsers + + table_sign = signature(parser.read_table) + csv_sign = signature(parser.read_csv) + + assert table_sign.parameters.keys() == csv_sign.parameters.keys() + assert table_sign.return_annotation == csv_sign.return_annotation + + for key, csv_param in csv_sign.parameters.items(): + table_param = table_sign.parameters[key] + if key == "sep": + assert csv_param.default == "," + assert table_param.default == "\t" + assert table_param.annotation == csv_param.annotation + assert table_param.kind == csv_param.kind + continue + else: + assert table_param == csv_param + + +def test_read_table_equivalency_to_read_csv(all_parsers): + # see gh-21948 + # As of 0.25.0, read_table is undeprecated + parser = all_parsers + data = "a\tb\n1\t2\n3\t4" + expected = parser.read_csv(StringIO(data), sep="\t") + result = parser.read_table(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_first_row_bom(all_parsers): + # see gh-26545 + parser = all_parsers + data = '''\ufeff"Head1" "Head2" "Head3"''' + + result = parser.read_csv(StringIO(data), delimiter="\t") + expected = DataFrame(columns=["Head1", "Head2", "Head3"]) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_first_row_bom_unquoted(all_parsers): + # see gh-36343 + parser = all_parsers + data = """\ufeffHead1 Head2 Head3""" + + result = parser.read_csv(StringIO(data), delimiter="\t") + expected = DataFrame(columns=["Head1", "Head2", "Head3"]) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize("nrows", range(1, 6)) +def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): + # GH 28071 + ref = DataFrame( + [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]], + columns=list("ab"), + ) + csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4" + parser = all_parsers + df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False) + tm.assert_frame_equal(df, ref[:nrows]) + + +@skip_pyarrow +def test_no_header_two_extra_columns(all_parsers): + # GH 26218 + column_names = ["one", "two", "three"] + ref = DataFrame([["foo", "bar", "baz"]], columns=column_names) + stream = StringIO("foo,bar,baz,bam,blah") + parser = all_parsers + df = parser.read_csv(stream, header=None, names=column_names, index_col=False) + tm.assert_frame_equal(df, ref) + + +def test_read_csv_names_not_accepting_sets(all_parsers): + # GH 34946 + data = """\ + 1,2,3 + 4,5,6\n""" + parser = all_parsers + with pytest.raises(ValueError, match="Names should be an ordered collection."): + parser.read_csv(StringIO(data), names=set("QAZ")) + + +@skip_pyarrow +def test_read_table_delim_whitespace_default_sep(all_parsers): + # GH: 35958 + f = StringIO("a b c\n1 -2 -3\n4 5 6") + parser = all_parsers + result = parser.read_table(f, delim_whitespace=True) + expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("delimiter", [",", "\t"]) +def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter): + # GH: 35958 + f = StringIO("a b c\n1 -2 -3\n4 5 6") + parser = all_parsers + msg = ( + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only specify one." + ) + with pytest.raises(ValueError, match=msg): + parser.read_csv(f, delim_whitespace=True, sep=delimiter) + + with pytest.raises(ValueError, match=msg): + parser.read_csv(f, delim_whitespace=True, delimiter=delimiter) + + +@pytest.mark.parametrize("delimiter", [",", "\t"]) +def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): + # GH: 35958 + f = StringIO("a b c\n1 -2 -3\n4 5 6") + parser = all_parsers + msg = ( + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only specify one." + ) + with pytest.raises(ValueError, match=msg): + parser.read_table(f, delim_whitespace=True, sep=delimiter) + + with pytest.raises(ValueError, match=msg): + parser.read_table(f, delim_whitespace=True, delimiter=delimiter) + + +@skip_pyarrow +def test_dict_keys_as_names(all_parsers): + # GH: 36928 + data = "1,2" + + keys = {"a": int, "b": int}.keys() + parser = all_parsers + + result = parser.read_csv(StringIO(data), names=keys) + expected = DataFrame({"a": [1], "b": [2]}) + tm.assert_frame_equal(result, expected) From cd9b3004724ffb666c4d9897cdc13943f66ca3b7 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 2 Jan 2021 02:17:31 -0500 Subject: [PATCH 84/95] TST/REF: io/parsers/test_common.py --- pandas/tests/io/parser/common/test_ints.py | 210 +++++++++++++++++++++ 1 file changed, 210 insertions(+) create mode 100644 pandas/tests/io/parser/common/test_ints.py diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py new file mode 100644 index 0000000000000..4b31447b638f7 --- /dev/null +++ b/pandas/tests/io/parser/common/test_ints.py @@ -0,0 +1,210 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm + +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +def test_int_conversion(all_parsers): + data = """A,B +1.0,1 +2.0,2 +3.0,3 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data)) + + expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + "A,B\nTrue,1\nFalse,2\nTrue,3", + {}, + DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]), + ), + ( + "A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3", + {"true_values": ["yes", "Yes", "YES"], "false_values": ["no", "NO", "No"]}, + DataFrame( + [[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]], + columns=["A", "B"], + ), + ), + ( + "A,B\nTRUE,1\nFALSE,2\nTRUE,3", + {}, + DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]), + ), + ( + "A,B\nfoo,bar\nbar,foo", + {"true_values": ["foo"], "false_values": ["bar"]}, + DataFrame([[True, False], [False, True]], columns=["A", "B"]), + ), + ], +) +def test_parse_bool(all_parsers, data, kwargs, expected): + parser = all_parsers + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_parse_integers_above_fp_precision(all_parsers): + data = """Numbers +17007000002000191 +17007000002000191 +17007000002000191 +17007000002000191 +17007000002000192 +17007000002000192 +17007000002000192 +17007000002000192 +17007000002000192 +17007000002000194""" + parser = all_parsers + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + { + "Numbers": [ + 17007000002000191, + 17007000002000191, + 17007000002000191, + 17007000002000191, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000194, + ] + } + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize("sep", [" ", r"\s+"]) +def test_integer_overflow_bug(all_parsers, sep): + # see gh-2601 + data = "65248E10 11\n55555E55 22\n" + parser = all_parsers + + result = parser.read_csv(StringIO(data), header=None, sep=sep) + expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]]) + tm.assert_frame_equal(result, expected) + + +def test_int64_min_issues(all_parsers): + # see gh-2599 + parser = all_parsers + data = "A,B\n0,0\n0," + result = parser.read_csv(StringIO(data)) + + expected = DataFrame({"A": [0, 0], "B": [0, np.nan]}) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize("conv", [None, np.int64, np.uint64]) +def test_int64_overflow(all_parsers, conv): + data = """ID +00013007854817840016671868 +00013007854817840016749251 +00013007854817840016754630 +00013007854817840016781876 +00013007854817840017028824 +00013007854817840017963235 +00013007854817840018860166""" + parser = all_parsers + + if conv is None: + # 13007854817840016671868 > UINT64_MAX, so this + # will overflow and return object as the dtype. + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + [ + "00013007854817840016671868", + "00013007854817840016749251", + "00013007854817840016754630", + "00013007854817840016781876", + "00013007854817840017028824", + "00013007854817840017963235", + "00013007854817840018860166", + ], + columns=["ID"], + ) + tm.assert_frame_equal(result, expected) + else: + # 13007854817840016671868 > UINT64_MAX, so attempts + # to cast to either int64 or uint64 will result in + # an OverflowError being raised. + msg = ( + "(Python int too large to convert to C long)|" + "(long too big to convert)|" + "(int too big to convert)" + ) + + with pytest.raises(OverflowError, match=msg): + parser.read_csv(StringIO(data), converters={"ID": conv}) + + +@skip_pyarrow +@pytest.mark.parametrize( + "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min] +) +def test_int64_uint64_range(all_parsers, val): + # These numbers fall right inside the int64-uint64 + # range, so they should be parsed as string. + parser = all_parsers + result = parser.read_csv(StringIO(str(val)), header=None) + + expected = DataFrame([val]) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize( + "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] +) +def test_outside_int64_uint64_range(all_parsers, val): + # These numbers fall just outside the int64-uint64 + # range, so they should be parsed as string. + parser = all_parsers + result = parser.read_csv(StringIO(str(val)), header=None) + + expected = DataFrame([str(val)]) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize("exp_data", [[str(-1), str(2 ** 63)], [str(2 ** 63), str(-1)]]) +def test_numeric_range_too_wide(all_parsers, exp_data): + # No numerical dtype can hold both negative and uint64 + # values, so they should be cast as string. + parser = all_parsers + data = "\n".join(exp_data) + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), header=None) + tm.assert_frame_equal(result, expected) + + +def test_integer_precision(all_parsers): + # Gh 7072 + s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765 +5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389""" + parser = all_parsers + result = parser.read_csv(StringIO(s), header=None)[4] + expected = Series([4321583677327450765, 4321113141090630389], name=4) + tm.assert_series_equal(result, expected) From 4a7dc0f99152dc9a78f07f16f6cb7d2e295da66e Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 2 Jan 2021 02:19:19 -0500 Subject: [PATCH 85/95] TST/REF: io/parsers/test_common.py --- .../tests/io/parser/common/test_chunksize.py | 232 ++++++++++++++++++ 1 file changed, 232 insertions(+) create mode 100644 pandas/tests/io/parser/common/test_chunksize.py diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py new file mode 100644 index 0000000000000..f7db9a5546d62 --- /dev/null +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -0,0 +1,232 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas.errors import DtypeWarning + +from pandas import DataFrame, concat +import pandas._testing as tm + +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +@skip_pyarrow +@pytest.mark.parametrize("index_col", [0, "index"]) +def test_read_chunksize_with_index(all_parsers, index_col): + parser = all_parsers + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + + expected = DataFrame( + [ + ["foo", 2, 3, 4, 5], + ["bar", 7, 8, 9, 10], + ["baz", 12, 13, 14, 15], + ["qux", 12, 13, 14, 15], + ["foo2", 12, 13, 14, 15], + ["bar2", 12, 13, 14, 15], + ], + columns=["index", "A", "B", "C", "D"], + ) + expected = expected.set_index("index") + + with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader: + chunks = list(reader) + tm.assert_frame_equal(chunks[0], expected[:2]) + tm.assert_frame_equal(chunks[1], expected[2:4]) + tm.assert_frame_equal(chunks[2], expected[4:]) + + +@skip_pyarrow +@pytest.mark.parametrize("chunksize", [1.3, "foo", 0]) +def test_read_chunksize_bad(all_parsers, chunksize): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + msg = r"'chunksize' must be an integer >=1" + + with pytest.raises(ValueError, match=msg): + with parser.read_csv(StringIO(data), chunksize=chunksize) as _: + pass + + +@skip_pyarrow +@pytest.mark.parametrize("chunksize", [2, 8]) +def test_read_chunksize_and_nrows(all_parsers, chunksize): + # see gh-15755 + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + kwargs = {"index_col": 0, "nrows": 5} + + expected = parser.read_csv(StringIO(data), **kwargs) + with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader: + tm.assert_frame_equal(concat(reader), expected) + + +@skip_pyarrow +def test_read_chunksize_and_nrows_changing_size(all_parsers): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + kwargs = {"index_col": 0, "nrows": 5} + + expected = parser.read_csv(StringIO(data), **kwargs) + with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader: + tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2]) + tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5]) + + with pytest.raises(StopIteration, match=""): + reader.get_chunk(size=3) + + +@skip_pyarrow +def test_get_chunk_passed_chunksize(all_parsers): + parser = all_parsers + data = """A,B,C +1,2,3 +4,5,6 +7,8,9 +1,2,3""" + + with parser.read_csv(StringIO(data), chunksize=2) as reader: + result = reader.get_chunk() + + expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}]) +def test_read_chunksize_compat(all_parsers, kwargs): + # see gh-12185 + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), **kwargs) + with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader: + tm.assert_frame_equal(concat(reader), result) + + +@skip_pyarrow +def test_read_chunksize_jagged_names(all_parsers): + # see gh-23509 + parser = all_parsers + data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)]) + + expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10]) + with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader: + result = concat(reader) + tm.assert_frame_equal(result, expected) + + +def test_chunk_begins_with_newline_whitespace(all_parsers): + # see gh-10022 + parser = all_parsers + data = "\n hello\nworld\n" + + result = parser.read_csv(StringIO(data), header=None) + expected = DataFrame([" hello", "world"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.xfail(reason="GH38630, sometimes gives ResourceWarning", strict=False) +def test_chunks_have_consistent_numerical_type(all_parsers): + parser = all_parsers + integers = [str(i) for i in range(499999)] + data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers) + + # Coercions should work without warnings. + with tm.assert_produces_warning(None): + result = parser.read_csv(StringIO(data)) + + assert type(result.a[0]) is np.float64 + assert result.a.dtype == float + + +def test_warn_if_chunks_have_mismatched_type(all_parsers): + warning_type = None + parser = all_parsers + integers = [str(i) for i in range(499999)] + data = "a\n" + "\n".join(integers + ["a", "b"] + integers) + + # see gh-3866: if chunks are different types and can't + # be coerced using numerical types, then issue warning. + if parser.engine == "c" and parser.low_memory: + warning_type = DtypeWarning + + with tm.assert_produces_warning(warning_type): + df = parser.read_csv(StringIO(data)) + assert df.a.dtype == object + + +@skip_pyarrow +@pytest.mark.parametrize("iterator", [True, False]) +def test_empty_with_nrows_chunksize(all_parsers, iterator): + # see gh-9535 + parser = all_parsers + expected = DataFrame(columns=["foo", "bar"]) + + nrows = 10 + data = StringIO("foo,bar\n") + + if iterator: + with parser.read_csv(data, chunksize=nrows) as reader: + result = next(iter(reader)) + else: + result = parser.read_csv(data, nrows=nrows) + + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_read_csv_memory_growth_chunksize(all_parsers): + # see gh-24805 + # + # Let's just make sure that we don't crash + # as we iteratively process all chunks. + parser = all_parsers + + with tm.ensure_clean() as path: + with open(path, "w") as f: + for i in range(1000): + f.write(str(i) + "\n") + + with parser.read_csv(path, chunksize=20) as result: + for _ in result: + pass From 3b24fe74ea5e851ad8be2c6c96fa70e70c3e88b5 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 2 Jan 2021 02:22:44 -0500 Subject: [PATCH 86/95] TST/REF: io/parsers/test_common.py --- pandas/tests/io/parser/common/test_decimal.py | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 pandas/tests/io/parser/common/test_decimal.py diff --git a/pandas/tests/io/parser/common/test_decimal.py b/pandas/tests/io/parser/common/test_decimal.py new file mode 100644 index 0000000000000..21eadc51d25b6 --- /dev/null +++ b/pandas/tests/io/parser/common/test_decimal.py @@ -0,0 +1,64 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +from io import StringIO + +import pytest + +from pandas import DataFrame +import pandas._testing as tm + +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +@skip_pyarrow +@pytest.mark.parametrize( + "data,thousands,decimal", + [ + ( + """A|B|C +1|2,334.01|5 +10|13|10. +""", + ",", + ".", + ), + ( + """A|B|C +1|2.334,01|5 +10|13|10, +""", + ".", + ",", + ), + ], +) +def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal): + parser = all_parsers + expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]}) + + result = parser.read_csv( + StringIO(data), sep="|", thousands=thousands, decimal=decimal + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_euro_decimal_format(all_parsers): + parser = all_parsers + data = """Id;Number1;Number2;Text1;Text2;Number3 +1;1521,1541;187101,9543;ABC;poi;4,738797819 +2;121,12;14897,76;DEF;uyt;0,377320872 +3;878,158;108013,434;GHI;rez;2,735694704""" + + result = parser.read_csv(StringIO(data), sep=";", decimal=",") + expected = DataFrame( + [ + [1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819], + [2, 121.12, 14897.76, "DEF", "uyt", 0.377320872], + [3, 878.158, 108013.434, "GHI", "rez", 2.735694704], + ], + columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"], + ) + tm.assert_frame_equal(result, expected) From c33bf46783e24baee51a89961d0249c520329911 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 2 Jan 2021 02:24:39 -0500 Subject: [PATCH 87/95] TST/REF: io/parsers/test_common.py --- .../tests/io/parser/common/test_iterator.py | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 pandas/tests/io/parser/common/test_iterator.py diff --git a/pandas/tests/io/parser/common/test_iterator.py b/pandas/tests/io/parser/common/test_iterator.py new file mode 100644 index 0000000000000..f19ae55ecb8ac --- /dev/null +++ b/pandas/tests/io/parser/common/test_iterator.py @@ -0,0 +1,110 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +from io import StringIO + +import pytest + +from pandas import DataFrame, Series, concat +import pandas._testing as tm + +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +@skip_pyarrow +def test_iterator(all_parsers): + # see gh-6607 + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + kwargs = {"index_col": 0} + + expected = parser.read_csv(StringIO(data), **kwargs) + with parser.read_csv(StringIO(data), iterator=True, **kwargs) as reader: + + first_chunk = reader.read(3) + tm.assert_frame_equal(first_chunk, expected[:3]) + + last_chunk = reader.read(5) + tm.assert_frame_equal(last_chunk, expected[3:]) + + +@skip_pyarrow +def test_iterator2(all_parsers): + parser = all_parsers + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + + with parser.read_csv(StringIO(data), iterator=True) as reader: + result = list(reader) + + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"], + ) + tm.assert_frame_equal(result[0], expected) + + +@skip_pyarrow +def test_iterator_stop_on_chunksize(all_parsers): + # gh-3967: stopping iteration when chunksize is specified + parser = all_parsers + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + + with parser.read_csv(StringIO(data), chunksize=1) as reader: + result = list(reader) + + assert len(result) == 3 + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"], + ) + tm.assert_frame_equal(concat(result), expected) + + +@skip_pyarrow +@pytest.mark.parametrize( + "kwargs", [{"iterator": True, "chunksize": 1}, {"iterator": True}, {"chunksize": 1}] +) +def test_iterator_skipfooter_errors(all_parsers, kwargs): + msg = "'skipfooter' not supported for iteration" + parser = all_parsers + data = "a\n1\n2" + + with pytest.raises(ValueError, match=msg): + with parser.read_csv(StringIO(data), skipfooter=1, **kwargs) as _: + pass + + +def test_iteration_open_handle(all_parsers): + parser = all_parsers + kwargs = {"squeeze": True, "header": None} + + with tm.ensure_clean() as path: + with open(path, "w") as f: + f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG") + + with open(path) as f: + for line in f: + if "CCC" in line: + break + + result = parser.read_csv(f, **kwargs) + expected = Series(["DDD", "EEE", "FFF", "GGG"], name=0) + tm.assert_series_equal(result, expected) From dc9530baa77b9a5e6d5e30f6f366d4794b271824 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 2 Jan 2021 02:27:14 -0500 Subject: [PATCH 88/95] TST/REF: io/parsers/test_common.py --- pandas/tests/io/parser/common/test_index.py | 292 ++++++++++++++++++++ 1 file changed, 292 insertions(+) create mode 100644 pandas/tests/io/parser/common/test_index.py diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py new file mode 100644 index 0000000000000..fd999fcdabac3 --- /dev/null +++ b/pandas/tests/io/parser/common/test_index.py @@ -0,0 +1,292 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +from datetime import datetime +from io import StringIO +import os + +import pytest + +from pandas import DataFrame, Index, MultiIndex +import pandas._testing as tm + +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + """foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""", + {"index_col": 0, "names": ["index", "A", "B", "C", "D"]}, + DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + index=Index(["foo", "bar", "baz", "qux", "foo2", "bar2"], name="index"), + columns=["A", "B", "C", "D"], + ), + ), + ( + """foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""", + {"index_col": [0, 1], "names": ["index1", "index2", "A", "B", "C", "D"]}, + DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + index=MultiIndex.from_tuples( + [ + ("foo", "one"), + ("foo", "two"), + ("foo", "three"), + ("bar", "one"), + ("bar", "two"), + ], + names=["index1", "index2"], + ), + columns=["A", "B", "C", "D"], + ), + ), + ], +) +def test_pass_names_with_index(all_parsers, data, kwargs, expected): + parser = all_parsers + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) +def test_multi_index_no_level_names(all_parsers, index_col): + data = """index1,index2,A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + headless_data = "\n".join(data.split("\n")[1:]) + + names = ["A", "B", "C", "D"] + parser = all_parsers + + result = parser.read_csv( + StringIO(headless_data), index_col=index_col, header=None, names=names + ) + expected = parser.read_csv(StringIO(data), index_col=index_col) + + # No index names in headless data. + expected.index.names = [None] * 2 + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_multi_index_no_level_names_implicit(all_parsers): + parser = all_parsers + data = """A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + columns=["A", "B", "C", "D"], + index=MultiIndex.from_tuples( + [ + ("foo", "one"), + ("foo", "two"), + ("foo", "three"), + ("bar", "one"), + ("bar", "two"), + ] + ), + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize( + "data,expected,header", + [ + ("a,b", DataFrame(columns=["a", "b"]), [0]), + ( + "a,b\nc,d", + DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])), + [0, 1], + ), + ], +) +@pytest.mark.parametrize("round_trip", [True, False]) +def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip): + # see gh-14545 + parser = all_parsers + data = expected.to_csv(index=False) if round_trip else data + + result = parser.read_csv(StringIO(data), header=header) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_no_unnamed_index(all_parsers): + parser = all_parsers + data = """ id c0 c1 c2 +0 1 0 a b +1 2 0 c d +2 2 2 e f +""" + result = parser.read_csv(StringIO(data), sep=" ") + expected = DataFrame( + [[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], [2, 2, 2, "e", "f"]], + columns=["Unnamed: 0", "id", "c0", "c1", "c2"], + ) + tm.assert_frame_equal(result, expected) + + +def test_read_duplicate_index_explicit(all_parsers): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo,12,13,14,15 +bar,12,13,14,15 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=0) + + expected = DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + columns=["A", "B", "C", "D"], + index=Index(["foo", "bar", "baz", "qux", "foo", "bar"], name="index"), + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_read_duplicate_index_implicit(all_parsers): + data = """A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo,12,13,14,15 +bar,12,13,14,15 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data)) + + expected = DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + columns=["A", "B", "C", "D"], + index=Index(["foo", "bar", "baz", "qux", "foo", "bar"]), + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_read_csv_no_index_name(all_parsers, csv_dir_path): + parser = all_parsers + csv2 = os.path.join(csv_dir_path, "test2.csv") + result = parser.read_csv(csv2, index_col=0, parse_dates=True) + + expected = DataFrame( + [ + [0.980269, 3.685731, -0.364216805298, -1.159738, "foo"], + [1.047916, -0.041232, -0.16181208307, 0.212549, "bar"], + [0.498581, 0.731168, -0.537677223318, 1.346270, "baz"], + [1.120202, 1.567621, 0.00364077397681, 0.675253, "qux"], + [-0.487094, 0.571455, -1.6116394093, 0.103469, "foo2"], + ], + columns=["A", "B", "C", "D", "E"], + index=Index( + [ + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + datetime(2000, 1, 6), + datetime(2000, 1, 7), + ] + ), + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_empty_with_index(all_parsers): + # see gh-10184 + data = "x,y" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=0) + + expected = DataFrame(columns=["y"], index=Index([], name="x")) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_empty_with_multi_index(all_parsers): + # see gh-10467 + data = "x,y,z" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=["x", "y"]) + + expected = DataFrame( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"]) + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_empty_with_reversed_multi_index(all_parsers): + data = "x,y,z" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=[1, 0]) + + expected = DataFrame( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"]) + ) + tm.assert_frame_equal(result, expected) From d83b2e0bbf341f79d741b41aa6f7853c3bb78f8d Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 2 Jan 2021 02:28:54 -0500 Subject: [PATCH 89/95] TST/REF: io/parsers/test_common.py --- .../tests/io/parser/common/test_data_list.py | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 pandas/tests/io/parser/common/test_data_list.py diff --git a/pandas/tests/io/parser/common/test_data_list.py b/pandas/tests/io/parser/common/test_data_list.py new file mode 100644 index 0000000000000..d67f728ad87e5 --- /dev/null +++ b/pandas/tests/io/parser/common/test_data_list.py @@ -0,0 +1,87 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +import csv +from io import StringIO + +import pytest + +from pandas import DataFrame +import pandas._testing as tm + +from pandas.io.parsers import TextParser + +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +@skip_pyarrow +def test_read_data_list(all_parsers): + parser = all_parsers + kwargs = {"index_col": 0} + data = "A,B,C\nfoo,1,2,3\nbar,4,5,6" + + data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]] + expected = parser.read_csv(StringIO(data), **kwargs) + + with TextParser(data_list, chunksize=2, **kwargs) as parser: + result = parser.read() + + tm.assert_frame_equal(result, expected) + + +def test_reader_list(all_parsers): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + kwargs = {"index_col": 0} + + lines = list(csv.reader(StringIO(data))) + with TextParser(lines, chunksize=2, **kwargs) as reader: + chunks = list(reader) + + expected = parser.read_csv(StringIO(data), **kwargs) + + tm.assert_frame_equal(chunks[0], expected[:2]) + tm.assert_frame_equal(chunks[1], expected[2:4]) + tm.assert_frame_equal(chunks[2], expected[4:]) + + +def test_reader_list_skiprows(all_parsers): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + kwargs = {"index_col": 0} + + lines = list(csv.reader(StringIO(data))) + with TextParser(lines, chunksize=2, skiprows=[1], **kwargs) as reader: + chunks = list(reader) + + expected = parser.read_csv(StringIO(data), **kwargs) + + tm.assert_frame_equal(chunks[0], expected[1:3]) + + +def test_read_csv_parse_simple_list(all_parsers): + parser = all_parsers + data = """foo +bar baz +qux foo +foo +bar""" + + result = parser.read_csv(StringIO(data), header=None) + expected = DataFrame(["foo", "bar baz", "qux foo", "foo", "bar"]) + tm.assert_frame_equal(result, expected) From 6205bedf0e4a4d9397ae781244043e58f87edfd9 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 2 Jan 2021 02:30:45 -0500 Subject: [PATCH 90/95] TST/REF: io/parsers/test_common.py --- pandas/tests/io/parser/common/test_float.py | 69 +++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 pandas/tests/io/parser/common/test_float.py diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py new file mode 100644 index 0000000000000..c9dcc5189de06 --- /dev/null +++ b/pandas/tests/io/parser/common/test_float.py @@ -0,0 +1,69 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas.compat import is_platform_linux + +from pandas import DataFrame +import pandas._testing as tm + +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +@skip_pyarrow +def test_float_parser(all_parsers): + # see gh-9565 + parser = all_parsers + data = "45e-1,4.5,45.,inf,-inf" + result = parser.read_csv(StringIO(data), header=None) + + expected = DataFrame([[float(s) for s in data.split(",")]]) + tm.assert_frame_equal(result, expected) + + +def test_scientific_no_exponent(all_parsers_all_precisions): + # see gh-12215 + df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]}) + data = df.to_csv(index=False) + parser, precision = all_parsers_all_precisions + if parser == "pyarrow": + pytest.skip() + + df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision) + tm.assert_frame_equal(df_roundtrip, df) + + +@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999]) +def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): + # GH#38753 + parser, precision = all_parsers_all_precisions + if parser == "pyarrow": + pytest.skip() + data = f"data\n10E{neg_exp}" + result = parser.read_csv(StringIO(data), float_precision=precision) + expected = DataFrame({"data": [0.0]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999]) +def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): + # GH#38753 + parser, precision = all_parsers_all_precisions + data = f"data\n10E{exp}" + result = parser.read_csv(StringIO(data), float_precision=precision) + if precision == "round_trip": + if exp == 999999999999999999 and is_platform_linux(): + mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result") + request.node.add_marker(mark) + + value = np.inf if exp > 0 else 0.0 + expected = DataFrame({"data": [value]}) + else: + expected = DataFrame({"data": [f"10E{exp}"]}) + + tm.assert_frame_equal(result, expected) From c4b3bb72fa34c80012fe05089625cabef2b433a0 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 2 Jan 2021 02:32:15 -0500 Subject: [PATCH 91/95] TST/REF: io/parsers/test_common.py --- pandas/tests/io/parser/common/test_inf.py | 65 +++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 pandas/tests/io/parser/common/test_inf.py diff --git a/pandas/tests/io/parser/common/test_inf.py b/pandas/tests/io/parser/common/test_inf.py new file mode 100644 index 0000000000000..9bc93171f9307 --- /dev/null +++ b/pandas/tests/io/parser/common/test_inf.py @@ -0,0 +1,65 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas import DataFrame, option_context +import pandas._testing as tm + +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +@skip_pyarrow +@pytest.mark.parametrize("na_filter", [True, False]) +def test_inf_parsing(all_parsers, na_filter): + parser = all_parsers + data = """\ +,A +a,inf +b,-inf +c,+Inf +d,-Inf +e,INF +f,-INF +g,+INf +h,-INf +i,inF +j,-inF""" + expected = DataFrame( + {"A": [float("inf"), float("-inf")] * 5}, + index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], + ) + result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize("na_filter", [True, False]) +def test_infinity_parsing(all_parsers, na_filter): + parser = all_parsers + data = """\ +,A +a,Infinity +b,-Infinity +c,+Infinity +""" + expected = DataFrame( + {"A": [float("infinity"), float("-infinity"), float("+infinity")]}, + index=["a", "b", "c"], + ) + result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_with_use_inf_as_na(all_parsers): + # https://github.com/pandas-dev/pandas/issues/35493 + parser = all_parsers + data = "1.0\nNaN\n3.0" + with option_context("use_inf_as_na", True): + result = parser.read_csv(StringIO(data), header=None) + expected = DataFrame([1.0, np.nan, 3.0]) + tm.assert_frame_equal(result, expected) From a77b33eb6afd4f9e4c4fdc272ffe59869fe2c873 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 2 Jan 2021 02:34:53 -0500 Subject: [PATCH 92/95] TST/REF: io/parsers/test_common.py --- pandas/tests/io/parser/common/test_verbose.py | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 pandas/tests/io/parser/common/test_verbose.py diff --git a/pandas/tests/io/parser/common/test_verbose.py b/pandas/tests/io/parser/common/test_verbose.py new file mode 100644 index 0000000000000..e085d230d1acd --- /dev/null +++ b/pandas/tests/io/parser/common/test_verbose.py @@ -0,0 +1,57 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +from io import StringIO + +import pytest + +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +@skip_pyarrow +def test_verbose_read(all_parsers, capsys): + parser = all_parsers + data = """a,b,c,d +one,1,2,3 +one,1,2,3 +,1,2,3 +one,1,2,3 +,1,2,3 +,1,2,3 +one,1,2,3 +two,1,2,3""" + + # Engines are verbose in different ways. + parser.read_csv(StringIO(data), verbose=True) + captured = capsys.readouterr() + + if parser.engine == "c": + assert "Tokenization took:" in captured.out + assert "Parser memory cleanup took:" in captured.out + else: # Python engine + assert captured.out == "Filled 3 NA values in column a\n" + + +@skip_pyarrow +def test_verbose_read2(all_parsers, capsys): + parser = all_parsers + data = """a,b,c,d +one,1,2,3 +two,1,2,3 +three,1,2,3 +four,1,2,3 +five,1,2,3 +,1,2,3 +seven,1,2,3 +eight,1,2,3""" + + parser.read_csv(StringIO(data), verbose=True, index_col=0) + captured = capsys.readouterr() + + # Engines are verbose in different ways. + if parser.engine == "c": + assert "Tokenization took:" in captured.out + assert "Parser memory cleanup took:" in captured.out + else: # Python engine + assert captured.out == "Filled 1 NA values in column a\n" From 04c8d218a86880ca90ecee8b16158e865a8750ca Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 2 Jan 2021 02:40:03 -0500 Subject: [PATCH 93/95] TST/REF: io/parsers/test_common.py --- .../io/parser/common/test_file_buffer_url.py | 452 ++++++++++++++++++ 1 file changed, 452 insertions(+) create mode 100644 pandas/tests/io/parser/common/test_file_buffer_url.py diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py new file mode 100644 index 0000000000000..0a5bc4a135b9e --- /dev/null +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -0,0 +1,452 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +from io import BytesIO, StringIO +import os +import platform +from urllib.error import URLError + +import pytest + +from pandas.errors import EmptyDataError, ParserError +import pandas.util._test_decorators as td + +from pandas import DataFrame +import pandas._testing as tm + +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +@skip_pyarrow +@tm.network +def test_url(all_parsers, csv_dir_path): + # TODO: FTP testing + parser = all_parsers + kwargs = {"sep": "\t"} + + url = ( + "https://raw.github.com/pandas-dev/pandas/master/" + "pandas/tests/io/parser/data/salaries.csv" + ) + url_result = parser.read_csv(url, **kwargs) + + local_path = os.path.join(csv_dir_path, "salaries.csv") + local_result = parser.read_csv(local_path, **kwargs) + tm.assert_frame_equal(url_result, local_result) + + +@skip_pyarrow +@pytest.mark.slow +def test_local_file(all_parsers, csv_dir_path): + parser = all_parsers + kwargs = {"sep": "\t"} + + local_path = os.path.join(csv_dir_path, "salaries.csv") + local_result = parser.read_csv(local_path, **kwargs) + url = "file://localhost/" + local_path + + try: + url_result = parser.read_csv(url, **kwargs) + tm.assert_frame_equal(url_result, local_result) + except URLError: + # Fails on some systems. + pytest.skip("Failing on: " + " ".join(platform.uname())) + + +@skip_pyarrow +def test_path_path_lib(all_parsers): + parser = all_parsers + df = tm.makeDataFrame() + result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0)) + tm.assert_frame_equal(df, result) + + +@skip_pyarrow +def test_path_local_path(all_parsers): + parser = all_parsers + df = tm.makeDataFrame() + result = tm.round_trip_localpath( + df.to_csv, lambda p: parser.read_csv(p, index_col=0) + ) + tm.assert_frame_equal(df, result) + + +@skip_pyarrow +def test_nonexistent_path(all_parsers): + # gh-2428: pls no segfault + # gh-14086: raise more helpful FileNotFoundError + # GH#29233 "File foo" instead of "File b'foo'" + parser = all_parsers + path = f"{tm.rands(10)}.csv" + + msg = r"\[Errno 2\]" + with pytest.raises(FileNotFoundError, match=msg) as e: + parser.read_csv(path) + assert path == e.value.filename + + +@skip_pyarrow +@td.skip_if_windows # os.chmod does not work in windows +def test_no_permission(all_parsers): + # GH 23784 + parser = all_parsers + + msg = r"\[Errno 13\]" + with tm.ensure_clean() as path: + os.chmod(path, 0) # make file unreadable + + # verify that this process cannot open the file (not running as sudo) + try: + with open(path): + pass + pytest.skip("Running as sudo.") + except PermissionError: + pass + + with pytest.raises(PermissionError, match=msg) as e: + parser.read_csv(path) + assert path == e.value.filename + + +@skip_pyarrow +@pytest.mark.parametrize( + "data,kwargs,expected,msg", + [ + # gh-10728: WHITESPACE_LINE + ( + "a,b,c\n4,5,6\n ", + {}, + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # gh-10548: EAT_LINE_COMMENT + ( + "a,b,c\n4,5,6\n#comment", + {"comment": "#"}, + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # EAT_CRNL_NOP + ( + "a,b,c\n4,5,6\n\r", + {}, + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # EAT_COMMENT + ( + "a,b,c\n4,5,6#comment", + {"comment": "#"}, + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # SKIP_LINE + ( + "a,b,c\n4,5,6\nskipme", + {"skiprows": [2]}, + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # EAT_LINE_COMMENT + ( + "a,b,c\n4,5,6\n#comment", + {"comment": "#", "skip_blank_lines": False}, + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # IN_FIELD + ( + "a,b,c\n4,5,6\n ", + {"skip_blank_lines": False}, + DataFrame([["4", 5, 6], [" ", None, None]], columns=["a", "b", "c"]), + None, + ), + # EAT_CRNL + ( + "a,b,c\n4,5,6\n\r", + {"skip_blank_lines": False}, + DataFrame([[4, 5, 6], [None, None, None]], columns=["a", "b", "c"]), + None, + ), + # ESCAPED_CHAR + ( + "a,b,c\n4,5,6\n\\", + {"escapechar": "\\"}, + None, + "(EOF following escape character)|(unexpected end of data)", + ), + # ESCAPE_IN_QUOTED_FIELD + ( + 'a,b,c\n4,5,6\n"\\', + {"escapechar": "\\"}, + None, + "(EOF inside string starting at row 2)|(unexpected end of data)", + ), + # IN_QUOTED_FIELD + ( + 'a,b,c\n4,5,6\n"', + {"escapechar": "\\"}, + None, + "(EOF inside string starting at row 2)|(unexpected end of data)", + ), + ], + ids=[ + "whitespace-line", + "eat-line-comment", + "eat-crnl-nop", + "eat-comment", + "skip-line", + "eat-line-comment", + "in-field", + "eat-crnl", + "escaped-char", + "escape-in-quoted-field", + "in-quoted-field", + ], +) +def test_eof_states(all_parsers, data, kwargs, expected, msg): + # see gh-10728, gh-10548 + parser = all_parsers + + if expected is None: + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + else: + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_temporary_file(all_parsers): + # see gh-13398 + parser = all_parsers + data = "0 0" + + with tm.ensure_clean(mode="w+", return_filelike=True) as new_file: + new_file.write(data) + new_file.flush() + new_file.seek(0) + + result = parser.read_csv(new_file, sep=r"\s+", header=None) + + expected = DataFrame([[0, 0]]) + tm.assert_frame_equal(result, expected) + + +def test_internal_eof_byte(all_parsers): + # see gh-5500 + parser = all_parsers + data = "a,b\n1\x1a,2" + + expected = DataFrame([["1\x1a", 2]], columns=["a", "b"]) + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +def test_internal_eof_byte_to_file(all_parsers): + # see gh-16559 + parser = all_parsers + data = b'c1,c2\r\n"test \x1a test", test\r\n' + expected = DataFrame([["test \x1a test", " test"]], columns=["c1", "c2"]) + path = f"__{tm.rands(10)}__.csv" + + with tm.ensure_clean(path) as path: + with open(path, "wb") as f: + f.write(data) + + result = parser.read_csv(path) + tm.assert_frame_equal(result, expected) + + +def test_file_handle_string_io(all_parsers): + # gh-14418 + # + # Don't close user provided file handles. + parser = all_parsers + data = "a,b\n1,2" + + fh = StringIO(data) + parser.read_csv(fh) + assert not fh.closed + + +def test_file_handles_with_open(all_parsers, csv1): + # gh-14418 + # + # Don't close user provided file handles. + parser = all_parsers + + for mode in ["r", "rb"]: + with open(csv1, mode) as f: + parser.read_csv(f) + assert not f.closed + + +@skip_pyarrow +def test_invalid_file_buffer_class(all_parsers): + # see gh-15337 + class InvalidBuffer: + pass + + parser = all_parsers + msg = "Invalid file path or buffer object type" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(InvalidBuffer()) + + +@skip_pyarrow +def test_invalid_file_buffer_mock(all_parsers): + # see gh-15337 + parser = all_parsers + msg = "Invalid file path or buffer object type" + + class Foo: + pass + + with pytest.raises(ValueError, match=msg): + parser.read_csv(Foo()) + + +def test_valid_file_buffer_seems_invalid(all_parsers): + # gh-16135: we want to ensure that "tell" and "seek" + # aren't actually being used when we call `read_csv` + # + # Thus, while the object may look "invalid" (these + # methods are attributes of the `StringIO` class), + # it is still a valid file-object for our purposes. + class NoSeekTellBuffer(StringIO): + def tell(self): + raise AttributeError("No tell method") + + def seek(self, pos, whence=0): + raise AttributeError("No seek method") + + data = "a\n1" + parser = all_parsers + expected = DataFrame({"a": [1]}) + + result = parser.read_csv(NoSeekTellBuffer(data)) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("io_class", [StringIO, BytesIO]) +@pytest.mark.parametrize("encoding", [None, "utf-8"]) +def test_read_csv_file_handle(all_parsers, io_class, encoding): + """ + Test whether read_csv does not close user-provided file handles. + + GH 36980 + """ + parser = all_parsers + expected = DataFrame({"a": [1], "b": [2]}) + + content = "a,b\n1,2" + if io_class == BytesIO: + content = content.encode("utf-8") + handle = io_class(content) + + tm.assert_frame_equal(parser.read_csv(handle, encoding=encoding), expected) + assert not handle.closed + + +@skip_pyarrow +def test_memory_map_file_handle_silent_fallback(all_parsers, compression): + """ + Do not fail for buffers with memory_map=True (cannot memory map BytesIO). + + GH 37621 + """ + parser = all_parsers + expected = DataFrame({"a": [1], "b": [2]}) + + handle = BytesIO() + expected.to_csv(handle, index=False, compression=compression, mode="wb") + handle.seek(0) + + tm.assert_frame_equal( + parser.read_csv(handle, memory_map=True, compression=compression), + expected, + ) + + +@skip_pyarrow +def test_memory_map_compression(all_parsers, compression): + """ + Support memory map for compressed files. + + GH 37621 + """ + parser = all_parsers + expected = DataFrame({"a": [1], "b": [2]}) + + with tm.ensure_clean() as path: + expected.to_csv(path, index=False, compression=compression) + + tm.assert_frame_equal( + parser.read_csv(path, memory_map=True, compression=compression), + expected, + ) + + +@skip_pyarrow +def test_context_manager(all_parsers, datapath): + # make sure that opened files are closed + parser = all_parsers + + path = datapath("io", "data", "csv", "iris.csv") + + reader = parser.read_csv(path, chunksize=1) + assert not reader._engine.handles.handle.closed + try: + with reader: + next(reader) + assert False + except AssertionError: + assert reader._engine.handles.handle.closed + + +@skip_pyarrow +def test_context_manageri_user_provided(all_parsers, datapath): + # make sure that user-provided handles are not closed + parser = all_parsers + + with open(datapath("io", "data", "csv", "iris.csv"), mode="r") as path: + + reader = parser.read_csv(path, chunksize=1) + assert not reader._engine.handles.handle.closed + try: + with reader: + next(reader) + assert False + except AssertionError: + assert not reader._engine.handles.handle.closed + + +@skip_pyarrow +def test_file_descriptor_leak(all_parsers): + # GH 31488 + + parser = all_parsers + with tm.ensure_clean() as path: + + def test(): + with pytest.raises(EmptyDataError, match="No columns to parse from file"): + parser.read_csv(path) + + td.check_file_leaks(test)() + + +@skip_pyarrow +@td.check_file_leaks +def test_memory_map(all_parsers, csv_dir_path): + mmap_file = os.path.join(csv_dir_path, "test_mmap.csv") + parser = all_parsers + + expected = DataFrame( + {"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]} + ) + + result = parser.read_csv(mmap_file, memory_map=True) + tm.assert_frame_equal(result, expected) From 8bb69591dfc3ce64f8bc5e2ad95d8be081e3fcb1 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 2 Jan 2021 02:42:46 -0500 Subject: [PATCH 94/95] TST/REF: io/parsers/test_common.py --- .../io/parser/common/test_read_errors.py | 223 ++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 pandas/tests/io/parser/common/test_read_errors.py diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py new file mode 100644 index 0000000000000..f68239bf5d48e --- /dev/null +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -0,0 +1,223 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +import codecs +from io import StringIO +import os + +import numpy as np +import pytest + +from pandas.errors import EmptyDataError, ParserError + +from pandas import DataFrame +import pandas._testing as tm + +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +@skip_pyarrow +def test_empty_decimal_marker(all_parsers): + data = """A|B|C +1|2,334|5 +10|13|10. +""" + # Parsers support only length-1 decimals + msg = "Only length-1 decimal markers supported" + parser = all_parsers + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), decimal="") + + +@skip_pyarrow +def test_bad_stream_exception(all_parsers, csv_dir_path): + # see gh-13652 + # + # This test validates that both the Python engine and C engine will + # raise UnicodeDecodeError instead of C engine raising ParserError + # and swallowing the exception that caused read to fail. + path = os.path.join(csv_dir_path, "sauron.SHIFT_JIS.csv") + codec = codecs.lookup("utf-8") + utf8 = codecs.lookup("utf-8") + parser = all_parsers + msg = "'utf-8' codec can't decode byte" + + # Stream must be binary UTF8. + with open(path, "rb") as handle, codecs.StreamRecoder( + handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter + ) as stream: + + with pytest.raises(UnicodeDecodeError, match=msg): + parser.read_csv(stream) + + +@skip_pyarrow +def test_malformed(all_parsers): + # see gh-6607 + parser = all_parsers + data = """ignore +A,B,C +1,2,3 # comment +1,2,3,4,5 +2,3,4 +""" + msg = "Expected 3 fields in line 4, saw 5" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), header=1, comment="#") + + +@skip_pyarrow +@pytest.mark.parametrize("nrows", [5, 3, None]) +def test_malformed_chunks(all_parsers, nrows): + data = """ignore +A,B,C +skip +1,2,3 +3,5,10 # comment +1,2,3,4,5 +2,3,4 +""" + parser = all_parsers + msg = "Expected 3 fields in line 6, saw 5" + with parser.read_csv( + StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2] + ) as reader: + with pytest.raises(ParserError, match=msg): + reader.read(nrows) + + +@skip_pyarrow +def test_catch_too_many_names(all_parsers): + # see gh-5156 + data = """\ +1,2,3 +4,,6 +7,8,9 +10,11,12\n""" + parser = all_parsers + msg = ( + "Too many columns specified: expected 4 and found 3" + if parser.engine == "c" + else "Number of passed names did not match " + "number of header fields in the file" + ) + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) + + +@skip_pyarrow +@pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5]) +def test_raise_on_no_columns(all_parsers, nrows): + parser = all_parsers + data = "\n" * nrows + + msg = "No columns to parse from file" + with pytest.raises(EmptyDataError, match=msg): + parser.read_csv(StringIO(data)) + + +def test_read_csv_raises_on_header_prefix(all_parsers): + # gh-27394 + parser = all_parsers + msg = "Argument prefix must be None if argument header is not None" + + s = StringIO("0,1\n2,3") + + with pytest.raises(ValueError, match=msg): + parser.read_csv(s, header=0, prefix="_X") + + +def test_unexpected_keyword_parameter_exception(all_parsers): + # GH-34976 + parser = all_parsers + + msg = "{}\\(\\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=msg.format("read_csv")): + parser.read_csv("foo.csv", foo=1) + with pytest.raises(TypeError, match=msg.format("read_table")): + parser.read_table("foo.tsv", foo=1) + + +@skip_pyarrow +def test_suppress_error_output(all_parsers, capsys): + # see gh-15925 + parser = all_parsers + data = "a\n1\n1,2,3\n4\n5,6,7" + expected = DataFrame({"a": [1, 4]}) + + result = parser.read_csv( + StringIO(data), error_bad_lines=False, warn_bad_lines=False + ) + tm.assert_frame_equal(result, expected) + + captured = capsys.readouterr() + assert captured.err == "" + + +@skip_pyarrow +@pytest.mark.parametrize( + "kwargs", + [{}, {"error_bad_lines": True}], # Default is True. # Explicitly pass in. +) +@pytest.mark.parametrize( + "warn_kwargs", [{}, {"warn_bad_lines": True}, {"warn_bad_lines": False}] +) +def test_error_bad_lines(all_parsers, kwargs, warn_kwargs): + # see gh-15925 + parser = all_parsers + kwargs.update(**warn_kwargs) + data = "a\n1\n1,2,3\n4\n5,6,7" + + msg = "Expected 1 fields in line 3, saw 3" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + + +@skip_pyarrow +def test_warn_bad_lines(all_parsers, capsys): + # see gh-15925 + parser = all_parsers + data = "a\n1\n1,2,3\n4\n5,6,7" + expected = DataFrame({"a": [1, 4]}) + + result = parser.read_csv(StringIO(data), error_bad_lines=False, warn_bad_lines=True) + tm.assert_frame_equal(result, expected) + + captured = capsys.readouterr() + assert "Skipping line 3" in captured.err + assert "Skipping line 5" in captured.err + + +@skip_pyarrow +def test_read_csv_wrong_num_columns(all_parsers): + # Too few columns. + data = """A,B,C,D,E,F +1,2,3,4,5,6 +6,7,8,9,10,11,12 +11,12,13,14,15,16 +""" + parser = all_parsers + msg = "Expected 6 fields in line 3, saw 7" + + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data)) + + +@skip_pyarrow +def test_null_byte_char(all_parsers): + # see gh-2741 + data = "\x00,foo" + names = ["a", "b"] + parser = all_parsers + + if parser.engine == "c": + expected = DataFrame([[np.nan, "foo"]], columns=names) + out = parser.read_csv(StringIO(data), names=names) + tm.assert_frame_equal(out, expected) + else: + msg = "NULL byte detected" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), names=names) From d9478d6487167aee6033fa2b8918906b8ab47a70 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 2 Jan 2021 02:43:57 -0500 Subject: [PATCH 95/95] TST/REF: remove test_common.py --- pandas/tests/io/parser/test_common.py | 2466 ------------------------- 1 file changed, 2466 deletions(-) delete mode 100644 pandas/tests/io/parser/test_common.py diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py deleted file mode 100644 index f06d1476c515a..0000000000000 --- a/pandas/tests/io/parser/test_common.py +++ /dev/null @@ -1,2466 +0,0 @@ -""" -Tests that work on both the Python and C engines but do not have a -specific classification into the other test modules. -""" -import codecs -import csv -from datetime import datetime -from inspect import signature -from io import BytesIO, StringIO -import os -import platform -from urllib.error import URLError - -import numpy as np -import pytest - -from pandas._libs.tslib import Timestamp -from pandas.compat import is_platform_linux -from pandas.errors import DtypeWarning, EmptyDataError, ParserError -import pandas.util._test_decorators as td - -from pandas import DataFrame, Index, MultiIndex, Series, compat, concat, option_context -import pandas._testing as tm - -from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser - -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") - - -def test_override_set_noconvert_columns(): - # see gh-17351 - # - # Usecols needs to be sorted in _set_noconvert_columns based - # on the test_usecols_with_parse_dates test from test_usecols.py - class MyTextFileReader(TextFileReader): - def __init__(self): - self._currow = 0 - self.squeeze = False - - class MyCParserWrapper(CParserWrapper): - def _set_noconvert_columns(self): - if self.usecols_dtype == "integer": - # self.usecols is a set, which is documented as unordered - # but in practice, a CPython set of integers is sorted. - # In other implementations this assumption does not hold. - # The following code simulates a different order, which - # before GH 17351 would cause the wrong columns to be - # converted via the parse_dates parameter - self.usecols = list(self.usecols) - self.usecols.reverse() - return CParserWrapper._set_noconvert_columns(self) - - data = """a,b,c,d,e -0,1,20140101,0900,4 -0,1,20140102,1000,4""" - - parse_dates = [[1, 2]] - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - - parser = MyTextFileReader() - parser.options = { - "usecols": [0, 2, 3], - "parse_dates": parse_dates, - "delimiter": ",", - } - parser.engine = "c" - parser._engine = MyCParserWrapper(StringIO(data), **parser.options) - - result = parser.read() - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_empty_decimal_marker(all_parsers): - data = """A|B|C -1|2,334|5 -10|13|10. -""" - # Parsers support only length-1 decimals - msg = "Only length-1 decimal markers supported" - parser = all_parsers - - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), decimal="") - - -@skip_pyarrow -def test_bad_stream_exception(all_parsers, csv_dir_path): - # see gh-13652 - # - # This test validates that both the Python engine and C engine will - # raise UnicodeDecodeError instead of C engine raising ParserError - # and swallowing the exception that caused read to fail. - path = os.path.join(csv_dir_path, "sauron.SHIFT_JIS.csv") - codec = codecs.lookup("utf-8") - utf8 = codecs.lookup("utf-8") - parser = all_parsers - msg = "'utf-8' codec can't decode byte" - - # Stream must be binary UTF8. - with open(path, "rb") as handle, codecs.StreamRecoder( - handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter - ) as stream: - - with pytest.raises(UnicodeDecodeError, match=msg): - parser.read_csv(stream) - - -@skip_pyarrow -def test_read_csv_local(all_parsers, csv1): - prefix = "file:///" if compat.is_platform_windows() else "file://" - parser = all_parsers - - fname = prefix + str(os.path.abspath(csv1)) - result = parser.read_csv(fname, index_col=0, parse_dates=True) - - expected = DataFrame( - [ - [0.980269, 3.685731, -0.364216805298, -1.159738], - [1.047916, -0.041232, -0.16181208307, 0.212549], - [0.498581, 0.731168, -0.537677223318, 1.346270], - [1.120202, 1.567621, 0.00364077397681, 0.675253], - [-0.487094, 0.571455, -1.6116394093, 0.103469], - [0.836649, 0.246462, 0.588542635376, 1.062782], - [-0.157161, 1.340307, 1.1957779562, -1.097007], - ], - columns=["A", "B", "C", "D"], - index=Index( - [ - datetime(2000, 1, 3), - datetime(2000, 1, 4), - datetime(2000, 1, 5), - datetime(2000, 1, 6), - datetime(2000, 1, 7), - datetime(2000, 1, 10), - datetime(2000, 1, 11), - ], - name="index", - ), - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_1000_sep(all_parsers): - parser = all_parsers - data = """A|B|C -1|2,334|5 -10|13|10. -""" - expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]}) - - result = parser.read_csv(StringIO(data), sep="|", thousands=",") - tm.assert_frame_equal(result, expected) - - -def test_squeeze(all_parsers): - data = """\ -a,1 -b,2 -c,3 -""" - parser = all_parsers - index = Index(["a", "b", "c"], name=0) - expected = Series([1, 2, 3], name=1, index=index) - - result = parser.read_csv(StringIO(data), index_col=0, header=None, squeeze=True) - tm.assert_series_equal(result, expected) - - # see gh-8217 - # - # Series should not be a view. - assert not result._is_view - - -@skip_pyarrow -def test_malformed(all_parsers): - # see gh-6607 - parser = all_parsers - data = """ignore -A,B,C -1,2,3 # comment -1,2,3,4,5 -2,3,4 -""" - msg = "Expected 3 fields in line 4, saw 5" - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), header=1, comment="#") - - -@skip_pyarrow -@pytest.mark.parametrize("nrows", [5, 3, None]) -def test_malformed_chunks(all_parsers, nrows): - data = """ignore -A,B,C -skip -1,2,3 -3,5,10 # comment -1,2,3,4,5 -2,3,4 -""" - parser = all_parsers - msg = "Expected 3 fields in line 6, saw 5" - with parser.read_csv( - StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2] - ) as reader: - with pytest.raises(ParserError, match=msg): - reader.read(nrows) - - -@skip_pyarrow -def test_unnamed_columns(all_parsers): - data = """A,B,C,, -1,2,3,4,5 -6,7,8,9,10 -11,12,13,14,15 -""" - parser = all_parsers - expected = DataFrame( - [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], - dtype=np.int64, - columns=["A", "B", "C", "Unnamed: 3", "Unnamed: 4"], - ) - result = parser.read_csv(StringIO(data)) - tm.assert_frame_equal(result, expected) - - -def test_csv_mixed_type(all_parsers): - data = """A,B,C -a,1,2 -b,3,4 -c,4,5 -""" - parser = all_parsers - expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}) - result = parser.read_csv(StringIO(data)) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_read_csv_low_memory_no_rows_with_index(all_parsers): - # see gh-21141 - parser = all_parsers - - if not parser.low_memory: - pytest.skip("This is a low-memory specific test") - - data = """A,B,C -1,1,1,2 -2,2,3,4 -3,3,4,5 -""" - result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0) - expected = DataFrame(columns=["A", "B", "C"]) - tm.assert_frame_equal(result, expected) - - -def test_read_csv_dataframe(all_parsers, csv1): - parser = all_parsers - result = parser.read_csv(csv1, index_col=0, parse_dates=True) - - expected = DataFrame( - [ - [0.980269, 3.685731, -0.364216805298, -1.159738], - [1.047916, -0.041232, -0.16181208307, 0.212549], - [0.498581, 0.731168, -0.537677223318, 1.346270], - [1.120202, 1.567621, 0.00364077397681, 0.675253], - [-0.487094, 0.571455, -1.6116394093, 0.103469], - [0.836649, 0.246462, 0.588542635376, 1.062782], - [-0.157161, 1.340307, 1.1957779562, -1.097007], - ], - columns=["A", "B", "C", "D"], - index=Index( - [ - datetime(2000, 1, 3), - datetime(2000, 1, 4), - datetime(2000, 1, 5), - datetime(2000, 1, 6), - datetime(2000, 1, 7), - datetime(2000, 1, 10), - datetime(2000, 1, 11), - ], - name="index", - ), - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_read_csv_no_index_name(all_parsers, csv_dir_path): - parser = all_parsers - csv2 = os.path.join(csv_dir_path, "test2.csv") - result = parser.read_csv(csv2, index_col=0, parse_dates=True) - - expected = DataFrame( - [ - [0.980269, 3.685731, -0.364216805298, -1.159738, "foo"], - [1.047916, -0.041232, -0.16181208307, 0.212549, "bar"], - [0.498581, 0.731168, -0.537677223318, 1.346270, "baz"], - [1.120202, 1.567621, 0.00364077397681, 0.675253, "qux"], - [-0.487094, 0.571455, -1.6116394093, 0.103469, "foo2"], - ], - columns=["A", "B", "C", "D", "E"], - index=Index( - [ - datetime(2000, 1, 3), - datetime(2000, 1, 4), - datetime(2000, 1, 5), - datetime(2000, 1, 6), - datetime(2000, 1, 7), - ] - ), - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_read_csv_wrong_num_columns(all_parsers): - # Too few columns. - data = """A,B,C,D,E,F -1,2,3,4,5,6 -6,7,8,9,10,11,12 -11,12,13,14,15,16 -""" - parser = all_parsers - msg = "Expected 6 fields in line 3, saw 7" - - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data)) - - -def test_read_duplicate_index_explicit(all_parsers): - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo,12,13,14,15 -bar,12,13,14,15 -""" - parser = all_parsers - result = parser.read_csv(StringIO(data), index_col=0) - - expected = DataFrame( - [ - [2, 3, 4, 5], - [7, 8, 9, 10], - [12, 13, 14, 15], - [12, 13, 14, 15], - [12, 13, 14, 15], - [12, 13, 14, 15], - ], - columns=["A", "B", "C", "D"], - index=Index(["foo", "bar", "baz", "qux", "foo", "bar"], name="index"), - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_read_duplicate_index_implicit(all_parsers): - data = """A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo,12,13,14,15 -bar,12,13,14,15 -""" - parser = all_parsers - result = parser.read_csv(StringIO(data)) - - expected = DataFrame( - [ - [2, 3, 4, 5], - [7, 8, 9, 10], - [12, 13, 14, 15], - [12, 13, 14, 15], - [12, 13, 14, 15], - [12, 13, 14, 15], - ], - columns=["A", "B", "C", "D"], - index=Index(["foo", "bar", "baz", "qux", "foo", "bar"]), - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "data,kwargs,expected", - [ - ( - "A,B\nTrue,1\nFalse,2\nTrue,3", - {}, - DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]), - ), - ( - "A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3", - {"true_values": ["yes", "Yes", "YES"], "false_values": ["no", "NO", "No"]}, - DataFrame( - [[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]], - columns=["A", "B"], - ), - ), - ( - "A,B\nTRUE,1\nFALSE,2\nTRUE,3", - {}, - DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]), - ), - ( - "A,B\nfoo,bar\nbar,foo", - {"true_values": ["foo"], "false_values": ["bar"]}, - DataFrame([[True, False], [False, True]], columns=["A", "B"]), - ), - ], -) -def test_parse_bool(all_parsers, data, kwargs, expected): - parser = all_parsers - result = parser.read_csv(StringIO(data), **kwargs) - tm.assert_frame_equal(result, expected) - - -def test_int_conversion(all_parsers): - data = """A,B -1.0,1 -2.0,2 -3.0,3 -""" - parser = all_parsers - result = parser.read_csv(StringIO(data)) - - expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("nrows", [3, 3.0]) -def test_read_nrows(all_parsers, nrows): - # see gh-10476 - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - expected = DataFrame( - [["foo", 2, 3, 4, 5], ["bar", 7, 8, 9, 10], ["baz", 12, 13, 14, 15]], - columns=["index", "A", "B", "C", "D"], - ) - parser = all_parsers - - result = parser.read_csv(StringIO(data), nrows=nrows) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("nrows", [1.2, "foo", -1]) -def test_read_nrows_bad(all_parsers, nrows): - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - msg = r"'nrows' must be an integer >=0" - parser = all_parsers - - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), nrows=nrows) - - -@skip_pyarrow -@pytest.mark.parametrize("index_col", [0, "index"]) -def test_read_chunksize_with_index(all_parsers, index_col): - parser = all_parsers - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - - expected = DataFrame( - [ - ["foo", 2, 3, 4, 5], - ["bar", 7, 8, 9, 10], - ["baz", 12, 13, 14, 15], - ["qux", 12, 13, 14, 15], - ["foo2", 12, 13, 14, 15], - ["bar2", 12, 13, 14, 15], - ], - columns=["index", "A", "B", "C", "D"], - ) - expected = expected.set_index("index") - - with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader: - chunks = list(reader) - tm.assert_frame_equal(chunks[0], expected[:2]) - tm.assert_frame_equal(chunks[1], expected[2:4]) - tm.assert_frame_equal(chunks[2], expected[4:]) - - -@skip_pyarrow -@pytest.mark.parametrize("chunksize", [1.3, "foo", 0]) -def test_read_chunksize_bad(all_parsers, chunksize): - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - parser = all_parsers - msg = r"'chunksize' must be an integer >=1" - - with pytest.raises(ValueError, match=msg): - with parser.read_csv(StringIO(data), chunksize=chunksize) as _: - pass - - -@skip_pyarrow -@pytest.mark.parametrize("chunksize", [2, 8]) -def test_read_chunksize_and_nrows(all_parsers, chunksize): - # see gh-15755 - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - parser = all_parsers - kwargs = {"index_col": 0, "nrows": 5} - - expected = parser.read_csv(StringIO(data), **kwargs) - with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader: - tm.assert_frame_equal(concat(reader), expected) - - -@skip_pyarrow -def test_read_chunksize_and_nrows_changing_size(all_parsers): - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - parser = all_parsers - kwargs = {"index_col": 0, "nrows": 5} - - expected = parser.read_csv(StringIO(data), **kwargs) - with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader: - tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2]) - tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5]) - - with pytest.raises(StopIteration, match=""): - reader.get_chunk(size=3) - - -@skip_pyarrow -def test_get_chunk_passed_chunksize(all_parsers): - parser = all_parsers - data = """A,B,C -1,2,3 -4,5,6 -7,8,9 -1,2,3""" - - with parser.read_csv(StringIO(data), chunksize=2) as reader: - result = reader.get_chunk() - - expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}]) -def test_read_chunksize_compat(all_parsers, kwargs): - # see gh-12185 - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - parser = all_parsers - result = parser.read_csv(StringIO(data), **kwargs) - with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader: - tm.assert_frame_equal(concat(reader), result) - - -@skip_pyarrow -def test_read_chunksize_jagged_names(all_parsers): - # see gh-23509 - parser = all_parsers - data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)]) - - expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10]) - with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader: - result = concat(reader) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_read_data_list(all_parsers): - parser = all_parsers - kwargs = {"index_col": 0} - data = "A,B,C\nfoo,1,2,3\nbar,4,5,6" - - data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]] - expected = parser.read_csv(StringIO(data), **kwargs) - - with TextParser(data_list, chunksize=2, **kwargs) as parser: - result = parser.read() - - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_iterator(all_parsers): - # see gh-6607 - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - parser = all_parsers - kwargs = {"index_col": 0} - - expected = parser.read_csv(StringIO(data), **kwargs) - with parser.read_csv(StringIO(data), iterator=True, **kwargs) as reader: - - first_chunk = reader.read(3) - tm.assert_frame_equal(first_chunk, expected[:3]) - - last_chunk = reader.read(5) - tm.assert_frame_equal(last_chunk, expected[3:]) - - -@skip_pyarrow -def test_iterator2(all_parsers): - parser = all_parsers - data = """A,B,C -foo,1,2,3 -bar,4,5,6 -baz,7,8,9 -""" - - with parser.read_csv(StringIO(data), iterator=True) as reader: - result = list(reader) - - expected = DataFrame( - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=["foo", "bar", "baz"], - columns=["A", "B", "C"], - ) - tm.assert_frame_equal(result[0], expected) - - -def test_reader_list(all_parsers): - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - parser = all_parsers - kwargs = {"index_col": 0} - - lines = list(csv.reader(StringIO(data))) - with TextParser(lines, chunksize=2, **kwargs) as reader: - chunks = list(reader) - - expected = parser.read_csv(StringIO(data), **kwargs) - - tm.assert_frame_equal(chunks[0], expected[:2]) - tm.assert_frame_equal(chunks[1], expected[2:4]) - tm.assert_frame_equal(chunks[2], expected[4:]) - - -def test_reader_list_skiprows(all_parsers): - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - parser = all_parsers - kwargs = {"index_col": 0} - - lines = list(csv.reader(StringIO(data))) - with TextParser(lines, chunksize=2, skiprows=[1], **kwargs) as reader: - chunks = list(reader) - - expected = parser.read_csv(StringIO(data), **kwargs) - - tm.assert_frame_equal(chunks[0], expected[1:3]) - - -@skip_pyarrow -def test_iterator_stop_on_chunksize(all_parsers): - # gh-3967: stopping iteration when chunksize is specified - parser = all_parsers - data = """A,B,C -foo,1,2,3 -bar,4,5,6 -baz,7,8,9 -""" - - with parser.read_csv(StringIO(data), chunksize=1) as reader: - result = list(reader) - - assert len(result) == 3 - expected = DataFrame( - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=["foo", "bar", "baz"], - columns=["A", "B", "C"], - ) - tm.assert_frame_equal(concat(result), expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "kwargs", [{"iterator": True, "chunksize": 1}, {"iterator": True}, {"chunksize": 1}] -) -def test_iterator_skipfooter_errors(all_parsers, kwargs): - msg = "'skipfooter' not supported for iteration" - parser = all_parsers - data = "a\n1\n2" - - with pytest.raises(ValueError, match=msg): - with parser.read_csv(StringIO(data), skipfooter=1, **kwargs) as _: - pass - - -def test_nrows_skipfooter_errors(all_parsers): - msg = "'skipfooter' not supported with 'nrows'" - data = "a\n1\n2\n3\n4\n5\n6" - parser = all_parsers - - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), skipfooter=1, nrows=5) - - -@pytest.mark.parametrize( - "data,kwargs,expected", - [ - ( - """foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""", - {"index_col": 0, "names": ["index", "A", "B", "C", "D"]}, - DataFrame( - [ - [2, 3, 4, 5], - [7, 8, 9, 10], - [12, 13, 14, 15], - [12, 13, 14, 15], - [12, 13, 14, 15], - [12, 13, 14, 15], - ], - index=Index(["foo", "bar", "baz", "qux", "foo2", "bar2"], name="index"), - columns=["A", "B", "C", "D"], - ), - ), - ( - """foo,one,2,3,4,5 -foo,two,7,8,9,10 -foo,three,12,13,14,15 -bar,one,12,13,14,15 -bar,two,12,13,14,15 -""", - {"index_col": [0, 1], "names": ["index1", "index2", "A", "B", "C", "D"]}, - DataFrame( - [ - [2, 3, 4, 5], - [7, 8, 9, 10], - [12, 13, 14, 15], - [12, 13, 14, 15], - [12, 13, 14, 15], - ], - index=MultiIndex.from_tuples( - [ - ("foo", "one"), - ("foo", "two"), - ("foo", "three"), - ("bar", "one"), - ("bar", "two"), - ], - names=["index1", "index2"], - ), - columns=["A", "B", "C", "D"], - ), - ), - ], -) -def test_pass_names_with_index(all_parsers, data, kwargs, expected): - parser = all_parsers - result = parser.read_csv(StringIO(data), **kwargs) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) -def test_multi_index_no_level_names(all_parsers, index_col): - data = """index1,index2,A,B,C,D -foo,one,2,3,4,5 -foo,two,7,8,9,10 -foo,three,12,13,14,15 -bar,one,12,13,14,15 -bar,two,12,13,14,15 -""" - headless_data = "\n".join(data.split("\n")[1:]) - - names = ["A", "B", "C", "D"] - parser = all_parsers - - result = parser.read_csv( - StringIO(headless_data), index_col=index_col, header=None, names=names - ) - expected = parser.read_csv(StringIO(data), index_col=index_col) - - # No index names in headless data. - expected.index.names = [None] * 2 - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_multi_index_no_level_names_implicit(all_parsers): - parser = all_parsers - data = """A,B,C,D -foo,one,2,3,4,5 -foo,two,7,8,9,10 -foo,three,12,13,14,15 -bar,one,12,13,14,15 -bar,two,12,13,14,15 -""" - - result = parser.read_csv(StringIO(data)) - expected = DataFrame( - [ - [2, 3, 4, 5], - [7, 8, 9, 10], - [12, 13, 14, 15], - [12, 13, 14, 15], - [12, 13, 14, 15], - ], - columns=["A", "B", "C", "D"], - index=MultiIndex.from_tuples( - [ - ("foo", "one"), - ("foo", "two"), - ("foo", "three"), - ("bar", "one"), - ("bar", "two"), - ] - ), - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "data,expected,header", - [ - ("a,b", DataFrame(columns=["a", "b"]), [0]), - ( - "a,b\nc,d", - DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])), - [0, 1], - ), - ], -) -@pytest.mark.parametrize("round_trip", [True, False]) -def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip): - # see gh-14545 - parser = all_parsers - data = expected.to_csv(index=False) if round_trip else data - - result = parser.read_csv(StringIO(data), header=header) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_no_unnamed_index(all_parsers): - parser = all_parsers - data = """ id c0 c1 c2 -0 1 0 a b -1 2 0 c d -2 2 2 e f -""" - result = parser.read_csv(StringIO(data), sep=" ") - expected = DataFrame( - [[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], [2, 2, 2, "e", "f"]], - columns=["Unnamed: 0", "id", "c0", "c1", "c2"], - ) - tm.assert_frame_equal(result, expected) - - -def test_read_csv_parse_simple_list(all_parsers): - parser = all_parsers - data = """foo -bar baz -qux foo -foo -bar""" - - result = parser.read_csv(StringIO(data), header=None) - expected = DataFrame(["foo", "bar baz", "qux foo", "foo", "bar"]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@tm.network -def test_url(all_parsers, csv_dir_path): - # TODO: FTP testing - parser = all_parsers - kwargs = {"sep": "\t"} - - url = ( - "https://raw.github.com/pandas-dev/pandas/master/" - "pandas/tests/io/parser/data/salaries.csv" - ) - url_result = parser.read_csv(url, **kwargs) - - local_path = os.path.join(csv_dir_path, "salaries.csv") - local_result = parser.read_csv(local_path, **kwargs) - tm.assert_frame_equal(url_result, local_result) - - -@skip_pyarrow -@pytest.mark.slow -def test_local_file(all_parsers, csv_dir_path): - parser = all_parsers - kwargs = {"sep": "\t"} - - local_path = os.path.join(csv_dir_path, "salaries.csv") - local_result = parser.read_csv(local_path, **kwargs) - url = "file://localhost/" + local_path - - try: - url_result = parser.read_csv(url, **kwargs) - tm.assert_frame_equal(url_result, local_result) - except URLError: - # Fails on some systems. - pytest.skip("Failing on: " + " ".join(platform.uname())) - - -@skip_pyarrow -def test_path_path_lib(all_parsers): - parser = all_parsers - df = tm.makeDataFrame() - result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0)) - tm.assert_frame_equal(df, result) - - -@skip_pyarrow -def test_path_local_path(all_parsers): - parser = all_parsers - df = tm.makeDataFrame() - result = tm.round_trip_localpath( - df.to_csv, lambda p: parser.read_csv(p, index_col=0) - ) - tm.assert_frame_equal(df, result) - - -@skip_pyarrow -def test_nonexistent_path(all_parsers): - # gh-2428: pls no segfault - # gh-14086: raise more helpful FileNotFoundError - # GH#29233 "File foo" instead of "File b'foo'" - parser = all_parsers - path = f"{tm.rands(10)}.csv" - - msg = r"\[Errno 2\]" - with pytest.raises(FileNotFoundError, match=msg) as e: - parser.read_csv(path) - assert path == e.value.filename - - -@skip_pyarrow -@td.skip_if_windows # os.chmod does not work in windows -def test_no_permission(all_parsers): - # GH 23784 - parser = all_parsers - - msg = r"\[Errno 13\]" - with tm.ensure_clean() as path: - os.chmod(path, 0) # make file unreadable - - # verify that this process cannot open the file (not running as sudo) - try: - with open(path): - pass - pytest.skip("Running as sudo.") - except PermissionError: - pass - - with pytest.raises(PermissionError, match=msg) as e: - parser.read_csv(path) - assert path == e.value.filename - - -@skip_pyarrow -def test_missing_trailing_delimiters(all_parsers): - parser = all_parsers - data = """A,B,C,D -1,2,3,4 -1,3,3, -1,4,5""" - - result = parser.read_csv(StringIO(data)) - expected = DataFrame( - [[1, 2, 3, 4], [1, 3, 3, np.nan], [1, 4, 5, np.nan]], - columns=["A", "B", "C", "D"], - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_skip_initial_space(all_parsers): - data = ( - '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' - "1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, " - "314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, " - "70.06056, 344.98370, 1, 1, -0.689265, -0.692787, " - "0.212036, 14.7674, 41.605, -9999.0, -9999.0, " - "-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128" - ) - parser = all_parsers - - result = parser.read_csv( - StringIO(data), - names=list(range(33)), - header=None, - na_values=["-9999.0"], - skipinitialspace=True, - ) - expected = DataFrame( - [ - [ - "09-Apr-2012", - "01:10:18.300", - 2456026.548822908, - 12849, - 1.00361, - 1.12551, - 330.65659, - 355626618.16711, - 73.48821, - 314.11625, - 1917.09447, - 179.71425, - 80.0, - 240.0, - -350, - 70.06056, - 344.9837, - 1, - 1, - -0.689265, - -0.692787, - 0.212036, - 14.7674, - 41.605, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - 0, - 12, - 128, - ] - ] - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_trailing_delimiters(all_parsers): - # see gh-2442 - data = """A,B,C -1,2,3, -4,5,6, -7,8,9,""" - parser = all_parsers - result = parser.read_csv(StringIO(data), index_col=False) - - expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]}) - tm.assert_frame_equal(result, expected) - - -def test_escapechar(all_parsers): - # https://stackoverflow.com/questions/13824840/feature-request-for- - # pandas-read-csv - data = '''SEARCH_TERM,ACTUAL_URL -"bra tv bord","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" -"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" -"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa - - parser = all_parsers - result = parser.read_csv( - StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8" - ) - - assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals series' - - tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"])) - - -def test_int64_min_issues(all_parsers): - # see gh-2599 - parser = all_parsers - data = "A,B\n0,0\n0," - result = parser.read_csv(StringIO(data)) - - expected = DataFrame({"A": [0, 0], "B": [0, np.nan]}) - tm.assert_frame_equal(result, expected) - - -def test_parse_integers_above_fp_precision(all_parsers): - data = """Numbers -17007000002000191 -17007000002000191 -17007000002000191 -17007000002000191 -17007000002000192 -17007000002000192 -17007000002000192 -17007000002000192 -17007000002000192 -17007000002000194""" - parser = all_parsers - result = parser.read_csv(StringIO(data)) - expected = DataFrame( - { - "Numbers": [ - 17007000002000191, - 17007000002000191, - 17007000002000191, - 17007000002000191, - 17007000002000192, - 17007000002000192, - 17007000002000192, - 17007000002000192, - 17007000002000192, - 17007000002000194, - ] - } - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.xfail(reason="GH38630, sometimes gives ResourceWarning", strict=False) -def test_chunks_have_consistent_numerical_type(all_parsers): - parser = all_parsers - integers = [str(i) for i in range(499999)] - data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers) - - # Coercions should work without warnings. - with tm.assert_produces_warning(None): - result = parser.read_csv(StringIO(data)) - - assert type(result.a[0]) is np.float64 - assert result.a.dtype == float - - -def test_warn_if_chunks_have_mismatched_type(all_parsers): - warning_type = None - parser = all_parsers - integers = [str(i) for i in range(499999)] - data = "a\n" + "\n".join(integers + ["a", "b"] + integers) - - # see gh-3866: if chunks are different types and can't - # be coerced using numerical types, then issue warning. - if parser.engine == "c" and parser.low_memory: - warning_type = DtypeWarning - - with tm.assert_produces_warning(warning_type): - df = parser.read_csv(StringIO(data)) - assert df.a.dtype == object - - -@skip_pyarrow -@pytest.mark.parametrize("sep", [" ", r"\s+"]) -def test_integer_overflow_bug(all_parsers, sep): - # see gh-2601 - data = "65248E10 11\n55555E55 22\n" - parser = all_parsers - - result = parser.read_csv(StringIO(data), header=None, sep=sep) - expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_catch_too_many_names(all_parsers): - # see gh-5156 - data = """\ -1,2,3 -4,,6 -7,8,9 -10,11,12\n""" - parser = all_parsers - msg = ( - "Too many columns specified: expected 4 and found 3" - if parser.engine == "c" - else "Number of passed names did not match " - "number of header fields in the file" - ) - - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) - - -@skip_pyarrow -def test_ignore_leading_whitespace(all_parsers): - # see gh-3374, gh-6607 - parser = all_parsers - data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9" - result = parser.read_csv(StringIO(data), sep=r"\s+") - - expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]}) - tm.assert_frame_equal(result, expected) - - -def test_chunk_begins_with_newline_whitespace(all_parsers): - # see gh-10022 - parser = all_parsers - data = "\n hello\nworld\n" - - result = parser.read_csv(StringIO(data), header=None) - expected = DataFrame([" hello", "world"]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_empty_with_index(all_parsers): - # see gh-10184 - data = "x,y" - parser = all_parsers - result = parser.read_csv(StringIO(data), index_col=0) - - expected = DataFrame(columns=["y"], index=Index([], name="x")) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_empty_with_multi_index(all_parsers): - # see gh-10467 - data = "x,y,z" - parser = all_parsers - result = parser.read_csv(StringIO(data), index_col=["x", "y"]) - - expected = DataFrame( - columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"]) - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_empty_with_reversed_multi_index(all_parsers): - data = "x,y,z" - parser = all_parsers - result = parser.read_csv(StringIO(data), index_col=[1, 0]) - - expected = DataFrame( - columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"]) - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_float_parser(all_parsers): - # see gh-9565 - parser = all_parsers - data = "45e-1,4.5,45.,inf,-inf" - result = parser.read_csv(StringIO(data), header=None) - - expected = DataFrame([[float(s) for s in data.split(",")]]) - tm.assert_frame_equal(result, expected) - - -def test_scientific_no_exponent(all_parsers_all_precisions): - # see gh-12215 - df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]}) - data = df.to_csv(index=False) - parser, precision = all_parsers_all_precisions - if parser == "pyarrow": - pytest.skip() - - df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision) - tm.assert_frame_equal(df_roundtrip, df) - - -@skip_pyarrow -@pytest.mark.parametrize("conv", [None, np.int64, np.uint64]) -def test_int64_overflow(all_parsers, conv): - data = """ID -00013007854817840016671868 -00013007854817840016749251 -00013007854817840016754630 -00013007854817840016781876 -00013007854817840017028824 -00013007854817840017963235 -00013007854817840018860166""" - parser = all_parsers - - if conv is None: - # 13007854817840016671868 > UINT64_MAX, so this - # will overflow and return object as the dtype. - result = parser.read_csv(StringIO(data)) - expected = DataFrame( - [ - "00013007854817840016671868", - "00013007854817840016749251", - "00013007854817840016754630", - "00013007854817840016781876", - "00013007854817840017028824", - "00013007854817840017963235", - "00013007854817840018860166", - ], - columns=["ID"], - ) - tm.assert_frame_equal(result, expected) - else: - # 13007854817840016671868 > UINT64_MAX, so attempts - # to cast to either int64 or uint64 will result in - # an OverflowError being raised. - msg = ( - "(Python int too large to convert to C long)|" - "(long too big to convert)|" - "(int too big to convert)" - ) - - with pytest.raises(OverflowError, match=msg): - parser.read_csv(StringIO(data), converters={"ID": conv}) - - -@skip_pyarrow -@pytest.mark.parametrize( - "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min] -) -def test_int64_uint64_range(all_parsers, val): - # These numbers fall right inside the int64-uint64 - # range, so they should be parsed as string. - parser = all_parsers - result = parser.read_csv(StringIO(str(val)), header=None) - - expected = DataFrame([val]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] -) -def test_outside_int64_uint64_range(all_parsers, val): - # These numbers fall just outside the int64-uint64 - # range, so they should be parsed as string. - parser = all_parsers - result = parser.read_csv(StringIO(str(val)), header=None) - - expected = DataFrame([str(val)]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("exp_data", [[str(-1), str(2 ** 63)], [str(2 ** 63), str(-1)]]) -def test_numeric_range_too_wide(all_parsers, exp_data): - # No numerical dtype can hold both negative and uint64 - # values, so they should be cast as string. - parser = all_parsers - data = "\n".join(exp_data) - expected = DataFrame(exp_data) - - result = parser.read_csv(StringIO(data), header=None) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999]) -def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): - # GH#38753 - parser, precision = all_parsers_all_precisions - if parser == "pyarrow": - pytest.skip() - data = f"data\n10E{neg_exp}" - result = parser.read_csv(StringIO(data), float_precision=precision) - expected = DataFrame({"data": [0.0]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999]) -def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): - # GH#38753 - parser, precision = all_parsers_all_precisions - data = f"data\n10E{exp}" - result = parser.read_csv(StringIO(data), float_precision=precision) - if precision == "round_trip": - if exp == 999999999999999999 and is_platform_linux(): - mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result") - request.node.add_marker(mark) - - value = np.inf if exp > 0 else 0.0 - expected = DataFrame({"data": [value]}) - else: - expected = DataFrame({"data": [f"10E{exp}"]}) - - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("iterator", [True, False]) -def test_empty_with_nrows_chunksize(all_parsers, iterator): - # see gh-9535 - parser = all_parsers - expected = DataFrame(columns=["foo", "bar"]) - - nrows = 10 - data = StringIO("foo,bar\n") - - if iterator: - with parser.read_csv(data, chunksize=nrows) as reader: - result = next(iter(reader)) - else: - result = parser.read_csv(data, nrows=nrows) - - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "data,kwargs,expected,msg", - [ - # gh-10728: WHITESPACE_LINE - ( - "a,b,c\n4,5,6\n ", - {}, - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), - None, - ), - # gh-10548: EAT_LINE_COMMENT - ( - "a,b,c\n4,5,6\n#comment", - {"comment": "#"}, - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), - None, - ), - # EAT_CRNL_NOP - ( - "a,b,c\n4,5,6\n\r", - {}, - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), - None, - ), - # EAT_COMMENT - ( - "a,b,c\n4,5,6#comment", - {"comment": "#"}, - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), - None, - ), - # SKIP_LINE - ( - "a,b,c\n4,5,6\nskipme", - {"skiprows": [2]}, - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), - None, - ), - # EAT_LINE_COMMENT - ( - "a,b,c\n4,5,6\n#comment", - {"comment": "#", "skip_blank_lines": False}, - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), - None, - ), - # IN_FIELD - ( - "a,b,c\n4,5,6\n ", - {"skip_blank_lines": False}, - DataFrame([["4", 5, 6], [" ", None, None]], columns=["a", "b", "c"]), - None, - ), - # EAT_CRNL - ( - "a,b,c\n4,5,6\n\r", - {"skip_blank_lines": False}, - DataFrame([[4, 5, 6], [None, None, None]], columns=["a", "b", "c"]), - None, - ), - # ESCAPED_CHAR - ( - "a,b,c\n4,5,6\n\\", - {"escapechar": "\\"}, - None, - "(EOF following escape character)|(unexpected end of data)", - ), - # ESCAPE_IN_QUOTED_FIELD - ( - 'a,b,c\n4,5,6\n"\\', - {"escapechar": "\\"}, - None, - "(EOF inside string starting at row 2)|(unexpected end of data)", - ), - # IN_QUOTED_FIELD - ( - 'a,b,c\n4,5,6\n"', - {"escapechar": "\\"}, - None, - "(EOF inside string starting at row 2)|(unexpected end of data)", - ), - ], - ids=[ - "whitespace-line", - "eat-line-comment", - "eat-crnl-nop", - "eat-comment", - "skip-line", - "eat-line-comment", - "in-field", - "eat-crnl", - "escaped-char", - "escape-in-quoted-field", - "in-quoted-field", - ], -) -def test_eof_states(all_parsers, data, kwargs, expected, msg): - # see gh-10728, gh-10548 - parser = all_parsers - - if expected is None: - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), **kwargs) - else: - result = parser.read_csv(StringIO(data), **kwargs) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]]) -def test_uneven_lines_with_usecols(all_parsers, usecols): - # see gh-12203 - parser = all_parsers - data = r"""a,b,c -0,1,2 -3,4,5,6,7 -8,9,10""" - - if usecols is None: - # Make sure that an error is still raised - # when the "usecols" parameter is not provided. - msg = r"Expected \d+ fields in line \d+, saw \d+" - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data)) - else: - expected = DataFrame({"a": [0, 3, 8], "b": [1, 4, 9]}) - - result = parser.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "data,kwargs,expected", - [ - # First, check to see that the response of parser when faced with no - # provided columns raises the correct error, with or without usecols. - ("", {}, None), - ("", {"usecols": ["X"]}, None), - ( - ",,", - {"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]}, - DataFrame(columns=["X"], index=[0], dtype=np.float64), - ), - ( - "", - {"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]}, - DataFrame(columns=["X"]), - ), - ], -) -def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): - # see gh-12493 - parser = all_parsers - - if expected is None: - msg = "No columns to parse from file" - with pytest.raises(EmptyDataError, match=msg): - parser.read_csv(StringIO(data), **kwargs) - else: - result = parser.read_csv(StringIO(data), **kwargs) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "kwargs,expected", - [ - # gh-8661, gh-8679: this should ignore six lines, including - # lines with trailing whitespace and blank lines. - ( - { - "header": None, - "delim_whitespace": True, - "skiprows": [0, 1, 2, 3, 5, 6], - "skip_blank_lines": True, - }, - DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]), - ), - # gh-8983: test skipping set of rows after a row with trailing spaces. - ( - { - "delim_whitespace": True, - "skiprows": [1, 2, 3, 5, 6], - "skip_blank_lines": True, - }, - DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}), - ), - ], -) -def test_trailing_spaces(all_parsers, kwargs, expected): - data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa - parser = all_parsers - - result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) - tm.assert_frame_equal(result, expected) - - -def test_raise_on_sep_with_delim_whitespace(all_parsers): - # see gh-6607 - data = "a b c\n1 2 3" - parser = all_parsers - - with pytest.raises(ValueError, match="you can only specify one"): - parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True) - - -@skip_pyarrow -@pytest.mark.parametrize("delim_whitespace", [True, False]) -def test_single_char_leading_whitespace(all_parsers, delim_whitespace): - # see gh-9710 - parser = all_parsers - data = """\ -MyColumn -a -b -a -b\n""" - - expected = DataFrame({"MyColumn": list("abab")}) - result = parser.read_csv( - StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "sep,skip_blank_lines,exp_data", - [ - (",", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]), - (r"\s+", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]), - ( - ",", - False, - [ - [1.0, 2.0, 4.0], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - [5.0, np.nan, 10.0], - [np.nan, np.nan, np.nan], - [-70.0, 0.4, 1.0], - ], - ), - ], -) -def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data): - parser = all_parsers - data = """\ -A,B,C -1,2.,4. - - -5.,NaN,10.0 - --70,.4,1 -""" - - if sep == r"\s+": - data = data.replace(",", " ") - - result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines) - expected = DataFrame(exp_data, columns=["A", "B", "C"]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_whitespace_lines(all_parsers): - parser = all_parsers - data = """ - -\t \t\t -\t -A,B,C -\t 1,2.,4. -5.,NaN,10.0 -""" - expected = DataFrame([[1, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]) - result = parser.read_csv(StringIO(data)) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "data,expected", - [ - ( - """ A B C D -a 1 2 3 4 -b 1 2 3 4 -c 1 2 3 4 -""", - DataFrame( - [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], - columns=["A", "B", "C", "D"], - index=["a", "b", "c"], - ), - ), - ( - " a b c\n1 2 3 \n4 5 6\n 7 8 9", - DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]), - ), - ], -) -def test_whitespace_regex_separator(all_parsers, data, expected): - # see gh-6607 - parser = all_parsers - result = parser.read_csv(StringIO(data), sep=r"\s+") - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_verbose_read(all_parsers, capsys): - parser = all_parsers - data = """a,b,c,d -one,1,2,3 -one,1,2,3 -,1,2,3 -one,1,2,3 -,1,2,3 -,1,2,3 -one,1,2,3 -two,1,2,3""" - - # Engines are verbose in different ways. - parser.read_csv(StringIO(data), verbose=True) - captured = capsys.readouterr() - - if parser.engine == "c": - assert "Tokenization took:" in captured.out - assert "Parser memory cleanup took:" in captured.out - else: # Python engine - assert captured.out == "Filled 3 NA values in column a\n" - - -@skip_pyarrow -def test_verbose_read2(all_parsers, capsys): - parser = all_parsers - data = """a,b,c,d -one,1,2,3 -two,1,2,3 -three,1,2,3 -four,1,2,3 -five,1,2,3 -,1,2,3 -seven,1,2,3 -eight,1,2,3""" - - parser.read_csv(StringIO(data), verbose=True, index_col=0) - captured = capsys.readouterr() - - # Engines are verbose in different ways. - if parser.engine == "c": - assert "Tokenization took:" in captured.out - assert "Parser memory cleanup took:" in captured.out - else: # Python engine - assert captured.out == "Filled 1 NA values in column a\n" - - -def test_iteration_open_handle(all_parsers): - parser = all_parsers - kwargs = {"squeeze": True, "header": None} - - with tm.ensure_clean() as path: - with open(path, "w") as f: - f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG") - - with open(path) as f: - for line in f: - if "CCC" in line: - break - - result = parser.read_csv(f, **kwargs) - expected = Series(["DDD", "EEE", "FFF", "GGG"], name=0) - tm.assert_series_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "data,thousands,decimal", - [ - ( - """A|B|C -1|2,334.01|5 -10|13|10. -""", - ",", - ".", - ), - ( - """A|B|C -1|2.334,01|5 -10|13|10, -""", - ".", - ",", - ), - ], -) -def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal): - parser = all_parsers - expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]}) - - result = parser.read_csv( - StringIO(data), sep="|", thousands=thousands, decimal=decimal - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_euro_decimal_format(all_parsers): - parser = all_parsers - data = """Id;Number1;Number2;Text1;Text2;Number3 -1;1521,1541;187101,9543;ABC;poi;4,738797819 -2;121,12;14897,76;DEF;uyt;0,377320872 -3;878,158;108013,434;GHI;rez;2,735694704""" - - result = parser.read_csv(StringIO(data), sep=";", decimal=",") - expected = DataFrame( - [ - [1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819], - [2, 121.12, 14897.76, "DEF", "uyt", 0.377320872], - [3, 878.158, 108013.434, "GHI", "rez", 2.735694704], - ], - columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"], - ) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("na_filter", [True, False]) -def test_inf_parsing(all_parsers, na_filter): - parser = all_parsers - data = """\ -,A -a,inf -b,-inf -c,+Inf -d,-Inf -e,INF -f,-INF -g,+INf -h,-INf -i,inF -j,-inF""" - expected = DataFrame( - {"A": [float("inf"), float("-inf")] * 5}, - index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], - ) - result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("na_filter", [True, False]) -def test_infinity_parsing(all_parsers, na_filter): - parser = all_parsers - data = """\ -,A -a,Infinity -b,-Infinity -c,+Infinity -""" - expected = DataFrame( - {"A": [float("infinity"), float("-infinity"), float("+infinity")]}, - index=["a", "b", "c"], - ) - result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5]) -def test_raise_on_no_columns(all_parsers, nrows): - parser = all_parsers - data = "\n" * nrows - - msg = "No columns to parse from file" - with pytest.raises(EmptyDataError, match=msg): - parser.read_csv(StringIO(data)) - - -@skip_pyarrow -@td.check_file_leaks -def test_memory_map(all_parsers, csv_dir_path): - mmap_file = os.path.join(csv_dir_path, "test_mmap.csv") - parser = all_parsers - - expected = DataFrame( - {"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]} - ) - - result = parser.read_csv(mmap_file, memory_map=True) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_null_byte_char(all_parsers): - # see gh-2741 - data = "\x00,foo" - names = ["a", "b"] - parser = all_parsers - - if parser.engine == "c": - expected = DataFrame([[np.nan, "foo"]], columns=names) - out = parser.read_csv(StringIO(data), names=names) - tm.assert_frame_equal(out, expected) - else: - msg = "NULL byte detected" - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), names=names) - - -@skip_pyarrow -def test_temporary_file(all_parsers): - # see gh-13398 - parser = all_parsers - data = "0 0" - - with tm.ensure_clean(mode="w+", return_filelike=True) as new_file: - new_file.write(data) - new_file.flush() - new_file.seek(0) - - result = parser.read_csv(new_file, sep=r"\s+", header=None) - - expected = DataFrame([[0, 0]]) - tm.assert_frame_equal(result, expected) - - -def test_internal_eof_byte(all_parsers): - # see gh-5500 - parser = all_parsers - data = "a,b\n1\x1a,2" - - expected = DataFrame([["1\x1a", 2]], columns=["a", "b"]) - result = parser.read_csv(StringIO(data)) - tm.assert_frame_equal(result, expected) - - -def test_internal_eof_byte_to_file(all_parsers): - # see gh-16559 - parser = all_parsers - data = b'c1,c2\r\n"test \x1a test", test\r\n' - expected = DataFrame([["test \x1a test", " test"]], columns=["c1", "c2"]) - path = f"__{tm.rands(10)}__.csv" - - with tm.ensure_clean(path) as path: - with open(path, "wb") as f: - f.write(data) - - result = parser.read_csv(path) - tm.assert_frame_equal(result, expected) - - -def test_sub_character(all_parsers, csv_dir_path): - # see gh-16893 - filename = os.path.join(csv_dir_path, "sub_char.csv") - expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"]) - - parser = all_parsers - result = parser.read_csv(filename) - tm.assert_frame_equal(result, expected) - - -def test_file_handle_string_io(all_parsers): - # gh-14418 - # - # Don't close user provided file handles. - parser = all_parsers - data = "a,b\n1,2" - - fh = StringIO(data) - parser.read_csv(fh) - assert not fh.closed - - -def test_file_handles_with_open(all_parsers, csv1): - # gh-14418 - # - # Don't close user provided file handles. - parser = all_parsers - - for mode in ["r", "rb"]: - with open(csv1, mode) as f: - parser.read_csv(f) - assert not f.closed - - -@skip_pyarrow -def test_invalid_file_buffer_class(all_parsers): - # see gh-15337 - class InvalidBuffer: - pass - - parser = all_parsers - msg = "Invalid file path or buffer object type" - - with pytest.raises(ValueError, match=msg): - parser.read_csv(InvalidBuffer()) - - -@skip_pyarrow -def test_invalid_file_buffer_mock(all_parsers): - # see gh-15337 - parser = all_parsers - msg = "Invalid file path or buffer object type" - - class Foo: - pass - - with pytest.raises(ValueError, match=msg): - parser.read_csv(Foo()) - - -def test_valid_file_buffer_seems_invalid(all_parsers): - # gh-16135: we want to ensure that "tell" and "seek" - # aren't actually being used when we call `read_csv` - # - # Thus, while the object may look "invalid" (these - # methods are attributes of the `StringIO` class), - # it is still a valid file-object for our purposes. - class NoSeekTellBuffer(StringIO): - def tell(self): - raise AttributeError("No tell method") - - def seek(self, pos, whence=0): - raise AttributeError("No seek method") - - data = "a\n1" - parser = all_parsers - expected = DataFrame({"a": [1]}) - - result = parser.read_csv(NoSeekTellBuffer(data)) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -@pytest.mark.parametrize( - "kwargs", - [{}, {"error_bad_lines": True}], # Default is True. # Explicitly pass in. -) -@pytest.mark.parametrize( - "warn_kwargs", [{}, {"warn_bad_lines": True}, {"warn_bad_lines": False}] -) -def test_error_bad_lines(all_parsers, kwargs, warn_kwargs): - # see gh-15925 - parser = all_parsers - kwargs.update(**warn_kwargs) - data = "a\n1\n1,2,3\n4\n5,6,7" - - msg = "Expected 1 fields in line 3, saw 3" - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), **kwargs) - - -@skip_pyarrow -def test_warn_bad_lines(all_parsers, capsys): - # see gh-15925 - parser = all_parsers - data = "a\n1\n1,2,3\n4\n5,6,7" - expected = DataFrame({"a": [1, 4]}) - - result = parser.read_csv(StringIO(data), error_bad_lines=False, warn_bad_lines=True) - tm.assert_frame_equal(result, expected) - - captured = capsys.readouterr() - assert "Skipping line 3" in captured.err - assert "Skipping line 5" in captured.err - - -@skip_pyarrow -def test_suppress_error_output(all_parsers, capsys): - # see gh-15925 - parser = all_parsers - data = "a\n1\n1,2,3\n4\n5,6,7" - expected = DataFrame({"a": [1, 4]}) - - result = parser.read_csv( - StringIO(data), error_bad_lines=False, warn_bad_lines=False - ) - tm.assert_frame_equal(result, expected) - - captured = capsys.readouterr() - assert captured.err == "" - - -@pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv", "中文文件名.csv"]) -def test_filename_with_special_chars(all_parsers, filename): - # see gh-15086. - parser = all_parsers - df = DataFrame({"a": [1, 2, 3]}) - - with tm.ensure_clean(filename) as path: - df.to_csv(path, index=False) - - result = parser.read_csv(path) - tm.assert_frame_equal(result, df) - - -@skip_pyarrow -def test_read_csv_memory_growth_chunksize(all_parsers): - # see gh-24805 - # - # Let's just make sure that we don't crash - # as we iteratively process all chunks. - parser = all_parsers - - with tm.ensure_clean() as path: - with open(path, "w") as f: - for i in range(1000): - f.write(str(i) + "\n") - - with parser.read_csv(path, chunksize=20) as result: - for _ in result: - pass - - -def test_read_csv_raises_on_header_prefix(all_parsers): - # gh-27394 - parser = all_parsers - msg = "Argument prefix must be None if argument header is not None" - - s = StringIO("0,1\n2,3") - - with pytest.raises(ValueError, match=msg): - parser.read_csv(s, header=0, prefix="_X") - - -def test_unexpected_keyword_parameter_exception(all_parsers): - # GH-34976 - parser = all_parsers - - msg = "{}\\(\\) got an unexpected keyword argument 'foo'" - with pytest.raises(TypeError, match=msg.format("read_csv")): - parser.read_csv("foo.csv", foo=1) - with pytest.raises(TypeError, match=msg.format("read_table")): - parser.read_table("foo.tsv", foo=1) - - -def test_read_table_same_signature_as_read_csv(all_parsers): - # GH-34976 - parser = all_parsers - - table_sign = signature(parser.read_table) - csv_sign = signature(parser.read_csv) - - assert table_sign.parameters.keys() == csv_sign.parameters.keys() - assert table_sign.return_annotation == csv_sign.return_annotation - - for key, csv_param in csv_sign.parameters.items(): - table_param = table_sign.parameters[key] - if key == "sep": - assert csv_param.default == "," - assert table_param.default == "\t" - assert table_param.annotation == csv_param.annotation - assert table_param.kind == csv_param.kind - continue - else: - assert table_param == csv_param - - -def test_read_table_equivalency_to_read_csv(all_parsers): - # see gh-21948 - # As of 0.25.0, read_table is undeprecated - parser = all_parsers - data = "a\tb\n1\t2\n3\t4" - expected = parser.read_csv(StringIO(data), sep="\t") - result = parser.read_table(StringIO(data)) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_first_row_bom(all_parsers): - # see gh-26545 - parser = all_parsers - data = '''\ufeff"Head1" "Head2" "Head3"''' - - result = parser.read_csv(StringIO(data), delimiter="\t") - expected = DataFrame(columns=["Head1", "Head2", "Head3"]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_first_row_bom_unquoted(all_parsers): - # see gh-36343 - parser = all_parsers - data = """\ufeffHead1 Head2 Head3""" - - result = parser.read_csv(StringIO(data), delimiter="\t") - expected = DataFrame(columns=["Head1", "Head2", "Head3"]) - tm.assert_frame_equal(result, expected) - - -def test_integer_precision(all_parsers): - # Gh 7072 - s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765 -5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389""" - parser = all_parsers - result = parser.read_csv(StringIO(s), header=None)[4] - expected = Series([4321583677327450765, 4321113141090630389], name=4) - tm.assert_series_equal(result, expected) - - -@skip_pyarrow -def test_file_descriptor_leak(all_parsers): - # GH 31488 - - parser = all_parsers - with tm.ensure_clean() as path: - - def test(): - with pytest.raises(EmptyDataError, match="No columns to parse from file"): - parser.read_csv(path) - - td.check_file_leaks(test)() - - -@skip_pyarrow -@pytest.mark.parametrize("nrows", range(1, 6)) -def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): - # GH 28071 - ref = DataFrame( - [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]], - columns=list("ab"), - ) - csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4" - parser = all_parsers - df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False) - tm.assert_frame_equal(df, ref[:nrows]) - - -@skip_pyarrow -def test_no_header_two_extra_columns(all_parsers): - # GH 26218 - column_names = ["one", "two", "three"] - ref = DataFrame([["foo", "bar", "baz"]], columns=column_names) - stream = StringIO("foo,bar,baz,bam,blah") - parser = all_parsers - df = parser.read_csv(stream, header=None, names=column_names, index_col=False) - tm.assert_frame_equal(df, ref) - - -def test_read_csv_names_not_accepting_sets(all_parsers): - # GH 34946 - data = """\ - 1,2,3 - 4,5,6\n""" - parser = all_parsers - with pytest.raises(ValueError, match="Names should be an ordered collection."): - parser.read_csv(StringIO(data), names=set("QAZ")) - - -def test_read_csv_with_use_inf_as_na(all_parsers): - # https://github.com/pandas-dev/pandas/issues/35493 - parser = all_parsers - data = "1.0\nNaN\n3.0" - with option_context("use_inf_as_na", True): - result = parser.read_csv(StringIO(data), header=None) - expected = DataFrame([1.0, np.nan, 3.0]) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_read_table_delim_whitespace_default_sep(all_parsers): - # GH: 35958 - f = StringIO("a b c\n1 -2 -3\n4 5 6") - parser = all_parsers - result = parser.read_table(f, delim_whitespace=True) - expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("delimiter", [",", "\t"]) -def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter): - # GH: 35958 - f = StringIO("a b c\n1 -2 -3\n4 5 6") - parser = all_parsers - msg = ( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) - with pytest.raises(ValueError, match=msg): - parser.read_csv(f, delim_whitespace=True, sep=delimiter) - - with pytest.raises(ValueError, match=msg): - parser.read_csv(f, delim_whitespace=True, delimiter=delimiter) - - -@pytest.mark.parametrize("delimiter", [",", "\t"]) -def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): - # GH: 35958 - f = StringIO("a b c\n1 -2 -3\n4 5 6") - parser = all_parsers - msg = ( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) - with pytest.raises(ValueError, match=msg): - parser.read_table(f, delim_whitespace=True, sep=delimiter) - - with pytest.raises(ValueError, match=msg): - parser.read_table(f, delim_whitespace=True, delimiter=delimiter) - - -@skip_pyarrow -def test_dict_keys_as_names(all_parsers): - # GH: 36928 - data = "1,2" - - keys = {"a": int, "b": int}.keys() - parser = all_parsers - - result = parser.read_csv(StringIO(data), names=keys) - expected = DataFrame({"a": [1], "b": [2]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("io_class", [StringIO, BytesIO]) -@pytest.mark.parametrize("encoding", [None, "utf-8"]) -def test_read_csv_file_handle(all_parsers, io_class, encoding): - """ - Test whether read_csv does not close user-provided file handles. - - GH 36980 - """ - parser = all_parsers - expected = DataFrame({"a": [1], "b": [2]}) - - content = "a,b\n1,2" - if io_class == BytesIO: - content = content.encode("utf-8") - handle = io_class(content) - - tm.assert_frame_equal(parser.read_csv(handle, encoding=encoding), expected) - assert not handle.closed - - -@skip_pyarrow -def test_memory_map_file_handle_silent_fallback(all_parsers, compression): - """ - Do not fail for buffers with memory_map=True (cannot memory map BytesIO). - - GH 37621 - """ - parser = all_parsers - expected = DataFrame({"a": [1], "b": [2]}) - - handle = BytesIO() - expected.to_csv(handle, index=False, compression=compression, mode="wb") - handle.seek(0) - - tm.assert_frame_equal( - parser.read_csv(handle, memory_map=True, compression=compression), - expected, - ) - - -@skip_pyarrow -def test_memory_map_compression(all_parsers, compression): - """ - Support memory map for compressed files. - - GH 37621 - """ - parser = all_parsers - expected = DataFrame({"a": [1], "b": [2]}) - - with tm.ensure_clean() as path: - expected.to_csv(path, index=False, compression=compression) - - tm.assert_frame_equal( - parser.read_csv(path, memory_map=True, compression=compression), - expected, - ) - - -@skip_pyarrow -def test_context_manager(all_parsers, datapath): - # make sure that opened files are closed - parser = all_parsers - - path = datapath("io", "data", "csv", "iris.csv") - - reader = parser.read_csv(path, chunksize=1) - assert not reader._engine.handles.handle.closed - try: - with reader: - next(reader) - assert False - except AssertionError: - assert reader._engine.handles.handle.closed - - -@skip_pyarrow -def test_context_manageri_user_provided(all_parsers, datapath): - # make sure that user-provided handles are not closed - parser = all_parsers - - with open(datapath("io", "data", "csv", "iris.csv"), mode="r") as path: - - reader = parser.read_csv(path, chunksize=1) - assert not reader._engine.handles.handle.closed - try: - with reader: - next(reader) - assert False - except AssertionError: - assert not reader._engine.handles.handle.closed