diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index e53c828fe30cb..497cf261fcece 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -38,7 +38,7 @@ ) from pandas.io.json._normalize import convert_to_line_delimits from pandas.io.json._table_schema import build_table_schema, parse_table_schema -from pandas.io.parsers import validate_integer +from pandas.io.parsers.readers import validate_integer loads = json.loads dumps = json.dumps diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py deleted file mode 100644 index e58e59a722b7a..0000000000000 --- a/pandas/io/parsers.py +++ /dev/null @@ -1,3997 +0,0 @@ -""" -Module contains tools for processing files into DataFrames or other objects -""" - -from collections import abc, defaultdict -import csv -import datetime -from io import StringIO -import itertools -import re -import sys -from textwrap import fill -from typing import ( - Any, - Dict, - Iterable, - Iterator, - List, - Optional, - Sequence, - Set, - Type, - cast, -) -import warnings - -import numpy as np - -import pandas._libs.lib as lib -import pandas._libs.ops as libops -import pandas._libs.parsers as parsers -from pandas._libs.parsers import STR_NA_VALUES -from pandas._libs.tslibs import parsing -from pandas._typing import DtypeArg, FilePathOrBuffer, StorageOptions, Union -from pandas.errors import ( - AbstractMethodError, - EmptyDataError, - ParserError, - ParserWarning, -) -from pandas.util._decorators import Appender - -from pandas.core.dtypes.cast import astype_nansafe -from pandas.core.dtypes.common import ( - ensure_object, - ensure_str, - is_bool_dtype, - is_categorical_dtype, - is_dict_like, - is_dtype_equal, - is_extension_array_dtype, - is_file_like, - is_float, - is_integer, - is_integer_dtype, - is_list_like, - is_object_dtype, - is_scalar, - is_string_dtype, - pandas_dtype, -) -from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas.core.dtypes.missing import isna - -from pandas.core import algorithms, generic -from pandas.core.arrays import Categorical -from pandas.core.frame import DataFrame -from pandas.core.indexes.api import ( - Index, - MultiIndex, - RangeIndex, - ensure_index_from_sequences, -) -from pandas.core.series import Series -from pandas.core.tools import datetimes as tools - -from pandas.io.common import IOHandles, get_handle, validate_header_arg -from pandas.io.date_converters import generic_parser - -# BOM character (byte order mark) -# This exists at the beginning of a file to indicate endianness -# of a file (stream). Unfortunately, this marker screws up parsing, -# so we need to remove it if we see it. -_BOM = "\ufeff" - -_doc_read_csv_and_table = ( - r""" -{summary} - -Also supports optionally iterating or breaking of the file -into chunks. - -Additional help can be found in the online docs for -`IO Tools `_. - -Parameters ----------- -filepath_or_buffer : str, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is - expected. A local file could be: file://localhost/path/to/table.csv. - - If you want to pass in a path object, pandas accepts any ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, such as - a file handle (e.g. via builtin ``open`` function) or ``StringIO``. -sep : str, default {_default_sep} - Delimiter to use. If sep is None, the C engine cannot automatically detect - the separator, but the Python parsing engine can, meaning the latter will - be used and automatically detect the separator by Python's builtin sniffer - tool, ``csv.Sniffer``. In addition, separators longer than 1 character and - different from ``'\s+'`` will be interpreted as regular expressions and - will also force the use of the Python parsing engine. Note that regex - delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``. -delimiter : str, default ``None`` - Alias for sep. -header : int, list of int, default 'infer' - Row number(s) to use as the column names, and the start of the - data. Default behavior is to infer the column names: if no names - are passed the behavior is identical to ``header=0`` and column - names are inferred from the first line of the file, if column - names are passed explicitly then the behavior is identical to - ``header=None``. Explicitly pass ``header=0`` to be able to - replace existing names. The header can be a list of integers that - specify row locations for a multi-index on the columns - e.g. [0,1,3]. Intervening rows that are not specified will be - skipped (e.g. 2 in this example is skipped). Note that this - parameter ignores commented lines and empty lines if - ``skip_blank_lines=True``, so ``header=0`` denotes the first line of - data rather than the first line of the file. -names : array-like, optional - List of column names to use. If the file contains a header row, - then you should explicitly pass ``header=0`` to override the column names. - Duplicates in this list are not allowed. -index_col : int, str, sequence of int / str, or False, default ``None`` - Column(s) to use as the row labels of the ``DataFrame``, either given as - string name or column index. If a sequence of int / str is given, a - MultiIndex is used. - - Note: ``index_col=False`` can be used to force pandas to *not* use the first - column as the index, e.g. when you have a malformed file with delimiters at - the end of each line. -usecols : list-like or callable, optional - Return a subset of the columns. If list-like, all elements must either - be positional (i.e. integer indices into the document columns) or strings - that correspond to column names provided either by the user in `names` or - inferred from the document header row(s). For example, a valid list-like - `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. - Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. - To instantiate a DataFrame from ``data`` with element order preserved use - ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns - in ``['foo', 'bar']`` order or - ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` - for ``['bar', 'foo']`` order. - - If callable, the callable function will be evaluated against the column - names, returning names where the callable function evaluates to True. An - example of a valid callable argument would be ``lambda x: x.upper() in - ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster - parsing time and lower memory usage. -squeeze : bool, default False - If the parsed data only contains one column then return a Series. -prefix : str, optional - Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... -mangle_dupe_cols : bool, default True - Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than - 'X'...'X'. Passing in False will cause data to be overwritten if there - are duplicate names in the columns. -dtype : Type name or dict of column -> type, optional - Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, - 'c': 'Int64'}} - Use `str` or `object` together with suitable `na_values` settings - to preserve and not interpret dtype. - If converters are specified, they will be applied INSTEAD - of dtype conversion. -engine : {{'c', 'python'}}, optional - Parser engine to use. The C engine is faster while the python engine is - currently more feature-complete. -converters : dict, optional - Dict of functions for converting values in certain columns. Keys can either - be integers or column labels. -true_values : list, optional - Values to consider as True. -false_values : list, optional - Values to consider as False. -skipinitialspace : bool, default False - Skip spaces after delimiter. -skiprows : list-like, int or callable, optional - Line numbers to skip (0-indexed) or number of lines to skip (int) - at the start of the file. - - If callable, the callable function will be evaluated against the row - indices, returning True if the row should be skipped and False otherwise. - An example of a valid callable argument would be ``lambda x: x in [0, 2]``. -skipfooter : int, default 0 - Number of lines at bottom of file to skip (Unsupported with engine='c'). -nrows : int, optional - Number of rows of file to read. Useful for reading pieces of large files. -na_values : scalar, str, list-like, or dict, optional - Additional strings to recognize as NA/NaN. If dict passed, specific - per-column NA values. By default the following values are interpreted as - NaN: '""" - + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") - + """'. -keep_default_na : bool, default True - Whether or not to include the default NaN values when parsing the data. - Depending on whether `na_values` is passed in, the behavior is as follows: - - * If `keep_default_na` is True, and `na_values` are specified, `na_values` - is appended to the default NaN values used for parsing. - * If `keep_default_na` is True, and `na_values` are not specified, only - the default NaN values are used for parsing. - * If `keep_default_na` is False, and `na_values` are specified, only - the NaN values specified `na_values` are used for parsing. - * If `keep_default_na` is False, and `na_values` are not specified, no - strings will be parsed as NaN. - - Note that if `na_filter` is passed in as False, the `keep_default_na` and - `na_values` parameters will be ignored. -na_filter : bool, default True - Detect missing value markers (empty strings and the value of na_values). In - data without any NAs, passing na_filter=False can improve the performance - of reading a large file. -verbose : bool, default False - Indicate number of NA values placed in non-numeric columns. -skip_blank_lines : bool, default True - If True, skip over blank lines rather than interpreting as NaN values. -parse_dates : bool or list of int or names or list of lists or dict, \ -default False - The behavior is as follows: - - * boolean. If True -> try parsing the index. - * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 - each as a separate date column. - * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as - a single date column. - * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call - result 'foo' - - If a column or index cannot be represented as an array of datetimes, - say because of an unparsable value or a mixture of timezones, the column - or index will be returned unaltered as an object data type. For - non-standard datetime parsing, use ``pd.to_datetime`` after - ``pd.read_csv``. To parse an index or column with a mixture of timezones, - specify ``date_parser`` to be a partially-applied - :func:`pandas.to_datetime` with ``utc=True``. See - :ref:`io.csv.mixed_timezones` for more. - - Note: A fast-path exists for iso8601-formatted dates. -infer_datetime_format : bool, default False - If True and `parse_dates` is enabled, pandas will attempt to infer the - format of the datetime strings in the columns, and if it can be inferred, - switch to a faster method of parsing them. In some cases this can increase - the parsing speed by 5-10x. -keep_date_col : bool, default False - If True and `parse_dates` specifies combining multiple columns then - keep the original columns. -date_parser : function, optional - Function to use for converting a sequence of string columns to an array of - datetime instances. The default uses ``dateutil.parser.parser`` to do the - conversion. Pandas will try to call `date_parser` in three different ways, - advancing to the next if an exception occurs: 1) Pass one or more arrays - (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the - string values from the columns defined by `parse_dates` into a single array - and pass that; and 3) call `date_parser` once for each row using one or - more strings (corresponding to the columns defined by `parse_dates`) as - arguments. -dayfirst : bool, default False - DD/MM format dates, international and European format. -cache_dates : bool, default True - If True, use a cache of unique, converted dates to apply the datetime - conversion. May produce significant speed-up when parsing duplicate - date strings, especially ones with timezone offsets. - - .. versionadded:: 0.25.0 -iterator : bool, default False - Return TextFileReader object for iteration or getting chunks with - ``get_chunk()``. - - .. versionchanged:: 1.2 - - ``TextFileReader`` is a context manager. -chunksize : int, optional - Return TextFileReader object for iteration. - See the `IO Tools docs - `_ - for more information on ``iterator`` and ``chunksize``. - - .. versionchanged:: 1.2 - - ``TextFileReader`` is a context manager. -compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer' and - `filepath_or_buffer` is path-like, then detect compression from the - following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no - decompression). If using 'zip', the ZIP file must contain only one data - file to be read in. Set to None for no decompression. -thousands : str, optional - Thousands separator. -decimal : str, default '.' - Character to recognize as decimal point (e.g. use ',' for European data). -lineterminator : str (length 1), optional - Character to break file into lines. Only valid with C parser. -quotechar : str (length 1), optional - The character used to denote the start and end of a quoted item. Quoted - items can include the delimiter and it will be ignored. -quoting : int or csv.QUOTE_* instance, default 0 - Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of - QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). -doublequote : bool, default ``True`` - When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate - whether or not to interpret two consecutive quotechar elements INSIDE a - field as a single ``quotechar`` element. -escapechar : str (length 1), optional - One-character string used to escape other characters. -comment : str, optional - Indicates remainder of line should not be parsed. If found at the beginning - of a line, the line will be ignored altogether. This parameter must be a - single character. Like empty lines (as long as ``skip_blank_lines=True``), - fully commented lines are ignored by the parameter `header` but not by - `skiprows`. For example, if ``comment='#'``, parsing - ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being - treated as the header. -encoding : str, optional - Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python - standard encodings - `_ . -dialect : str or csv.Dialect, optional - If provided, this parameter will override values (default or not) for the - following parameters: `delimiter`, `doublequote`, `escapechar`, - `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to - override values, a ParserWarning will be issued. See csv.Dialect - documentation for more details. -error_bad_lines : bool, default True - Lines with too many fields (e.g. a csv line with too many commas) will by - default cause an exception to be raised, and no DataFrame will be returned. - If False, then these "bad lines" will dropped from the DataFrame that is - returned. -warn_bad_lines : bool, default True - If error_bad_lines is False, and warn_bad_lines is True, a warning for each - "bad line" will be output. -delim_whitespace : bool, default False - Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be - used as the sep. Equivalent to setting ``sep='\\s+'``. If this option - is set to True, nothing should be passed in for the ``delimiter`` - parameter. -low_memory : bool, default True - Internally process the file in chunks, resulting in lower memory use - while parsing, but possibly mixed type inference. To ensure no mixed - types either set False, or specify the type with the `dtype` parameter. - Note that the entire file is read into a single DataFrame regardless, - use the `chunksize` or `iterator` parameter to return the data in chunks. - (Only valid with C parser). -memory_map : bool, default False - If a filepath is provided for `filepath_or_buffer`, map the file object - directly onto memory and access the data directly from there. Using this - option can improve performance because there is no longer any I/O overhead. -float_precision : str, optional - Specifies which converter the C engine should use for floating-point - values. The options are ``None`` or 'high' for the ordinary converter, - 'legacy' for the original lower precision pandas converter, and - 'round_trip' for the round-trip converter. - - .. versionchanged:: 1.2 - -{storage_options} - - .. versionadded:: 1.2 - -Returns -------- -DataFrame or TextParser - A comma-separated values (csv) file is returned as two-dimensional - data structure with labeled axes. - -See Also --------- -DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. -read_csv : Read a comma-separated values (csv) file into DataFrame. -read_fwf : Read a table of fixed-width formatted lines into DataFrame. - -Examples --------- ->>> pd.{func_name}('data.csv') # doctest: +SKIP -""" -) - - -def validate_integer(name, val, min_val=0): - """ - Checks whether the 'name' parameter for parsing is either - an integer OR float that can SAFELY be cast to an integer - without losing accuracy. Raises a ValueError if that is - not the case. - - Parameters - ---------- - name : string - Parameter name (used for error reporting) - val : int or float - The value to check - min_val : int - Minimum allowed value (val < min_val will result in a ValueError) - """ - msg = f"'{name:s}' must be an integer >={min_val:d}" - - if val is not None: - if is_float(val): - if int(val) != val: - raise ValueError(msg) - val = int(val) - elif not (is_integer(val) and val >= min_val): - raise ValueError(msg) - - return val - - -def _validate_names(names): - """ - Raise ValueError if the `names` parameter contains duplicates or has an - invalid data type. - - Parameters - ---------- - names : array-like or None - An array containing a list of the names used for the output DataFrame. - - Raises - ------ - ValueError - If names are not unique or are not ordered (e.g. set). - """ - if names is not None: - if len(names) != len(set(names)): - raise ValueError("Duplicate names are not allowed.") - if not ( - is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView) - ): - raise ValueError("Names should be an ordered collection.") - - -def _read(filepath_or_buffer: FilePathOrBuffer, kwds): - """Generic reader of line files.""" - if kwds.get("date_parser", None) is not None: - if isinstance(kwds["parse_dates"], bool): - kwds["parse_dates"] = True - - # Extract some of the arguments (pass chunksize on). - iterator = kwds.get("iterator", False) - chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1) - nrows = kwds.get("nrows", None) - - # Check for duplicates in names. - _validate_names(kwds.get("names", None)) - - # Create the parser. - parser = TextFileReader(filepath_or_buffer, **kwds) - - if chunksize or iterator: - return parser - - with parser: - return parser.read(nrows) - - -_parser_defaults = { - "delimiter": None, - "escapechar": None, - "quotechar": '"', - "quoting": csv.QUOTE_MINIMAL, - "doublequote": True, - "skipinitialspace": False, - "lineterminator": None, - "header": "infer", - "index_col": None, - "names": None, - "prefix": None, - "skiprows": None, - "skipfooter": 0, - "nrows": None, - "na_values": None, - "keep_default_na": True, - "true_values": None, - "false_values": None, - "converters": None, - "dtype": None, - "cache_dates": True, - "thousands": None, - "comment": None, - "decimal": ".", - # 'engine': 'c', - "parse_dates": False, - "keep_date_col": False, - "dayfirst": False, - "date_parser": None, - "usecols": None, - # 'iterator': False, - "chunksize": None, - "verbose": False, - "encoding": None, - "squeeze": False, - "compression": None, - "mangle_dupe_cols": True, - "infer_datetime_format": False, - "skip_blank_lines": True, -} - - -_c_parser_defaults = { - "delim_whitespace": False, - "na_filter": True, - "low_memory": True, - "memory_map": False, - "error_bad_lines": True, - "warn_bad_lines": True, - "float_precision": None, -} - -_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} - -_c_unsupported = {"skipfooter"} -_python_unsupported = {"low_memory", "float_precision"} - -_deprecated_defaults: Dict[str, Any] = {} -_deprecated_args: Set[str] = set() - - -@Appender( - _doc_read_csv_and_table.format( - func_name="read_csv", - summary="Read a comma-separated values (csv) file into DataFrame.", - _default_sep="','", - storage_options=generic._shared_docs["storage_options"], - ) -) -def read_csv( - filepath_or_buffer: FilePathOrBuffer, - sep=lib.no_default, - delimiter=None, - # Column and Index Locations and Names - header="infer", - names=None, - index_col=None, - usecols=None, - squeeze=False, - prefix=None, - mangle_dupe_cols=True, - # General Parsing Configuration - dtype: Optional[DtypeArg] = None, - engine=None, - converters=None, - true_values=None, - false_values=None, - skipinitialspace=False, - skiprows=None, - skipfooter=0, - nrows=None, - # NA and Missing Data Handling - na_values=None, - keep_default_na=True, - na_filter=True, - verbose=False, - skip_blank_lines=True, - # Datetime Handling - parse_dates=False, - infer_datetime_format=False, - keep_date_col=False, - date_parser=None, - dayfirst=False, - cache_dates=True, - # Iteration - iterator=False, - chunksize=None, - # Quoting, Compression, and File Format - compression="infer", - thousands=None, - decimal: str = ".", - lineterminator=None, - quotechar='"', - quoting=csv.QUOTE_MINIMAL, - doublequote=True, - escapechar=None, - comment=None, - encoding=None, - dialect=None, - # Error Handling - error_bad_lines=True, - warn_bad_lines=True, - # Internal - delim_whitespace=False, - low_memory=_c_parser_defaults["low_memory"], - memory_map=False, - float_precision=None, - storage_options: StorageOptions = None, -): - kwds = locals() - del kwds["filepath_or_buffer"] - del kwds["sep"] - - kwds_defaults = _refine_defaults_read( - dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": ","} - ) - kwds.update(kwds_defaults) - - return _read(filepath_or_buffer, kwds) - - -@Appender( - _doc_read_csv_and_table.format( - func_name="read_table", - summary="Read general delimited file into DataFrame.", - _default_sep=r"'\\t' (tab-stop)", - storage_options=generic._shared_docs["storage_options"], - ) -) -def read_table( - filepath_or_buffer: FilePathOrBuffer, - sep=lib.no_default, - delimiter=None, - # Column and Index Locations and Names - header="infer", - names=None, - index_col=None, - usecols=None, - squeeze=False, - prefix=None, - mangle_dupe_cols=True, - # General Parsing Configuration - dtype: Optional[DtypeArg] = None, - engine=None, - converters=None, - true_values=None, - false_values=None, - skipinitialspace=False, - skiprows=None, - skipfooter=0, - nrows=None, - # NA and Missing Data Handling - na_values=None, - keep_default_na=True, - na_filter=True, - verbose=False, - skip_blank_lines=True, - # Datetime Handling - parse_dates=False, - infer_datetime_format=False, - keep_date_col=False, - date_parser=None, - dayfirst=False, - cache_dates=True, - # Iteration - iterator=False, - chunksize=None, - # Quoting, Compression, and File Format - compression="infer", - thousands=None, - decimal: str = ".", - lineterminator=None, - quotechar='"', - quoting=csv.QUOTE_MINIMAL, - doublequote=True, - escapechar=None, - comment=None, - encoding=None, - dialect=None, - # Error Handling - error_bad_lines=True, - warn_bad_lines=True, - # Internal - delim_whitespace=False, - low_memory=_c_parser_defaults["low_memory"], - memory_map=False, - float_precision=None, -): - kwds = locals() - del kwds["filepath_or_buffer"] - del kwds["sep"] - - kwds_defaults = _refine_defaults_read( - dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": "\t"} - ) - kwds.update(kwds_defaults) - - return _read(filepath_or_buffer, kwds) - - -def read_fwf( - filepath_or_buffer: FilePathOrBuffer, - colspecs="infer", - widths=None, - infer_nrows=100, - **kwds, -): - r""" - Read a table of fixed-width formatted lines into DataFrame. - - Also supports optionally iterating or breaking of the file - into chunks. - - Additional help can be found in the `online docs for IO Tools - `_. - - Parameters - ---------- - filepath_or_buffer : str, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, and file. For file URLs, a host is - expected. A local file could be: - ``file://localhost/path/to/table.csv``. - - If you want to pass in a path object, pandas accepts any - ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, - such as a file handle (e.g. via builtin ``open`` function) - or ``StringIO``. - colspecs : list of tuple (int, int) or 'infer'. optional - A list of tuples giving the extents of the fixed-width - fields of each line as half-open intervals (i.e., [from, to[ ). - String value 'infer' can be used to instruct the parser to try - detecting the column specifications from the first 100 rows of - the data which are not being skipped via skiprows (default='infer'). - widths : list of int, optional - A list of field widths which can be used instead of 'colspecs' if - the intervals are contiguous. - infer_nrows : int, default 100 - The number of rows to consider when letting the parser determine the - `colspecs`. - - .. versionadded:: 0.24.0 - **kwds : optional - Optional keyword arguments can be passed to ``TextFileReader``. - - Returns - ------- - DataFrame or TextParser - A comma-separated values (csv) file is returned as two-dimensional - data structure with labeled axes. - - See Also - -------- - DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. - read_csv : Read a comma-separated values (csv) file into DataFrame. - - Examples - -------- - >>> pd.read_fwf('data.csv') # doctest: +SKIP - """ - # Check input arguments. - if colspecs is None and widths is None: - raise ValueError("Must specify either colspecs or widths") - elif colspecs not in (None, "infer") and widths is not None: - raise ValueError("You must specify only one of 'widths' and 'colspecs'") - - # Compute 'colspecs' from 'widths', if specified. - if widths is not None: - colspecs, col = [], 0 - for w in widths: - colspecs.append((col, col + w)) - col += w - - kwds["colspecs"] = colspecs - kwds["infer_nrows"] = infer_nrows - kwds["engine"] = "python-fwf" - return _read(filepath_or_buffer, kwds) - - -class TextFileReader(abc.Iterator): - """ - - Passed dialect overrides any of the related parser options - - """ - - def __init__(self, f, engine=None, **kwds): - - self.f = f - - if engine is not None: - engine_specified = True - else: - engine = "python" - engine_specified = False - self.engine = engine - self._engine_specified = kwds.get("engine_specified", engine_specified) - - _validate_skipfooter(kwds) - - dialect = _extract_dialect(kwds) - if dialect is not None: - kwds = _merge_with_dialect_properties(dialect, kwds) - - if kwds.get("header", "infer") == "infer": - kwds["header"] = 0 if kwds.get("names") is None else None - - self.orig_options = kwds - - # miscellanea - self._currow = 0 - - options = self._get_options_with_defaults(engine) - options["storage_options"] = kwds.get("storage_options", None) - - self.chunksize = options.pop("chunksize", None) - self.nrows = options.pop("nrows", None) - self.squeeze = options.pop("squeeze", False) - - self._check_file_or_buffer(f, engine) - self.options, self.engine = self._clean_options(options, engine) - - if "has_index_names" in kwds: - self.options["has_index_names"] = kwds["has_index_names"] - - self._engine = self._make_engine(self.engine) - - def close(self): - self._engine.close() - - def _get_options_with_defaults(self, engine): - kwds = self.orig_options - - options = {} - - for argname, default in _parser_defaults.items(): - value = kwds.get(argname, default) - - # see gh-12935 - if argname == "mangle_dupe_cols" and not value: - raise ValueError("Setting mangle_dupe_cols=False is not supported yet") - else: - options[argname] = value - - for argname, default in _c_parser_defaults.items(): - if argname in kwds: - value = kwds[argname] - - if engine != "c" and value != default: - if "python" in engine and argname not in _python_unsupported: - pass - elif value == _deprecated_defaults.get(argname, default): - pass - else: - raise ValueError( - f"The {repr(argname)} option is not supported with the " - f"{repr(engine)} engine" - ) - else: - value = _deprecated_defaults.get(argname, default) - options[argname] = value - - if engine == "python-fwf": - # pandas\io\parsers.py:907: error: Incompatible types in assignment - # (expression has type "object", variable has type "Union[int, str, - # None]") [assignment] - for argname, default in _fwf_defaults.items(): # type: ignore[assignment] - options[argname] = kwds.get(argname, default) - - return options - - def _check_file_or_buffer(self, f, engine): - # see gh-16530 - if is_file_like(f) and engine != "c" and not hasattr(f, "__next__"): - # The C engine doesn't need the file-like to have the "__next__" - # attribute. However, the Python engine explicitly calls - # "__next__(...)" when iterating through such an object, meaning it - # needs to have that attribute - raise ValueError( - "The 'python' engine cannot iterate through this file buffer." - ) - - def _clean_options(self, options, engine): - result = options.copy() - - fallback_reason = None - - # C engine not supported yet - if engine == "c": - if options["skipfooter"] > 0: - fallback_reason = "the 'c' engine does not support skipfooter" - engine = "python" - - sep = options["delimiter"] - delim_whitespace = options["delim_whitespace"] - - if sep is None and not delim_whitespace: - if engine == "c": - fallback_reason = ( - "the 'c' engine does not support " - "sep=None with delim_whitespace=False" - ) - engine = "python" - elif sep is not None and len(sep) > 1: - if engine == "c" and sep == r"\s+": - result["delim_whitespace"] = True - del result["delimiter"] - elif engine not in ("python", "python-fwf"): - # wait until regex engine integrated - fallback_reason = ( - "the 'c' engine does not support " - "regex separators (separators > 1 char and " - r"different from '\s+' are interpreted as regex)" - ) - engine = "python" - elif delim_whitespace: - if "python" in engine: - result["delimiter"] = r"\s+" - elif sep is not None: - encodeable = True - encoding = sys.getfilesystemencoding() or "utf-8" - try: - if len(sep.encode(encoding)) > 1: - encodeable = False - except UnicodeDecodeError: - encodeable = False - if not encodeable and engine not in ("python", "python-fwf"): - fallback_reason = ( - f"the separator encoded in {encoding} " - "is > 1 char long, and the 'c' engine " - "does not support such separators" - ) - engine = "python" - - quotechar = options["quotechar"] - if quotechar is not None and isinstance(quotechar, (str, bytes)): - if ( - len(quotechar) == 1 - and ord(quotechar) > 127 - and engine not in ("python", "python-fwf") - ): - fallback_reason = ( - "ord(quotechar) > 127, meaning the " - "quotechar is larger than one byte, " - "and the 'c' engine does not support such quotechars" - ) - engine = "python" - - if fallback_reason and self._engine_specified: - raise ValueError(fallback_reason) - - if engine == "c": - for arg in _c_unsupported: - del result[arg] - - if "python" in engine: - for arg in _python_unsupported: - if fallback_reason and result[arg] != _c_parser_defaults[arg]: - raise ValueError( - "Falling back to the 'python' engine because " - f"{fallback_reason}, but this causes {repr(arg)} to be " - "ignored as it is not supported by the 'python' engine." - ) - del result[arg] - - if fallback_reason: - warnings.warn( - ( - "Falling back to the 'python' engine because " - f"{fallback_reason}; you can avoid this warning by specifying " - "engine='python'." - ), - ParserWarning, - stacklevel=5, - ) - - index_col = options["index_col"] - names = options["names"] - converters = options["converters"] - na_values = options["na_values"] - skiprows = options["skiprows"] - - validate_header_arg(options["header"]) - - for arg in _deprecated_args: - parser_default = _c_parser_defaults[arg] - depr_default = _deprecated_defaults[arg] - if result.get(arg, depr_default) != depr_default: - msg = ( - f"The {arg} argument has been deprecated and will be " - "removed in a future version.\n\n" - ) - warnings.warn(msg, FutureWarning, stacklevel=2) - else: - result[arg] = parser_default - - if index_col is True: - raise ValueError("The value of index_col couldn't be 'True'") - if _is_index_col(index_col): - if not isinstance(index_col, (list, tuple, np.ndarray)): - index_col = [index_col] - result["index_col"] = index_col - - names = list(names) if names is not None else names - - # type conversion-related - if converters is not None: - if not isinstance(converters, dict): - raise TypeError( - "Type converters must be a dict or subclass, " - f"input was a {type(converters).__name__}" - ) - else: - converters = {} - - # Converting values to NA - keep_default_na = options["keep_default_na"] - na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) - - # handle skiprows; this is internally handled by the - # c-engine, so only need for python parsers - if engine != "c": - if is_integer(skiprows): - skiprows = list(range(skiprows)) - if skiprows is None: - skiprows = set() - elif not callable(skiprows): - skiprows = set(skiprows) - - # put stuff back - result["names"] = names - result["converters"] = converters - result["na_values"] = na_values - result["na_fvalues"] = na_fvalues - result["skiprows"] = skiprows - - return result, engine - - def __next__(self): - try: - return self.get_chunk() - except StopIteration: - self.close() - raise - - def _make_engine(self, engine="c"): - mapping: Dict[str, Type[ParserBase]] = { - "c": CParserWrapper, - "python": PythonParser, - "python-fwf": FixedWidthFieldParser, - } - if engine not in mapping: - raise ValueError( - f"Unknown engine: {engine} (valid options are {mapping.keys()})" - ) - # error: Too many arguments for "ParserBase" - return mapping[engine](self.f, **self.options) # type: ignore[call-arg] - - def _failover_to_python(self): - raise AbstractMethodError(self) - - def read(self, nrows=None): - nrows = validate_integer("nrows", nrows) - index, columns, col_dict = self._engine.read(nrows) - - if index is None: - if col_dict: - # Any column is actually fine: - new_rows = len(next(iter(col_dict.values()))) - index = RangeIndex(self._currow, self._currow + new_rows) - else: - new_rows = 0 - else: - new_rows = len(index) - - df = DataFrame(col_dict, columns=columns, index=index) - - self._currow += new_rows - - if self.squeeze and len(df.columns) == 1: - return df[df.columns[0]].copy() - return df - - def get_chunk(self, size=None): - if size is None: - size = self.chunksize - if self.nrows is not None: - if self._currow >= self.nrows: - raise StopIteration - size = min(size, self.nrows - self._currow) - return self.read(nrows=size) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.close() - - -def _is_index_col(col): - return col is not None and col is not False - - -def _is_potential_multi_index( - columns, index_col: Optional[Union[bool, Sequence[int]]] = None -): - """ - Check whether or not the `columns` parameter - could be converted into a MultiIndex. - - Parameters - ---------- - columns : array-like - Object which may or may not be convertible into a MultiIndex - index_col : None, bool or list, optional - Column or columns to use as the (possibly hierarchical) index - - Returns - ------- - boolean : Whether or not columns could become a MultiIndex - """ - if index_col is None or isinstance(index_col, bool): - index_col = [] - - return ( - len(columns) - and not isinstance(columns, MultiIndex) - and all(isinstance(c, tuple) for c in columns if c not in list(index_col)) - ) - - -def _evaluate_usecols(usecols, names): - """ - Check whether or not the 'usecols' parameter - is a callable. If so, enumerates the 'names' - parameter and returns a set of indices for - each entry in 'names' that evaluates to True. - If not a callable, returns 'usecols'. - """ - if callable(usecols): - return {i for i, name in enumerate(names) if usecols(name)} - return usecols - - -def _validate_usecols_names(usecols, names): - """ - Validates that all usecols are present in a given - list of names. If not, raise a ValueError that - shows what usecols are missing. - - Parameters - ---------- - usecols : iterable of usecols - The columns to validate are present in names. - names : iterable of names - The column names to check against. - - Returns - ------- - usecols : iterable of usecols - The `usecols` parameter if the validation succeeds. - - Raises - ------ - ValueError : Columns were missing. Error message will list them. - """ - missing = [c for c in usecols if c not in names] - if len(missing) > 0: - raise ValueError( - f"Usecols do not match columns, columns expected but not found: {missing}" - ) - - return usecols - - -def _validate_skipfooter_arg(skipfooter): - """ - Validate the 'skipfooter' parameter. - - Checks whether 'skipfooter' is a non-negative integer. - Raises a ValueError if that is not the case. - - Parameters - ---------- - skipfooter : non-negative integer - The number of rows to skip at the end of the file. - - Returns - ------- - validated_skipfooter : non-negative integer - The original input if the validation succeeds. - - Raises - ------ - ValueError : 'skipfooter' was not a non-negative integer. - """ - if not is_integer(skipfooter): - raise ValueError("skipfooter must be an integer") - - if skipfooter < 0: - raise ValueError("skipfooter cannot be negative") - - return skipfooter - - -def _validate_usecols_arg(usecols): - """ - Validate the 'usecols' parameter. - - Checks whether or not the 'usecols' parameter contains all integers - (column selection by index), strings (column by name) or is a callable. - Raises a ValueError if that is not the case. - - Parameters - ---------- - usecols : list-like, callable, or None - List of columns to use when parsing or a callable that can be used - to filter a list of table columns. - - Returns - ------- - usecols_tuple : tuple - A tuple of (verified_usecols, usecols_dtype). - - 'verified_usecols' is either a set if an array-like is passed in or - 'usecols' if a callable or None is passed in. - - 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like - is passed in or None if a callable or None is passed in. - """ - msg = ( - "'usecols' must either be list-like of all strings, all unicode, " - "all integers or a callable." - ) - if usecols is not None: - if callable(usecols): - return usecols, None - - if not is_list_like(usecols): - # see gh-20529 - # - # Ensure it is iterable container but not string. - raise ValueError(msg) - - usecols_dtype = lib.infer_dtype(usecols, skipna=False) - - if usecols_dtype not in ("empty", "integer", "string"): - raise ValueError(msg) - - usecols = set(usecols) - - return usecols, usecols_dtype - return usecols, None - - -def _validate_parse_dates_arg(parse_dates): - """ - Check whether or not the 'parse_dates' parameter - is a non-boolean scalar. Raises a ValueError if - that is the case. - """ - msg = ( - "Only booleans, lists, and dictionaries are accepted " - "for the 'parse_dates' parameter" - ) - - if parse_dates is not None: - if is_scalar(parse_dates): - if not lib.is_bool(parse_dates): - raise TypeError(msg) - - elif not isinstance(parse_dates, (list, dict)): - raise TypeError(msg) - - return parse_dates - - -class ParserBase: - def __init__(self, kwds): - - self.names = kwds.get("names") - self.orig_names: Optional[List] = None - self.prefix = kwds.pop("prefix", None) - - self.index_col = kwds.get("index_col", None) - self.unnamed_cols: Set = set() - self.index_names: Optional[List] = None - self.col_names = None - - self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False)) - self.date_parser = kwds.pop("date_parser", None) - self.dayfirst = kwds.pop("dayfirst", False) - self.keep_date_col = kwds.pop("keep_date_col", False) - - self.na_values = kwds.get("na_values") - self.na_fvalues = kwds.get("na_fvalues") - self.na_filter = kwds.get("na_filter", False) - self.keep_default_na = kwds.get("keep_default_na", True) - - self.true_values = kwds.get("true_values") - self.false_values = kwds.get("false_values") - self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True) - self.infer_datetime_format = kwds.pop("infer_datetime_format", False) - self.cache_dates = kwds.pop("cache_dates", True) - - self._date_conv = _make_date_converter( - date_parser=self.date_parser, - dayfirst=self.dayfirst, - infer_datetime_format=self.infer_datetime_format, - cache_dates=self.cache_dates, - ) - - # validate header options for mi - self.header = kwds.get("header") - if isinstance(self.header, (list, tuple, np.ndarray)): - if not all(map(is_integer, self.header)): - raise ValueError("header must be integer or list of integers") - if any(i < 0 for i in self.header): - raise ValueError( - "cannot specify multi-index header with negative integers" - ) - if kwds.get("usecols"): - raise ValueError( - "cannot specify usecols when specifying a multi-index header" - ) - if kwds.get("names"): - raise ValueError( - "cannot specify names when specifying a multi-index header" - ) - - # validate index_col that only contains integers - if self.index_col is not None: - is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray)) - if not ( - is_sequence - and all(map(is_integer, self.index_col)) - or is_integer(self.index_col) - ): - raise ValueError( - "index_col must only contain row numbers " - "when specifying a multi-index header" - ) - elif self.header is not None: - # GH 27394 - if self.prefix is not None: - raise ValueError( - "Argument prefix must be None if argument header is not None" - ) - # GH 16338 - elif not is_integer(self.header): - raise ValueError("header must be integer or list of integers") - # GH 27779 - elif self.header < 0: - raise ValueError( - "Passing negative integer to header is invalid. " - "For no header, use header=None instead" - ) - - self._name_processed = False - - self._first_chunk = True - - self.handles: Optional[IOHandles] = None - - def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None: - """ - Let the readers open IOHanldes after they are done with their potential raises. - """ - self.handles = get_handle( - src, - "r", - encoding=kwds.get("encoding", None), - compression=kwds.get("compression", None), - memory_map=kwds.get("memory_map", False), - storage_options=kwds.get("storage_options", None), - ) - - def _validate_parse_dates_presence(self, columns: List[str]) -> None: - """ - Check if parse_dates are in columns. - - If user has provided names for parse_dates, check if those columns - are available. - - Parameters - ---------- - columns : list - List of names of the dataframe. - - Raises - ------ - ValueError - If column to parse_date is not in dataframe. - - """ - cols_needed: Iterable - if is_dict_like(self.parse_dates): - cols_needed = itertools.chain(*self.parse_dates.values()) - elif is_list_like(self.parse_dates): - # a column in parse_dates could be represented - # ColReference = Union[int, str] - # DateGroups = List[ColReference] - # ParseDates = Union[DateGroups, List[DateGroups], - # Dict[ColReference, DateGroups]] - cols_needed = itertools.chain.from_iterable( - col if is_list_like(col) else [col] for col in self.parse_dates - ) - else: - cols_needed = [] - - # get only columns that are references using names (str), not by index - missing_cols = ", ".join( - sorted( - { - col - for col in cols_needed - if isinstance(col, str) and col not in columns - } - ) - ) - if missing_cols: - raise ValueError( - f"Missing column provided to 'parse_dates': '{missing_cols}'" - ) - - def close(self): - if self.handles is not None: - self.handles.close() - - @property - def _has_complex_date_col(self): - return isinstance(self.parse_dates, dict) or ( - isinstance(self.parse_dates, list) - and len(self.parse_dates) > 0 - and isinstance(self.parse_dates[0], list) - ) - - def _should_parse_dates(self, i): - if isinstance(self.parse_dates, bool): - return self.parse_dates - else: - if self.index_names is not None: - name = self.index_names[i] - else: - name = None - j = i if self.index_col is None else self.index_col[i] - - if is_scalar(self.parse_dates): - return (j == self.parse_dates) or ( - name is not None and name == self.parse_dates - ) - else: - return (j in self.parse_dates) or ( - name is not None and name in self.parse_dates - ) - - def _extract_multi_indexer_columns( - self, header, index_names, col_names, passed_names=False - ): - """ - extract and return the names, index_names, col_names - header is a list-of-lists returned from the parsers - """ - if len(header) < 2: - return header[0], index_names, col_names, passed_names - - # the names are the tuples of the header that are not the index cols - # 0 is the name of the index, assuming index_col is a list of column - # numbers - ic = self.index_col - if ic is None: - ic = [] - - if not isinstance(ic, (list, tuple, np.ndarray)): - ic = [ic] - sic = set(ic) - - # clean the index_names - index_names = header.pop(-1) - index_names, _, _ = _clean_index_names( - index_names, self.index_col, self.unnamed_cols - ) - - # extract the columns - field_count = len(header[0]) - - def extract(r): - return tuple(r[i] for i in range(field_count) if i not in sic) - - columns = list(zip(*(extract(r) for r in header))) - names = ic + columns - - # If we find unnamed columns all in a single - # level, then our header was too long. - for n in range(len(columns[0])): - if all(ensure_str(col[n]) in self.unnamed_cols for col in columns): - header = ",".join(str(x) for x in self.header) - raise ParserError( - f"Passed header=[{header}] are too many rows " - "for this multi_index of columns" - ) - - # Clean the column names (if we have an index_col). - if len(ic): - col_names = [ - r[0] if ((r[0] is not None) and r[0] not in self.unnamed_cols) else None - for r in header - ] - else: - col_names = [None] * len(header) - - passed_names = True - - return names, index_names, col_names, passed_names - - def _maybe_dedup_names(self, names): - # see gh-7160 and gh-9424: this helps to provide - # immediate alleviation of the duplicate names - # issue and appears to be satisfactory to users, - # but ultimately, not needing to butcher the names - # would be nice! - if self.mangle_dupe_cols: - names = list(names) # so we can index - # pandas\io\parsers.py:1559: error: Need type annotation for - # 'counts' [var-annotated] - counts = defaultdict(int) # type: ignore[var-annotated] - is_potential_mi = _is_potential_multi_index(names, self.index_col) - - for i, col in enumerate(names): - cur_count = counts[col] - - while cur_count > 0: - counts[col] = cur_count + 1 - - if is_potential_mi: - col = col[:-1] + (f"{col[-1]}.{cur_count}",) - else: - col = f"{col}.{cur_count}" - cur_count = counts[col] - - names[i] = col - counts[col] = cur_count + 1 - - return names - - def _maybe_make_multi_index_columns(self, columns, col_names=None): - # possibly create a column mi here - if _is_potential_multi_index(columns): - columns = MultiIndex.from_tuples(columns, names=col_names) - return columns - - def _make_index(self, data, alldata, columns, indexnamerow=False): - if not _is_index_col(self.index_col) or not self.index_col: - index = None - - elif not self._has_complex_date_col: - index = self._get_simple_index(alldata, columns) - index = self._agg_index(index) - elif self._has_complex_date_col: - if not self._name_processed: - (self.index_names, _, self.index_col) = _clean_index_names( - list(columns), self.index_col, self.unnamed_cols - ) - self._name_processed = True - index = self._get_complex_date_index(data, columns) - index = self._agg_index(index, try_parse_dates=False) - - # add names for the index - if indexnamerow: - coffset = len(indexnamerow) - len(columns) - # pandas\io\parsers.py:1604: error: Item "None" of "Optional[Any]" - # has no attribute "set_names" [union-attr] - index = index.set_names(indexnamerow[:coffset]) # type: ignore[union-attr] - - # maybe create a mi on the columns - columns = self._maybe_make_multi_index_columns(columns, self.col_names) - - return index, columns - - _implicit_index = False - - def _get_simple_index(self, data, columns): - def ix(col): - if not isinstance(col, str): - return col - raise ValueError(f"Index {col} invalid") - - to_remove = [] - index = [] - for idx in self.index_col: - i = ix(idx) - to_remove.append(i) - index.append(data[i]) - - # remove index items from content and columns, don't pop in - # loop - for i in sorted(to_remove, reverse=True): - data.pop(i) - if not self._implicit_index: - columns.pop(i) - - return index - - def _get_complex_date_index(self, data, col_names): - def _get_name(icol): - if isinstance(icol, str): - return icol - - if col_names is None: - raise ValueError(f"Must supply column order to use {icol!s} as index") - - for i, c in enumerate(col_names): - if i == icol: - return c - - to_remove = [] - index = [] - for idx in self.index_col: - name = _get_name(idx) - to_remove.append(name) - index.append(data[name]) - - # remove index items from content and columns, don't pop in - # loop - for c in sorted(to_remove, reverse=True): - data.pop(c) - col_names.remove(c) - - return index - - def _agg_index(self, index, try_parse_dates=True) -> Index: - arrays = [] - - for i, arr in enumerate(index): - - if try_parse_dates and self._should_parse_dates(i): - arr = self._date_conv(arr) - - if self.na_filter: - col_na_values = self.na_values - col_na_fvalues = self.na_fvalues - else: - col_na_values = set() - col_na_fvalues = set() - - if isinstance(self.na_values, dict): - # pandas\io\parsers.py:1678: error: Value of type - # "Optional[Any]" is not indexable [index] - col_name = self.index_names[i] # type: ignore[index] - if col_name is not None: - col_na_values, col_na_fvalues = _get_na_values( - col_name, self.na_values, self.na_fvalues, self.keep_default_na - ) - - arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues) - arrays.append(arr) - - names = self.index_names - index = ensure_index_from_sequences(arrays, names) - - return index - - def _convert_to_ndarrays( - self, dct, na_values, na_fvalues, verbose=False, converters=None, dtypes=None - ): - result = {} - for c, values in dct.items(): - conv_f = None if converters is None else converters.get(c, None) - if isinstance(dtypes, dict): - cast_type = dtypes.get(c, None) - else: - # single dtype or None - cast_type = dtypes - - if self.na_filter: - col_na_values, col_na_fvalues = _get_na_values( - c, na_values, na_fvalues, self.keep_default_na - ) - else: - col_na_values, col_na_fvalues = set(), set() - - if conv_f is not None: - # conv_f applied to data before inference - if cast_type is not None: - warnings.warn( - ( - "Both a converter and dtype were specified " - f"for column {c} - only the converter will be used" - ), - ParserWarning, - stacklevel=7, - ) - - try: - values = lib.map_infer(values, conv_f) - except ValueError: - mask = algorithms.isin(values, list(na_values)).view(np.uint8) - values = lib.map_infer_mask(values, conv_f, mask) - - cvals, na_count = self._infer_types( - values, set(col_na_values) | col_na_fvalues, try_num_bool=False - ) - else: - is_ea = is_extension_array_dtype(cast_type) - is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) - # skip inference if specified dtype is object - # or casting to an EA - try_num_bool = not (cast_type and is_str_or_ea_dtype) - - # general type inference and conversion - cvals, na_count = self._infer_types( - values, set(col_na_values) | col_na_fvalues, try_num_bool - ) - - # type specified in dtype param or cast_type is an EA - if cast_type and ( - not is_dtype_equal(cvals, cast_type) - or is_extension_array_dtype(cast_type) - ): - if not is_ea and na_count > 0: - try: - if is_bool_dtype(cast_type): - raise ValueError( - f"Bool column has NA values in column {c}" - ) - except (AttributeError, TypeError): - # invalid input to is_bool_dtype - pass - cast_type = pandas_dtype(cast_type) - cvals = self._cast_types(cvals, cast_type, c) - - result[c] = cvals - if verbose and na_count: - print(f"Filled {na_count} NA values in column {c!s}") - return result - - def _infer_types(self, values, na_values, try_num_bool=True): - """ - Infer types of values, possibly casting - - Parameters - ---------- - values : ndarray - na_values : set - try_num_bool : bool, default try - try to cast values to numeric (first preference) or boolean - - Returns - ------- - converted : ndarray - na_count : int - """ - na_count = 0 - if issubclass(values.dtype.type, (np.number, np.bool_)): - mask = algorithms.isin(values, list(na_values)) - na_count = mask.sum() - if na_count > 0: - if is_integer_dtype(values): - values = values.astype(np.float64) - np.putmask(values, mask, np.nan) - return values, na_count - - if try_num_bool and is_object_dtype(values.dtype): - # exclude e.g DatetimeIndex here - try: - result = lib.maybe_convert_numeric(values, na_values, False) - except (ValueError, TypeError): - # e.g. encountering datetime string gets ValueError - # TypeError can be raised in floatify - result = values - na_count = parsers.sanitize_objects(result, na_values, False) - else: - na_count = isna(result).sum() - else: - result = values - if values.dtype == np.object_: - na_count = parsers.sanitize_objects(values, na_values, False) - - if result.dtype == np.object_ and try_num_bool: - result = libops.maybe_convert_bool( - np.asarray(values), - true_values=self.true_values, - false_values=self.false_values, - ) - - return result, na_count - - def _cast_types(self, values, cast_type, column): - """ - Cast values to specified type - - Parameters - ---------- - values : ndarray - cast_type : string or np.dtype - dtype to cast values to - column : string - column name - used only for error reporting - - Returns - ------- - converted : ndarray - """ - if is_categorical_dtype(cast_type): - known_cats = ( - isinstance(cast_type, CategoricalDtype) - and cast_type.categories is not None - ) - - if not is_object_dtype(values) and not known_cats: - # TODO: this is for consistency with - # c-parser which parses all categories - # as strings - values = astype_nansafe(values, str) - - cats = Index(values).unique().dropna() - values = Categorical._from_inferred_categories( - cats, cats.get_indexer(values), cast_type, true_values=self.true_values - ) - - # use the EA's implementation of casting - elif is_extension_array_dtype(cast_type): - # ensure cast_type is an actual dtype and not a string - cast_type = pandas_dtype(cast_type) - array_type = cast_type.construct_array_type() - try: - if is_bool_dtype(cast_type): - return array_type._from_sequence_of_strings( - values, - dtype=cast_type, - true_values=self.true_values, - false_values=self.false_values, - ) - else: - return array_type._from_sequence_of_strings(values, dtype=cast_type) - except NotImplementedError as err: - raise NotImplementedError( - f"Extension Array: {array_type} must implement " - "_from_sequence_of_strings in order to be used in parser methods" - ) from err - - else: - try: - values = astype_nansafe(values, cast_type, copy=True, skipna=True) - except ValueError as err: - raise ValueError( - f"Unable to convert column {column} to type {cast_type}" - ) from err - return values - - def _do_date_conversions(self, names, data): - # returns data, columns - - if self.parse_dates is not None: - data, names = _process_date_conversion( - data, - self._date_conv, - self.parse_dates, - self.index_col, - self.index_names, - names, - keep_date_col=self.keep_date_col, - ) - - return names, data - - -class CParserWrapper(ParserBase): - def __init__(self, src: FilePathOrBuffer, **kwds): - self.kwds = kwds - kwds = kwds.copy() - - ParserBase.__init__(self, kwds) - - # #2442 - kwds["allow_leading_cols"] = self.index_col is not False - - # GH20529, validate usecol arg before TextReader - self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) - kwds["usecols"] = self.usecols - - # open handles - self._open_handles(src, kwds) - assert self.handles is not None - for key in ("storage_options", "encoding", "memory_map", "compression"): - kwds.pop(key, None) - if self.handles.is_mmap and hasattr(self.handles.handle, "mmap"): - # pandas\io\parsers.py:1861: error: Item "IO[Any]" of - # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] - - # pandas\io\parsers.py:1861: error: Item "RawIOBase" of - # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] - - # pandas\io\parsers.py:1861: error: Item "BufferedIOBase" of - # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] - - # pandas\io\parsers.py:1861: error: Item "TextIOBase" of - # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] - - # pandas\io\parsers.py:1861: error: Item "TextIOWrapper" of - # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] - - # pandas\io\parsers.py:1861: error: Item "mmap" of "Union[IO[Any], - # RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]" has - # no attribute "mmap" [union-attr] - self.handles.handle = self.handles.handle.mmap # type: ignore[union-attr] - - try: - self._reader = parsers.TextReader(self.handles.handle, **kwds) - except Exception: - self.handles.close() - raise - self.unnamed_cols = self._reader.unnamed_cols - - passed_names = self.names is None - - if self._reader.header is None: - self.names = None - else: - if len(self._reader.header) > 1: - # we have a multi index in the columns - ( - self.names, - self.index_names, - self.col_names, - passed_names, - ) = self._extract_multi_indexer_columns( - self._reader.header, self.index_names, self.col_names, passed_names - ) - else: - self.names = list(self._reader.header[0]) - - if self.names is None: - if self.prefix: - self.names = [ - f"{self.prefix}{i}" for i in range(self._reader.table_width) - ] - else: - self.names = list(range(self._reader.table_width)) - - # gh-9755 - # - # need to set orig_names here first - # so that proper indexing can be done - # with _set_noconvert_columns - # - # once names has been filtered, we will - # then set orig_names again to names - self.orig_names = self.names[:] - - if self.usecols: - usecols = _evaluate_usecols(self.usecols, self.orig_names) - - # GH 14671 - # assert for mypy, orig_names is List or None, None would error in issubset - assert self.orig_names is not None - if self.usecols_dtype == "string" and not set(usecols).issubset( - self.orig_names - ): - _validate_usecols_names(usecols, self.orig_names) - - if len(self.names) > len(usecols): - self.names = [ - n - for i, n in enumerate(self.names) - if (i in usecols or n in usecols) - ] - - if len(self.names) < len(usecols): - _validate_usecols_names(usecols, self.names) - - self._validate_parse_dates_presence(self.names) - self._set_noconvert_columns() - - self.orig_names = self.names - - if not self._has_complex_date_col: - if self._reader.leading_cols == 0 and _is_index_col(self.index_col): - - self._name_processed = True - (index_names, self.names, self.index_col) = _clean_index_names( - self.names, self.index_col, self.unnamed_cols - ) - - if self.index_names is None: - self.index_names = index_names - - if self._reader.header is None and not passed_names: - # pandas\io\parsers.py:1997: error: Argument 1 to "len" has - # incompatible type "Optional[Any]"; expected "Sized" - # [arg-type] - self.index_names = [None] * len( - self.index_names # type: ignore[arg-type] - ) - - self._implicit_index = self._reader.leading_cols > 0 - - def close(self) -> None: - super().close() - - # close additional handles opened by C parser - try: - self._reader.close() - except ValueError: - pass - - def _set_noconvert_columns(self): - """ - Set the columns that should not undergo dtype conversions. - - Currently, any column that is involved with date parsing will not - undergo such conversions. - """ - names = self.orig_names - if self.usecols_dtype == "integer": - # A set of integers will be converted to a list in - # the correct order every single time. - usecols = list(self.usecols) - usecols.sort() - elif callable(self.usecols) or self.usecols_dtype not in ("empty", None): - # The names attribute should have the correct columns - # in the proper order for indexing with parse_dates. - usecols = self.names[:] - else: - # Usecols is empty. - - # pandas\io\parsers.py:2030: error: Incompatible types in - # assignment (expression has type "None", variable has type - # "List[Any]") [assignment] - usecols = None # type: ignore[assignment] - - def _set(x): - if usecols is not None and is_integer(x): - x = usecols[x] - - if not is_integer(x): - # assert for mypy, names is List or None, None would error when calling - # .index() - assert names is not None - x = names.index(x) - - self._reader.set_noconvert(x) - - if isinstance(self.parse_dates, list): - for val in self.parse_dates: - if isinstance(val, list): - for k in val: - _set(k) - else: - _set(val) - - elif isinstance(self.parse_dates, dict): - for val in self.parse_dates.values(): - if isinstance(val, list): - for k in val: - _set(k) - else: - _set(val) - - elif self.parse_dates: - if isinstance(self.index_col, list): - for k in self.index_col: - _set(k) - elif self.index_col is not None: - _set(self.index_col) - - def set_error_bad_lines(self, status): - self._reader.set_error_bad_lines(int(status)) - - def read(self, nrows=None): - try: - data = self._reader.read(nrows) - except StopIteration: - if self._first_chunk: - self._first_chunk = False - names = self._maybe_dedup_names(self.orig_names) - index, columns, col_dict = _get_empty_meta( - names, - self.index_col, - self.index_names, - dtype=self.kwds.get("dtype"), - ) - columns = self._maybe_make_multi_index_columns(columns, self.col_names) - - if self.usecols is not None: - columns = self._filter_usecols(columns) - - col_dict = {k: v for k, v in col_dict.items() if k in columns} - - return index, columns, col_dict - - else: - self.close() - raise - - # Done with first read, next time raise StopIteration - self._first_chunk = False - - names = self.names - - if self._reader.leading_cols: - if self._has_complex_date_col: - raise NotImplementedError("file structure not yet supported") - - # implicit index, no index names - arrays = [] - - for i in range(self._reader.leading_cols): - if self.index_col is None: - values = data.pop(i) - else: - values = data.pop(self.index_col[i]) - - values = self._maybe_parse_dates(values, i, try_parse_dates=True) - arrays.append(values) - - index = ensure_index_from_sequences(arrays) - - if self.usecols is not None: - names = self._filter_usecols(names) - - names = self._maybe_dedup_names(names) - - # rename dict keys - data = sorted(data.items()) - data = {k: v for k, (i, v) in zip(names, data)} - - names, data = self._do_date_conversions(names, data) - - else: - # rename dict keys - data = sorted(data.items()) - - # ugh, mutation - - # assert for mypy, orig_names is List or None, None would error in list(...) - assert self.orig_names is not None - names = list(self.orig_names) - names = self._maybe_dedup_names(names) - - if self.usecols is not None: - names = self._filter_usecols(names) - - # columns as list - alldata = [x[1] for x in data] - - data = {k: v for k, (i, v) in zip(names, data)} - - names, data = self._do_date_conversions(names, data) - index, names = self._make_index(data, alldata, names) - - # maybe create a mi on the columns - names = self._maybe_make_multi_index_columns(names, self.col_names) - - return index, names, data - - def _filter_usecols(self, names): - # hackish - usecols = _evaluate_usecols(self.usecols, names) - if usecols is not None and len(names) != len(usecols): - names = [ - name for i, name in enumerate(names) if i in usecols or name in usecols - ] - return names - - def _get_index_names(self): - names = list(self._reader.header[0]) - idx_names = None - - if self._reader.leading_cols == 0 and self.index_col is not None: - (idx_names, names, self.index_col) = _clean_index_names( - names, self.index_col, self.unnamed_cols - ) - - return names, idx_names - - def _maybe_parse_dates(self, values, index, try_parse_dates=True): - if try_parse_dates and self._should_parse_dates(index): - values = self._date_conv(values) - return values - - -def TextParser(*args, **kwds): - """ - Converts lists of lists/tuples into DataFrames with proper type inference - and optional (e.g. string to datetime) conversion. Also enables iterating - lazily over chunks of large files - - Parameters - ---------- - data : file-like object or list - delimiter : separator character to use - dialect : str or csv.Dialect instance, optional - Ignored if delimiter is longer than 1 character - names : sequence, default - header : int, default 0 - Row to use to parse column labels. Defaults to the first row. Prior - rows will be discarded - index_col : int or list, optional - Column or columns to use as the (possibly hierarchical) index - has_index_names: bool, default False - True if the cols defined in index_col have an index name and are - not in the header. - na_values : scalar, str, list-like, or dict, optional - Additional strings to recognize as NA/NaN. - keep_default_na : bool, default True - thousands : str, optional - Thousands separator - comment : str, optional - Comment out remainder of line - parse_dates : bool, default False - keep_date_col : bool, default False - date_parser : function, optional - skiprows : list of integers - Row numbers to skip - skipfooter : int - Number of line at bottom of file to skip - converters : dict, optional - Dict of functions for converting values in certain columns. Keys can - either be integers or column labels, values are functions that take one - input argument, the cell (not column) content, and return the - transformed content. - encoding : str, optional - Encoding to use for UTF when reading/writing (ex. 'utf-8') - squeeze : bool, default False - returns Series if only one column. - infer_datetime_format: bool, default False - If True and `parse_dates` is True for a column, try to infer the - datetime format based on the first datetime string. If the format - can be inferred, there often will be a large parsing speed-up. - float_precision : str, optional - Specifies which converter the C engine should use for floating-point - values. The options are `None` or `high` for the ordinary converter, - `legacy` for the original lower precision pandas converter, and - `round_trip` for the round-trip converter. - - .. versionchanged:: 1.2 - """ - kwds["engine"] = "python" - return TextFileReader(*args, **kwds) - - -def count_empty_vals(vals) -> int: - return sum(1 for v in vals if v == "" or v is None) - - -class PythonParser(ParserBase): - def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): - """ - Workhorse function for processing nested list into DataFrame - """ - ParserBase.__init__(self, kwds) - - self.data: Optional[Iterator[str]] = None - self.buf: List = [] - self.pos = 0 - self.line_pos = 0 - - self.skiprows = kwds["skiprows"] - - if callable(self.skiprows): - self.skipfunc = self.skiprows - else: - self.skipfunc = lambda x: x in self.skiprows - - self.skipfooter = _validate_skipfooter_arg(kwds["skipfooter"]) - self.delimiter = kwds["delimiter"] - - self.quotechar = kwds["quotechar"] - if isinstance(self.quotechar, str): - self.quotechar = str(self.quotechar) - - self.escapechar = kwds["escapechar"] - self.doublequote = kwds["doublequote"] - self.skipinitialspace = kwds["skipinitialspace"] - self.lineterminator = kwds["lineterminator"] - self.quoting = kwds["quoting"] - self.usecols, _ = _validate_usecols_arg(kwds["usecols"]) - self.skip_blank_lines = kwds["skip_blank_lines"] - - self.warn_bad_lines = kwds["warn_bad_lines"] - self.error_bad_lines = kwds["error_bad_lines"] - - self.names_passed = kwds["names"] or None - - self.has_index_names = False - if "has_index_names" in kwds: - self.has_index_names = kwds["has_index_names"] - - self.verbose = kwds["verbose"] - self.converters = kwds["converters"] - - self.dtype = kwds["dtype"] - self.thousands = kwds["thousands"] - self.decimal = kwds["decimal"] - - self.comment = kwds["comment"] - - # Set self.data to something that can read lines. - if isinstance(f, list): - # read_excel: f is a list - self.data = cast(Iterator[str], f) - else: - self._open_handles(f, kwds) - assert self.handles is not None - assert hasattr(self.handles.handle, "readline") - try: - self._make_reader(self.handles.handle) - except (csv.Error, UnicodeDecodeError): - self.close() - raise - - # Get columns in two steps: infer from data, then - # infer column indices from self.usecols if it is specified. - self._col_indices: Optional[List[int]] = None - try: - ( - self.columns, - self.num_original_columns, - self.unnamed_cols, - ) = self._infer_columns() - except (TypeError, ValueError): - self.close() - raise - - # Now self.columns has the set of columns that we will process. - # The original set is stored in self.original_columns. - if len(self.columns) > 1: - # we are processing a multi index column - ( - self.columns, - self.index_names, - self.col_names, - _, - ) = self._extract_multi_indexer_columns( - self.columns, self.index_names, self.col_names - ) - # Update list of original names to include all indices. - self.num_original_columns = len(self.columns) - else: - self.columns = self.columns[0] - - # get popped off for index - self.orig_names = list(self.columns) - - # needs to be cleaned/refactored - # multiple date column thing turning into a real spaghetti factory - - if not self._has_complex_date_col: - (index_names, self.orig_names, self.columns) = self._get_index_name( - self.columns - ) - self._name_processed = True - if self.index_names is None: - self.index_names = index_names - - if self._col_indices is None: - self._col_indices = list(range(len(self.columns))) - - self._validate_parse_dates_presence(self.columns) - if self.parse_dates: - self._no_thousands_columns = self._set_no_thousands_columns() - else: - self._no_thousands_columns = None - - if len(self.decimal) != 1: - raise ValueError("Only length-1 decimal markers supported") - - decimal = re.escape(self.decimal) - if self.thousands is None: - regex = fr"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$" - else: - thousands = re.escape(self.thousands) - regex = ( - fr"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?" - fr"([0-9]?(E|e)\-?[0-9]+)?$" - ) - self.num = re.compile(regex) - - def _set_no_thousands_columns(self): - # Create a set of column ids that are not to be stripped of thousands - # operators. - noconvert_columns = set() - - def _set(x): - if is_integer(x): - noconvert_columns.add(x) - else: - assert self._col_indices is not None - col_indices = self._col_indices - noconvert_columns.add(col_indices[self.columns.index(x)]) - - if isinstance(self.parse_dates, list): - for val in self.parse_dates: - if isinstance(val, list): - for k in val: - _set(k) - else: - _set(val) - - elif isinstance(self.parse_dates, dict): - for val in self.parse_dates.values(): - if isinstance(val, list): - for k in val: - _set(k) - else: - _set(val) - - elif self.parse_dates: - if isinstance(self.index_col, list): - for k in self.index_col: - _set(k) - elif self.index_col is not None: - _set(self.index_col) - - return noconvert_columns - - def _make_reader(self, f): - sep = self.delimiter - - if sep is None or len(sep) == 1: - if self.lineterminator: - raise ValueError( - "Custom line terminators not supported in python parser (yet)" - ) - - class MyDialect(csv.Dialect): - delimiter = self.delimiter - quotechar = self.quotechar - escapechar = self.escapechar - doublequote = self.doublequote - skipinitialspace = self.skipinitialspace - quoting = self.quoting - lineterminator = "\n" - - dia = MyDialect - - if sep is not None: - dia.delimiter = sep - else: - # attempt to sniff the delimiter from the first valid line, - # i.e. no comment line and not in skiprows - line = f.readline() - lines = self._check_comments([[line]])[0] - while self.skipfunc(self.pos) or not lines: - self.pos += 1 - line = f.readline() - lines = self._check_comments([[line]])[0] - - # since `line` was a string, lines will be a list containing - # only a single string - line = lines[0] - - self.pos += 1 - self.line_pos += 1 - sniffed = csv.Sniffer().sniff(line) - dia.delimiter = sniffed.delimiter - - # Note: encoding is irrelevant here - line_rdr = csv.reader(StringIO(line), dialect=dia) - self.buf.extend(list(line_rdr)) - - # Note: encoding is irrelevant here - reader = csv.reader(f, dialect=dia, strict=True) - - else: - - def _read(): - line = f.readline() - pat = re.compile(sep) - - yield pat.split(line.strip()) - - for line in f: - yield pat.split(line.strip()) - - reader = _read() - - # pandas\io\parsers.py:2427: error: Incompatible types in assignment - # (expression has type "_reader", variable has type "Union[IO[Any], - # RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap, None]") - # [assignment] - self.data = reader # type: ignore[assignment] - - def read(self, rows=None): - try: - content = self._get_lines(rows) - except StopIteration: - if self._first_chunk: - content = [] - else: - self.close() - raise - - # done with first read, next time raise StopIteration - self._first_chunk = False - - # pandas\io\parsers.py:2480: error: Argument 1 to "list" has - # incompatible type "Optional[Any]"; expected "Iterable[Any]" - # [arg-type] - columns = list(self.orig_names) # type: ignore[arg-type] - if not len(content): # pragma: no cover - # DataFrame with the right metadata, even though it's length 0 - names = self._maybe_dedup_names(self.orig_names) - index, columns, col_dict = _get_empty_meta( - names, self.index_col, self.index_names, self.dtype - ) - columns = self._maybe_make_multi_index_columns(columns, self.col_names) - return index, columns, col_dict - - # handle new style for names in index - count_empty_content_vals = count_empty_vals(content[0]) - indexnamerow = None - if self.has_index_names and count_empty_content_vals == len(columns): - indexnamerow = content[0] - content = content[1:] - - alldata = self._rows_to_cols(content) - data, columns = self._exclude_implicit_index(alldata) - - columns, data = self._do_date_conversions(columns, data) - - data = self._convert_data(data) - index, columns = self._make_index(data, alldata, columns, indexnamerow) - - return index, columns, data - - def _exclude_implicit_index(self, alldata): - names = self._maybe_dedup_names(self.orig_names) - - offset = 0 - if self._implicit_index: - offset = len(self.index_col) - - if self._col_indices is not None and len(names) != len(self._col_indices): - names = [names[i] for i in sorted(self._col_indices)] - - return {name: alldata[i + offset] for i, name in enumerate(names)}, names - - # legacy - def get_chunk(self, size=None): - if size is None: - # pandas\io\parsers.py:2528: error: "PythonParser" has no attribute - # "chunksize" [attr-defined] - size = self.chunksize # type: ignore[attr-defined] - return self.read(rows=size) - - def _convert_data(self, data): - # apply converters - def _clean_mapping(mapping): - """converts col numbers to names""" - clean = {} - for col, v in mapping.items(): - # pandas\io\parsers.py:2537: error: Unsupported right operand - # type for in ("Optional[Any]") [operator] - if ( - isinstance(col, int) - and col not in self.orig_names # type: ignore[operator] - ): - # pandas\io\parsers.py:2538: error: Value of type - # "Optional[Any]" is not indexable [index] - col = self.orig_names[col] # type: ignore[index] - clean[col] = v - return clean - - clean_conv = _clean_mapping(self.converters) - if not isinstance(self.dtype, dict): - # handles single dtype applied to all columns - clean_dtypes = self.dtype - else: - clean_dtypes = _clean_mapping(self.dtype) - - # Apply NA values. - clean_na_values = {} - clean_na_fvalues = {} - - if isinstance(self.na_values, dict): - for col in self.na_values: - na_value = self.na_values[col] - na_fvalue = self.na_fvalues[col] - - # pandas\io\parsers.py:2558: error: Unsupported right operand - # type for in ("Optional[Any]") [operator] - if ( - isinstance(col, int) - and col not in self.orig_names # type: ignore[operator] - ): - # pandas\io\parsers.py:2559: error: Value of type - # "Optional[Any]" is not indexable [index] - col = self.orig_names[col] # type: ignore[index] - - clean_na_values[col] = na_value - clean_na_fvalues[col] = na_fvalue - else: - clean_na_values = self.na_values - clean_na_fvalues = self.na_fvalues - - return self._convert_to_ndarrays( - data, - clean_na_values, - clean_na_fvalues, - self.verbose, - clean_conv, - clean_dtypes, - ) - - def _infer_columns(self): - names = self.names - num_original_columns = 0 - clear_buffer = True - # pandas\io\parsers.py:2580: error: Need type annotation for - # 'unnamed_cols' (hint: "unnamed_cols: Set[] = ...") - # [var-annotated] - unnamed_cols = set() # type: ignore[var-annotated] - - if self.header is not None: - header = self.header - - if isinstance(header, (list, tuple, np.ndarray)): - have_mi_columns = len(header) > 1 - # we have a mi columns, so read an extra line - if have_mi_columns: - header = list(header) + [header[-1] + 1] - else: - have_mi_columns = False - header = [header] - - # pandas\io\parsers.py:2594: error: Need type annotation for - # 'columns' (hint: "columns: List[] = ...") [var-annotated] - columns = [] # type: ignore[var-annotated] - for level, hr in enumerate(header): - try: - line = self._buffered_line() - - while self.line_pos <= hr: - line = self._next_line() - - except StopIteration as err: - if self.line_pos < hr: - raise ValueError( - f"Passed header={hr} but only {self.line_pos + 1} lines in " - "file" - ) from err - - # We have an empty file, so check - # if columns are provided. That will - # serve as the 'line' for parsing - if have_mi_columns and hr > 0: - if clear_buffer: - self._clear_buffer() - columns.append([None] * len(columns[-1])) - return columns, num_original_columns, unnamed_cols - - if not self.names: - raise EmptyDataError("No columns to parse from file") from err - - line = self.names[:] - - this_columns = [] - this_unnamed_cols = [] - - for i, c in enumerate(line): - if c == "": - if have_mi_columns: - col_name = f"Unnamed: {i}_level_{level}" - else: - col_name = f"Unnamed: {i}" - - this_unnamed_cols.append(i) - this_columns.append(col_name) - else: - this_columns.append(c) - - if not have_mi_columns and self.mangle_dupe_cols: - # pandas\io\parsers.py:2639: error: Need type annotation - # for 'counts' [var-annotated] - counts = defaultdict(int) # type: ignore[var-annotated] - - for i, col in enumerate(this_columns): - cur_count = counts[col] - - while cur_count > 0: - counts[col] = cur_count + 1 - col = f"{col}.{cur_count}" - cur_count = counts[col] - - this_columns[i] = col - counts[col] = cur_count + 1 - elif have_mi_columns: - - # if we have grabbed an extra line, but its not in our - # format so save in the buffer, and create an blank extra - # line for the rest of the parsing code - if hr == header[-1]: - lc = len(this_columns) - ic = len(self.index_col) if self.index_col is not None else 0 - unnamed_count = len(this_unnamed_cols) - - if lc != unnamed_count and lc - ic > unnamed_count: - clear_buffer = False - # pandas\io\parsers.py:2663: error: List item 0 has - # incompatible type "None"; expected "str" - # [list-item] - this_columns = [None] * lc # type: ignore[list-item] - self.buf = [self.buf[-1]] - - # pandas\io\parsers.py:2666: error: Argument 1 to "append" of - # "list" has incompatible type "List[str]"; expected - # "List[None]" [arg-type] - columns.append(this_columns) # type: ignore[arg-type] - unnamed_cols.update({this_columns[i] for i in this_unnamed_cols}) - - if len(columns) == 1: - num_original_columns = len(this_columns) - - if clear_buffer: - self._clear_buffer() - - if names is not None: - if len(names) > len(columns[0]): - raise ValueError( - "Number of passed names did not match " - "number of header fields in the file" - ) - if len(columns) > 1: - raise TypeError("Cannot pass names with multi-index columns") - - if self.usecols is not None: - # Set _use_cols. We don't store columns because they are - # overwritten. - self._handle_usecols(columns, names) - else: - num_original_columns = len(names) - columns = [names] - else: - columns = self._handle_usecols(columns, columns[0]) - else: - try: - line = self._buffered_line() - - except StopIteration as err: - if not names: - raise EmptyDataError("No columns to parse from file") from err - - line = names[:] - - ncols = len(line) - num_original_columns = ncols - - if not names: - if self.prefix: - # pandas\io\parsers.py:2711: error: List comprehension has - # incompatible type List[str]; expected List[None] [misc] - columns = [ - [ - f"{self.prefix}{i}" # type: ignore[misc] - for i in range(ncols) - ] - ] - else: - # pandas\io\parsers.py:2713: error: Argument 1 to "list" - # has incompatible type "range"; expected "Iterable[None]" - # [arg-type] - columns = [list(range(ncols))] # type: ignore[arg-type] - columns = self._handle_usecols(columns, columns[0]) - else: - if self.usecols is None or len(names) >= num_original_columns: - columns = self._handle_usecols([names], names) - num_original_columns = len(names) - else: - if not callable(self.usecols) and len(names) != len(self.usecols): - raise ValueError( - "Number of passed names did not match number of " - "header fields in the file" - ) - # Ignore output but set used columns. - self._handle_usecols([names], names) - columns = [names] - num_original_columns = ncols - - return columns, num_original_columns, unnamed_cols - - def _handle_usecols(self, columns, usecols_key): - """ - Sets self._col_indices - - usecols_key is used if there are string usecols. - """ - if self.usecols is not None: - if callable(self.usecols): - col_indices = _evaluate_usecols(self.usecols, usecols_key) - elif any(isinstance(u, str) for u in self.usecols): - if len(columns) > 1: - raise ValueError( - "If using multiple headers, usecols must be integers." - ) - col_indices = [] - - for col in self.usecols: - if isinstance(col, str): - try: - col_indices.append(usecols_key.index(col)) - except ValueError: - _validate_usecols_names(self.usecols, usecols_key) - else: - col_indices.append(col) - else: - col_indices = self.usecols - - columns = [ - [n for i, n in enumerate(column) if i in col_indices] - for column in columns - ] - self._col_indices = sorted(col_indices) - return columns - - def _buffered_line(self): - """ - Return a line from buffer, filling buffer if required. - """ - if len(self.buf) > 0: - return self.buf[0] - else: - return self._next_line() - - def _check_for_bom(self, first_row): - """ - Checks whether the file begins with the BOM character. - If it does, remove it. In addition, if there is quoting - in the field subsequent to the BOM, remove it as well - because it technically takes place at the beginning of - the name, not the middle of it. - """ - # first_row will be a list, so we need to check - # that that list is not empty before proceeding. - if not first_row: - return first_row - - # The first element of this row is the one that could have the - # BOM that we want to remove. Check that the first element is a - # string before proceeding. - if not isinstance(first_row[0], str): - return first_row - - # Check that the string is not empty, as that would - # obviously not have a BOM at the start of it. - if not first_row[0]: - return first_row - - # Since the string is non-empty, check that it does - # in fact begin with a BOM. - first_elt = first_row[0][0] - if first_elt != _BOM: - return first_row - - first_row_bom = first_row[0] - - if len(first_row_bom) > 1 and first_row_bom[1] == self.quotechar: - start = 2 - quote = first_row_bom[1] - end = first_row_bom[2:].index(quote) + 2 - - # Extract the data between the quotation marks - new_row = first_row_bom[start:end] - - # Extract any remaining data after the second - # quotation mark. - if len(first_row_bom) > end + 1: - new_row += first_row_bom[end + 1 :] - - else: - - # No quotation so just remove BOM from first element - new_row = first_row_bom[1:] - return [new_row] + first_row[1:] - - def _is_line_empty(self, line): - """ - Check if a line is empty or not. - - Parameters - ---------- - line : str, array-like - The line of data to check. - - Returns - ------- - boolean : Whether or not the line is empty. - """ - return not line or all(not x for x in line) - - def _next_line(self): - if isinstance(self.data, list): - while self.skipfunc(self.pos): - self.pos += 1 - - while True: - try: - line = self._check_comments([self.data[self.pos]])[0] - self.pos += 1 - # either uncommented or blank to begin with - if not self.skip_blank_lines and ( - self._is_line_empty(self.data[self.pos - 1]) or line - ): - break - elif self.skip_blank_lines: - ret = self._remove_empty_lines([line]) - if ret: - line = ret[0] - break - except IndexError: - raise StopIteration - else: - while self.skipfunc(self.pos): - self.pos += 1 - # assert for mypy, data is Iterator[str] or None, would error in next - assert self.data is not None - next(self.data) - - while True: - orig_line = self._next_iter_line(row_num=self.pos + 1) - self.pos += 1 - - if orig_line is not None: - line = self._check_comments([orig_line])[0] - - if self.skip_blank_lines: - ret = self._remove_empty_lines([line]) - - if ret: - line = ret[0] - break - elif self._is_line_empty(orig_line) or line: - break - - # This was the first line of the file, - # which could contain the BOM at the - # beginning of it. - if self.pos == 1: - line = self._check_for_bom(line) - - self.line_pos += 1 - self.buf.append(line) - return line - - def _alert_malformed(self, msg, row_num): - """ - Alert a user about a malformed row. - - If `self.error_bad_lines` is True, the alert will be `ParserError`. - If `self.warn_bad_lines` is True, the alert will be printed out. - - Parameters - ---------- - msg : The error message to display. - row_num : The row number where the parsing error occurred. - Because this row number is displayed, we 1-index, - even though we 0-index internally. - """ - if self.error_bad_lines: - raise ParserError(msg) - elif self.warn_bad_lines: - base = f"Skipping line {row_num}: " - sys.stderr.write(base + msg + "\n") - - def _next_iter_line(self, row_num): - """ - Wrapper around iterating through `self.data` (CSV source). - - When a CSV error is raised, we check for specific - error messages that allow us to customize the - error message displayed to the user. - - Parameters - ---------- - row_num : The row number of the line being parsed. - """ - try: - # assert for mypy, data is Iterator[str] or None, would error in next - assert self.data is not None - return next(self.data) - except csv.Error as e: - if self.warn_bad_lines or self.error_bad_lines: - msg = str(e) - - if "NULL byte" in msg or "line contains NUL" in msg: - msg = ( - "NULL byte detected. This byte " - "cannot be processed in Python's " - "native csv library at the moment, " - "so please pass in engine='c' instead" - ) - - if self.skipfooter > 0: - reason = ( - "Error could possibly be due to " - "parsing errors in the skipped footer rows " - "(the skipfooter keyword is only applied " - "after Python's csv library has parsed " - "all rows)." - ) - msg += ". " + reason - - self._alert_malformed(msg, row_num) - return None - - def _check_comments(self, lines): - if self.comment is None: - return lines - ret = [] - for line in lines: - rl = [] - for x in line: - if ( - not isinstance(x, str) - or self.comment not in x - or x in self.na_values - ): - rl.append(x) - else: - x = x[: x.find(self.comment)] - if len(x) > 0: - rl.append(x) - break - ret.append(rl) - return ret - - def _remove_empty_lines(self, lines): - """ - Iterate through the lines and remove any that are - either empty or contain only one whitespace value - - Parameters - ---------- - lines : array-like - The array of lines that we are to filter. - - Returns - ------- - filtered_lines : array-like - The same array of lines with the "empty" ones removed. - """ - ret = [] - for line in lines: - # Remove empty lines and lines with only one whitespace value - if ( - len(line) > 1 - or len(line) == 1 - and (not isinstance(line[0], str) or line[0].strip()) - ): - ret.append(line) - return ret - - def _check_thousands(self, lines): - if self.thousands is None: - return lines - - return self._search_replace_num_columns( - lines=lines, search=self.thousands, replace="" - ) - - def _search_replace_num_columns(self, lines, search, replace): - ret = [] - for line in lines: - rl = [] - for i, x in enumerate(line): - if ( - not isinstance(x, str) - or search not in x - or (self._no_thousands_columns and i in self._no_thousands_columns) - or not self.num.search(x.strip()) - ): - rl.append(x) - else: - rl.append(x.replace(search, replace)) - ret.append(rl) - return ret - - def _check_decimal(self, lines): - if self.decimal == _parser_defaults["decimal"]: - return lines - - return self._search_replace_num_columns( - lines=lines, search=self.decimal, replace="." - ) - - def _clear_buffer(self): - self.buf = [] - - _implicit_index = False - - def _get_index_name(self, columns): - """ - Try several cases to get lines: - - 0) There are headers on row 0 and row 1 and their - total summed lengths equals the length of the next line. - Treat row 0 as columns and row 1 as indices - 1) Look for implicit index: there are more columns - on row 1 than row 0. If this is true, assume that row - 1 lists index columns and row 0 lists normal columns. - 2) Get index from the columns if it was listed. - """ - orig_names = list(columns) - columns = list(columns) - - try: - line = self._next_line() - except StopIteration: - line = None - - try: - next_line = self._next_line() - except StopIteration: - next_line = None - - # implicitly index_col=0 b/c 1 fewer column names - implicit_first_cols = 0 - if line is not None: - # leave it 0, #2442 - # Case 1 - if self.index_col is not False: - implicit_first_cols = len(line) - self.num_original_columns - - # Case 0 - if next_line is not None: - if len(next_line) == len(line) + self.num_original_columns: - # column and index names on diff rows - self.index_col = list(range(len(line))) - self.buf = self.buf[1:] - - for c in reversed(line): - columns.insert(0, c) - - # Update list of original names to include all indices. - orig_names = list(columns) - self.num_original_columns = len(columns) - return line, orig_names, columns - - if implicit_first_cols > 0: - # Case 1 - self._implicit_index = True - if self.index_col is None: - self.index_col = list(range(implicit_first_cols)) - - index_name = None - - else: - # Case 2 - (index_name, columns_, self.index_col) = _clean_index_names( - columns, self.index_col, self.unnamed_cols - ) - - return index_name, orig_names, columns - - def _rows_to_cols(self, content): - col_len = self.num_original_columns - - if self._implicit_index: - col_len += len(self.index_col) - - max_len = max(len(row) for row in content) - - # Check that there are no rows with too many - # elements in their row (rows with too few - # elements are padded with NaN). - if max_len > col_len and self.index_col is not False and self.usecols is None: - - footers = self.skipfooter if self.skipfooter else 0 - bad_lines = [] - - iter_content = enumerate(content) - content_len = len(content) - content = [] - - for (i, l) in iter_content: - actual_len = len(l) - - if actual_len > col_len: - if self.error_bad_lines or self.warn_bad_lines: - row_num = self.pos - (content_len - i + footers) - bad_lines.append((row_num, actual_len)) - - if self.error_bad_lines: - break - else: - content.append(l) - - for row_num, actual_len in bad_lines: - msg = ( - f"Expected {col_len} fields in line {row_num + 1}, saw " - f"{actual_len}" - ) - if ( - self.delimiter - and len(self.delimiter) > 1 - and self.quoting != csv.QUOTE_NONE - ): - # see gh-13374 - reason = ( - "Error could possibly be due to quotes being " - "ignored when a multi-char delimiter is used." - ) - msg += ". " + reason - - self._alert_malformed(msg, row_num + 1) - - # see gh-13320 - zipped_content = list(lib.to_object_array(content, min_width=col_len).T) - - if self.usecols: - assert self._col_indices is not None - col_indices = self._col_indices - - if self._implicit_index: - zipped_content = [ - a - for i, a in enumerate(zipped_content) - if ( - i < len(self.index_col) - or i - len(self.index_col) in col_indices - ) - ] - else: - zipped_content = [ - a for i, a in enumerate(zipped_content) if i in col_indices - ] - return zipped_content - - def _get_lines(self, rows=None): - lines = self.buf - new_rows = None - - # already fetched some number - if rows is not None: - # we already have the lines in the buffer - if len(self.buf) >= rows: - new_rows, self.buf = self.buf[:rows], self.buf[rows:] - - # need some lines - else: - rows -= len(self.buf) - - if new_rows is None: - if isinstance(self.data, list): - if self.pos > len(self.data): - raise StopIteration - if rows is None: - new_rows = self.data[self.pos :] - new_pos = len(self.data) - else: - new_rows = self.data[self.pos : self.pos + rows] - new_pos = self.pos + rows - - # Check for stop rows. n.b.: self.skiprows is a set. - if self.skiprows: - new_rows = [ - row - for i, row in enumerate(new_rows) - if not self.skipfunc(i + self.pos) - ] - - lines.extend(new_rows) - self.pos = new_pos - - else: - new_rows = [] - try: - if rows is not None: - for _ in range(rows): - # assert for mypy, data is Iterator[str] or None, would - # error in next - assert self.data is not None - new_rows.append(next(self.data)) - lines.extend(new_rows) - else: - rows = 0 - - while True: - new_row = self._next_iter_line(row_num=self.pos + rows + 1) - rows += 1 - - if new_row is not None: - new_rows.append(new_row) - - except StopIteration: - if self.skiprows: - new_rows = [ - row - for i, row in enumerate(new_rows) - if not self.skipfunc(i + self.pos) - ] - lines.extend(new_rows) - if len(lines) == 0: - raise - self.pos += len(new_rows) - - self.buf = [] - else: - lines = new_rows - - if self.skipfooter: - lines = lines[: -self.skipfooter] - - lines = self._check_comments(lines) - if self.skip_blank_lines: - lines = self._remove_empty_lines(lines) - lines = self._check_thousands(lines) - return self._check_decimal(lines) - - -def _make_date_converter( - date_parser=None, dayfirst=False, infer_datetime_format=False, cache_dates=True -): - def converter(*date_cols): - if date_parser is None: - strs = parsing.concat_date_cols(date_cols) - - try: - return tools.to_datetime( - ensure_object(strs), - utc=None, - dayfirst=dayfirst, - errors="ignore", - infer_datetime_format=infer_datetime_format, - cache=cache_dates, - ).to_numpy() - - except ValueError: - return tools.to_datetime( - parsing.try_parse_dates(strs, dayfirst=dayfirst), cache=cache_dates - ) - else: - try: - result = tools.to_datetime( - date_parser(*date_cols), errors="ignore", cache=cache_dates - ) - if isinstance(result, datetime.datetime): - raise Exception("scalar parser") - return result - except Exception: - try: - return tools.to_datetime( - parsing.try_parse_dates( - parsing.concat_date_cols(date_cols), - parser=date_parser, - dayfirst=dayfirst, - ), - errors="ignore", - ) - except Exception: - return generic_parser(date_parser, *date_cols) - - return converter - - -def _process_date_conversion( - data_dict, - converter, - parse_spec, - index_col, - index_names, - columns, - keep_date_col=False, -): - def _isindex(colspec): - return (isinstance(index_col, list) and colspec in index_col) or ( - isinstance(index_names, list) and colspec in index_names - ) - - new_cols = [] - new_data = {} - - orig_names = columns - columns = list(columns) - - date_cols = set() - - if parse_spec is None or isinstance(parse_spec, bool): - return data_dict, columns - - if isinstance(parse_spec, list): - # list of column lists - for colspec in parse_spec: - if is_scalar(colspec): - if isinstance(colspec, int) and colspec not in data_dict: - colspec = orig_names[colspec] - if _isindex(colspec): - continue - data_dict[colspec] = converter(data_dict[colspec]) - else: - new_name, col, old_names = _try_convert_dates( - converter, colspec, data_dict, orig_names - ) - if new_name in data_dict: - raise ValueError(f"New date column already in dict {new_name}") - new_data[new_name] = col - new_cols.append(new_name) - date_cols.update(old_names) - - elif isinstance(parse_spec, dict): - # dict of new name to column list - for new_name, colspec in parse_spec.items(): - if new_name in data_dict: - raise ValueError(f"Date column {new_name} already in dict") - - _, col, old_names = _try_convert_dates( - converter, colspec, data_dict, orig_names - ) - - new_data[new_name] = col - new_cols.append(new_name) - date_cols.update(old_names) - - data_dict.update(new_data) - new_cols.extend(columns) - - if not keep_date_col: - for c in list(date_cols): - data_dict.pop(c) - new_cols.remove(c) - - return data_dict, new_cols - - -def _try_convert_dates(parser, colspec, data_dict, columns): - colset = set(columns) - colnames = [] - - for c in colspec: - if c in colset: - colnames.append(c) - elif isinstance(c, int) and c not in columns: - colnames.append(columns[c]) - else: - colnames.append(c) - - new_name = "_".join(str(x) for x in colnames) - to_parse = [data_dict[c] for c in colnames if c in data_dict] - - new_col = parser(*to_parse) - return new_name, new_col, colnames - - -def _clean_na_values(na_values, keep_default_na=True): - - if na_values is None: - if keep_default_na: - na_values = STR_NA_VALUES - else: - na_values = set() - # pandas\io\parsers.py:3387: error: Need type annotation for - # 'na_fvalues' (hint: "na_fvalues: Set[] = ...") [var-annotated] - na_fvalues = set() # type: ignore[var-annotated] - elif isinstance(na_values, dict): - old_na_values = na_values.copy() - na_values = {} # Prevent aliasing. - - # Convert the values in the na_values dictionary - # into array-likes for further use. This is also - # where we append the default NaN values, provided - # that `keep_default_na=True`. - for k, v in old_na_values.items(): - if not is_list_like(v): - v = [v] - - if keep_default_na: - v = set(v) | STR_NA_VALUES - - na_values[k] = v - # pandas\io\parsers.py:3404: error: Incompatible types in assignment - # (expression has type "Dict[Any, Any]", variable has type "Set[Any]") - # [assignment] - na_fvalues = { # type: ignore[assignment] - k: _floatify_na_values(v) for k, v in na_values.items() - } - else: - if not is_list_like(na_values): - na_values = [na_values] - na_values = _stringify_na_values(na_values) - if keep_default_na: - na_values = na_values | STR_NA_VALUES - - na_fvalues = _floatify_na_values(na_values) - - return na_values, na_fvalues - - -def _clean_index_names(columns, index_col, unnamed_cols): - if not _is_index_col(index_col): - return None, columns, index_col - - columns = list(columns) - - # In case of no rows and multiindex columns we have to set index_names to - # list of Nones GH#38292 - if not columns: - return [None] * len(index_col), columns, index_col - - cp_cols = list(columns) - index_names = [] - - # don't mutate - index_col = list(index_col) - - for i, c in enumerate(index_col): - if isinstance(c, str): - index_names.append(c) - for j, name in enumerate(cp_cols): - if name == c: - index_col[i] = j - columns.remove(name) - break - else: - name = cp_cols[c] - columns.remove(name) - index_names.append(name) - - # Only clean index names that were placeholders. - for i, name in enumerate(index_names): - if isinstance(name, str) and name in unnamed_cols: - # pandas\io\parsers.py:3445: error: No overload variant of - # "__setitem__" of "list" matches argument types "int", "None" - # [call-overload] - index_names[i] = None # type: ignore[call-overload] - - return index_names, columns, index_col - - -def _get_empty_meta(columns, index_col, index_names, dtype: Optional[DtypeArg] = None): - columns = list(columns) - - # Convert `dtype` to a defaultdict of some kind. - # This will enable us to write `dtype[col_name]` - # without worrying about KeyError issues later on. - if not is_dict_like(dtype): - # if dtype == None, default will be object. - default_dtype = dtype or object - dtype = defaultdict(lambda: default_dtype) - else: - dtype = cast(dict, dtype) - dtype = defaultdict( - lambda: object, - {columns[k] if is_integer(k) else k: v for k, v in dtype.items()}, - ) - - # Even though we have no data, the "index" of the empty DataFrame - # could for example still be an empty MultiIndex. Thus, we need to - # check whether we have any index columns specified, via either: - # - # 1) index_col (column indices) - # 2) index_names (column names) - # - # Both must be non-null to ensure a successful construction. Otherwise, - # we have to create a generic empty Index. - if (index_col is None or index_col is False) or index_names is None: - index = Index([]) - else: - data = [Series([], dtype=dtype[name]) for name in index_names] - index = ensure_index_from_sequences(data, names=index_names) - index_col.sort() - - for i, n in enumerate(index_col): - columns.pop(n - i) - - col_dict = {col_name: Series([], dtype=dtype[col_name]) for col_name in columns} - - return index, columns, col_dict - - -def _floatify_na_values(na_values): - # create float versions of the na_values - result = set() - for v in na_values: - try: - v = float(v) - if not np.isnan(v): - result.add(v) - except (TypeError, ValueError, OverflowError): - pass - return result - - -def _stringify_na_values(na_values): - """ return a stringified and numeric for these values """ - result = [] - for x in na_values: - result.append(str(x)) - result.append(x) - try: - v = float(x) - - # we are like 999 here - if v == int(v): - v = int(v) - result.append(f"{v}.0") - result.append(str(v)) - - # pandas\io\parsers.py:3522: error: Argument 1 to "append" of - # "list" has incompatible type "float"; expected "str" [arg-type] - result.append(v) # type: ignore[arg-type] - except (TypeError, ValueError, OverflowError): - pass - try: - # pandas\io\parsers.py:3526: error: Argument 1 to "append" of - # "list" has incompatible type "int"; expected "str" [arg-type] - result.append(int(x)) # type: ignore[arg-type] - except (TypeError, ValueError, OverflowError): - pass - return set(result) - - -def _get_na_values(col, na_values, na_fvalues, keep_default_na): - """ - Get the NaN values for a given column. - - Parameters - ---------- - col : str - The name of the column. - na_values : array-like, dict - The object listing the NaN values as strings. - na_fvalues : array-like, dict - The object listing the NaN values as floats. - keep_default_na : bool - If `na_values` is a dict, and the column is not mapped in the - dictionary, whether to return the default NaN values or the empty set. - - Returns - ------- - nan_tuple : A length-two tuple composed of - - 1) na_values : the string NaN values for that column. - 2) na_fvalues : the float NaN values for that column. - """ - if isinstance(na_values, dict): - if col in na_values: - return na_values[col], na_fvalues[col] - else: - if keep_default_na: - return STR_NA_VALUES, set() - - return set(), set() - else: - return na_values, na_fvalues - - -def _get_col_names(colspec, columns): - colset = set(columns) - colnames = [] - for c in colspec: - if c in colset: - colnames.append(c) - elif isinstance(c, int): - colnames.append(columns[c]) - return colnames - - -class FixedWidthReader(abc.Iterator): - """ - A reader of fixed-width lines. - """ - - def __init__(self, f, colspecs, delimiter, comment, skiprows=None, infer_nrows=100): - self.f = f - self.buffer = None - self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t " - self.comment = comment - if colspecs == "infer": - self.colspecs = self.detect_colspecs( - infer_nrows=infer_nrows, skiprows=skiprows - ) - else: - self.colspecs = colspecs - - if not isinstance(self.colspecs, (tuple, list)): - raise TypeError( - "column specifications must be a list or tuple, " - f"input was a {type(colspecs).__name__}" - ) - - for colspec in self.colspecs: - if not ( - isinstance(colspec, (tuple, list)) - and len(colspec) == 2 - and isinstance(colspec[0], (int, np.integer, type(None))) - and isinstance(colspec[1], (int, np.integer, type(None))) - ): - raise TypeError( - "Each column specification must be " - "2 element tuple or list of integers" - ) - - def get_rows(self, infer_nrows, skiprows=None): - """ - Read rows from self.f, skipping as specified. - - We distinguish buffer_rows (the first <= infer_nrows - lines) from the rows returned to detect_colspecs - because it's simpler to leave the other locations - with skiprows logic alone than to modify them to - deal with the fact we skipped some rows here as - well. - - Parameters - ---------- - infer_nrows : int - Number of rows to read from self.f, not counting - rows that are skipped. - skiprows: set, optional - Indices of rows to skip. - - Returns - ------- - detect_rows : list of str - A list containing the rows to read. - - """ - if skiprows is None: - skiprows = set() - buffer_rows = [] - detect_rows = [] - for i, row in enumerate(self.f): - if i not in skiprows: - detect_rows.append(row) - buffer_rows.append(row) - if len(detect_rows) >= infer_nrows: - break - self.buffer = iter(buffer_rows) - return detect_rows - - def detect_colspecs(self, infer_nrows=100, skiprows=None): - # Regex escape the delimiters - delimiters = "".join(fr"\{x}" for x in self.delimiter) - pattern = re.compile(f"([^{delimiters}]+)") - rows = self.get_rows(infer_nrows, skiprows) - if not rows: - raise EmptyDataError("No rows from which to infer column width") - max_len = max(map(len, rows)) - mask = np.zeros(max_len + 1, dtype=int) - if self.comment is not None: - rows = [row.partition(self.comment)[0] for row in rows] - for row in rows: - for m in pattern.finditer(row): - mask[m.start() : m.end()] = 1 - shifted = np.roll(mask, 1) - shifted[0] = 0 - edges = np.where((mask ^ shifted) == 1)[0] - edge_pairs = list(zip(edges[::2], edges[1::2])) - return edge_pairs - - def __next__(self): - if self.buffer is not None: - try: - line = next(self.buffer) - except StopIteration: - self.buffer = None - line = next(self.f) - else: - line = next(self.f) - # Note: 'colspecs' is a sequence of half-open intervals. - return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs] - - -class FixedWidthFieldParser(PythonParser): - """ - Specialization that Converts fixed-width fields into DataFrames. - See PythonParser for details. - """ - - def __init__(self, f, **kwds): - # Support iterators, convert to a list. - self.colspecs = kwds.pop("colspecs") - self.infer_nrows = kwds.pop("infer_nrows") - PythonParser.__init__(self, f, **kwds) - - def _make_reader(self, f): - self.data = FixedWidthReader( - f, - self.colspecs, - self.delimiter, - self.comment, - self.skiprows, - self.infer_nrows, - ) - - def _remove_empty_lines(self, lines) -> List: - """ - Returns the list of lines without the empty ones. With fixed-width - fields, empty lines become arrays of empty strings. - - See PythonParser._remove_empty_lines. - """ - return [ - line - for line in lines - if any(not isinstance(e, str) or e.strip() for e in line) - ] - - -def _refine_defaults_read( - dialect: Union[str, csv.Dialect], - delimiter: Union[str, object], - delim_whitespace: bool, - engine: str, - sep: Union[str, object], - defaults: Dict[str, Any], -): - """Validate/refine default values of input parameters of read_csv, read_table. - - Parameters - ---------- - dialect : str or csv.Dialect - If provided, this parameter will override values (default or not) for the - following parameters: `delimiter`, `doublequote`, `escapechar`, - `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to - override values, a ParserWarning will be issued. See csv.Dialect - documentation for more details. - delimiter : str or object - Alias for sep. - delim_whitespace : bool - Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be - used as the sep. Equivalent to setting ``sep='\\s+'``. If this option - is set to True, nothing should be passed in for the ``delimiter`` - parameter. - engine : {{'c', 'python'}} - Parser engine to use. The C engine is faster while the python engine is - currently more feature-complete. - sep : str or object - A delimiter provided by the user (str) or a sentinel value, i.e. - pandas._libs.lib.no_default. - defaults: dict - Default values of input parameters. - - Returns - ------- - kwds : dict - Input parameters with correct values. - - Raises - ------ - ValueError : If a delimiter was specified with ``sep`` (or ``delimiter``) and - ``delim_whitespace=True``. - """ - # fix types for sep, delimiter to Union(str, Any) - delim_default = defaults["delimiter"] - kwds: Dict[str, Any] = {} - # gh-23761 - # - # When a dialect is passed, it overrides any of the overlapping - # parameters passed in directly. We don't want to warn if the - # default parameters were passed in (since it probably means - # that the user didn't pass them in explicitly in the first place). - # - # "delimiter" is the annoying corner case because we alias it to - # "sep" before doing comparison to the dialect values later on. - # Thus, we need a flag to indicate that we need to "override" - # the comparison to dialect values by checking if default values - # for BOTH "delimiter" and "sep" were provided. - if dialect is not None: - kwds["sep_override"] = delimiter is None and ( - sep is lib.no_default or sep == delim_default - ) - - # Alias sep -> delimiter. - if delimiter is None: - delimiter = sep - - if delim_whitespace and (delimiter is not lib.no_default): - raise ValueError( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) - - if delimiter is lib.no_default: - # assign default separator value - kwds["delimiter"] = delim_default - else: - kwds["delimiter"] = delimiter - - if engine is not None: - kwds["engine_specified"] = True - else: - kwds["engine"] = "c" - kwds["engine_specified"] = False - - return kwds - - -def _extract_dialect(kwds: Dict[str, Any]) -> Optional[csv.Dialect]: - """ - Extract concrete csv dialect instance. - - Returns - ------- - csv.Dialect or None - """ - if kwds.get("dialect") is None: - return None - - dialect = kwds["dialect"] - if dialect in csv.list_dialects(): - dialect = csv.get_dialect(dialect) - - _validate_dialect(dialect) - - return dialect - - -MANDATORY_DIALECT_ATTRS = ( - "delimiter", - "doublequote", - "escapechar", - "skipinitialspace", - "quotechar", - "quoting", -) - - -def _validate_dialect(dialect: csv.Dialect) -> None: - """ - Validate csv dialect instance. - - Raises - ------ - ValueError - If incorrect dialect is provided. - """ - for param in MANDATORY_DIALECT_ATTRS: - if not hasattr(dialect, param): - raise ValueError(f"Invalid dialect {dialect} provided") - - -def _merge_with_dialect_properties( - dialect: csv.Dialect, - defaults: Dict[str, Any], -) -> Dict[str, Any]: - """ - Merge default kwargs in TextFileReader with dialect parameters. - - Parameters - ---------- - dialect : csv.Dialect - Concrete csv dialect. See csv.Dialect documentation for more details. - defaults : dict - Keyword arguments passed to TextFileReader. - - Returns - ------- - kwds : dict - Updated keyword arguments, merged with dialect parameters. - """ - kwds = defaults.copy() - - for param in MANDATORY_DIALECT_ATTRS: - dialect_val = getattr(dialect, param) - - parser_default = _parser_defaults[param] - provided = kwds.get(param, parser_default) - - # Messages for conflicting values between the dialect - # instance and the actual parameters provided. - conflict_msgs = [] - - # Don't warn if the default parameter was passed in, - # even if it conflicts with the dialect (gh-23761). - if provided != parser_default and provided != dialect_val: - msg = ( - f"Conflicting values for '{param}': '{provided}' was " - f"provided, but the dialect specifies '{dialect_val}'. " - "Using the dialect-specified value." - ) - - # Annoying corner case for not warning about - # conflicts between dialect and delimiter parameter. - # Refer to the outer "_read_" function for more info. - if not (param == "delimiter" and kwds.pop("sep_override", False)): - conflict_msgs.append(msg) - - if conflict_msgs: - warnings.warn("\n\n".join(conflict_msgs), ParserWarning, stacklevel=2) - kwds[param] = dialect_val - return kwds - - -def _validate_skipfooter(kwds: Dict[str, Any]) -> None: - """ - Check whether skipfooter is compatible with other kwargs in TextFileReader. - - Parameters - ---------- - kwds : dict - Keyword arguments passed to TextFileReader. - - Raises - ------ - ValueError - If skipfooter is not compatible with other parameters. - """ - if kwds.get("skipfooter"): - if kwds.get("iterator") or kwds.get("chunksize"): - raise ValueError("'skipfooter' not supported for iteration") - if kwds.get("nrows"): - raise ValueError("'skipfooter' not supported with 'nrows'") diff --git a/pandas/io/parsers/__init__.py b/pandas/io/parsers/__init__.py new file mode 100644 index 0000000000000..ff11968db15f0 --- /dev/null +++ b/pandas/io/parsers/__init__.py @@ -0,0 +1,9 @@ +from pandas.io.parsers.readers import ( + TextFileReader, + TextParser, + read_csv, + read_fwf, + read_table, +) + +__all__ = ["TextFileReader", "TextParser", "read_csv", "read_fwf", "read_table"] diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py new file mode 100644 index 0000000000000..64c3b1e64a659 --- /dev/null +++ b/pandas/io/parsers/base_parser.py @@ -0,0 +1,1085 @@ +from collections import defaultdict +import csv +import datetime +import itertools +from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Union, cast +import warnings + +import numpy as np + +import pandas._libs.lib as lib +import pandas._libs.ops as libops +import pandas._libs.parsers as parsers +from pandas._libs.parsers import STR_NA_VALUES +from pandas._libs.tslibs import parsing +from pandas._typing import DtypeArg, FilePathOrBuffer +from pandas.errors import ParserError, ParserWarning + +from pandas.core.dtypes.cast import astype_nansafe +from pandas.core.dtypes.common import ( + ensure_object, + ensure_str, + is_bool_dtype, + is_categorical_dtype, + is_dict_like, + is_dtype_equal, + is_extension_array_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_object_dtype, + is_scalar, + is_string_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.missing import isna + +from pandas.core import algorithms +from pandas.core.arrays import Categorical +from pandas.core.indexes.api import Index, MultiIndex, ensure_index_from_sequences +from pandas.core.series import Series +from pandas.core.tools import datetimes as tools + +from pandas.io.common import IOHandles, get_handle +from pandas.io.date_converters import generic_parser + +parser_defaults = { + "delimiter": None, + "escapechar": None, + "quotechar": '"', + "quoting": csv.QUOTE_MINIMAL, + "doublequote": True, + "skipinitialspace": False, + "lineterminator": None, + "header": "infer", + "index_col": None, + "names": None, + "prefix": None, + "skiprows": None, + "skipfooter": 0, + "nrows": None, + "na_values": None, + "keep_default_na": True, + "true_values": None, + "false_values": None, + "converters": None, + "dtype": None, + "cache_dates": True, + "thousands": None, + "comment": None, + "decimal": ".", + # 'engine': 'c', + "parse_dates": False, + "keep_date_col": False, + "dayfirst": False, + "date_parser": None, + "usecols": None, + # 'iterator': False, + "chunksize": None, + "verbose": False, + "encoding": None, + "squeeze": False, + "compression": None, + "mangle_dupe_cols": True, + "infer_datetime_format": False, + "skip_blank_lines": True, +} + + +class ParserBase: + def __init__(self, kwds): + + self.names = kwds.get("names") + self.orig_names: Optional[List] = None + self.prefix = kwds.pop("prefix", None) + + self.index_col = kwds.get("index_col", None) + self.unnamed_cols: Set = set() + self.index_names: Optional[List] = None + self.col_names = None + + self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False)) + self.date_parser = kwds.pop("date_parser", None) + self.dayfirst = kwds.pop("dayfirst", False) + self.keep_date_col = kwds.pop("keep_date_col", False) + + self.na_values = kwds.get("na_values") + self.na_fvalues = kwds.get("na_fvalues") + self.na_filter = kwds.get("na_filter", False) + self.keep_default_na = kwds.get("keep_default_na", True) + + self.true_values = kwds.get("true_values") + self.false_values = kwds.get("false_values") + self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True) + self.infer_datetime_format = kwds.pop("infer_datetime_format", False) + self.cache_dates = kwds.pop("cache_dates", True) + + self._date_conv = _make_date_converter( + date_parser=self.date_parser, + dayfirst=self.dayfirst, + infer_datetime_format=self.infer_datetime_format, + cache_dates=self.cache_dates, + ) + + # validate header options for mi + self.header = kwds.get("header") + if isinstance(self.header, (list, tuple, np.ndarray)): + if not all(map(is_integer, self.header)): + raise ValueError("header must be integer or list of integers") + if any(i < 0 for i in self.header): + raise ValueError( + "cannot specify multi-index header with negative integers" + ) + if kwds.get("usecols"): + raise ValueError( + "cannot specify usecols when specifying a multi-index header" + ) + if kwds.get("names"): + raise ValueError( + "cannot specify names when specifying a multi-index header" + ) + + # validate index_col that only contains integers + if self.index_col is not None: + is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray)) + if not ( + is_sequence + and all(map(is_integer, self.index_col)) + or is_integer(self.index_col) + ): + raise ValueError( + "index_col must only contain row numbers " + "when specifying a multi-index header" + ) + elif self.header is not None: + # GH 27394 + if self.prefix is not None: + raise ValueError( + "Argument prefix must be None if argument header is not None" + ) + # GH 16338 + elif not is_integer(self.header): + raise ValueError("header must be integer or list of integers") + # GH 27779 + elif self.header < 0: + raise ValueError( + "Passing negative integer to header is invalid. " + "For no header, use header=None instead" + ) + + self._name_processed = False + + self._first_chunk = True + + self.handles: Optional[IOHandles] = None + + def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None: + """ + Let the readers open IOHanldes after they are done with their potential raises. + """ + self.handles = get_handle( + src, + "r", + encoding=kwds.get("encoding", None), + compression=kwds.get("compression", None), + memory_map=kwds.get("memory_map", False), + storage_options=kwds.get("storage_options", None), + ) + + def _validate_parse_dates_presence(self, columns: List[str]) -> None: + """ + Check if parse_dates are in columns. + + If user has provided names for parse_dates, check if those columns + are available. + + Parameters + ---------- + columns : list + List of names of the dataframe. + + Raises + ------ + ValueError + If column to parse_date is not in dataframe. + + """ + cols_needed: Iterable + if is_dict_like(self.parse_dates): + cols_needed = itertools.chain(*self.parse_dates.values()) + elif is_list_like(self.parse_dates): + # a column in parse_dates could be represented + # ColReference = Union[int, str] + # DateGroups = List[ColReference] + # ParseDates = Union[DateGroups, List[DateGroups], + # Dict[ColReference, DateGroups]] + cols_needed = itertools.chain.from_iterable( + col if is_list_like(col) else [col] for col in self.parse_dates + ) + else: + cols_needed = [] + + # get only columns that are references using names (str), not by index + missing_cols = ", ".join( + sorted( + { + col + for col in cols_needed + if isinstance(col, str) and col not in columns + } + ) + ) + if missing_cols: + raise ValueError( + f"Missing column provided to 'parse_dates': '{missing_cols}'" + ) + + def close(self): + if self.handles is not None: + self.handles.close() + + @property + def _has_complex_date_col(self): + return isinstance(self.parse_dates, dict) or ( + isinstance(self.parse_dates, list) + and len(self.parse_dates) > 0 + and isinstance(self.parse_dates[0], list) + ) + + def _should_parse_dates(self, i): + if isinstance(self.parse_dates, bool): + return self.parse_dates + else: + if self.index_names is not None: + name = self.index_names[i] + else: + name = None + j = i if self.index_col is None else self.index_col[i] + + if is_scalar(self.parse_dates): + return (j == self.parse_dates) or ( + name is not None and name == self.parse_dates + ) + else: + return (j in self.parse_dates) or ( + name is not None and name in self.parse_dates + ) + + def _extract_multi_indexer_columns( + self, header, index_names, col_names, passed_names=False + ): + """ + extract and return the names, index_names, col_names + header is a list-of-lists returned from the parsers + """ + if len(header) < 2: + return header[0], index_names, col_names, passed_names + + # the names are the tuples of the header that are not the index cols + # 0 is the name of the index, assuming index_col is a list of column + # numbers + ic = self.index_col + if ic is None: + ic = [] + + if not isinstance(ic, (list, tuple, np.ndarray)): + ic = [ic] + sic = set(ic) + + # clean the index_names + index_names = header.pop(-1) + index_names, _, _ = self._clean_index_names( + index_names, self.index_col, self.unnamed_cols + ) + + # extract the columns + field_count = len(header[0]) + + def extract(r): + return tuple(r[i] for i in range(field_count) if i not in sic) + + columns = list(zip(*(extract(r) for r in header))) + names = ic + columns + + # If we find unnamed columns all in a single + # level, then our header was too long. + for n in range(len(columns[0])): + if all(ensure_str(col[n]) in self.unnamed_cols for col in columns): + header = ",".join(str(x) for x in self.header) + raise ParserError( + f"Passed header=[{header}] are too many rows " + "for this multi_index of columns" + ) + + # Clean the column names (if we have an index_col). + if len(ic): + col_names = [ + r[0] if ((r[0] is not None) and r[0] not in self.unnamed_cols) else None + for r in header + ] + else: + col_names = [None] * len(header) + + passed_names = True + + return names, index_names, col_names, passed_names + + def _maybe_dedup_names(self, names): + # see gh-7160 and gh-9424: this helps to provide + # immediate alleviation of the duplicate names + # issue and appears to be satisfactory to users, + # but ultimately, not needing to butcher the names + # would be nice! + if self.mangle_dupe_cols: + names = list(names) # so we can index + # pandas\io\parsers.py:1559: error: Need type annotation for + # 'counts' [var-annotated] + counts = defaultdict(int) # type: ignore[var-annotated] + is_potential_mi = _is_potential_multi_index(names, self.index_col) + + for i, col in enumerate(names): + cur_count = counts[col] + + while cur_count > 0: + counts[col] = cur_count + 1 + + if is_potential_mi: + col = col[:-1] + (f"{col[-1]}.{cur_count}",) + else: + col = f"{col}.{cur_count}" + cur_count = counts[col] + + names[i] = col + counts[col] = cur_count + 1 + + return names + + def _maybe_make_multi_index_columns(self, columns, col_names=None): + # possibly create a column mi here + if _is_potential_multi_index(columns): + columns = MultiIndex.from_tuples(columns, names=col_names) + return columns + + def _make_index(self, data, alldata, columns, indexnamerow=False): + if not is_index_col(self.index_col) or not self.index_col: + index = None + + elif not self._has_complex_date_col: + index = self._get_simple_index(alldata, columns) + index = self._agg_index(index) + elif self._has_complex_date_col: + if not self._name_processed: + (self.index_names, _, self.index_col) = self._clean_index_names( + list(columns), self.index_col, self.unnamed_cols + ) + self._name_processed = True + index = self._get_complex_date_index(data, columns) + index = self._agg_index(index, try_parse_dates=False) + + # add names for the index + if indexnamerow: + coffset = len(indexnamerow) - len(columns) + # pandas\io\parsers.py:1604: error: Item "None" of "Optional[Any]" + # has no attribute "set_names" [union-attr] + index = index.set_names(indexnamerow[:coffset]) # type: ignore[union-attr] + + # maybe create a mi on the columns + columns = self._maybe_make_multi_index_columns(columns, self.col_names) + + return index, columns + + _implicit_index = False + + def _get_simple_index(self, data, columns): + def ix(col): + if not isinstance(col, str): + return col + raise ValueError(f"Index {col} invalid") + + to_remove = [] + index = [] + for idx in self.index_col: + i = ix(idx) + to_remove.append(i) + index.append(data[i]) + + # remove index items from content and columns, don't pop in + # loop + for i in sorted(to_remove, reverse=True): + data.pop(i) + if not self._implicit_index: + columns.pop(i) + + return index + + def _get_complex_date_index(self, data, col_names): + def _get_name(icol): + if isinstance(icol, str): + return icol + + if col_names is None: + raise ValueError(f"Must supply column order to use {icol!s} as index") + + for i, c in enumerate(col_names): + if i == icol: + return c + + to_remove = [] + index = [] + for idx in self.index_col: + name = _get_name(idx) + to_remove.append(name) + index.append(data[name]) + + # remove index items from content and columns, don't pop in + # loop + for c in sorted(to_remove, reverse=True): + data.pop(c) + col_names.remove(c) + + return index + + def _agg_index(self, index, try_parse_dates=True) -> Index: + arrays = [] + + for i, arr in enumerate(index): + + if try_parse_dates and self._should_parse_dates(i): + arr = self._date_conv(arr) + + if self.na_filter: + col_na_values = self.na_values + col_na_fvalues = self.na_fvalues + else: + col_na_values = set() + col_na_fvalues = set() + + if isinstance(self.na_values, dict): + # pandas\io\parsers.py:1678: error: Value of type + # "Optional[Any]" is not indexable [index] + col_name = self.index_names[i] # type: ignore[index] + if col_name is not None: + col_na_values, col_na_fvalues = _get_na_values( + col_name, self.na_values, self.na_fvalues, self.keep_default_na + ) + + arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues) + arrays.append(arr) + + names = self.index_names + index = ensure_index_from_sequences(arrays, names) + + return index + + def _convert_to_ndarrays( + self, dct, na_values, na_fvalues, verbose=False, converters=None, dtypes=None + ): + result = {} + for c, values in dct.items(): + conv_f = None if converters is None else converters.get(c, None) + if isinstance(dtypes, dict): + cast_type = dtypes.get(c, None) + else: + # single dtype or None + cast_type = dtypes + + if self.na_filter: + col_na_values, col_na_fvalues = _get_na_values( + c, na_values, na_fvalues, self.keep_default_na + ) + else: + col_na_values, col_na_fvalues = set(), set() + + if conv_f is not None: + # conv_f applied to data before inference + if cast_type is not None: + warnings.warn( + ( + "Both a converter and dtype were specified " + f"for column {c} - only the converter will be used" + ), + ParserWarning, + stacklevel=7, + ) + + try: + values = lib.map_infer(values, conv_f) + except ValueError: + mask = algorithms.isin(values, list(na_values)).view(np.uint8) + values = lib.map_infer_mask(values, conv_f, mask) + + cvals, na_count = self._infer_types( + values, set(col_na_values) | col_na_fvalues, try_num_bool=False + ) + else: + is_ea = is_extension_array_dtype(cast_type) + is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) + # skip inference if specified dtype is object + # or casting to an EA + try_num_bool = not (cast_type and is_str_or_ea_dtype) + + # general type inference and conversion + cvals, na_count = self._infer_types( + values, set(col_na_values) | col_na_fvalues, try_num_bool + ) + + # type specified in dtype param or cast_type is an EA + if cast_type and ( + not is_dtype_equal(cvals, cast_type) + or is_extension_array_dtype(cast_type) + ): + if not is_ea and na_count > 0: + try: + if is_bool_dtype(cast_type): + raise ValueError( + f"Bool column has NA values in column {c}" + ) + except (AttributeError, TypeError): + # invalid input to is_bool_dtype + pass + cast_type = pandas_dtype(cast_type) + cvals = self._cast_types(cvals, cast_type, c) + + result[c] = cvals + if verbose and na_count: + print(f"Filled {na_count} NA values in column {c!s}") + return result + + def _infer_types(self, values, na_values, try_num_bool=True): + """ + Infer types of values, possibly casting + + Parameters + ---------- + values : ndarray + na_values : set + try_num_bool : bool, default try + try to cast values to numeric (first preference) or boolean + + Returns + ------- + converted : ndarray + na_count : int + """ + na_count = 0 + if issubclass(values.dtype.type, (np.number, np.bool_)): + mask = algorithms.isin(values, list(na_values)) + na_count = mask.sum() + if na_count > 0: + if is_integer_dtype(values): + values = values.astype(np.float64) + np.putmask(values, mask, np.nan) + return values, na_count + + if try_num_bool and is_object_dtype(values.dtype): + # exclude e.g DatetimeIndex here + try: + result = lib.maybe_convert_numeric(values, na_values, False) + except (ValueError, TypeError): + # e.g. encountering datetime string gets ValueError + # TypeError can be raised in floatify + result = values + na_count = parsers.sanitize_objects(result, na_values, False) + else: + na_count = isna(result).sum() + else: + result = values + if values.dtype == np.object_: + na_count = parsers.sanitize_objects(values, na_values, False) + + if result.dtype == np.object_ and try_num_bool: + result = libops.maybe_convert_bool( + np.asarray(values), + true_values=self.true_values, + false_values=self.false_values, + ) + + return result, na_count + + def _cast_types(self, values, cast_type, column): + """ + Cast values to specified type + + Parameters + ---------- + values : ndarray + cast_type : string or np.dtype + dtype to cast values to + column : string + column name - used only for error reporting + + Returns + ------- + converted : ndarray + """ + if is_categorical_dtype(cast_type): + known_cats = ( + isinstance(cast_type, CategoricalDtype) + and cast_type.categories is not None + ) + + if not is_object_dtype(values) and not known_cats: + # TODO: this is for consistency with + # c-parser which parses all categories + # as strings + values = astype_nansafe(values, str) + + cats = Index(values).unique().dropna() + values = Categorical._from_inferred_categories( + cats, cats.get_indexer(values), cast_type, true_values=self.true_values + ) + + # use the EA's implementation of casting + elif is_extension_array_dtype(cast_type): + # ensure cast_type is an actual dtype and not a string + cast_type = pandas_dtype(cast_type) + array_type = cast_type.construct_array_type() + try: + if is_bool_dtype(cast_type): + return array_type._from_sequence_of_strings( + values, + dtype=cast_type, + true_values=self.true_values, + false_values=self.false_values, + ) + else: + return array_type._from_sequence_of_strings(values, dtype=cast_type) + except NotImplementedError as err: + raise NotImplementedError( + f"Extension Array: {array_type} must implement " + "_from_sequence_of_strings in order to be used in parser methods" + ) from err + + else: + try: + values = astype_nansafe(values, cast_type, copy=True, skipna=True) + except ValueError as err: + raise ValueError( + f"Unable to convert column {column} to type {cast_type}" + ) from err + return values + + def _do_date_conversions(self, names, data): + # returns data, columns + + if self.parse_dates is not None: + data, names = _process_date_conversion( + data, + self._date_conv, + self.parse_dates, + self.index_col, + self.index_names, + names, + keep_date_col=self.keep_date_col, + ) + + return names, data + + def _evaluate_usecols(self, usecols, names): + """ + Check whether or not the 'usecols' parameter + is a callable. If so, enumerates the 'names' + parameter and returns a set of indices for + each entry in 'names' that evaluates to True. + If not a callable, returns 'usecols'. + """ + if callable(usecols): + return {i for i, name in enumerate(names) if usecols(name)} + return usecols + + def _validate_usecols_names(self, usecols, names): + """ + Validates that all usecols are present in a given + list of names. If not, raise a ValueError that + shows what usecols are missing. + + Parameters + ---------- + usecols : iterable of usecols + The columns to validate are present in names. + names : iterable of names + The column names to check against. + + Returns + ------- + usecols : iterable of usecols + The `usecols` parameter if the validation succeeds. + + Raises + ------ + ValueError : Columns were missing. Error message will list them. + """ + missing = [c for c in usecols if c not in names] + if len(missing) > 0: + raise ValueError( + f"Usecols do not match columns, columns expected but not found: " + f"{missing}" + ) + + return usecols + + def _validate_usecols_arg(self, usecols): + """ + Validate the 'usecols' parameter. + + Checks whether or not the 'usecols' parameter contains all integers + (column selection by index), strings (column by name) or is a callable. + Raises a ValueError if that is not the case. + + Parameters + ---------- + usecols : list-like, callable, or None + List of columns to use when parsing or a callable that can be used + to filter a list of table columns. + + Returns + ------- + usecols_tuple : tuple + A tuple of (verified_usecols, usecols_dtype). + + 'verified_usecols' is either a set if an array-like is passed in or + 'usecols' if a callable or None is passed in. + + 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like + is passed in or None if a callable or None is passed in. + """ + msg = ( + "'usecols' must either be list-like of all strings, all unicode, " + "all integers or a callable." + ) + if usecols is not None: + if callable(usecols): + return usecols, None + + if not is_list_like(usecols): + # see gh-20529 + # + # Ensure it is iterable container but not string. + raise ValueError(msg) + + usecols_dtype = lib.infer_dtype(usecols, skipna=False) + + if usecols_dtype not in ("empty", "integer", "string"): + raise ValueError(msg) + + usecols = set(usecols) + + return usecols, usecols_dtype + return usecols, None + + def _clean_index_names(self, columns, index_col, unnamed_cols): + if not is_index_col(index_col): + return None, columns, index_col + + columns = list(columns) + + # In case of no rows and multiindex columns we have to set index_names to + # list of Nones GH#38292 + if not columns: + return [None] * len(index_col), columns, index_col + + cp_cols = list(columns) + index_names = [] + + # don't mutate + index_col = list(index_col) + + for i, c in enumerate(index_col): + if isinstance(c, str): + index_names.append(c) + for j, name in enumerate(cp_cols): + if name == c: + index_col[i] = j + columns.remove(name) + break + else: + name = cp_cols[c] + columns.remove(name) + index_names.append(name) + + # Only clean index names that were placeholders. + for i, name in enumerate(index_names): + if isinstance(name, str) and name in unnamed_cols: + # pandas\io\parsers.py:3445: error: No overload variant of + # "__setitem__" of "list" matches argument types "int", "None" + # [call-overload] + index_names[i] = None # type: ignore[call-overload] + + return index_names, columns, index_col + + def _get_empty_meta( + self, columns, index_col, index_names, dtype: Optional[DtypeArg] = None + ): + columns = list(columns) + + # Convert `dtype` to a defaultdict of some kind. + # This will enable us to write `dtype[col_name]` + # without worrying about KeyError issues later on. + if not is_dict_like(dtype): + # if dtype == None, default will be object. + default_dtype = dtype or object + dtype = defaultdict(lambda: default_dtype) + else: + dtype = cast(dict, dtype) + dtype = defaultdict( + lambda: object, + {columns[k] if is_integer(k) else k: v for k, v in dtype.items()}, + ) + + # Even though we have no data, the "index" of the empty DataFrame + # could for example still be an empty MultiIndex. Thus, we need to + # check whether we have any index columns specified, via either: + # + # 1) index_col (column indices) + # 2) index_names (column names) + # + # Both must be non-null to ensure a successful construction. Otherwise, + # we have to create a generic empty Index. + if (index_col is None or index_col is False) or index_names is None: + index = Index([]) + else: + data = [Series([], dtype=dtype[name]) for name in index_names] + index = ensure_index_from_sequences(data, names=index_names) + index_col.sort() + + for i, n in enumerate(index_col): + columns.pop(n - i) + + col_dict = {col_name: Series([], dtype=dtype[col_name]) for col_name in columns} + + return index, columns, col_dict + + +def _make_date_converter( + date_parser=None, dayfirst=False, infer_datetime_format=False, cache_dates=True +): + def converter(*date_cols): + if date_parser is None: + strs = parsing.concat_date_cols(date_cols) + + try: + return tools.to_datetime( + ensure_object(strs), + utc=None, + dayfirst=dayfirst, + errors="ignore", + infer_datetime_format=infer_datetime_format, + cache=cache_dates, + ).to_numpy() + + except ValueError: + return tools.to_datetime( + parsing.try_parse_dates(strs, dayfirst=dayfirst), cache=cache_dates + ) + else: + try: + result = tools.to_datetime( + date_parser(*date_cols), errors="ignore", cache=cache_dates + ) + if isinstance(result, datetime.datetime): + raise Exception("scalar parser") + return result + except Exception: + try: + return tools.to_datetime( + parsing.try_parse_dates( + parsing.concat_date_cols(date_cols), + parser=date_parser, + dayfirst=dayfirst, + ), + errors="ignore", + ) + except Exception: + return generic_parser(date_parser, *date_cols) + + return converter + + +def _process_date_conversion( + data_dict, + converter, + parse_spec, + index_col, + index_names, + columns, + keep_date_col=False, +): + def _isindex(colspec): + return (isinstance(index_col, list) and colspec in index_col) or ( + isinstance(index_names, list) and colspec in index_names + ) + + new_cols = [] + new_data = {} + + orig_names = columns + columns = list(columns) + + date_cols = set() + + if parse_spec is None or isinstance(parse_spec, bool): + return data_dict, columns + + if isinstance(parse_spec, list): + # list of column lists + for colspec in parse_spec: + if is_scalar(colspec): + if isinstance(colspec, int) and colspec not in data_dict: + colspec = orig_names[colspec] + if _isindex(colspec): + continue + data_dict[colspec] = converter(data_dict[colspec]) + else: + new_name, col, old_names = _try_convert_dates( + converter, colspec, data_dict, orig_names + ) + if new_name in data_dict: + raise ValueError(f"New date column already in dict {new_name}") + new_data[new_name] = col + new_cols.append(new_name) + date_cols.update(old_names) + + elif isinstance(parse_spec, dict): + # dict of new name to column list + for new_name, colspec in parse_spec.items(): + if new_name in data_dict: + raise ValueError(f"Date column {new_name} already in dict") + + _, col, old_names = _try_convert_dates( + converter, colspec, data_dict, orig_names + ) + + new_data[new_name] = col + new_cols.append(new_name) + date_cols.update(old_names) + + data_dict.update(new_data) + new_cols.extend(columns) + + if not keep_date_col: + for c in list(date_cols): + data_dict.pop(c) + new_cols.remove(c) + + return data_dict, new_cols + + +def _try_convert_dates(parser, colspec, data_dict, columns): + colset = set(columns) + colnames = [] + + for c in colspec: + if c in colset: + colnames.append(c) + elif isinstance(c, int) and c not in columns: + colnames.append(columns[c]) + else: + colnames.append(c) + + new_name = "_".join(str(x) for x in colnames) + to_parse = [data_dict[c] for c in colnames if c in data_dict] + + new_col = parser(*to_parse) + return new_name, new_col, colnames + + +def _get_na_values(col, na_values, na_fvalues, keep_default_na): + """ + Get the NaN values for a given column. + + Parameters + ---------- + col : str + The name of the column. + na_values : array-like, dict + The object listing the NaN values as strings. + na_fvalues : array-like, dict + The object listing the NaN values as floats. + keep_default_na : bool + If `na_values` is a dict, and the column is not mapped in the + dictionary, whether to return the default NaN values or the empty set. + + Returns + ------- + nan_tuple : A length-two tuple composed of + + 1) na_values : the string NaN values for that column. + 2) na_fvalues : the float NaN values for that column. + """ + if isinstance(na_values, dict): + if col in na_values: + return na_values[col], na_fvalues[col] + else: + if keep_default_na: + return STR_NA_VALUES, set() + + return set(), set() + else: + return na_values, na_fvalues + + +# Seems to be unused +def _get_col_names(colspec, columns): + colset = set(columns) + colnames = [] + for c in colspec: + if c in colset: + colnames.append(c) + elif isinstance(c, int): + colnames.append(columns[c]) + return colnames + + +def _is_potential_multi_index( + columns, index_col: Optional[Union[bool, Sequence[int]]] = None +): + """ + Check whether or not the `columns` parameter + could be converted into a MultiIndex. + + Parameters + ---------- + columns : array-like + Object which may or may not be convertible into a MultiIndex + index_col : None, bool or list, optional + Column or columns to use as the (possibly hierarchical) index + + Returns + ------- + boolean : Whether or not columns could become a MultiIndex + """ + if index_col is None or isinstance(index_col, bool): + index_col = [] + + return ( + len(columns) + and not isinstance(columns, MultiIndex) + and all(isinstance(c, tuple) for c in columns if c not in list(index_col)) + ) + + +def _validate_parse_dates_arg(parse_dates): + """ + Check whether or not the 'parse_dates' parameter + is a non-boolean scalar. Raises a ValueError if + that is the case. + """ + msg = ( + "Only booleans, lists, and dictionaries are accepted " + "for the 'parse_dates' parameter" + ) + + if parse_dates is not None: + if is_scalar(parse_dates): + if not lib.is_bool(parse_dates): + raise TypeError(msg) + + elif not isinstance(parse_dates, (list, dict)): + raise TypeError(msg) + + return parse_dates + + +def is_index_col(col): + return col is not None and col is not False diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py new file mode 100644 index 0000000000000..b4c00dfe9b3e7 --- /dev/null +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -0,0 +1,328 @@ +import pandas._libs.parsers as parsers +from pandas._typing import FilePathOrBuffer + +from pandas.core.dtypes.common import is_integer + +from pandas.core.indexes.api import ensure_index_from_sequences + +from pandas.io.parsers.base_parser import ParserBase, is_index_col + + +class CParserWrapper(ParserBase): + def __init__(self, src: FilePathOrBuffer, **kwds): + self.kwds = kwds + kwds = kwds.copy() + + ParserBase.__init__(self, kwds) + + # #2442 + kwds["allow_leading_cols"] = self.index_col is not False + + # GH20529, validate usecol arg before TextReader + self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"]) + kwds["usecols"] = self.usecols + + # open handles + self._open_handles(src, kwds) + assert self.handles is not None + for key in ("storage_options", "encoding", "memory_map", "compression"): + kwds.pop(key, None) + if self.handles.is_mmap and hasattr(self.handles.handle, "mmap"): + # pandas\io\parsers.py:1861: error: Item "IO[Any]" of + # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, + # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] + + # pandas\io\parsers.py:1861: error: Item "RawIOBase" of + # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, + # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] + + # pandas\io\parsers.py:1861: error: Item "BufferedIOBase" of + # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, + # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] + + # pandas\io\parsers.py:1861: error: Item "TextIOBase" of + # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, + # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] + + # pandas\io\parsers.py:1861: error: Item "TextIOWrapper" of + # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, + # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] + + # pandas\io\parsers.py:1861: error: Item "mmap" of "Union[IO[Any], + # RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]" has + # no attribute "mmap" [union-attr] + self.handles.handle = self.handles.handle.mmap # type: ignore[union-attr] + + try: + self._reader = parsers.TextReader(self.handles.handle, **kwds) + except Exception: + self.handles.close() + raise + self.unnamed_cols = self._reader.unnamed_cols + + passed_names = self.names is None + + if self._reader.header is None: + self.names = None + else: + if len(self._reader.header) > 1: + # we have a multi index in the columns + ( + self.names, + self.index_names, + self.col_names, + passed_names, + ) = self._extract_multi_indexer_columns( + self._reader.header, self.index_names, self.col_names, passed_names + ) + else: + self.names = list(self._reader.header[0]) + + if self.names is None: + if self.prefix: + self.names = [ + f"{self.prefix}{i}" for i in range(self._reader.table_width) + ] + else: + self.names = list(range(self._reader.table_width)) + + # gh-9755 + # + # need to set orig_names here first + # so that proper indexing can be done + # with _set_noconvert_columns + # + # once names has been filtered, we will + # then set orig_names again to names + self.orig_names = self.names[:] + + if self.usecols: + usecols = self._evaluate_usecols(self.usecols, self.orig_names) + + # GH 14671 + # assert for mypy, orig_names is List or None, None would error in issubset + assert self.orig_names is not None + if self.usecols_dtype == "string" and not set(usecols).issubset( + self.orig_names + ): + self._validate_usecols_names(usecols, self.orig_names) + + if len(self.names) > len(usecols): + self.names = [ + n + for i, n in enumerate(self.names) + if (i in usecols or n in usecols) + ] + + if len(self.names) < len(usecols): + self._validate_usecols_names(usecols, self.names) + + self._validate_parse_dates_presence(self.names) + self._set_noconvert_columns() + + self.orig_names = self.names + + if not self._has_complex_date_col: + if self._reader.leading_cols == 0 and is_index_col(self.index_col): + + self._name_processed = True + (index_names, self.names, self.index_col) = self._clean_index_names( + self.names, self.index_col, self.unnamed_cols + ) + + if self.index_names is None: + self.index_names = index_names + + if self._reader.header is None and not passed_names: + # pandas\io\parsers.py:1997: error: Argument 1 to "len" has + # incompatible type "Optional[Any]"; expected "Sized" + # [arg-type] + self.index_names = [None] * len( + self.index_names # type: ignore[arg-type] + ) + + self._implicit_index = self._reader.leading_cols > 0 + + def close(self) -> None: + super().close() + + # close additional handles opened by C parser + try: + self._reader.close() + except ValueError: + pass + + def _set_noconvert_columns(self): + """ + Set the columns that should not undergo dtype conversions. + + Currently, any column that is involved with date parsing will not + undergo such conversions. + """ + names = self.orig_names + if self.usecols_dtype == "integer": + # A set of integers will be converted to a list in + # the correct order every single time. + usecols = list(self.usecols) + usecols.sort() + elif callable(self.usecols) or self.usecols_dtype not in ("empty", None): + # The names attribute should have the correct columns + # in the proper order for indexing with parse_dates. + usecols = self.names[:] + else: + # Usecols is empty. + + # pandas\io\parsers.py:2030: error: Incompatible types in + # assignment (expression has type "None", variable has type + # "List[Any]") [assignment] + usecols = None # type: ignore[assignment] + + def _set(x): + if usecols is not None and is_integer(x): + x = usecols[x] + + if not is_integer(x): + # assert for mypy, names is List or None, None would error when calling + # .index() + assert names is not None + x = names.index(x) + + self._reader.set_noconvert(x) + + if isinstance(self.parse_dates, list): + for val in self.parse_dates: + if isinstance(val, list): + for k in val: + _set(k) + else: + _set(val) + + elif isinstance(self.parse_dates, dict): + for val in self.parse_dates.values(): + if isinstance(val, list): + for k in val: + _set(k) + else: + _set(val) + + elif self.parse_dates: + if isinstance(self.index_col, list): + for k in self.index_col: + _set(k) + elif self.index_col is not None: + _set(self.index_col) + + def set_error_bad_lines(self, status): + self._reader.set_error_bad_lines(int(status)) + + def read(self, nrows=None): + try: + data = self._reader.read(nrows) + except StopIteration: + if self._first_chunk: + self._first_chunk = False + names = self._maybe_dedup_names(self.orig_names) + index, columns, col_dict = self._get_empty_meta( + names, + self.index_col, + self.index_names, + dtype=self.kwds.get("dtype"), + ) + columns = self._maybe_make_multi_index_columns(columns, self.col_names) + + if self.usecols is not None: + columns = self._filter_usecols(columns) + + col_dict = {k: v for k, v in col_dict.items() if k in columns} + + return index, columns, col_dict + + else: + self.close() + raise + + # Done with first read, next time raise StopIteration + self._first_chunk = False + + names = self.names + + if self._reader.leading_cols: + if self._has_complex_date_col: + raise NotImplementedError("file structure not yet supported") + + # implicit index, no index names + arrays = [] + + for i in range(self._reader.leading_cols): + if self.index_col is None: + values = data.pop(i) + else: + values = data.pop(self.index_col[i]) + + values = self._maybe_parse_dates(values, i, try_parse_dates=True) + arrays.append(values) + + index = ensure_index_from_sequences(arrays) + + if self.usecols is not None: + names = self._filter_usecols(names) + + names = self._maybe_dedup_names(names) + + # rename dict keys + data = sorted(data.items()) + data = {k: v for k, (i, v) in zip(names, data)} + + names, data = self._do_date_conversions(names, data) + + else: + # rename dict keys + data = sorted(data.items()) + + # ugh, mutation + + # assert for mypy, orig_names is List or None, None would error in list(...) + assert self.orig_names is not None + names = list(self.orig_names) + names = self._maybe_dedup_names(names) + + if self.usecols is not None: + names = self._filter_usecols(names) + + # columns as list + alldata = [x[1] for x in data] + + data = {k: v for k, (i, v) in zip(names, data)} + + names, data = self._do_date_conversions(names, data) + index, names = self._make_index(data, alldata, names) + + # maybe create a mi on the columns + names = self._maybe_make_multi_index_columns(names, self.col_names) + + return index, names, data + + def _filter_usecols(self, names): + # hackish + usecols = self._evaluate_usecols(self.usecols, names) + if usecols is not None and len(names) != len(usecols): + names = [ + name for i, name in enumerate(names) if i in usecols or name in usecols + ] + return names + + def _get_index_names(self): + names = list(self._reader.header[0]) + idx_names = None + + if self._reader.leading_cols == 0 and self.index_col is not None: + (idx_names, names, self.index_col) = self._clean_index_names( + names, self.index_col, self.unnamed_cols + ) + + return names, idx_names + + def _maybe_parse_dates(self, values, index, try_parse_dates=True): + if try_parse_dates and self._should_parse_dates(index): + values = self._date_conv(values) + return values diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py new file mode 100644 index 0000000000000..ba05eb4a6599f --- /dev/null +++ b/pandas/io/parsers/python_parser.py @@ -0,0 +1,1263 @@ +from collections import abc, defaultdict +import csv +from io import StringIO +import re +import sys +from typing import Iterator, List, Optional, cast + +import numpy as np + +import pandas._libs.lib as lib +from pandas._typing import FilePathOrBuffer, Union +from pandas.errors import EmptyDataError, ParserError + +from pandas.core.dtypes.common import is_integer + +from pandas.io.parsers.base_parser import ParserBase, parser_defaults + +# BOM character (byte order mark) +# This exists at the beginning of a file to indicate endianness +# of a file (stream). Unfortunately, this marker screws up parsing, +# so we need to remove it if we see it. +_BOM = "\ufeff" + + +class PythonParser(ParserBase): + def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): + """ + Workhorse function for processing nested list into DataFrame + """ + ParserBase.__init__(self, kwds) + + self.data: Optional[Iterator[str]] = None + self.buf: List = [] + self.pos = 0 + self.line_pos = 0 + + self.skiprows = kwds["skiprows"] + + if callable(self.skiprows): + self.skipfunc = self.skiprows + else: + self.skipfunc = lambda x: x in self.skiprows + + self.skipfooter = _validate_skipfooter_arg(kwds["skipfooter"]) + self.delimiter = kwds["delimiter"] + + self.quotechar = kwds["quotechar"] + if isinstance(self.quotechar, str): + self.quotechar = str(self.quotechar) + + self.escapechar = kwds["escapechar"] + self.doublequote = kwds["doublequote"] + self.skipinitialspace = kwds["skipinitialspace"] + self.lineterminator = kwds["lineterminator"] + self.quoting = kwds["quoting"] + self.usecols, _ = self._validate_usecols_arg(kwds["usecols"]) + self.skip_blank_lines = kwds["skip_blank_lines"] + + self.warn_bad_lines = kwds["warn_bad_lines"] + self.error_bad_lines = kwds["error_bad_lines"] + + self.names_passed = kwds["names"] or None + + self.has_index_names = False + if "has_index_names" in kwds: + self.has_index_names = kwds["has_index_names"] + + self.verbose = kwds["verbose"] + self.converters = kwds["converters"] + + self.dtype = kwds["dtype"] + self.thousands = kwds["thousands"] + self.decimal = kwds["decimal"] + + self.comment = kwds["comment"] + + # Set self.data to something that can read lines. + if isinstance(f, list): + # read_excel: f is a list + self.data = cast(Iterator[str], f) + else: + self._open_handles(f, kwds) + assert self.handles is not None + assert hasattr(self.handles.handle, "readline") + try: + self._make_reader(self.handles.handle) + except (csv.Error, UnicodeDecodeError): + self.close() + raise + + # Get columns in two steps: infer from data, then + # infer column indices from self.usecols if it is specified. + self._col_indices: Optional[List[int]] = None + try: + ( + self.columns, + self.num_original_columns, + self.unnamed_cols, + ) = self._infer_columns() + except (TypeError, ValueError): + self.close() + raise + + # Now self.columns has the set of columns that we will process. + # The original set is stored in self.original_columns. + if len(self.columns) > 1: + # we are processing a multi index column + ( + self.columns, + self.index_names, + self.col_names, + _, + ) = self._extract_multi_indexer_columns( + self.columns, self.index_names, self.col_names + ) + # Update list of original names to include all indices. + self.num_original_columns = len(self.columns) + else: + self.columns = self.columns[0] + + # get popped off for index + self.orig_names = list(self.columns) + + # needs to be cleaned/refactored + # multiple date column thing turning into a real spaghetti factory + + if not self._has_complex_date_col: + (index_names, self.orig_names, self.columns) = self._get_index_name( + self.columns + ) + self._name_processed = True + if self.index_names is None: + self.index_names = index_names + + if self._col_indices is None: + self._col_indices = list(range(len(self.columns))) + + self._validate_parse_dates_presence(self.columns) + if self.parse_dates: + self._no_thousands_columns = self._set_no_thousands_columns() + else: + self._no_thousands_columns = None + + if len(self.decimal) != 1: + raise ValueError("Only length-1 decimal markers supported") + + decimal = re.escape(self.decimal) + if self.thousands is None: + regex = fr"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$" + else: + thousands = re.escape(self.thousands) + regex = ( + fr"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?" + fr"([0-9]?(E|e)\-?[0-9]+)?$" + ) + self.num = re.compile(regex) + + def _set_no_thousands_columns(self): + # Create a set of column ids that are not to be stripped of thousands + # operators. + noconvert_columns = set() + + def _set(x): + if is_integer(x): + noconvert_columns.add(x) + else: + assert self._col_indices is not None + col_indices = self._col_indices + noconvert_columns.add(col_indices[self.columns.index(x)]) + + if isinstance(self.parse_dates, list): + for val in self.parse_dates: + if isinstance(val, list): + for k in val: + _set(k) + else: + _set(val) + + elif isinstance(self.parse_dates, dict): + for val in self.parse_dates.values(): + if isinstance(val, list): + for k in val: + _set(k) + else: + _set(val) + + elif self.parse_dates: + if isinstance(self.index_col, list): + for k in self.index_col: + _set(k) + elif self.index_col is not None: + _set(self.index_col) + + return noconvert_columns + + def _make_reader(self, f): + sep = self.delimiter + + if sep is None or len(sep) == 1: + if self.lineterminator: + raise ValueError( + "Custom line terminators not supported in python parser (yet)" + ) + + class MyDialect(csv.Dialect): + delimiter = self.delimiter + quotechar = self.quotechar + escapechar = self.escapechar + doublequote = self.doublequote + skipinitialspace = self.skipinitialspace + quoting = self.quoting + lineterminator = "\n" + + dia = MyDialect + + if sep is not None: + dia.delimiter = sep + else: + # attempt to sniff the delimiter from the first valid line, + # i.e. no comment line and not in skiprows + line = f.readline() + lines = self._check_comments([[line]])[0] + while self.skipfunc(self.pos) or not lines: + self.pos += 1 + line = f.readline() + lines = self._check_comments([[line]])[0] + + # since `line` was a string, lines will be a list containing + # only a single string + line = lines[0] + + self.pos += 1 + self.line_pos += 1 + sniffed = csv.Sniffer().sniff(line) + dia.delimiter = sniffed.delimiter + + # Note: encoding is irrelevant here + line_rdr = csv.reader(StringIO(line), dialect=dia) + self.buf.extend(list(line_rdr)) + + # Note: encoding is irrelevant here + reader = csv.reader(f, dialect=dia, strict=True) + + else: + + def _read(): + line = f.readline() + pat = re.compile(sep) + + yield pat.split(line.strip()) + + for line in f: + yield pat.split(line.strip()) + + reader = _read() + + # pandas\io\parsers.py:2427: error: Incompatible types in assignment + # (expression has type "_reader", variable has type "Union[IO[Any], + # RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap, None]") + # [assignment] + self.data = reader # type: ignore[assignment] + + def read(self, rows=None): + try: + content = self._get_lines(rows) + except StopIteration: + if self._first_chunk: + content = [] + else: + self.close() + raise + + # done with first read, next time raise StopIteration + self._first_chunk = False + + # pandas\io\parsers.py:2480: error: Argument 1 to "list" has + # incompatible type "Optional[Any]"; expected "Iterable[Any]" + # [arg-type] + columns = list(self.orig_names) # type: ignore[arg-type] + if not len(content): # pragma: no cover + # DataFrame with the right metadata, even though it's length 0 + names = self._maybe_dedup_names(self.orig_names) + index, columns, col_dict = self._get_empty_meta( + names, self.index_col, self.index_names, self.dtype + ) + columns = self._maybe_make_multi_index_columns(columns, self.col_names) + return index, columns, col_dict + + # handle new style for names in index + count_empty_content_vals = count_empty_vals(content[0]) + indexnamerow = None + if self.has_index_names and count_empty_content_vals == len(columns): + indexnamerow = content[0] + content = content[1:] + + alldata = self._rows_to_cols(content) + data, columns = self._exclude_implicit_index(alldata) + + columns, data = self._do_date_conversions(columns, data) + + data = self._convert_data(data) + index, columns = self._make_index(data, alldata, columns, indexnamerow) + + return index, columns, data + + def _exclude_implicit_index(self, alldata): + names = self._maybe_dedup_names(self.orig_names) + + offset = 0 + if self._implicit_index: + offset = len(self.index_col) + + if self._col_indices is not None and len(names) != len(self._col_indices): + names = [names[i] for i in sorted(self._col_indices)] + + return {name: alldata[i + offset] for i, name in enumerate(names)}, names + + # legacy + def get_chunk(self, size=None): + if size is None: + # pandas\io\parsers.py:2528: error: "PythonParser" has no attribute + # "chunksize" [attr-defined] + size = self.chunksize # type: ignore[attr-defined] + return self.read(rows=size) + + def _convert_data(self, data): + # apply converters + def _clean_mapping(mapping): + """converts col numbers to names""" + clean = {} + for col, v in mapping.items(): + # pandas\io\parsers.py:2537: error: Unsupported right operand + # type for in ("Optional[Any]") [operator] + if ( + isinstance(col, int) + and col not in self.orig_names # type: ignore[operator] + ): + # pandas\io\parsers.py:2538: error: Value of type + # "Optional[Any]" is not indexable [index] + col = self.orig_names[col] # type: ignore[index] + clean[col] = v + return clean + + clean_conv = _clean_mapping(self.converters) + if not isinstance(self.dtype, dict): + # handles single dtype applied to all columns + clean_dtypes = self.dtype + else: + clean_dtypes = _clean_mapping(self.dtype) + + # Apply NA values. + clean_na_values = {} + clean_na_fvalues = {} + + if isinstance(self.na_values, dict): + for col in self.na_values: + na_value = self.na_values[col] + na_fvalue = self.na_fvalues[col] + + # pandas\io\parsers.py:2558: error: Unsupported right operand + # type for in ("Optional[Any]") [operator] + if ( + isinstance(col, int) + and col not in self.orig_names # type: ignore[operator] + ): + # pandas\io\parsers.py:2559: error: Value of type + # "Optional[Any]" is not indexable [index] + col = self.orig_names[col] # type: ignore[index] + + clean_na_values[col] = na_value + clean_na_fvalues[col] = na_fvalue + else: + clean_na_values = self.na_values + clean_na_fvalues = self.na_fvalues + + return self._convert_to_ndarrays( + data, + clean_na_values, + clean_na_fvalues, + self.verbose, + clean_conv, + clean_dtypes, + ) + + def _infer_columns(self): + names = self.names + num_original_columns = 0 + clear_buffer = True + # pandas\io\parsers.py:2580: error: Need type annotation for + # 'unnamed_cols' (hint: "unnamed_cols: Set[] = ...") + # [var-annotated] + unnamed_cols = set() # type: ignore[var-annotated] + + if self.header is not None: + header = self.header + + if isinstance(header, (list, tuple, np.ndarray)): + have_mi_columns = len(header) > 1 + # we have a mi columns, so read an extra line + if have_mi_columns: + header = list(header) + [header[-1] + 1] + else: + have_mi_columns = False + header = [header] + + # pandas\io\parsers.py:2594: error: Need type annotation for + # 'columns' (hint: "columns: List[] = ...") [var-annotated] + columns = [] # type: ignore[var-annotated] + for level, hr in enumerate(header): + try: + line = self._buffered_line() + + while self.line_pos <= hr: + line = self._next_line() + + except StopIteration as err: + if self.line_pos < hr: + raise ValueError( + f"Passed header={hr} but only {self.line_pos + 1} lines in " + "file" + ) from err + + # We have an empty file, so check + # if columns are provided. That will + # serve as the 'line' for parsing + if have_mi_columns and hr > 0: + if clear_buffer: + self._clear_buffer() + columns.append([None] * len(columns[-1])) + return columns, num_original_columns, unnamed_cols + + if not self.names: + raise EmptyDataError("No columns to parse from file") from err + + line = self.names[:] + + this_columns = [] + this_unnamed_cols = [] + + for i, c in enumerate(line): + if c == "": + if have_mi_columns: + col_name = f"Unnamed: {i}_level_{level}" + else: + col_name = f"Unnamed: {i}" + + this_unnamed_cols.append(i) + this_columns.append(col_name) + else: + this_columns.append(c) + + if not have_mi_columns and self.mangle_dupe_cols: + # pandas\io\parsers.py:2639: error: Need type annotation + # for 'counts' [var-annotated] + counts = defaultdict(int) # type: ignore[var-annotated] + + for i, col in enumerate(this_columns): + cur_count = counts[col] + + while cur_count > 0: + counts[col] = cur_count + 1 + col = f"{col}.{cur_count}" + cur_count = counts[col] + + this_columns[i] = col + counts[col] = cur_count + 1 + elif have_mi_columns: + + # if we have grabbed an extra line, but its not in our + # format so save in the buffer, and create an blank extra + # line for the rest of the parsing code + if hr == header[-1]: + lc = len(this_columns) + ic = len(self.index_col) if self.index_col is not None else 0 + unnamed_count = len(this_unnamed_cols) + + if lc != unnamed_count and lc - ic > unnamed_count: + clear_buffer = False + # pandas\io\parsers.py:2663: error: List item 0 has + # incompatible type "None"; expected "str" + # [list-item] + this_columns = [None] * lc # type: ignore[list-item] + self.buf = [self.buf[-1]] + + # pandas\io\parsers.py:2666: error: Argument 1 to "append" of + # "list" has incompatible type "List[str]"; expected + # "List[None]" [arg-type] + columns.append(this_columns) # type: ignore[arg-type] + unnamed_cols.update({this_columns[i] for i in this_unnamed_cols}) + + if len(columns) == 1: + num_original_columns = len(this_columns) + + if clear_buffer: + self._clear_buffer() + + if names is not None: + if len(names) > len(columns[0]): + raise ValueError( + "Number of passed names did not match " + "number of header fields in the file" + ) + if len(columns) > 1: + raise TypeError("Cannot pass names with multi-index columns") + + if self.usecols is not None: + # Set _use_cols. We don't store columns because they are + # overwritten. + self._handle_usecols(columns, names) + else: + num_original_columns = len(names) + columns = [names] + else: + columns = self._handle_usecols(columns, columns[0]) + else: + try: + line = self._buffered_line() + + except StopIteration as err: + if not names: + raise EmptyDataError("No columns to parse from file") from err + + line = names[:] + + ncols = len(line) + num_original_columns = ncols + + if not names: + if self.prefix: + # pandas\io\parsers.py:2711: error: List comprehension has + # incompatible type List[str]; expected List[None] [misc] + columns = [ + [ + f"{self.prefix}{i}" # type: ignore[misc] + for i in range(ncols) + ] + ] + else: + # pandas\io\parsers.py:2713: error: Argument 1 to "list" + # has incompatible type "range"; expected "Iterable[None]" + # [arg-type] + columns = [list(range(ncols))] # type: ignore[arg-type] + columns = self._handle_usecols(columns, columns[0]) + else: + if self.usecols is None or len(names) >= num_original_columns: + columns = self._handle_usecols([names], names) + num_original_columns = len(names) + else: + if not callable(self.usecols) and len(names) != len(self.usecols): + raise ValueError( + "Number of passed names did not match number of " + "header fields in the file" + ) + # Ignore output but set used columns. + self._handle_usecols([names], names) + columns = [names] + num_original_columns = ncols + + return columns, num_original_columns, unnamed_cols + + def _handle_usecols(self, columns, usecols_key): + """ + Sets self._col_indices + + usecols_key is used if there are string usecols. + """ + if self.usecols is not None: + if callable(self.usecols): + col_indices = self._evaluate_usecols(self.usecols, usecols_key) + elif any(isinstance(u, str) for u in self.usecols): + if len(columns) > 1: + raise ValueError( + "If using multiple headers, usecols must be integers." + ) + col_indices = [] + + for col in self.usecols: + if isinstance(col, str): + try: + col_indices.append(usecols_key.index(col)) + except ValueError: + self._validate_usecols_names(self.usecols, usecols_key) + else: + col_indices.append(col) + else: + col_indices = self.usecols + + columns = [ + [n for i, n in enumerate(column) if i in col_indices] + for column in columns + ] + self._col_indices = sorted(col_indices) + return columns + + def _buffered_line(self): + """ + Return a line from buffer, filling buffer if required. + """ + if len(self.buf) > 0: + return self.buf[0] + else: + return self._next_line() + + def _check_for_bom(self, first_row): + """ + Checks whether the file begins with the BOM character. + If it does, remove it. In addition, if there is quoting + in the field subsequent to the BOM, remove it as well + because it technically takes place at the beginning of + the name, not the middle of it. + """ + # first_row will be a list, so we need to check + # that that list is not empty before proceeding. + if not first_row: + return first_row + + # The first element of this row is the one that could have the + # BOM that we want to remove. Check that the first element is a + # string before proceeding. + if not isinstance(first_row[0], str): + return first_row + + # Check that the string is not empty, as that would + # obviously not have a BOM at the start of it. + if not first_row[0]: + return first_row + + # Since the string is non-empty, check that it does + # in fact begin with a BOM. + first_elt = first_row[0][0] + if first_elt != _BOM: + return first_row + + first_row_bom = first_row[0] + + if len(first_row_bom) > 1 and first_row_bom[1] == self.quotechar: + start = 2 + quote = first_row_bom[1] + end = first_row_bom[2:].index(quote) + 2 + + # Extract the data between the quotation marks + new_row = first_row_bom[start:end] + + # Extract any remaining data after the second + # quotation mark. + if len(first_row_bom) > end + 1: + new_row += first_row_bom[end + 1 :] + + else: + + # No quotation so just remove BOM from first element + new_row = first_row_bom[1:] + return [new_row] + first_row[1:] + + def _is_line_empty(self, line): + """ + Check if a line is empty or not. + + Parameters + ---------- + line : str, array-like + The line of data to check. + + Returns + ------- + boolean : Whether or not the line is empty. + """ + return not line or all(not x for x in line) + + def _next_line(self): + if isinstance(self.data, list): + while self.skipfunc(self.pos): + self.pos += 1 + + while True: + try: + line = self._check_comments([self.data[self.pos]])[0] + self.pos += 1 + # either uncommented or blank to begin with + if not self.skip_blank_lines and ( + self._is_line_empty(self.data[self.pos - 1]) or line + ): + break + elif self.skip_blank_lines: + ret = self._remove_empty_lines([line]) + if ret: + line = ret[0] + break + except IndexError: + raise StopIteration + else: + while self.skipfunc(self.pos): + self.pos += 1 + # assert for mypy, data is Iterator[str] or None, would error in next + assert self.data is not None + next(self.data) + + while True: + orig_line = self._next_iter_line(row_num=self.pos + 1) + self.pos += 1 + + if orig_line is not None: + line = self._check_comments([orig_line])[0] + + if self.skip_blank_lines: + ret = self._remove_empty_lines([line]) + + if ret: + line = ret[0] + break + elif self._is_line_empty(orig_line) or line: + break + + # This was the first line of the file, + # which could contain the BOM at the + # beginning of it. + if self.pos == 1: + line = self._check_for_bom(line) + + self.line_pos += 1 + self.buf.append(line) + return line + + def _alert_malformed(self, msg, row_num): + """ + Alert a user about a malformed row. + + If `self.error_bad_lines` is True, the alert will be `ParserError`. + If `self.warn_bad_lines` is True, the alert will be printed out. + + Parameters + ---------- + msg : The error message to display. + row_num : The row number where the parsing error occurred. + Because this row number is displayed, we 1-index, + even though we 0-index internally. + """ + if self.error_bad_lines: + raise ParserError(msg) + elif self.warn_bad_lines: + base = f"Skipping line {row_num}: " + sys.stderr.write(base + msg + "\n") + + def _next_iter_line(self, row_num): + """ + Wrapper around iterating through `self.data` (CSV source). + + When a CSV error is raised, we check for specific + error messages that allow us to customize the + error message displayed to the user. + + Parameters + ---------- + row_num : The row number of the line being parsed. + """ + try: + # assert for mypy, data is Iterator[str] or None, would error in next + assert self.data is not None + return next(self.data) + except csv.Error as e: + if self.warn_bad_lines or self.error_bad_lines: + msg = str(e) + + if "NULL byte" in msg or "line contains NUL" in msg: + msg = ( + "NULL byte detected. This byte " + "cannot be processed in Python's " + "native csv library at the moment, " + "so please pass in engine='c' instead" + ) + + if self.skipfooter > 0: + reason = ( + "Error could possibly be due to " + "parsing errors in the skipped footer rows " + "(the skipfooter keyword is only applied " + "after Python's csv library has parsed " + "all rows)." + ) + msg += ". " + reason + + self._alert_malformed(msg, row_num) + return None + + def _check_comments(self, lines): + if self.comment is None: + return lines + ret = [] + for line in lines: + rl = [] + for x in line: + if ( + not isinstance(x, str) + or self.comment not in x + or x in self.na_values + ): + rl.append(x) + else: + x = x[: x.find(self.comment)] + if len(x) > 0: + rl.append(x) + break + ret.append(rl) + return ret + + def _remove_empty_lines(self, lines): + """ + Iterate through the lines and remove any that are + either empty or contain only one whitespace value + + Parameters + ---------- + lines : array-like + The array of lines that we are to filter. + + Returns + ------- + filtered_lines : array-like + The same array of lines with the "empty" ones removed. + """ + ret = [] + for line in lines: + # Remove empty lines and lines with only one whitespace value + if ( + len(line) > 1 + or len(line) == 1 + and (not isinstance(line[0], str) or line[0].strip()) + ): + ret.append(line) + return ret + + def _check_thousands(self, lines): + if self.thousands is None: + return lines + + return self._search_replace_num_columns( + lines=lines, search=self.thousands, replace="" + ) + + def _search_replace_num_columns(self, lines, search, replace): + ret = [] + for line in lines: + rl = [] + for i, x in enumerate(line): + if ( + not isinstance(x, str) + or search not in x + or (self._no_thousands_columns and i in self._no_thousands_columns) + or not self.num.search(x.strip()) + ): + rl.append(x) + else: + rl.append(x.replace(search, replace)) + ret.append(rl) + return ret + + def _check_decimal(self, lines): + if self.decimal == parser_defaults["decimal"]: + return lines + + return self._search_replace_num_columns( + lines=lines, search=self.decimal, replace="." + ) + + def _clear_buffer(self): + self.buf = [] + + _implicit_index = False + + def _get_index_name(self, columns): + """ + Try several cases to get lines: + + 0) There are headers on row 0 and row 1 and their + total summed lengths equals the length of the next line. + Treat row 0 as columns and row 1 as indices + 1) Look for implicit index: there are more columns + on row 1 than row 0. If this is true, assume that row + 1 lists index columns and row 0 lists normal columns. + 2) Get index from the columns if it was listed. + """ + orig_names = list(columns) + columns = list(columns) + + try: + line = self._next_line() + except StopIteration: + line = None + + try: + next_line = self._next_line() + except StopIteration: + next_line = None + + # implicitly index_col=0 b/c 1 fewer column names + implicit_first_cols = 0 + if line is not None: + # leave it 0, #2442 + # Case 1 + if self.index_col is not False: + implicit_first_cols = len(line) - self.num_original_columns + + # Case 0 + if next_line is not None: + if len(next_line) == len(line) + self.num_original_columns: + # column and index names on diff rows + self.index_col = list(range(len(line))) + self.buf = self.buf[1:] + + for c in reversed(line): + columns.insert(0, c) + + # Update list of original names to include all indices. + orig_names = list(columns) + self.num_original_columns = len(columns) + return line, orig_names, columns + + if implicit_first_cols > 0: + # Case 1 + self._implicit_index = True + if self.index_col is None: + self.index_col = list(range(implicit_first_cols)) + + index_name = None + + else: + # Case 2 + (index_name, columns_, self.index_col) = self._clean_index_names( + columns, self.index_col, self.unnamed_cols + ) + + return index_name, orig_names, columns + + def _rows_to_cols(self, content): + col_len = self.num_original_columns + + if self._implicit_index: + col_len += len(self.index_col) + + max_len = max(len(row) for row in content) + + # Check that there are no rows with too many + # elements in their row (rows with too few + # elements are padded with NaN). + if max_len > col_len and self.index_col is not False and self.usecols is None: + + footers = self.skipfooter if self.skipfooter else 0 + bad_lines = [] + + iter_content = enumerate(content) + content_len = len(content) + content = [] + + for (i, l) in iter_content: + actual_len = len(l) + + if actual_len > col_len: + if self.error_bad_lines or self.warn_bad_lines: + row_num = self.pos - (content_len - i + footers) + bad_lines.append((row_num, actual_len)) + + if self.error_bad_lines: + break + else: + content.append(l) + + for row_num, actual_len in bad_lines: + msg = ( + f"Expected {col_len} fields in line {row_num + 1}, saw " + f"{actual_len}" + ) + if ( + self.delimiter + and len(self.delimiter) > 1 + and self.quoting != csv.QUOTE_NONE + ): + # see gh-13374 + reason = ( + "Error could possibly be due to quotes being " + "ignored when a multi-char delimiter is used." + ) + msg += ". " + reason + + self._alert_malformed(msg, row_num + 1) + + # see gh-13320 + zipped_content = list(lib.to_object_array(content, min_width=col_len).T) + + if self.usecols: + assert self._col_indices is not None + col_indices = self._col_indices + + if self._implicit_index: + zipped_content = [ + a + for i, a in enumerate(zipped_content) + if ( + i < len(self.index_col) + or i - len(self.index_col) in col_indices + ) + ] + else: + zipped_content = [ + a for i, a in enumerate(zipped_content) if i in col_indices + ] + return zipped_content + + def _get_lines(self, rows=None): + lines = self.buf + new_rows = None + + # already fetched some number + if rows is not None: + # we already have the lines in the buffer + if len(self.buf) >= rows: + new_rows, self.buf = self.buf[:rows], self.buf[rows:] + + # need some lines + else: + rows -= len(self.buf) + + if new_rows is None: + if isinstance(self.data, list): + if self.pos > len(self.data): + raise StopIteration + if rows is None: + new_rows = self.data[self.pos :] + new_pos = len(self.data) + else: + new_rows = self.data[self.pos : self.pos + rows] + new_pos = self.pos + rows + + # Check for stop rows. n.b.: self.skiprows is a set. + if self.skiprows: + new_rows = [ + row + for i, row in enumerate(new_rows) + if not self.skipfunc(i + self.pos) + ] + + lines.extend(new_rows) + self.pos = new_pos + + else: + new_rows = [] + try: + if rows is not None: + for _ in range(rows): + # assert for mypy, data is Iterator[str] or None, would + # error in next + assert self.data is not None + new_rows.append(next(self.data)) + lines.extend(new_rows) + else: + rows = 0 + + while True: + new_row = self._next_iter_line(row_num=self.pos + rows + 1) + rows += 1 + + if new_row is not None: + new_rows.append(new_row) + + except StopIteration: + if self.skiprows: + new_rows = [ + row + for i, row in enumerate(new_rows) + if not self.skipfunc(i + self.pos) + ] + lines.extend(new_rows) + if len(lines) == 0: + raise + self.pos += len(new_rows) + + self.buf = [] + else: + lines = new_rows + + if self.skipfooter: + lines = lines[: -self.skipfooter] + + lines = self._check_comments(lines) + if self.skip_blank_lines: + lines = self._remove_empty_lines(lines) + lines = self._check_thousands(lines) + return self._check_decimal(lines) + + +class FixedWidthReader(abc.Iterator): + """ + A reader of fixed-width lines. + """ + + def __init__(self, f, colspecs, delimiter, comment, skiprows=None, infer_nrows=100): + self.f = f + self.buffer = None + self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t " + self.comment = comment + if colspecs == "infer": + self.colspecs = self.detect_colspecs( + infer_nrows=infer_nrows, skiprows=skiprows + ) + else: + self.colspecs = colspecs + + if not isinstance(self.colspecs, (tuple, list)): + raise TypeError( + "column specifications must be a list or tuple, " + f"input was a {type(colspecs).__name__}" + ) + + for colspec in self.colspecs: + if not ( + isinstance(colspec, (tuple, list)) + and len(colspec) == 2 + and isinstance(colspec[0], (int, np.integer, type(None))) + and isinstance(colspec[1], (int, np.integer, type(None))) + ): + raise TypeError( + "Each column specification must be " + "2 element tuple or list of integers" + ) + + def get_rows(self, infer_nrows, skiprows=None): + """ + Read rows from self.f, skipping as specified. + + We distinguish buffer_rows (the first <= infer_nrows + lines) from the rows returned to detect_colspecs + because it's simpler to leave the other locations + with skiprows logic alone than to modify them to + deal with the fact we skipped some rows here as + well. + + Parameters + ---------- + infer_nrows : int + Number of rows to read from self.f, not counting + rows that are skipped. + skiprows: set, optional + Indices of rows to skip. + + Returns + ------- + detect_rows : list of str + A list containing the rows to read. + + """ + if skiprows is None: + skiprows = set() + buffer_rows = [] + detect_rows = [] + for i, row in enumerate(self.f): + if i not in skiprows: + detect_rows.append(row) + buffer_rows.append(row) + if len(detect_rows) >= infer_nrows: + break + self.buffer = iter(buffer_rows) + return detect_rows + + def detect_colspecs(self, infer_nrows=100, skiprows=None): + # Regex escape the delimiters + delimiters = "".join(fr"\{x}" for x in self.delimiter) + pattern = re.compile(f"([^{delimiters}]+)") + rows = self.get_rows(infer_nrows, skiprows) + if not rows: + raise EmptyDataError("No rows from which to infer column width") + max_len = max(map(len, rows)) + mask = np.zeros(max_len + 1, dtype=int) + if self.comment is not None: + rows = [row.partition(self.comment)[0] for row in rows] + for row in rows: + for m in pattern.finditer(row): + mask[m.start() : m.end()] = 1 + shifted = np.roll(mask, 1) + shifted[0] = 0 + edges = np.where((mask ^ shifted) == 1)[0] + edge_pairs = list(zip(edges[::2], edges[1::2])) + return edge_pairs + + def __next__(self): + if self.buffer is not None: + try: + line = next(self.buffer) + except StopIteration: + self.buffer = None + line = next(self.f) + else: + line = next(self.f) + # Note: 'colspecs' is a sequence of half-open intervals. + return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs] + + +class FixedWidthFieldParser(PythonParser): + """ + Specialization that Converts fixed-width fields into DataFrames. + See PythonParser for details. + """ + + def __init__(self, f, **kwds): + # Support iterators, convert to a list. + self.colspecs = kwds.pop("colspecs") + self.infer_nrows = kwds.pop("infer_nrows") + PythonParser.__init__(self, f, **kwds) + + def _make_reader(self, f): + self.data = FixedWidthReader( + f, + self.colspecs, + self.delimiter, + self.comment, + self.skiprows, + self.infer_nrows, + ) + + def _remove_empty_lines(self, lines) -> List: + """ + Returns the list of lines without the empty ones. With fixed-width + fields, empty lines become arrays of empty strings. + + See PythonParser._remove_empty_lines. + """ + return [ + line + for line in lines + if any(not isinstance(e, str) or e.strip() for e in line) + ] + + +def count_empty_vals(vals) -> int: + return sum(1 for v in vals if v == "" or v is None) + + +def _validate_skipfooter_arg(skipfooter): + """ + Validate the 'skipfooter' parameter. + + Checks whether 'skipfooter' is a non-negative integer. + Raises a ValueError if that is not the case. + + Parameters + ---------- + skipfooter : non-negative integer + The number of rows to skip at the end of the file. + + Returns + ------- + validated_skipfooter : non-negative integer + The original input if the validation succeeds. + + Raises + ------ + ValueError : 'skipfooter' was not a non-negative integer. + """ + if not is_integer(skipfooter): + raise ValueError("skipfooter must be an integer") + + if skipfooter < 0: + raise ValueError("skipfooter cannot be negative") + + return skipfooter diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py new file mode 100644 index 0000000000000..37814a5debf2e --- /dev/null +++ b/pandas/io/parsers/readers.py @@ -0,0 +1,1339 @@ +""" +Module contains tools for processing files into DataFrames or other objects +""" + +from collections import abc +import csv +import sys +from textwrap import fill +from typing import Any, Dict, Optional, Set, Type +import warnings + +import numpy as np + +import pandas._libs.lib as lib +from pandas._libs.parsers import STR_NA_VALUES +from pandas._typing import DtypeArg, FilePathOrBuffer, StorageOptions, Union +from pandas.errors import AbstractMethodError, ParserWarning +from pandas.util._decorators import Appender + +from pandas.core.dtypes.common import is_file_like, is_float, is_integer, is_list_like + +from pandas.core import generic +from pandas.core.frame import DataFrame +from pandas.core.indexes.api import RangeIndex + +from pandas.io.common import validate_header_arg +from pandas.io.parsers.base_parser import ParserBase, is_index_col, parser_defaults +from pandas.io.parsers.c_parser_wrapper import CParserWrapper +from pandas.io.parsers.python_parser import FixedWidthFieldParser, PythonParser + +_doc_read_csv_and_table = ( + r""" +{summary} + +Also supports optionally iterating or breaking of the file +into chunks. + +Additional help can be found in the online docs for +`IO Tools `_. + +Parameters +---------- +filepath_or_buffer : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is + expected. A local file could be: file://localhost/path/to/table.csv. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, such as + a file handle (e.g. via builtin ``open`` function) or ``StringIO``. +sep : str, default {_default_sep} + Delimiter to use. If sep is None, the C engine cannot automatically detect + the separator, but the Python parsing engine can, meaning the latter will + be used and automatically detect the separator by Python's builtin sniffer + tool, ``csv.Sniffer``. In addition, separators longer than 1 character and + different from ``'\s+'`` will be interpreted as regular expressions and + will also force the use of the Python parsing engine. Note that regex + delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``. +delimiter : str, default ``None`` + Alias for sep. +header : int, list of int, default 'infer' + Row number(s) to use as the column names, and the start of the + data. Default behavior is to infer the column names: if no names + are passed the behavior is identical to ``header=0`` and column + names are inferred from the first line of the file, if column + names are passed explicitly then the behavior is identical to + ``header=None``. Explicitly pass ``header=0`` to be able to + replace existing names. The header can be a list of integers that + specify row locations for a multi-index on the columns + e.g. [0,1,3]. Intervening rows that are not specified will be + skipped (e.g. 2 in this example is skipped). Note that this + parameter ignores commented lines and empty lines if + ``skip_blank_lines=True``, so ``header=0`` denotes the first line of + data rather than the first line of the file. +names : array-like, optional + List of column names to use. If the file contains a header row, + then you should explicitly pass ``header=0`` to override the column names. + Duplicates in this list are not allowed. +index_col : int, str, sequence of int / str, or False, default ``None`` + Column(s) to use as the row labels of the ``DataFrame``, either given as + string name or column index. If a sequence of int / str is given, a + MultiIndex is used. + + Note: ``index_col=False`` can be used to force pandas to *not* use the first + column as the index, e.g. when you have a malformed file with delimiters at + the end of each line. +usecols : list-like or callable, optional + Return a subset of the columns. If list-like, all elements must either + be positional (i.e. integer indices into the document columns) or strings + that correspond to column names provided either by the user in `names` or + inferred from the document header row(s). For example, a valid list-like + `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. + Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. + To instantiate a DataFrame from ``data`` with element order preserved use + ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns + in ``['foo', 'bar']`` order or + ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` + for ``['bar', 'foo']`` order. + + If callable, the callable function will be evaluated against the column + names, returning names where the callable function evaluates to True. An + example of a valid callable argument would be ``lambda x: x.upper() in + ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster + parsing time and lower memory usage. +squeeze : bool, default False + If the parsed data only contains one column then return a Series. +prefix : str, optional + Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... +mangle_dupe_cols : bool, default True + Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than + 'X'...'X'. Passing in False will cause data to be overwritten if there + are duplicate names in the columns. +dtype : Type name or dict of column -> type, optional + Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, + 'c': 'Int64'}} + Use `str` or `object` together with suitable `na_values` settings + to preserve and not interpret dtype. + If converters are specified, they will be applied INSTEAD + of dtype conversion. +engine : {{'c', 'python'}}, optional + Parser engine to use. The C engine is faster while the python engine is + currently more feature-complete. +converters : dict, optional + Dict of functions for converting values in certain columns. Keys can either + be integers or column labels. +true_values : list, optional + Values to consider as True. +false_values : list, optional + Values to consider as False. +skipinitialspace : bool, default False + Skip spaces after delimiter. +skiprows : list-like, int or callable, optional + Line numbers to skip (0-indexed) or number of lines to skip (int) + at the start of the file. + + If callable, the callable function will be evaluated against the row + indices, returning True if the row should be skipped and False otherwise. + An example of a valid callable argument would be ``lambda x: x in [0, 2]``. +skipfooter : int, default 0 + Number of lines at bottom of file to skip (Unsupported with engine='c'). +nrows : int, optional + Number of rows of file to read. Useful for reading pieces of large files. +na_values : scalar, str, list-like, or dict, optional + Additional strings to recognize as NA/NaN. If dict passed, specific + per-column NA values. By default the following values are interpreted as + NaN: '""" + + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") + + """'. +keep_default_na : bool, default True + Whether or not to include the default NaN values when parsing the data. + Depending on whether `na_values` is passed in, the behavior is as follows: + + * If `keep_default_na` is True, and `na_values` are specified, `na_values` + is appended to the default NaN values used for parsing. + * If `keep_default_na` is True, and `na_values` are not specified, only + the default NaN values are used for parsing. + * If `keep_default_na` is False, and `na_values` are specified, only + the NaN values specified `na_values` are used for parsing. + * If `keep_default_na` is False, and `na_values` are not specified, no + strings will be parsed as NaN. + + Note that if `na_filter` is passed in as False, the `keep_default_na` and + `na_values` parameters will be ignored. +na_filter : bool, default True + Detect missing value markers (empty strings and the value of na_values). In + data without any NAs, passing na_filter=False can improve the performance + of reading a large file. +verbose : bool, default False + Indicate number of NA values placed in non-numeric columns. +skip_blank_lines : bool, default True + If True, skip over blank lines rather than interpreting as NaN values. +parse_dates : bool or list of int or names or list of lists or dict, \ +default False + The behavior is as follows: + + * boolean. If True -> try parsing the index. + * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 + each as a separate date column. + * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as + a single date column. + * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call + result 'foo' + + If a column or index cannot be represented as an array of datetimes, + say because of an unparsable value or a mixture of timezones, the column + or index will be returned unaltered as an object data type. For + non-standard datetime parsing, use ``pd.to_datetime`` after + ``pd.read_csv``. To parse an index or column with a mixture of timezones, + specify ``date_parser`` to be a partially-applied + :func:`pandas.to_datetime` with ``utc=True``. See + :ref:`io.csv.mixed_timezones` for more. + + Note: A fast-path exists for iso8601-formatted dates. +infer_datetime_format : bool, default False + If True and `parse_dates` is enabled, pandas will attempt to infer the + format of the datetime strings in the columns, and if it can be inferred, + switch to a faster method of parsing them. In some cases this can increase + the parsing speed by 5-10x. +keep_date_col : bool, default False + If True and `parse_dates` specifies combining multiple columns then + keep the original columns. +date_parser : function, optional + Function to use for converting a sequence of string columns to an array of + datetime instances. The default uses ``dateutil.parser.parser`` to do the + conversion. Pandas will try to call `date_parser` in three different ways, + advancing to the next if an exception occurs: 1) Pass one or more arrays + (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the + string values from the columns defined by `parse_dates` into a single array + and pass that; and 3) call `date_parser` once for each row using one or + more strings (corresponding to the columns defined by `parse_dates`) as + arguments. +dayfirst : bool, default False + DD/MM format dates, international and European format. +cache_dates : bool, default True + If True, use a cache of unique, converted dates to apply the datetime + conversion. May produce significant speed-up when parsing duplicate + date strings, especially ones with timezone offsets. + + .. versionadded:: 0.25.0 +iterator : bool, default False + Return TextFileReader object for iteration or getting chunks with + ``get_chunk()``. + + .. versionchanged:: 1.2 + + ``TextFileReader`` is a context manager. +chunksize : int, optional + Return TextFileReader object for iteration. + See the `IO Tools docs + `_ + for more information on ``iterator`` and ``chunksize``. + + .. versionchanged:: 1.2 + + ``TextFileReader`` is a context manager. +compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer' and + `filepath_or_buffer` is path-like, then detect compression from the + following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no + decompression). If using 'zip', the ZIP file must contain only one data + file to be read in. Set to None for no decompression. +thousands : str, optional + Thousands separator. +decimal : str, default '.' + Character to recognize as decimal point (e.g. use ',' for European data). +lineterminator : str (length 1), optional + Character to break file into lines. Only valid with C parser. +quotechar : str (length 1), optional + The character used to denote the start and end of a quoted item. Quoted + items can include the delimiter and it will be ignored. +quoting : int or csv.QUOTE_* instance, default 0 + Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of + QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). +doublequote : bool, default ``True`` + When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate + whether or not to interpret two consecutive quotechar elements INSIDE a + field as a single ``quotechar`` element. +escapechar : str (length 1), optional + One-character string used to escape other characters. +comment : str, optional + Indicates remainder of line should not be parsed. If found at the beginning + of a line, the line will be ignored altogether. This parameter must be a + single character. Like empty lines (as long as ``skip_blank_lines=True``), + fully commented lines are ignored by the parameter `header` but not by + `skiprows`. For example, if ``comment='#'``, parsing + ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being + treated as the header. +encoding : str, optional + Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python + standard encodings + `_ . +dialect : str or csv.Dialect, optional + If provided, this parameter will override values (default or not) for the + following parameters: `delimiter`, `doublequote`, `escapechar`, + `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to + override values, a ParserWarning will be issued. See csv.Dialect + documentation for more details. +error_bad_lines : bool, default True + Lines with too many fields (e.g. a csv line with too many commas) will by + default cause an exception to be raised, and no DataFrame will be returned. + If False, then these "bad lines" will dropped from the DataFrame that is + returned. +warn_bad_lines : bool, default True + If error_bad_lines is False, and warn_bad_lines is True, a warning for each + "bad line" will be output. +delim_whitespace : bool, default False + Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be + used as the sep. Equivalent to setting ``sep='\\s+'``. If this option + is set to True, nothing should be passed in for the ``delimiter`` + parameter. +low_memory : bool, default True + Internally process the file in chunks, resulting in lower memory use + while parsing, but possibly mixed type inference. To ensure no mixed + types either set False, or specify the type with the `dtype` parameter. + Note that the entire file is read into a single DataFrame regardless, + use the `chunksize` or `iterator` parameter to return the data in chunks. + (Only valid with C parser). +memory_map : bool, default False + If a filepath is provided for `filepath_or_buffer`, map the file object + directly onto memory and access the data directly from there. Using this + option can improve performance because there is no longer any I/O overhead. +float_precision : str, optional + Specifies which converter the C engine should use for floating-point + values. The options are ``None`` or 'high' for the ordinary converter, + 'legacy' for the original lower precision pandas converter, and + 'round_trip' for the round-trip converter. + + .. versionchanged:: 1.2 + +{storage_options} + + .. versionadded:: 1.2 + +Returns +------- +DataFrame or TextParser + A comma-separated values (csv) file is returned as two-dimensional + data structure with labeled axes. + +See Also +-------- +DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. +read_csv : Read a comma-separated values (csv) file into DataFrame. +read_fwf : Read a table of fixed-width formatted lines into DataFrame. + +Examples +-------- +>>> pd.{func_name}('data.csv') # doctest: +SKIP +""" +) + + +_c_parser_defaults = { + "delim_whitespace": False, + "na_filter": True, + "low_memory": True, + "memory_map": False, + "error_bad_lines": True, + "warn_bad_lines": True, + "float_precision": None, +} + +_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} + +_c_unsupported = {"skipfooter"} +_python_unsupported = {"low_memory", "float_precision"} + +_deprecated_defaults: Dict[str, Any] = {} +_deprecated_args: Set[str] = set() + + +def validate_integer(name, val, min_val=0): + """ + Checks whether the 'name' parameter for parsing is either + an integer OR float that can SAFELY be cast to an integer + without losing accuracy. Raises a ValueError if that is + not the case. + + Parameters + ---------- + name : string + Parameter name (used for error reporting) + val : int or float + The value to check + min_val : int + Minimum allowed value (val < min_val will result in a ValueError) + """ + msg = f"'{name:s}' must be an integer >={min_val:d}" + + if val is not None: + if is_float(val): + if int(val) != val: + raise ValueError(msg) + val = int(val) + elif not (is_integer(val) and val >= min_val): + raise ValueError(msg) + + return val + + +def _validate_names(names): + """ + Raise ValueError if the `names` parameter contains duplicates or has an + invalid data type. + + Parameters + ---------- + names : array-like or None + An array containing a list of the names used for the output DataFrame. + + Raises + ------ + ValueError + If names are not unique or are not ordered (e.g. set). + """ + if names is not None: + if len(names) != len(set(names)): + raise ValueError("Duplicate names are not allowed.") + if not ( + is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView) + ): + raise ValueError("Names should be an ordered collection.") + + +def _read(filepath_or_buffer: FilePathOrBuffer, kwds): + """Generic reader of line files.""" + if kwds.get("date_parser", None) is not None: + if isinstance(kwds["parse_dates"], bool): + kwds["parse_dates"] = True + + # Extract some of the arguments (pass chunksize on). + iterator = kwds.get("iterator", False) + chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1) + nrows = kwds.get("nrows", None) + + # Check for duplicates in names. + _validate_names(kwds.get("names", None)) + + # Create the parser. + parser = TextFileReader(filepath_or_buffer, **kwds) + + if chunksize or iterator: + return parser + + with parser: + return parser.read(nrows) + + +@Appender( + _doc_read_csv_and_table.format( + func_name="read_csv", + summary="Read a comma-separated values (csv) file into DataFrame.", + _default_sep="','", + storage_options=generic._shared_docs["storage_options"], + ) +) +def read_csv( + filepath_or_buffer: FilePathOrBuffer, + sep=lib.no_default, + delimiter=None, + # Column and Index Locations and Names + header="infer", + names=None, + index_col=None, + usecols=None, + squeeze=False, + prefix=None, + mangle_dupe_cols=True, + # General Parsing Configuration + dtype: Optional[DtypeArg] = None, + engine=None, + converters=None, + true_values=None, + false_values=None, + skipinitialspace=False, + skiprows=None, + skipfooter=0, + nrows=None, + # NA and Missing Data Handling + na_values=None, + keep_default_na=True, + na_filter=True, + verbose=False, + skip_blank_lines=True, + # Datetime Handling + parse_dates=False, + infer_datetime_format=False, + keep_date_col=False, + date_parser=None, + dayfirst=False, + cache_dates=True, + # Iteration + iterator=False, + chunksize=None, + # Quoting, Compression, and File Format + compression="infer", + thousands=None, + decimal: str = ".", + lineterminator=None, + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + doublequote=True, + escapechar=None, + comment=None, + encoding=None, + dialect=None, + # Error Handling + error_bad_lines=True, + warn_bad_lines=True, + # Internal + delim_whitespace=False, + low_memory=_c_parser_defaults["low_memory"], + memory_map=False, + float_precision=None, + storage_options: StorageOptions = None, +): + kwds = locals() + del kwds["filepath_or_buffer"] + del kwds["sep"] + + kwds_defaults = _refine_defaults_read( + dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": ","} + ) + kwds.update(kwds_defaults) + + return _read(filepath_or_buffer, kwds) + + +@Appender( + _doc_read_csv_and_table.format( + func_name="read_table", + summary="Read general delimited file into DataFrame.", + _default_sep=r"'\\t' (tab-stop)", + storage_options=generic._shared_docs["storage_options"], + ) +) +def read_table( + filepath_or_buffer: FilePathOrBuffer, + sep=lib.no_default, + delimiter=None, + # Column and Index Locations and Names + header="infer", + names=None, + index_col=None, + usecols=None, + squeeze=False, + prefix=None, + mangle_dupe_cols=True, + # General Parsing Configuration + dtype: Optional[DtypeArg] = None, + engine=None, + converters=None, + true_values=None, + false_values=None, + skipinitialspace=False, + skiprows=None, + skipfooter=0, + nrows=None, + # NA and Missing Data Handling + na_values=None, + keep_default_na=True, + na_filter=True, + verbose=False, + skip_blank_lines=True, + # Datetime Handling + parse_dates=False, + infer_datetime_format=False, + keep_date_col=False, + date_parser=None, + dayfirst=False, + cache_dates=True, + # Iteration + iterator=False, + chunksize=None, + # Quoting, Compression, and File Format + compression="infer", + thousands=None, + decimal: str = ".", + lineterminator=None, + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + doublequote=True, + escapechar=None, + comment=None, + encoding=None, + dialect=None, + # Error Handling + error_bad_lines=True, + warn_bad_lines=True, + # Internal + delim_whitespace=False, + low_memory=_c_parser_defaults["low_memory"], + memory_map=False, + float_precision=None, +): + kwds = locals() + del kwds["filepath_or_buffer"] + del kwds["sep"] + + kwds_defaults = _refine_defaults_read( + dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": "\t"} + ) + kwds.update(kwds_defaults) + + return _read(filepath_or_buffer, kwds) + + +def read_fwf( + filepath_or_buffer: FilePathOrBuffer, + colspecs="infer", + widths=None, + infer_nrows=100, + **kwds, +): + r""" + Read a table of fixed-width formatted lines into DataFrame. + + Also supports optionally iterating or breaking of the file + into chunks. + + Additional help can be found in the `online docs for IO Tools + `_. + + Parameters + ---------- + filepath_or_buffer : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: + ``file://localhost/path/to/table.csv``. + + If you want to pass in a path object, pandas accepts any + ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handle (e.g. via builtin ``open`` function) + or ``StringIO``. + colspecs : list of tuple (int, int) or 'infer'. optional + A list of tuples giving the extents of the fixed-width + fields of each line as half-open intervals (i.e., [from, to[ ). + String value 'infer' can be used to instruct the parser to try + detecting the column specifications from the first 100 rows of + the data which are not being skipped via skiprows (default='infer'). + widths : list of int, optional + A list of field widths which can be used instead of 'colspecs' if + the intervals are contiguous. + infer_nrows : int, default 100 + The number of rows to consider when letting the parser determine the + `colspecs`. + + .. versionadded:: 0.24.0 + **kwds : optional + Optional keyword arguments can be passed to ``TextFileReader``. + + Returns + ------- + DataFrame or TextParser + A comma-separated values (csv) file is returned as two-dimensional + data structure with labeled axes. + + See Also + -------- + DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. + read_csv : Read a comma-separated values (csv) file into DataFrame. + + Examples + -------- + >>> pd.read_fwf('data.csv') # doctest: +SKIP + """ + # Check input arguments. + if colspecs is None and widths is None: + raise ValueError("Must specify either colspecs or widths") + elif colspecs not in (None, "infer") and widths is not None: + raise ValueError("You must specify only one of 'widths' and 'colspecs'") + + # Compute 'colspecs' from 'widths', if specified. + if widths is not None: + colspecs, col = [], 0 + for w in widths: + colspecs.append((col, col + w)) + col += w + + kwds["colspecs"] = colspecs + kwds["infer_nrows"] = infer_nrows + kwds["engine"] = "python-fwf" + return _read(filepath_or_buffer, kwds) + + +class TextFileReader(abc.Iterator): + """ + + Passed dialect overrides any of the related parser options + + """ + + def __init__(self, f, engine=None, **kwds): + + self.f = f + + if engine is not None: + engine_specified = True + else: + engine = "python" + engine_specified = False + self.engine = engine + self._engine_specified = kwds.get("engine_specified", engine_specified) + + _validate_skipfooter(kwds) + + dialect = _extract_dialect(kwds) + if dialect is not None: + kwds = _merge_with_dialect_properties(dialect, kwds) + + if kwds.get("header", "infer") == "infer": + kwds["header"] = 0 if kwds.get("names") is None else None + + self.orig_options = kwds + + # miscellanea + self._currow = 0 + + options = self._get_options_with_defaults(engine) + options["storage_options"] = kwds.get("storage_options", None) + + self.chunksize = options.pop("chunksize", None) + self.nrows = options.pop("nrows", None) + self.squeeze = options.pop("squeeze", False) + + self._check_file_or_buffer(f, engine) + self.options, self.engine = self._clean_options(options, engine) + + if "has_index_names" in kwds: + self.options["has_index_names"] = kwds["has_index_names"] + + self._engine = self._make_engine(self.engine) + + def close(self): + self._engine.close() + + def _get_options_with_defaults(self, engine): + kwds = self.orig_options + + options = {} + + for argname, default in parser_defaults.items(): + value = kwds.get(argname, default) + + # see gh-12935 + if argname == "mangle_dupe_cols" and not value: + raise ValueError("Setting mangle_dupe_cols=False is not supported yet") + else: + options[argname] = value + + for argname, default in _c_parser_defaults.items(): + if argname in kwds: + value = kwds[argname] + + if engine != "c" and value != default: + if "python" in engine and argname not in _python_unsupported: + pass + elif value == _deprecated_defaults.get(argname, default): + pass + else: + raise ValueError( + f"The {repr(argname)} option is not supported with the " + f"{repr(engine)} engine" + ) + else: + value = _deprecated_defaults.get(argname, default) + options[argname] = value + + if engine == "python-fwf": + # pandas\io\parsers.py:907: error: Incompatible types in assignment + # (expression has type "object", variable has type "Union[int, str, + # None]") [assignment] + for argname, default in _fwf_defaults.items(): # type: ignore[assignment] + options[argname] = kwds.get(argname, default) + + return options + + def _check_file_or_buffer(self, f, engine): + # see gh-16530 + if is_file_like(f) and engine != "c" and not hasattr(f, "__next__"): + # The C engine doesn't need the file-like to have the "__next__" + # attribute. However, the Python engine explicitly calls + # "__next__(...)" when iterating through such an object, meaning it + # needs to have that attribute + raise ValueError( + "The 'python' engine cannot iterate through this file buffer." + ) + + def _clean_options(self, options, engine): + result = options.copy() + + fallback_reason = None + + # C engine not supported yet + if engine == "c": + if options["skipfooter"] > 0: + fallback_reason = "the 'c' engine does not support skipfooter" + engine = "python" + + sep = options["delimiter"] + delim_whitespace = options["delim_whitespace"] + + if sep is None and not delim_whitespace: + if engine == "c": + fallback_reason = ( + "the 'c' engine does not support " + "sep=None with delim_whitespace=False" + ) + engine = "python" + elif sep is not None and len(sep) > 1: + if engine == "c" and sep == r"\s+": + result["delim_whitespace"] = True + del result["delimiter"] + elif engine not in ("python", "python-fwf"): + # wait until regex engine integrated + fallback_reason = ( + "the 'c' engine does not support " + "regex separators (separators > 1 char and " + r"different from '\s+' are interpreted as regex)" + ) + engine = "python" + elif delim_whitespace: + if "python" in engine: + result["delimiter"] = r"\s+" + elif sep is not None: + encodeable = True + encoding = sys.getfilesystemencoding() or "utf-8" + try: + if len(sep.encode(encoding)) > 1: + encodeable = False + except UnicodeDecodeError: + encodeable = False + if not encodeable and engine not in ("python", "python-fwf"): + fallback_reason = ( + f"the separator encoded in {encoding} " + "is > 1 char long, and the 'c' engine " + "does not support such separators" + ) + engine = "python" + + quotechar = options["quotechar"] + if quotechar is not None and isinstance(quotechar, (str, bytes)): + if ( + len(quotechar) == 1 + and ord(quotechar) > 127 + and engine not in ("python", "python-fwf") + ): + fallback_reason = ( + "ord(quotechar) > 127, meaning the " + "quotechar is larger than one byte, " + "and the 'c' engine does not support such quotechars" + ) + engine = "python" + + if fallback_reason and self._engine_specified: + raise ValueError(fallback_reason) + + if engine == "c": + for arg in _c_unsupported: + del result[arg] + + if "python" in engine: + for arg in _python_unsupported: + if fallback_reason and result[arg] != _c_parser_defaults[arg]: + raise ValueError( + "Falling back to the 'python' engine because " + f"{fallback_reason}, but this causes {repr(arg)} to be " + "ignored as it is not supported by the 'python' engine." + ) + del result[arg] + + if fallback_reason: + warnings.warn( + ( + "Falling back to the 'python' engine because " + f"{fallback_reason}; you can avoid this warning by specifying " + "engine='python'." + ), + ParserWarning, + stacklevel=5, + ) + + index_col = options["index_col"] + names = options["names"] + converters = options["converters"] + na_values = options["na_values"] + skiprows = options["skiprows"] + + validate_header_arg(options["header"]) + + for arg in _deprecated_args: + parser_default = _c_parser_defaults[arg] + depr_default = _deprecated_defaults[arg] + if result.get(arg, depr_default) != depr_default: + msg = ( + f"The {arg} argument has been deprecated and will be " + "removed in a future version.\n\n" + ) + warnings.warn(msg, FutureWarning, stacklevel=2) + else: + result[arg] = parser_default + + if index_col is True: + raise ValueError("The value of index_col couldn't be 'True'") + if is_index_col(index_col): + if not isinstance(index_col, (list, tuple, np.ndarray)): + index_col = [index_col] + result["index_col"] = index_col + + names = list(names) if names is not None else names + + # type conversion-related + if converters is not None: + if not isinstance(converters, dict): + raise TypeError( + "Type converters must be a dict or subclass, " + f"input was a {type(converters).__name__}" + ) + else: + converters = {} + + # Converting values to NA + keep_default_na = options["keep_default_na"] + na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) + + # handle skiprows; this is internally handled by the + # c-engine, so only need for python parsers + if engine != "c": + if is_integer(skiprows): + skiprows = list(range(skiprows)) + if skiprows is None: + skiprows = set() + elif not callable(skiprows): + skiprows = set(skiprows) + + # put stuff back + result["names"] = names + result["converters"] = converters + result["na_values"] = na_values + result["na_fvalues"] = na_fvalues + result["skiprows"] = skiprows + + return result, engine + + def __next__(self): + try: + return self.get_chunk() + except StopIteration: + self.close() + raise + + def _make_engine(self, engine="c"): + mapping: Dict[str, Type[ParserBase]] = { + "c": CParserWrapper, + "python": PythonParser, + "python-fwf": FixedWidthFieldParser, + } + if engine not in mapping: + raise ValueError( + f"Unknown engine: {engine} (valid options are {mapping.keys()})" + ) + # error: Too many arguments for "ParserBase" + return mapping[engine](self.f, **self.options) # type: ignore[call-arg] + + def _failover_to_python(self): + raise AbstractMethodError(self) + + def read(self, nrows=None): + nrows = validate_integer("nrows", nrows) + index, columns, col_dict = self._engine.read(nrows) + + if index is None: + if col_dict: + # Any column is actually fine: + new_rows = len(next(iter(col_dict.values()))) + index = RangeIndex(self._currow, self._currow + new_rows) + else: + new_rows = 0 + else: + new_rows = len(index) + + df = DataFrame(col_dict, columns=columns, index=index) + + self._currow += new_rows + + if self.squeeze and len(df.columns) == 1: + return df[df.columns[0]].copy() + return df + + def get_chunk(self, size=None): + if size is None: + size = self.chunksize + if self.nrows is not None: + if self._currow >= self.nrows: + raise StopIteration + size = min(size, self.nrows - self._currow) + return self.read(nrows=size) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + + +def TextParser(*args, **kwds): + """ + Converts lists of lists/tuples into DataFrames with proper type inference + and optional (e.g. string to datetime) conversion. Also enables iterating + lazily over chunks of large files + + Parameters + ---------- + data : file-like object or list + delimiter : separator character to use + dialect : str or csv.Dialect instance, optional + Ignored if delimiter is longer than 1 character + names : sequence, default + header : int, default 0 + Row to use to parse column labels. Defaults to the first row. Prior + rows will be discarded + index_col : int or list, optional + Column or columns to use as the (possibly hierarchical) index + has_index_names: bool, default False + True if the cols defined in index_col have an index name and are + not in the header. + na_values : scalar, str, list-like, or dict, optional + Additional strings to recognize as NA/NaN. + keep_default_na : bool, default True + thousands : str, optional + Thousands separator + comment : str, optional + Comment out remainder of line + parse_dates : bool, default False + keep_date_col : bool, default False + date_parser : function, optional + skiprows : list of integers + Row numbers to skip + skipfooter : int + Number of line at bottom of file to skip + converters : dict, optional + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the cell (not column) content, and return the + transformed content. + encoding : str, optional + Encoding to use for UTF when reading/writing (ex. 'utf-8') + squeeze : bool, default False + returns Series if only one column. + infer_datetime_format: bool, default False + If True and `parse_dates` is True for a column, try to infer the + datetime format based on the first datetime string. If the format + can be inferred, there often will be a large parsing speed-up. + float_precision : str, optional + Specifies which converter the C engine should use for floating-point + values. The options are `None` or `high` for the ordinary converter, + `legacy` for the original lower precision pandas converter, and + `round_trip` for the round-trip converter. + + .. versionchanged:: 1.2 + """ + kwds["engine"] = "python" + return TextFileReader(*args, **kwds) + + +def _clean_na_values(na_values, keep_default_na=True): + + if na_values is None: + if keep_default_na: + na_values = STR_NA_VALUES + else: + na_values = set() + # pandas\io\parsers.py:3387: error: Need type annotation for + # 'na_fvalues' (hint: "na_fvalues: Set[] = ...") [var-annotated] + na_fvalues = set() # type: ignore[var-annotated] + elif isinstance(na_values, dict): + old_na_values = na_values.copy() + na_values = {} # Prevent aliasing. + + # Convert the values in the na_values dictionary + # into array-likes for further use. This is also + # where we append the default NaN values, provided + # that `keep_default_na=True`. + for k, v in old_na_values.items(): + if not is_list_like(v): + v = [v] + + if keep_default_na: + v = set(v) | STR_NA_VALUES + + na_values[k] = v + # pandas\io\parsers.py:3404: error: Incompatible types in assignment + # (expression has type "Dict[Any, Any]", variable has type "Set[Any]") + # [assignment] + na_fvalues = { # type: ignore[assignment] + k: _floatify_na_values(v) for k, v in na_values.items() + } + else: + if not is_list_like(na_values): + na_values = [na_values] + na_values = _stringify_na_values(na_values) + if keep_default_na: + na_values = na_values | STR_NA_VALUES + + na_fvalues = _floatify_na_values(na_values) + + return na_values, na_fvalues + + +def _floatify_na_values(na_values): + # create float versions of the na_values + result = set() + for v in na_values: + try: + v = float(v) + if not np.isnan(v): + result.add(v) + except (TypeError, ValueError, OverflowError): + pass + return result + + +def _stringify_na_values(na_values): + """ return a stringified and numeric for these values """ + result = [] + for x in na_values: + result.append(str(x)) + result.append(x) + try: + v = float(x) + + # we are like 999 here + if v == int(v): + v = int(v) + result.append(f"{v}.0") + result.append(str(v)) + + # pandas\io\parsers.py:3522: error: Argument 1 to "append" of + # "list" has incompatible type "float"; expected "str" [arg-type] + result.append(v) # type: ignore[arg-type] + except (TypeError, ValueError, OverflowError): + pass + try: + # pandas\io\parsers.py:3526: error: Argument 1 to "append" of + # "list" has incompatible type "int"; expected "str" [arg-type] + result.append(int(x)) # type: ignore[arg-type] + except (TypeError, ValueError, OverflowError): + pass + return set(result) + + +def _refine_defaults_read( + dialect: Union[str, csv.Dialect], + delimiter: Union[str, object], + delim_whitespace: bool, + engine: str, + sep: Union[str, object], + defaults: Dict[str, Any], +): + """Validate/refine default values of input parameters of read_csv, read_table. + + Parameters + ---------- + dialect : str or csv.Dialect + If provided, this parameter will override values (default or not) for the + following parameters: `delimiter`, `doublequote`, `escapechar`, + `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to + override values, a ParserWarning will be issued. See csv.Dialect + documentation for more details. + delimiter : str or object + Alias for sep. + delim_whitespace : bool + Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be + used as the sep. Equivalent to setting ``sep='\\s+'``. If this option + is set to True, nothing should be passed in for the ``delimiter`` + parameter. + engine : {{'c', 'python'}} + Parser engine to use. The C engine is faster while the python engine is + currently more feature-complete. + sep : str or object + A delimiter provided by the user (str) or a sentinel value, i.e. + pandas._libs.lib.no_default. + defaults: dict + Default values of input parameters. + + Returns + ------- + kwds : dict + Input parameters with correct values. + + Raises + ------ + ValueError : If a delimiter was specified with ``sep`` (or ``delimiter``) and + ``delim_whitespace=True``. + """ + # fix types for sep, delimiter to Union(str, Any) + delim_default = defaults["delimiter"] + kwds: Dict[str, Any] = {} + # gh-23761 + # + # When a dialect is passed, it overrides any of the overlapping + # parameters passed in directly. We don't want to warn if the + # default parameters were passed in (since it probably means + # that the user didn't pass them in explicitly in the first place). + # + # "delimiter" is the annoying corner case because we alias it to + # "sep" before doing comparison to the dialect values later on. + # Thus, we need a flag to indicate that we need to "override" + # the comparison to dialect values by checking if default values + # for BOTH "delimiter" and "sep" were provided. + if dialect is not None: + kwds["sep_override"] = delimiter is None and ( + sep is lib.no_default or sep == delim_default + ) + + # Alias sep -> delimiter. + if delimiter is None: + delimiter = sep + + if delim_whitespace and (delimiter is not lib.no_default): + raise ValueError( + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only specify one." + ) + + if delimiter is lib.no_default: + # assign default separator value + kwds["delimiter"] = delim_default + else: + kwds["delimiter"] = delimiter + + if engine is not None: + kwds["engine_specified"] = True + else: + kwds["engine"] = "c" + kwds["engine_specified"] = False + + return kwds + + +def _extract_dialect(kwds: Dict[str, Any]) -> Optional[csv.Dialect]: + """ + Extract concrete csv dialect instance. + + Returns + ------- + csv.Dialect or None + """ + if kwds.get("dialect") is None: + return None + + dialect = kwds["dialect"] + if dialect in csv.list_dialects(): + dialect = csv.get_dialect(dialect) + + _validate_dialect(dialect) + + return dialect + + +MANDATORY_DIALECT_ATTRS = ( + "delimiter", + "doublequote", + "escapechar", + "skipinitialspace", + "quotechar", + "quoting", +) + + +def _validate_dialect(dialect: csv.Dialect) -> None: + """ + Validate csv dialect instance. + + Raises + ------ + ValueError + If incorrect dialect is provided. + """ + for param in MANDATORY_DIALECT_ATTRS: + if not hasattr(dialect, param): + raise ValueError(f"Invalid dialect {dialect} provided") + + +def _merge_with_dialect_properties( + dialect: csv.Dialect, + defaults: Dict[str, Any], +) -> Dict[str, Any]: + """ + Merge default kwargs in TextFileReader with dialect parameters. + + Parameters + ---------- + dialect : csv.Dialect + Concrete csv dialect. See csv.Dialect documentation for more details. + defaults : dict + Keyword arguments passed to TextFileReader. + + Returns + ------- + kwds : dict + Updated keyword arguments, merged with dialect parameters. + """ + kwds = defaults.copy() + + for param in MANDATORY_DIALECT_ATTRS: + dialect_val = getattr(dialect, param) + + parser_default = parser_defaults[param] + provided = kwds.get(param, parser_default) + + # Messages for conflicting values between the dialect + # instance and the actual parameters provided. + conflict_msgs = [] + + # Don't warn if the default parameter was passed in, + # even if it conflicts with the dialect (gh-23761). + if provided != parser_default and provided != dialect_val: + msg = ( + f"Conflicting values for '{param}': '{provided}' was " + f"provided, but the dialect specifies '{dialect_val}'. " + "Using the dialect-specified value." + ) + + # Annoying corner case for not warning about + # conflicts between dialect and delimiter parameter. + # Refer to the outer "_read_" function for more info. + if not (param == "delimiter" and kwds.pop("sep_override", False)): + conflict_msgs.append(msg) + + if conflict_msgs: + warnings.warn("\n\n".join(conflict_msgs), ParserWarning, stacklevel=2) + kwds[param] = dialect_val + return kwds + + +def _validate_skipfooter(kwds: Dict[str, Any]) -> None: + """ + Check whether skipfooter is compatible with other kwargs in TextFileReader. + + Parameters + ---------- + kwds : dict + Keyword arguments passed to TextFileReader. + + Raises + ------ + ValueError + If skipfooter is not compatible with other parameters. + """ + if kwds.get("skipfooter"): + if kwds.get("iterator") or kwds.get("chunksize"): + raise ValueError("'skipfooter' not supported for iteration") + if kwds.get("nrows"): + raise ValueError("'skipfooter' not supported with 'nrows'") diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 4fd754bf79ba2..9fd6e48cf8689 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -16,7 +16,8 @@ from pandas import DataFrame, Index, Series, compat import pandas._testing as tm -from pandas.io.parsers import CParserWrapper, TextFileReader +from pandas.io.parsers import TextFileReader +from pandas.io.parsers.c_parser_wrapper import CParserWrapper def test_override_set_noconvert_columns(): diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py index c12eb5ec873b2..d0ee6add9ca92 100644 --- a/pandas/tests/io/parser/test_dialect.py +++ b/pandas/tests/io/parser/test_dialect.py @@ -98,9 +98,9 @@ def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, arg, val if "value" == "dialect": # No conflict --> no warning. kwds[arg] = dialect_kwargs[arg] elif "value" == "default": # Default --> no warning. - from pandas.io.parsers import _parser_defaults + from pandas.io.parsers.base_parser import parser_defaults - kwds[arg] = _parser_defaults[arg] + kwds[arg] = parser_defaults[arg] else: # Non-default + conflict with dialect --> warning. warning_klass = ParserWarning kwds[arg] = "blah" diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 47dc543c61bd0..5322c19a3ae50 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -11,11 +11,13 @@ import numpy as np import pytest +from pandas.errors import EmptyDataError + import pandas as pd from pandas import DataFrame, DatetimeIndex import pandas._testing as tm -from pandas.io.parsers import EmptyDataError, read_csv, read_fwf +from pandas.io.parsers import read_csv, read_fwf def test_basic(): diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 267fae760398a..2cf3d959acb48 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -14,8 +14,8 @@ import pandas._testing as tm -import pandas.io.parsers as parsers from pandas.io.parsers import read_csv +import pandas.io.parsers.readers as parsers @pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val) @@ -86,7 +86,7 @@ def test_c_engine(self): read_csv(StringIO(data), lineterminator="~~") def test_python_engine(self, python_engine): - from pandas.io.parsers import _python_unsupported as py_unsupported + from pandas.io.parsers.readers import _python_unsupported as py_unsupported data = """1,2,3,, 1,2,3,4,