From a775f9cb40a6cd41d52c44261cbb1594705ffcee Mon Sep 17 00:00:00 2001 From: barnargh Date: Sat, 20 Apr 2024 20:19:38 -0500 Subject: [PATCH 1/5] fixed implicit conversion of 1-arrays inside data frames --- pandas/core/internals/managers.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 8fda9cd23b508..d09e0fb74463a 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2170,8 +2170,10 @@ def setitem_inplace(self, indexer, value) -> None: # Note: checking for ndarray instead of np.dtype means we exclude # dt64/td64, which do their own validation. value = np_can_hold_element(arr.dtype, value) - - if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1: + + # check if the dtype of the block is object + implicit_convert = arr.dtype != 'object' + if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1 and implicit_convert: # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 value = value[0, ...] From 1d04d6f959239e709404a4519518f651e9c32d49 Mon Sep 17 00:00:00 2001 From: barnargh Date: Sat, 20 Apr 2024 22:54:45 -0500 Subject: [PATCH 2/5] fixed issue #57944 --- pandas/io/parsers/readers.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 70f9a68244164..1eb736880c369 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -716,6 +716,19 @@ def read_csv( ) -> DataFrame | TextFileReader: ... +# a helper function for the read_csv(...) below). +# ensures that all keys in dtype are of type str. +# this allows for compatibility with the csv library +def parse_dtype(dtype) -> DtypeArg: + temp = {} + for key in dtype: + if isinstance(key, str): + temp[f"{key}"] = dtype[key] + else: + temp[key] = dtype[key] + return temp + + @Appender( _doc_read_csv_and_table.format( func_name="read_csv", @@ -790,6 +803,9 @@ def read_csv( storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: + # ensures that all keys in dtype are a string for compatibility with csv + dtype = parse_dtype(dtype) + if keep_date_col is not lib.no_default: # GH#55569 warnings.warn( From 9458f3e2ca33bb304f32c4d354c29f558d80c0a4 Mon Sep 17 00:00:00 2001 From: Gabe Barnard Date: Sat, 20 Apr 2024 23:47:04 -0500 Subject: [PATCH 3/5] restored to og --- pandas/core/internals/managers.py | 6 +- pandas/io/parsers/readers.py | 4209 ++++++++++++++++------------- 2 files changed, 2343 insertions(+), 1872 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index d09e0fb74463a..8fda9cd23b508 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2170,10 +2170,8 @@ def setitem_inplace(self, indexer, value) -> None: # Note: checking for ndarray instead of np.dtype means we exclude # dt64/td64, which do their own validation. value = np_can_hold_element(arr.dtype, value) - - # check if the dtype of the block is object - implicit_convert = arr.dtype != 'object' - if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1 and implicit_convert: + + if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1: # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 value = value[0, ...] diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 1eb736880c369..0c2332f24de1e 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1,2045 +1,2518 @@ -""" -Module contains tools for processing files into DataFrames or other objects - -GH#48849 provides a convenient way of deprecating keyword arguments -""" - from __future__ import annotations -from collections import ( - abc, - defaultdict, +from collections.abc import ( + Hashable, + Sequence, ) -import csv -import sys -from textwrap import fill +import itertools from typing import ( - IO, TYPE_CHECKING, Any, Callable, - Generic, Literal, - TypedDict, - overload, + NoReturn, + cast, + final, ) import warnings import numpy as np -from pandas._libs import lib -from pandas._libs.parsers import STR_NA_VALUES +from pandas._config.config import get_option + +from pandas._libs import ( + algos as libalgos, + internals as libinternals, + lib, +) +from pandas._libs.internals import ( + BlockPlacement, + BlockValuesRefs, +) +from pandas._libs.tslibs import Timestamp from pandas.errors import ( AbstractMethodError, - ParserWarning, + PerformanceWarning, ) -from pandas.util._decorators import Appender +from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level -from pandas.util._validators import check_dtype_backend +from pandas.util._validators import validate_bool_kwarg +from pandas.core.dtypes.cast import ( + find_common_type, + infer_dtype_from_scalar, + np_can_hold_element, +) from pandas.core.dtypes.common import ( - is_file_like, - is_float, - is_hashable, - is_integer, + ensure_platform_int, + is_1d_only_ea_dtype, is_list_like, - pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + ExtensionDtype, + SparseDtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) +from pandas.core.dtypes.missing import ( + array_equals, + isna, ) -from pandas import Series -from pandas.core.frame import DataFrame -from pandas.core.indexes.api import RangeIndex -from pandas.core.shared_docs import _shared_docs - -from pandas.io.common import ( - IOHandles, - get_handle, - stringify_path, - validate_header_arg, +import pandas.core.algorithms as algos +from pandas.core.arrays import DatetimeArray +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core.base import PandasObject +from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, + extract_array, +) +from pandas.core.indexers import maybe_convert_indices +from pandas.core.indexes.api import ( + Index, + default_index, + ensure_index, ) -from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper -from pandas.io.parsers.base_parser import ( - ParserBase, - is_index_col, - parser_defaults, +from pandas.core.internals.blocks import ( + Block, + NumpyBlock, + ensure_block_shape, + extend_blocks, + get_block_type, + maybe_coerce_values, + new_block, + new_block_2d, ) -from pandas.io.parsers.c_parser_wrapper import CParserWrapper -from pandas.io.parsers.python_parser import ( - FixedWidthFieldParser, - PythonParser, +from pandas.core.internals.ops import ( + blockwise_all, + operate_blockwise, ) if TYPE_CHECKING: - from collections.abc import ( - Hashable, - Iterable, - Mapping, - Sequence, - ) - from types import TracebackType - from pandas._typing import ( - CompressionOptions, - CSVEngine, - DtypeArg, - DtypeBackend, - FilePath, - HashableT, - IndexLabel, - ReadCsvBuffer, + ArrayLike, + AxisInt, + DtypeObj, + QuantileInterpolation, Self, - StorageOptions, - Unpack, - UsecolsArgType, + Shape, + npt, ) - class _read_shared(TypedDict, Generic[HashableT], total=False): - # annotations shared between read_csv/fwf/table's overloads - # NOTE: Keep in sync with the annotations of the implementation - sep: str | None | lib.NoDefault - delimiter: str | None | lib.NoDefault - header: int | Sequence[int] | None | Literal["infer"] - names: Sequence[Hashable] | None | lib.NoDefault - index_col: IndexLabel | Literal[False] | None - usecols: UsecolsArgType - dtype: DtypeArg | None - engine: CSVEngine | None - converters: Mapping[HashableT, Callable] | None - true_values: list | None - false_values: list | None - skipinitialspace: bool - skiprows: list[int] | int | Callable[[Hashable], bool] | None - skipfooter: int - nrows: int | None - na_values: ( - Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None - ) - keep_default_na: bool - na_filter: bool - skip_blank_lines: bool - parse_dates: bool | Sequence[Hashable] | None - infer_datetime_format: bool | lib.NoDefault - keep_date_col: bool | lib.NoDefault - date_parser: Callable | lib.NoDefault - date_format: str | dict[Hashable, str] | None - dayfirst: bool - cache_dates: bool - compression: CompressionOptions - thousands: str | None - decimal: str - lineterminator: str | None - quotechar: str - quoting: int - doublequote: bool - escapechar: str | None - comment: str | None - encoding: str | None - encoding_errors: str | None - dialect: str | csv.Dialect | None - on_bad_lines: str - delim_whitespace: bool | lib.NoDefault - low_memory: bool - memory_map: bool - float_precision: Literal["high", "legacy", "round_trip"] | None - storage_options: StorageOptions | None - dtype_backend: DtypeBackend | lib.NoDefault -else: - _read_shared = dict - - -_doc_read_csv_and_table = ( - r""" -{summary} - -Also supports optionally iterating or breaking of the file -into chunks. - -Additional help can be found in the online docs for -`IO Tools `_. - -Parameters ----------- -filepath_or_buffer : str, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is - expected. A local file could be: file://localhost/path/to/table.csv. - - If you want to pass in a path object, pandas accepts any ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, such as - a file handle (e.g. via builtin ``open`` function) or ``StringIO``. -sep : str, default {_default_sep} - Character or regex pattern to treat as the delimiter. If ``sep=None``, the - C engine cannot automatically detect - the separator, but the Python parsing engine can, meaning the latter will - be used and automatically detect the separator from only the first valid - row of the file by Python's builtin sniffer tool, ``csv.Sniffer``. - In addition, separators longer than 1 character and different from - ``'\s+'`` will be interpreted as regular expressions and will also force - the use of the Python parsing engine. Note that regex delimiters are prone - to ignoring quoted data. Regex example: ``'\r\t'``. -delimiter : str, optional - Alias for ``sep``. -header : int, Sequence of int, 'infer' or None, default 'infer' - Row number(s) containing column labels and marking the start of the - data (zero-indexed). Default behavior is to infer the column names: if no ``names`` - are passed the behavior is identical to ``header=0`` and column - names are inferred from the first line of the file, if column - names are passed explicitly to ``names`` then the behavior is identical to - ``header=None``. Explicitly pass ``header=0`` to be able to - replace existing names. The header can be a list of integers that - specify row locations for a :class:`~pandas.MultiIndex` on the columns - e.g. ``[0, 1, 3]``. Intervening rows that are not specified will be - skipped (e.g. 2 in this example is skipped). Note that this - parameter ignores commented lines and empty lines if - ``skip_blank_lines=True``, so ``header=0`` denotes the first line of - data rather than the first line of the file. - - When inferred from the file contents, headers are kept distinct from - each other by renaming duplicate names with a numeric suffix of the form - ``".{{count}}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. - Empty headers are named ``"Unnamed: {{i}}"`` or ``"Unnamed: {{i}}_level_{{level}}"`` - in the case of MultiIndex columns. -names : Sequence of Hashable, optional - Sequence of column labels to apply. If the file contains a header row, - then you should explicitly pass ``header=0`` to override the column names. - Duplicates in this list are not allowed. -index_col : Hashable, Sequence of Hashable or False, optional - Column(s) to use as row label(s), denoted either by column labels or column - indices. If a sequence of labels or indices is given, :class:`~pandas.MultiIndex` - will be formed for the row labels. - - Note: ``index_col=False`` can be used to force pandas to *not* use the first - column as the index, e.g., when you have a malformed file with delimiters at - the end of each line. -usecols : Sequence of Hashable or Callable, optional - Subset of columns to select, denoted either by column labels or column indices. - If list-like, all elements must either - be positional (i.e. integer indices into the document columns) or strings - that correspond to column names provided either by the user in ``names`` or - inferred from the document header row(s). If ``names`` are given, the document - header row(s) are not taken into account. For example, a valid list-like - ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. - Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. - To instantiate a :class:`~pandas.DataFrame` from ``data`` with element order - preserved use ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` - for columns in ``['foo', 'bar']`` order or - ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` - for ``['bar', 'foo']`` order. - - If callable, the callable function will be evaluated against the column - names, returning names where the callable function evaluates to ``True``. An - example of a valid callable argument would be ``lambda x: x.upper() in - ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster - parsing time and lower memory usage. -dtype : dtype or dict of {{Hashable : dtype}}, optional - Data type(s) to apply to either the whole dataset or individual columns. - E.g., ``{{'a': np.float64, 'b': np.int32, 'c': 'Int64'}}`` - Use ``str`` or ``object`` together with suitable ``na_values`` settings - to preserve and not interpret ``dtype``. - If ``converters`` are specified, they will be applied INSTEAD - of ``dtype`` conversion. - - .. versionadded:: 1.5.0 - - Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input where - the default determines the ``dtype`` of the columns which are not explicitly - listed. -engine : {{'c', 'python', 'pyarrow'}}, optional - Parser engine to use. The C and pyarrow engines are faster, while the python engine - is currently more feature-complete. Multithreading is currently only supported by - the pyarrow engine. - - .. versionadded:: 1.4.0 - - The 'pyarrow' engine was added as an *experimental* engine, and some features - are unsupported, or may not work correctly, with this engine. -converters : dict of {{Hashable : Callable}}, optional - Functions for converting values in specified columns. Keys can either - be column labels or column indices. -true_values : list, optional - Values to consider as ``True`` in addition to case-insensitive variants of 'True'. -false_values : list, optional - Values to consider as ``False`` in addition to case-insensitive variants of 'False'. -skipinitialspace : bool, default False - Skip spaces after delimiter. -skiprows : int, list of int or Callable, optional - Line numbers to skip (0-indexed) or number of lines to skip (``int``) - at the start of the file. - - If callable, the callable function will be evaluated against the row - indices, returning ``True`` if the row should be skipped and ``False`` otherwise. - An example of a valid callable argument would be ``lambda x: x in [0, 2]``. -skipfooter : int, default 0 - Number of lines at bottom of file to skip (Unsupported with ``engine='c'``). -nrows : int, optional - Number of rows of file to read. Useful for reading pieces of large files. -na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional - Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific - per-column ``NA`` values. By default the following values are interpreted as - ``NaN``: " """ - + fill('", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") - + """ ". - -keep_default_na : bool, default True - Whether or not to include the default ``NaN`` values when parsing the data. - Depending on whether ``na_values`` is passed in, the behavior is as follows: - - * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, ``na_values`` - is appended to the default ``NaN`` values used for parsing. - * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only - the default ``NaN`` values are used for parsing. - * If ``keep_default_na`` is ``False``, and ``na_values`` are specified, only - the ``NaN`` values specified ``na_values`` are used for parsing. - * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no - strings will be parsed as ``NaN``. - - Note that if ``na_filter`` is passed in as ``False``, the ``keep_default_na`` and - ``na_values`` parameters will be ignored. -na_filter : bool, default True - Detect missing value markers (empty strings and the value of ``na_values``). In - data without any ``NA`` values, passing ``na_filter=False`` can improve the - performance of reading a large file. -skip_blank_lines : bool, default True - If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. -parse_dates : bool, None, list of Hashable, list of lists or dict of {{Hashable : \ -list}}, default None - The behavior is as follows: - - * ``bool``. If ``True`` -> try parsing the index. - * ``None``. Behaves like ``True`` if ``date_parser`` or ``date_format`` are - specified. - * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 - each as a separate date column. - * ``list`` of ``list``. e.g. If ``[[1, 3]]`` -> combine columns 1 and 3 and parse - as a single date column. Values are joined with a space before parsing. - * ``dict``, e.g. ``{{'foo' : [1, 3]}}`` -> parse columns 1, 3 as date and call - result 'foo'. Values are joined with a space before parsing. - - If a column or index cannot be represented as an array of ``datetime``, - say because of an unparsable value or a mixture of timezones, the column - or index will be returned unaltered as an ``object`` data type. For - non-standard ``datetime`` parsing, use :func:`~pandas.to_datetime` after - :func:`~pandas.read_csv`. - - Note: A fast-path exists for iso8601-formatted dates. -infer_datetime_format : bool, default False - If ``True`` and ``parse_dates`` is enabled, pandas will attempt to infer the - format of the ``datetime`` strings in the columns, and if it can be inferred, - switch to a faster method of parsing them. In some cases this can increase - the parsing speed by 5-10x. - - .. deprecated:: 2.0.0 - A strict version of this argument is now the default, passing it has no effect. - -keep_date_col : bool, default False - If ``True`` and ``parse_dates`` specifies combining multiple columns then - keep the original columns. -date_parser : Callable, optional - Function to use for converting a sequence of string columns to an array of - ``datetime`` instances. The default uses ``dateutil.parser.parser`` to do the - conversion. pandas will try to call ``date_parser`` in three different ways, - advancing to the next if an exception occurs: 1) Pass one or more arrays - (as defined by ``parse_dates``) as arguments; 2) concatenate (row-wise) the - string values from the columns defined by ``parse_dates`` into a single array - and pass that; and 3) call ``date_parser`` once for each row using one or - more strings (corresponding to the columns defined by ``parse_dates``) as - arguments. - - .. deprecated:: 2.0.0 - Use ``date_format`` instead, or read in as ``object`` and then apply - :func:`~pandas.to_datetime` as-needed. -date_format : str or dict of column -> format, optional - Format to use for parsing dates when used in conjunction with ``parse_dates``. - The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See - `strftime documentation - `_ for more information on choices, though - note that :const:`"%f"` will parse all the way up to nanoseconds. - You can also pass: - - - "ISO8601", to parse any `ISO8601 `_ - time string (not necessarily in exactly the same format); - - "mixed", to infer the format for each element individually. This is risky, - and you should probably use it along with `dayfirst`. - - .. versionadded:: 2.0.0 -dayfirst : bool, default False - DD/MM format dates, international and European format. -cache_dates : bool, default True - If ``True``, use a cache of unique, converted dates to apply the ``datetime`` - conversion. May produce significant speed-up when parsing duplicate - date strings, especially ones with timezone offsets. - -iterator : bool, default False - Return ``TextFileReader`` object for iteration or getting chunks with - ``get_chunk()``. -chunksize : int, optional - Number of lines to read from the file per chunk. Passing a value will cause the - function to return a ``TextFileReader`` object for iteration. - See the `IO Tools docs - `_ - for more information on ``iterator`` and ``chunksize``. - -{decompression_options} - - .. versionchanged:: 1.4.0 Zstandard support. - -thousands : str (length 1), optional - Character acting as the thousands separator in numerical values. -decimal : str (length 1), default '.' - Character to recognize as decimal point (e.g., use ',' for European data). -lineterminator : str (length 1), optional - Character used to denote a line break. Only valid with C parser. -quotechar : str (length 1), optional - Character used to denote the start and end of a quoted item. Quoted - items can include the ``delimiter`` and it will be ignored. -quoting : {{0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, \ -3 or csv.QUOTE_NONE}}, default csv.QUOTE_MINIMAL - Control field quoting behavior per ``csv.QUOTE_*`` constants. Default is - ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only fields containing special - characters are quoted (e.g., characters defined in ``quotechar``, ``delimiter``, - or ``lineterminator``. -doublequote : bool, default True - When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, indicate - whether or not to interpret two consecutive ``quotechar`` elements INSIDE a - field as a single ``quotechar`` element. -escapechar : str (length 1), optional - Character used to escape other characters. -comment : str (length 1), optional - Character indicating that the remainder of line should not be parsed. - If found at the beginning - of a line, the line will be ignored altogether. This parameter must be a - single character. Like empty lines (as long as ``skip_blank_lines=True``), - fully commented lines are ignored by the parameter ``header`` but not by - ``skiprows``. For example, if ``comment='#'``, parsing - ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in ``'a,b,c'`` being - treated as the header. -encoding : str, optional, default 'utf-8' - Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). `List of Python - standard encodings - `_ . - -encoding_errors : str, optional, default 'strict' - How encoding errors are treated. `List of possible values - `_ . - - .. versionadded:: 1.3.0 - -dialect : str or csv.Dialect, optional - If provided, this parameter will override values (default or not) for the - following parameters: ``delimiter``, ``doublequote``, ``escapechar``, - ``skipinitialspace``, ``quotechar``, and ``quoting``. If it is necessary to - override values, a ``ParserWarning`` will be issued. See ``csv.Dialect`` - documentation for more details. -on_bad_lines : {{'error', 'warn', 'skip'}} or Callable, default 'error' - Specifies what to do upon encountering a bad line (a line with too many fields). - Allowed values are : - - - ``'error'``, raise an Exception when a bad line is encountered. - - ``'warn'``, raise a warning when a bad line is encountered and skip that line. - - ``'skip'``, skip bad lines without raising or warning when they are encountered. - - .. versionadded:: 1.3.0 - - .. versionadded:: 1.4.0 - - - Callable, function with signature - ``(bad_line: list[str]) -> list[str] | None`` that will process a single - bad line. ``bad_line`` is a list of strings split by the ``sep``. - If the function returns ``None``, the bad line will be ignored. - If the function returns a new ``list`` of strings with more elements than - expected, a ``ParserWarning`` will be emitted while dropping extra elements. - Only supported when ``engine='python'`` - - .. versionchanged:: 2.2.0 - - - Callable, function with signature - as described in `pyarrow documentation - `_ when ``engine='pyarrow'`` - -delim_whitespace : bool, default False - Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be - used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option - is set to ``True``, nothing should be passed in for the ``delimiter`` - parameter. - - .. deprecated:: 2.2.0 - Use ``sep="\\s+"`` instead. -low_memory : bool, default True - Internally process the file in chunks, resulting in lower memory use - while parsing, but possibly mixed type inference. To ensure no mixed - types either set ``False``, or specify the type with the ``dtype`` parameter. - Note that the entire file is read into a single :class:`~pandas.DataFrame` - regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in - chunks. (Only valid with C parser). -memory_map : bool, default False - If a filepath is provided for ``filepath_or_buffer``, map the file object - directly onto memory and access the data directly from there. Using this - option can improve performance because there is no longer any I/O overhead. -float_precision : {{'high', 'legacy', 'round_trip'}}, optional - Specifies which converter the C engine should use for floating-point - values. The options are ``None`` or ``'high'`` for the ordinary converter, - ``'legacy'`` for the original lower precision pandas converter, and - ``'round_trip'`` for the round-trip converter. - -{storage_options} - -dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' - Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: - - * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. - - .. versionadded:: 2.0 - -Returns -------- -DataFrame or TextFileReader - A comma-separated values (csv) file is returned as two-dimensional - data structure with labeled axes. - -See Also --------- -DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. -{see_also_func_name} : {see_also_func_summary} -read_fwf : Read a table of fixed-width formatted lines into DataFrame. - -Examples --------- ->>> pd.{func_name}('data.csv') # doctest: +SKIP -""" -) + from pandas.api.extensions import ExtensionArray -class _C_Parser_Defaults(TypedDict): - delim_whitespace: Literal[False] - na_filter: Literal[True] - low_memory: Literal[True] - memory_map: Literal[False] - float_precision: None +def interleaved_dtype(dtypes: list[DtypeObj]) -> DtypeObj | None: + """ + Find the common dtype for `blocks`. + Parameters + ---------- + blocks : List[DtypeObj] -_c_parser_defaults: _C_Parser_Defaults = { - "delim_whitespace": False, - "na_filter": True, - "low_memory": True, - "memory_map": False, - "float_precision": None, -} + Returns + ------- + dtype : np.dtype, ExtensionDtype, or None + None is returned when `blocks` is empty. + """ + if not len(dtypes): + return None + return find_common_type(dtypes) -class _Fwf_Defaults(TypedDict): - colspecs: Literal["infer"] - infer_nrows: Literal[100] - widths: None +def ensure_np_dtype(dtype: DtypeObj) -> np.dtype: + # TODO: https://github.com/pandas-dev/pandas/issues/22791 + # Give EAs some input on what happens here. Sparse needs this. + if isinstance(dtype, SparseDtype): + dtype = dtype.subtype + dtype = cast(np.dtype, dtype) + elif isinstance(dtype, ExtensionDtype): + dtype = np.dtype("object") + elif dtype == np.dtype(str): + dtype = np.dtype("object") + return dtype -_fwf_defaults: _Fwf_Defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} -_c_unsupported = {"skipfooter"} -_python_unsupported = {"low_memory", "float_precision"} -_pyarrow_unsupported = { - "skipfooter", - "float_precision", - "chunksize", - "comment", - "nrows", - "thousands", - "memory_map", - "dialect", - "delim_whitespace", - "quoting", - "lineterminator", - "converters", - "iterator", - "dayfirst", - "skipinitialspace", - "low_memory", -} +class BaseBlockManager(PandasObject): + """ + Core internal data structure to implement DataFrame, Series, etc. -@overload -def validate_integer(name: str, val: None, min_val: int = ...) -> None: ... + Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a + lightweight blocked set of labeled data to be manipulated by the DataFrame + public API class + Attributes + ---------- + shape + ndim + axes + values + items + + Methods + ------- + set_axis(axis, new_labels) + copy(deep=True) -@overload -def validate_integer(name: str, val: float, min_val: int = ...) -> int: ... + get_dtypes + apply(func, axes, block_filter_fn) -@overload -def validate_integer(name: str, val: int | None, min_val: int = ...) -> int | None: ... + get_bool_data + get_numeric_data + get_slice(slice_like, axis) + get(label) + iget(loc) -def validate_integer( - name: str, val: int | float | None, min_val: int = 0 -) -> int | None: - """ - Checks whether the 'name' parameter for parsing is either - an integer OR float that can SAFELY be cast to an integer - without losing accuracy. Raises a ValueError if that is - not the case. + take(indexer, axis) + reindex_axis(new_labels, axis) + reindex_indexer(new_labels, indexer, axis) + + delete(label) + insert(loc, label, value) + set(label, value) Parameters ---------- - name : str - Parameter name (used for error reporting) - val : int or float - The value to check - min_val : int - Minimum allowed value (val < min_val will result in a ValueError) + blocks: Sequence of Block + axes: Sequence of Index + verify_integrity: bool, default True + + Notes + ----- + This is *not* a public API class """ - if val is None: - return val - msg = f"'{name:s}' must be an integer >={min_val:d}" - if is_float(val): - if int(val) != val: - raise ValueError(msg) - val = int(val) - elif not (is_integer(val) and val >= min_val): - raise ValueError(msg) + __slots__ = () + + _blknos: npt.NDArray[np.intp] + _blklocs: npt.NDArray[np.intp] + blocks: tuple[Block, ...] + axes: list[Index] + + @property + def ndim(self) -> int: + raise NotImplementedError + + _known_consolidated: bool + _is_consolidated: bool + + def __init__(self, blocks, axes, verify_integrity: bool = True) -> None: + raise NotImplementedError + + @final + def __len__(self) -> int: + return len(self.items) + + @property + def shape(self) -> Shape: + return tuple(len(ax) for ax in self.axes) + + @classmethod + def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> Self: + raise NotImplementedError + + @property + def blknos(self) -> npt.NDArray[np.intp]: + """ + Suppose we want to find the array corresponding to our i'th column. + + blknos[i] identifies the block from self.blocks that contains this column. + + blklocs[i] identifies the column of interest within + self.blocks[self.blknos[i]] + """ + if self._blknos is None: + # Note: these can be altered by other BlockManager methods. + self._rebuild_blknos_and_blklocs() + + return self._blknos + + @property + def blklocs(self) -> npt.NDArray[np.intp]: + """ + See blknos.__doc__ + """ + if self._blklocs is None: + # Note: these can be altered by other BlockManager methods. + self._rebuild_blknos_and_blklocs() + + return self._blklocs + + def make_empty(self, axes=None) -> Self: + """return an empty BlockManager with the items axis of len 0""" + if axes is None: + axes = [Index([])] + self.axes[1:] + + # preserve dtype if possible + if self.ndim == 1: + assert isinstance(self, SingleBlockManager) # for mypy + blk = self.blocks[0] + arr = blk.values[:0] + bp = BlockPlacement(slice(0, 0)) + nb = blk.make_block_same_class(arr, placement=bp) + blocks = [nb] + else: + blocks = [] + return type(self).from_blocks(blocks, axes) - return int(val) + def __nonzero__(self) -> bool: + return True + # Python3 compat + __bool__ = __nonzero__ -def _validate_names(names: Sequence[Hashable] | None) -> None: - """ - Raise ValueError if the `names` parameter contains duplicates or has an - invalid data type. + def set_axis(self, axis: AxisInt, new_labels: Index) -> None: + # Caller is responsible for ensuring we have an Index object. + self._validate_set_axis(axis, new_labels) + self.axes[axis] = new_labels - Parameters - ---------- - names : array-like or None - An array containing a list of the names used for the output DataFrame. + @final + def _validate_set_axis(self, axis: AxisInt, new_labels: Index) -> None: + # Caller is responsible for ensuring we have an Index object. + old_len = len(self.axes[axis]) + new_len = len(new_labels) - Raises - ------ - ValueError - If names are not unique or are not ordered (e.g. set). - """ - if names is not None: - if len(names) != len(set(names)): - raise ValueError("Duplicate names are not allowed.") - if not ( - is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView) - ): - raise ValueError("Names should be an ordered collection.") - - -def _read( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds -) -> DataFrame | TextFileReader: - """Generic reader of line files.""" - # if we pass a date_parser and parse_dates=False, we should not parse the - # dates GH#44366 - if kwds.get("parse_dates", None) is None: - if ( - kwds.get("date_parser", lib.no_default) is lib.no_default - and kwds.get("date_format", None) is None - ): - kwds["parse_dates"] = False - else: - kwds["parse_dates"] = True + if axis == 1 and len(self.items) == 0: + # If we are setting the index on a DataFrame with no columns, + # it is OK to change the length. + pass - # Extract some of the arguments (pass chunksize on). - iterator = kwds.get("iterator", False) - chunksize = kwds.get("chunksize", None) - if kwds.get("engine") == "pyarrow": - if iterator: + elif new_len != old_len: raise ValueError( - "The 'iterator' option is not supported with the 'pyarrow' engine" + f"Length mismatch: Expected axis has {old_len} elements, new " + f"values have {new_len} elements" ) - if chunksize is not None: - raise ValueError( - "The 'chunksize' option is not supported with the 'pyarrow' engine" - ) - else: - chunksize = validate_integer("chunksize", chunksize, 1) - - nrows = kwds.get("nrows", None) - - # Check for duplicates in names. - _validate_names(kwds.get("names", None)) - - # Create the parser. - parser = TextFileReader(filepath_or_buffer, **kwds) - - if chunksize or iterator: - return parser - - with parser: - return parser.read(nrows) - - -@overload -def read_csv( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - iterator: Literal[True], - chunksize: int | None = ..., - **kwds: Unpack[_read_shared[HashableT]], -) -> TextFileReader: ... - - -@overload -def read_csv( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - iterator: bool = ..., - chunksize: int, - **kwds: Unpack[_read_shared[HashableT]], -) -> TextFileReader: ... - - -@overload -def read_csv( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - iterator: Literal[False] = ..., - chunksize: None = ..., - **kwds: Unpack[_read_shared[HashableT]], -) -> DataFrame: ... - - -@overload -def read_csv( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - iterator: bool = ..., - chunksize: int | None = ..., - **kwds: Unpack[_read_shared[HashableT]], -) -> DataFrame | TextFileReader: ... - - -# a helper function for the read_csv(...) below). -# ensures that all keys in dtype are of type str. -# this allows for compatibility with the csv library -def parse_dtype(dtype) -> DtypeArg: - temp = {} - for key in dtype: - if isinstance(key, str): - temp[f"{key}"] = dtype[key] + @property + def is_single_block(self) -> bool: + # Assumes we are 2D; overridden by SingleBlockManager + return len(self.blocks) == 1 + + @property + def items(self) -> Index: + return self.axes[0] + + def _has_no_reference(self, i: int) -> bool: + """ + Check for column `i` if it has references. + (whether it references another array or is itself being referenced) + Returns True if the column has no references. + """ + blkno = self.blknos[i] + return self._has_no_reference_block(blkno) + + def _has_no_reference_block(self, blkno: int) -> bool: + """ + Check for block `i` if it has references. + (whether it references another array or is itself being referenced) + Returns True if the block has no references. + """ + return not self.blocks[blkno].refs.has_reference() + + def add_references(self, mgr: BaseBlockManager) -> None: + """ + Adds the references from one manager to another. We assume that both + managers have the same block structure. + """ + if len(self.blocks) != len(mgr.blocks): + # If block structure changes, then we made a copy + return + for i, blk in enumerate(self.blocks): + blk.refs = mgr.blocks[i].refs + blk.refs.add_reference(blk) + + def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool: + """ + Checks if two blocks from two different block managers reference the + same underlying values. + """ + blk = self.blocks[blkno] + return any(blk is ref() for ref in mgr.blocks[blkno].refs.referenced_blocks) + + def get_dtypes(self) -> npt.NDArray[np.object_]: + dtypes = np.array([blk.dtype for blk in self.blocks], dtype=object) + return dtypes.take(self.blknos) + + @property + def arrays(self) -> list[ArrayLike]: + """ + Quick access to the backing arrays of the Blocks. + + Only for compatibility with ArrayManager for testing convenience. + Not to be used in actual code, and return value is not the same as the + ArrayManager method (list of 1D arrays vs iterator of 2D ndarrays / 1D EAs). + + Warning! The returned arrays don't handle Copy-on-Write, so this should + be used with caution (only in read-mode). + """ + return [blk.values for blk in self.blocks] + + def __repr__(self) -> str: + output = type(self).__name__ + for i, ax in enumerate(self.axes): + if i == 0: + output += f"\nItems: {ax}" + else: + output += f"\nAxis {i}: {ax}" + + for block in self.blocks: + output += f"\n{block}" + return output + + def _equal_values(self, other: Self) -> bool: + """ + To be implemented by the subclasses. Only check the column values + assuming shape and indexes have already been checked. + """ + raise AbstractMethodError(self) + + @final + def equals(self, other: object) -> bool: + """ + Implementation for DataFrame.equals + """ + if not isinstance(other, type(self)): + return False + + self_axes, other_axes = self.axes, other.axes + if len(self_axes) != len(other_axes): + return False + if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)): + return False + + return self._equal_values(other) + + def apply( + self, + f, + align_keys: list[str] | None = None, + **kwargs, + ) -> Self: + """ + Iterate over the blocks, collect and create a new BlockManager. + + Parameters + ---------- + f : str or callable + Name of the Block method to apply. + align_keys: List[str] or None, default None + **kwargs + Keywords to pass to `f` + + Returns + ------- + BlockManager + """ + assert "filter" not in kwargs + + align_keys = align_keys or [] + result_blocks: list[Block] = [] + # fillna: Series/DataFrame is responsible for making sure value is aligned + + aligned_args = {k: kwargs[k] for k in align_keys} + + for b in self.blocks: + if aligned_args: + for k, obj in aligned_args.items(): + if isinstance(obj, (ABCSeries, ABCDataFrame)): + # The caller is responsible for ensuring that + # obj.axes[-1].equals(self.items) + if obj.ndim == 1: + kwargs[k] = obj.iloc[b.mgr_locs.indexer]._values + else: + kwargs[k] = obj.iloc[:, b.mgr_locs.indexer]._values + else: + # otherwise we have an ndarray + kwargs[k] = obj[b.mgr_locs.indexer] + + if callable(f): + applied = b.apply(f, **kwargs) + else: + applied = getattr(b, f)(**kwargs) + result_blocks = extend_blocks(applied, result_blocks) + + out = type(self).from_blocks(result_blocks, self.axes) + return out + + @final + def isna(self, func) -> Self: + return self.apply("apply", func=func) + + @final + def fillna(self, value, limit: int | None, inplace: bool) -> Self: + if limit is not None: + # Do this validation even if we go through one of the no-op paths + limit = libalgos.validate_limit(None, limit=limit) + + return self.apply( + "fillna", + value=value, + limit=limit, + inplace=inplace, + ) + + @final + def where(self, other, cond, align: bool) -> Self: + if align: + align_keys = ["other", "cond"] else: - temp[key] = dtype[key] - return temp - - -@Appender( - _doc_read_csv_and_table.format( - func_name="read_csv", - summary="Read a comma-separated values (csv) file into DataFrame.", - see_also_func_name="read_table", - see_also_func_summary="Read general delimited file into DataFrame.", - _default_sep="','", - storage_options=_shared_docs["storage_options"], - decompression_options=_shared_docs["decompression_options"] - % "filepath_or_buffer", - ) -) -def read_csv( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - sep: str | None | lib.NoDefault = lib.no_default, - delimiter: str | None | lib.NoDefault = None, - # Column and Index Locations and Names - header: int | Sequence[int] | None | Literal["infer"] = "infer", - names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, - index_col: IndexLabel | Literal[False] | None = None, - usecols: UsecolsArgType = None, - # General Parsing Configuration - dtype: DtypeArg | None = None, - engine: CSVEngine | None = None, - converters: Mapping[HashableT, Callable] | None = None, - true_values: list | None = None, - false_values: list | None = None, - skipinitialspace: bool = False, - skiprows: list[int] | int | Callable[[Hashable], bool] | None = None, - skipfooter: int = 0, - nrows: int | None = None, - # NA and Missing Data Handling - na_values: Hashable - | Iterable[Hashable] - | Mapping[Hashable, Iterable[Hashable]] - | None = None, - keep_default_na: bool = True, - na_filter: bool = True, - skip_blank_lines: bool = True, - # Datetime Handling - parse_dates: bool | Sequence[Hashable] | None = None, - infer_datetime_format: bool | lib.NoDefault = lib.no_default, - keep_date_col: bool | lib.NoDefault = lib.no_default, - date_parser: Callable | lib.NoDefault = lib.no_default, - date_format: str | dict[Hashable, str] | None = None, - dayfirst: bool = False, - cache_dates: bool = True, - # Iteration - iterator: bool = False, - chunksize: int | None = None, - # Quoting, Compression, and File Format - compression: CompressionOptions = "infer", - thousands: str | None = None, - decimal: str = ".", - lineterminator: str | None = None, - quotechar: str = '"', - quoting: int = csv.QUOTE_MINIMAL, - doublequote: bool = True, - escapechar: str | None = None, - comment: str | None = None, - encoding: str | None = None, - encoding_errors: str | None = "strict", - dialect: str | csv.Dialect | None = None, - # Error Handling - on_bad_lines: str = "error", - # Internal - delim_whitespace: bool | lib.NoDefault = lib.no_default, - low_memory: bool = _c_parser_defaults["low_memory"], - memory_map: bool = False, - float_precision: Literal["high", "legacy", "round_trip"] | None = None, - storage_options: StorageOptions | None = None, - dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, -) -> DataFrame | TextFileReader: - # ensures that all keys in dtype are a string for compatibility with csv - dtype = parse_dtype(dtype) - - if keep_date_col is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'keep_date_col' keyword in pd.read_csv is deprecated and " - "will be removed in a future version. Explicitly remove unwanted " - "columns after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), + align_keys = ["cond"] + other = extract_array(other, extract_numpy=True) + + return self.apply( + "where", + align_keys=align_keys, + other=other, + cond=cond, ) - else: - keep_date_col = False - - if lib.is_list_like(parse_dates): - # GH#55569 - depr = False - # error: Item "bool" of "bool | Sequence[Hashable] | None" has no - # attribute "__iter__" (not iterable) - if not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr] - depr = True - elif isinstance(parse_dates, dict) and any( - lib.is_list_like(x) for x in parse_dates.values() - ): - depr = True - if depr: - warnings.warn( - "Support for nested sequences for 'parse_dates' in pd.read_csv " - "is deprecated. Combine the desired columns with pd.to_datetime " - "after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if infer_datetime_format is not lib.no_default: - warnings.warn( - "The argument 'infer_datetime_format' is deprecated and will " - "be removed in a future version. " - "A strict version of it is now the default, see " - "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " - "You can safely remove this argument.", - FutureWarning, - stacklevel=find_stack_level(), + @final + def putmask(self, mask, new, align: bool = True) -> Self: + if align: + align_keys = ["new", "mask"] + else: + align_keys = ["mask"] + new = extract_array(new, extract_numpy=True) + + return self.apply( + "putmask", + align_keys=align_keys, + mask=mask, + new=new, ) - if delim_whitespace is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'delim_whitespace' keyword in pd.read_csv is deprecated and " - "will be removed in a future version. Use ``sep='\\s+'`` instead", - FutureWarning, - stacklevel=find_stack_level(), + @final + def round(self, decimals: int) -> Self: + return self.apply("round", decimals=decimals) + + @final + def replace(self, to_replace, value, inplace: bool) -> Self: + inplace = validate_bool_kwarg(inplace, "inplace") + # NDFrame.replace ensures the not-is_list_likes here + assert not lib.is_list_like(to_replace) + assert not lib.is_list_like(value) + return self.apply( + "replace", + to_replace=to_replace, + value=value, + inplace=inplace, ) - else: - delim_whitespace = False - - # locals() should never be modified - kwds = locals().copy() - del kwds["filepath_or_buffer"] - del kwds["sep"] - - kwds_defaults = _refine_defaults_read( - dialect, - delimiter, - delim_whitespace, - engine, - sep, - on_bad_lines, - names, - defaults={"delimiter": ","}, - dtype_backend=dtype_backend, - ) - kwds.update(kwds_defaults) - - return _read(filepath_or_buffer, kwds) - - -@overload -def read_table( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - iterator: Literal[True], - chunksize: int | None = ..., - **kwds: Unpack[_read_shared[HashableT]], -) -> TextFileReader: ... - - -@overload -def read_table( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - iterator: bool = ..., - chunksize: int, - **kwds: Unpack[_read_shared[HashableT]], -) -> TextFileReader: ... - - -@overload -def read_table( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - iterator: Literal[False] = ..., - chunksize: None = ..., - **kwds: Unpack[_read_shared[HashableT]], -) -> DataFrame: ... - - -@overload -def read_table( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - iterator: bool = ..., - chunksize: int | None = ..., - **kwds: Unpack[_read_shared[HashableT]], -) -> DataFrame | TextFileReader: ... - - -@Appender( - _doc_read_csv_and_table.format( - func_name="read_table", - summary="Read general delimited file into DataFrame.", - see_also_func_name="read_csv", - see_also_func_summary=( - "Read a comma-separated values (csv) file into DataFrame." - ), - _default_sep=r"'\\t' (tab-stop)", - storage_options=_shared_docs["storage_options"], - decompression_options=_shared_docs["decompression_options"] - % "filepath_or_buffer", - ) -) -def read_table( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - sep: str | None | lib.NoDefault = lib.no_default, - delimiter: str | None | lib.NoDefault = None, - # Column and Index Locations and Names - header: int | Sequence[int] | None | Literal["infer"] = "infer", - names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, - index_col: IndexLabel | Literal[False] | None = None, - usecols: UsecolsArgType = None, - # General Parsing Configuration - dtype: DtypeArg | None = None, - engine: CSVEngine | None = None, - converters: Mapping[HashableT, Callable] | None = None, - true_values: list | None = None, - false_values: list | None = None, - skipinitialspace: bool = False, - skiprows: list[int] | int | Callable[[Hashable], bool] | None = None, - skipfooter: int = 0, - nrows: int | None = None, - # NA and Missing Data Handling - na_values: Hashable - | Iterable[Hashable] - | Mapping[Hashable, Iterable[Hashable]] - | None = None, - keep_default_na: bool = True, - na_filter: bool = True, - skip_blank_lines: bool = True, - # Datetime Handling - parse_dates: bool | Sequence[Hashable] | None = None, - infer_datetime_format: bool | lib.NoDefault = lib.no_default, - keep_date_col: bool | lib.NoDefault = lib.no_default, - date_parser: Callable | lib.NoDefault = lib.no_default, - date_format: str | dict[Hashable, str] | None = None, - dayfirst: bool = False, - cache_dates: bool = True, - # Iteration - iterator: bool = False, - chunksize: int | None = None, - # Quoting, Compression, and File Format - compression: CompressionOptions = "infer", - thousands: str | None = None, - decimal: str = ".", - lineterminator: str | None = None, - quotechar: str = '"', - quoting: int = csv.QUOTE_MINIMAL, - doublequote: bool = True, - escapechar: str | None = None, - comment: str | None = None, - encoding: str | None = None, - encoding_errors: str | None = "strict", - dialect: str | csv.Dialect | None = None, - # Error Handling - on_bad_lines: str = "error", - # Internal - delim_whitespace: bool | lib.NoDefault = lib.no_default, - low_memory: bool = _c_parser_defaults["low_memory"], - memory_map: bool = False, - float_precision: Literal["high", "legacy", "round_trip"] | None = None, - storage_options: StorageOptions | None = None, - dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, -) -> DataFrame | TextFileReader: - if keep_date_col is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'keep_date_col' keyword in pd.read_table is deprecated and " - "will be removed in a future version. Explicitly remove unwanted " - "columns after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), + + @final + def replace_regex(self, **kwargs) -> Self: + return self.apply("_replace_regex", **kwargs) + + @final + def replace_list( + self, + src_list: list[Any], + dest_list: list[Any], + inplace: bool = False, + regex: bool = False, + ) -> Self: + """do a list replace""" + inplace = validate_bool_kwarg(inplace, "inplace") + + bm = self.apply( + "replace_list", + src_list=src_list, + dest_list=dest_list, + inplace=inplace, + regex=regex, ) - else: - keep_date_col = False - - # error: Item "bool" of "bool | Sequence[Hashable]" has no attribute "__iter__" - if lib.is_list_like(parse_dates) and not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr] - # GH#55569 - warnings.warn( - "Support for nested sequences for 'parse_dates' in pd.read_table " - "is deprecated. Combine the desired columns with pd.to_datetime " - "after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), + bm._consolidate_inplace() + return bm + + def interpolate(self, inplace: bool, **kwargs) -> Self: + return self.apply("interpolate", inplace=inplace, **kwargs) + + def pad_or_backfill(self, inplace: bool, **kwargs) -> Self: + return self.apply("pad_or_backfill", inplace=inplace, **kwargs) + + def shift(self, periods: int, fill_value) -> Self: + if fill_value is lib.no_default: + fill_value = None + + return self.apply("shift", periods=periods, fill_value=fill_value) + + def setitem(self, indexer, value) -> Self: + """ + Set values with indexer. + + For SingleBlockManager, this backs s[indexer] = value + """ + if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim: + raise ValueError(f"Cannot set values with ndim > {self.ndim}") + + if not self._has_no_reference(0): + # this method is only called if there is a single block -> hardcoded 0 + # Split blocks to only copy the columns we want to modify + if self.ndim == 2 and isinstance(indexer, tuple): + blk_loc = self.blklocs[indexer[1]] + if is_list_like(blk_loc) and blk_loc.ndim == 2: + blk_loc = np.squeeze(blk_loc, axis=0) + elif not is_list_like(blk_loc): + # Keep dimension and copy data later + blk_loc = [blk_loc] # type: ignore[assignment] + if len(blk_loc) == 0: + return self.copy(deep=False) + + values = self.blocks[0].values + if values.ndim == 2: + values = values[blk_loc] + # "T" has no attribute "_iset_split_block" + self._iset_split_block( # type: ignore[attr-defined] + 0, blk_loc, values + ) + # first block equals values + self.blocks[0].setitem((indexer[0], np.arange(len(blk_loc))), value) + return self + # No need to split if we either set all columns or on a single block + # manager + self = self.copy() + + return self.apply("setitem", indexer=indexer, value=value) + + def diff(self, n: int) -> Self: + # only reached with self.ndim == 2 + return self.apply("diff", n=n) + + def astype(self, dtype, errors: str = "raise") -> Self: + return self.apply("astype", dtype=dtype, errors=errors) + + def convert(self) -> Self: + return self.apply("convert") + + def convert_dtypes(self, **kwargs): + return self.apply("convert_dtypes", **kwargs) + + def get_values_for_csv( + self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None + ) -> Self: + """ + Convert values to native types (strings / python objects) that are used + in formatting (repr / csv). + """ + return self.apply( + "get_values_for_csv", + na_rep=na_rep, + quoting=quoting, + float_format=float_format, + date_format=date_format, + decimal=decimal, ) - if infer_datetime_format is not lib.no_default: - warnings.warn( - "The argument 'infer_datetime_format' is deprecated and will " - "be removed in a future version. " - "A strict version of it is now the default, see " - "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " - "You can safely remove this argument.", - FutureWarning, - stacklevel=find_stack_level(), + @property + def any_extension_types(self) -> bool: + """Whether any of the blocks in this manager are extension blocks""" + return any(block.is_extension for block in self.blocks) + + @property + def is_view(self) -> bool: + """return a boolean if we are a single block and are a view""" + if len(self.blocks) == 1: + return self.blocks[0].is_view + + # It is technically possible to figure out which blocks are views + # e.g. [ b.values.base is not None for b in self.blocks ] + # but then we have the case of possibly some blocks being a view + # and some blocks not. setting in theory is possible on the non-view + # blocks. But this is a bit + # complicated + + return False + + def _get_data_subset(self, predicate: Callable) -> Self: + blocks = [blk for blk in self.blocks if predicate(blk.values)] + return self._combine(blocks) + + def get_bool_data(self) -> Self: + """ + Select blocks that are bool-dtype and columns from object-dtype blocks + that are all-bool. + """ + + new_blocks = [] + + for blk in self.blocks: + if blk.dtype == bool: + new_blocks.append(blk) + + elif blk.is_object: + nbs = blk._split() + new_blocks.extend(nb for nb in nbs if nb.is_bool) + + return self._combine(new_blocks) + + def get_numeric_data(self) -> Self: + numeric_blocks = [blk for blk in self.blocks if blk.is_numeric] + if len(numeric_blocks) == len(self.blocks): + # Avoid somewhat expensive _combine + return self + return self._combine(numeric_blocks) + + def _combine(self, blocks: list[Block], index: Index | None = None) -> Self: + """return a new manager with the blocks""" + if len(blocks) == 0: + if self.ndim == 2: + # retain our own Index dtype + if index is not None: + axes = [self.items[:0], index] + else: + axes = [self.items[:0]] + self.axes[1:] + return self.make_empty(axes) + return self.make_empty() + + # FIXME: optimization potential + indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) + inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) + + new_blocks: list[Block] = [] + for b in blocks: + nb = b.copy(deep=False) + nb.mgr_locs = BlockPlacement(inv_indexer[nb.mgr_locs.indexer]) + new_blocks.append(nb) + + axes = list(self.axes) + if index is not None: + axes[-1] = index + axes[0] = self.items.take(indexer) + + return type(self).from_blocks(new_blocks, axes) + + @property + def nblocks(self) -> int: + return len(self.blocks) + + def copy(self, deep: bool | Literal["all"] = True) -> Self: + """ + Make deep or shallow copy of BlockManager + + Parameters + ---------- + deep : bool, string or None, default True + If False or None, return a shallow copy (do not copy data) + If 'all', copy data and a deep copy of the index + + Returns + ------- + BlockManager + """ + # this preserves the notion of view copying of axes + if deep: + # hit in e.g. tests.io.json.test_pandas + + def copy_func(ax): + return ax.copy(deep=True) if deep == "all" else ax.view() + + new_axes = [copy_func(ax) for ax in self.axes] + else: + new_axes = [ax.view() for ax in self.axes] + + res = self.apply("copy", deep=deep) + res.axes = new_axes + + if self.ndim > 1: + # Avoid needing to re-compute these + blknos = self._blknos + if blknos is not None: + res._blknos = blknos.copy() + res._blklocs = self._blklocs.copy() + + if deep: + res._consolidate_inplace() + return res + + def is_consolidated(self) -> bool: + return True + + def consolidate(self) -> Self: + """ + Join together blocks having same dtype + + Returns + ------- + y : BlockManager + """ + if self.is_consolidated(): + return self + + bm = type(self)(self.blocks, self.axes, verify_integrity=False) + bm._is_consolidated = False + bm._consolidate_inplace() + return bm + + def _consolidate_inplace(self) -> None: + return + + @final + def reindex_axis( + self, + new_index: Index, + axis: AxisInt, + fill_value=None, + only_slice: bool = False, + ) -> Self: + """ + Conform data manager to new index. + """ + new_index, indexer = self.axes[axis].reindex(new_index) + + return self.reindex_indexer( + new_index, + indexer, + axis=axis, + fill_value=fill_value, + only_slice=only_slice, ) - if delim_whitespace is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'delim_whitespace' keyword in pd.read_table is deprecated and " - "will be removed in a future version. Use ``sep='\\s+'`` instead", - FutureWarning, - stacklevel=find_stack_level(), + def reindex_indexer( + self, + new_axis: Index, + indexer: npt.NDArray[np.intp] | None, + axis: AxisInt, + fill_value=None, + allow_dups: bool = False, + only_slice: bool = False, + *, + use_na_proxy: bool = False, + ) -> Self: + """ + Parameters + ---------- + new_axis : Index + indexer : ndarray[intp] or None + axis : int + fill_value : object, default None + allow_dups : bool, default False + only_slice : bool, default False + Whether to take views, not copies, along columns. + use_na_proxy : bool, default False + Whether to use a np.void ndarray for newly introduced columns. + + pandas-indexer with -1's only. + """ + if indexer is None: + if new_axis is self.axes[axis]: + return self + + result = self.copy(deep=False) + result.axes = list(self.axes) + result.axes[axis] = new_axis + return result + + # Should be intp, but in some cases we get int64 on 32bit builds + assert isinstance(indexer, np.ndarray) + + # some axes don't allow reindexing with dups + if not allow_dups: + self.axes[axis]._validate_can_reindex(indexer) + + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + if axis == 0: + new_blocks = self._slice_take_blocks_ax0( + indexer, + fill_value=fill_value, + only_slice=only_slice, + use_na_proxy=use_na_proxy, + ) + else: + new_blocks = [ + blk.take_nd( + indexer, + axis=1, + fill_value=( + fill_value if fill_value is not None else blk.fill_value + ), + ) + for blk in self.blocks + ] + + new_axes = list(self.axes) + new_axes[axis] = new_axis + + new_mgr = type(self).from_blocks(new_blocks, new_axes) + if axis == 1: + # We can avoid the need to rebuild these + new_mgr._blknos = self.blknos.copy() + new_mgr._blklocs = self.blklocs.copy() + return new_mgr + + def _slice_take_blocks_ax0( + self, + slice_or_indexer: slice | np.ndarray, + fill_value=lib.no_default, + only_slice: bool = False, + *, + use_na_proxy: bool = False, + ref_inplace_op: bool = False, + ) -> list[Block]: + """ + Slice/take blocks along axis=0. + + Overloaded for SingleBlock + + Parameters + ---------- + slice_or_indexer : slice or np.ndarray[int64] + fill_value : scalar, default lib.no_default + only_slice : bool, default False + If True, we always return views on existing arrays, never copies. + This is used when called from ops.blockwise.operate_blockwise. + use_na_proxy : bool, default False + Whether to use a np.void ndarray for newly introduced columns. + ref_inplace_op: bool, default False + Don't track refs if True because we operate inplace + + Returns + ------- + new_blocks : list of Block + """ + allow_fill = fill_value is not lib.no_default + + sl_type, slobj, sllen = _preprocess_slice_or_indexer( + slice_or_indexer, self.shape[0], allow_fill=allow_fill ) - else: - delim_whitespace = False - - # locals() should never be modified - kwds = locals().copy() - del kwds["filepath_or_buffer"] - del kwds["sep"] - - kwds_defaults = _refine_defaults_read( - dialect, - delimiter, - delim_whitespace, - engine, - sep, - on_bad_lines, - names, - defaults={"delimiter": "\t"}, - dtype_backend=dtype_backend, - ) - kwds.update(kwds_defaults) - - return _read(filepath_or_buffer, kwds) - - -@overload -def read_fwf( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - colspecs: Sequence[tuple[int, int]] | str | None = ..., - widths: Sequence[int] | None = ..., - infer_nrows: int = ..., - iterator: Literal[True], - chunksize: int | None = ..., - **kwds: Unpack[_read_shared[HashableT]], -) -> TextFileReader: ... - - -@overload -def read_fwf( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - colspecs: Sequence[tuple[int, int]] | str | None = ..., - widths: Sequence[int] | None = ..., - infer_nrows: int = ..., - iterator: bool = ..., - chunksize: int, - **kwds: Unpack[_read_shared[HashableT]], -) -> TextFileReader: ... - - -@overload -def read_fwf( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - colspecs: Sequence[tuple[int, int]] | str | None = ..., - widths: Sequence[int] | None = ..., - infer_nrows: int = ..., - iterator: Literal[False] = ..., - chunksize: None = ..., - **kwds: Unpack[_read_shared[HashableT]], -) -> DataFrame: ... - - -def read_fwf( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - colspecs: Sequence[tuple[int, int]] | str | None = "infer", - widths: Sequence[int] | None = None, - infer_nrows: int = 100, - iterator: bool = False, - chunksize: int | None = None, - **kwds: Unpack[_read_shared[HashableT]], -) -> DataFrame | TextFileReader: - r""" - Read a table of fixed-width formatted lines into DataFrame. - - Also supports optionally iterating or breaking of the file - into chunks. - - Additional help can be found in the `online docs for IO Tools - `_. - Parameters - ---------- - filepath_or_buffer : str, path object, or file-like object - String, path object (implementing ``os.PathLike[str]``), or file-like - object implementing a text ``read()`` function.The string could be a URL. - Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is - expected. A local file could be: - ``file://localhost/path/to/table.csv``. - colspecs : list of tuple (int, int) or 'infer'. optional - A list of tuples giving the extents of the fixed-width - fields of each line as half-open intervals (i.e., [from, to] ). - String value 'infer' can be used to instruct the parser to try - detecting the column specifications from the first 100 rows of - the data which are not being skipped via skiprows (default='infer'). - widths : list of int, optional - A list of field widths which can be used instead of 'colspecs' if - the intervals are contiguous. - infer_nrows : int, default 100 - The number of rows to consider when letting the parser determine the - `colspecs`. - iterator : bool, default False - Return ``TextFileReader`` object for iteration or getting chunks with - ``get_chunk()``. - chunksize : int, optional - Number of lines to read from the file per chunk. - **kwds : optional - Optional keyword arguments can be passed to ``TextFileReader``. + if self.is_single_block: + blk = self.blocks[0] + + if sl_type == "slice": + # GH#32959 EABlock would fail since we can't make 0-width + # TODO(EA2D): special casing unnecessary with 2D EAs + if sllen == 0: + return [] + bp = BlockPlacement(slice(0, sllen)) + return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)] + elif not allow_fill or self.ndim == 1: + if allow_fill and fill_value is None: + fill_value = blk.fill_value + + if not allow_fill and only_slice: + # GH#33597 slice instead of take, so we get + # views instead of copies + blocks = [ + blk.getitem_block_columns( + slice(ml, ml + 1), + new_mgr_locs=BlockPlacement(i), + ref_inplace_op=ref_inplace_op, + ) + for i, ml in enumerate(slobj) + ] + return blocks + else: + bp = BlockPlacement(slice(0, sllen)) + return [ + blk.take_nd( + slobj, + axis=0, + new_mgr_locs=bp, + fill_value=fill_value, + ) + ] - Returns - ------- - DataFrame or TextFileReader - A comma-separated values (csv) file is returned as two-dimensional - data structure with labeled axes. - - See Also - -------- - DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. - read_csv : Read a comma-separated values (csv) file into DataFrame. - - Examples - -------- - >>> pd.read_fwf("data.csv") # doctest: +SKIP - """ - # Check input arguments. - if colspecs is None and widths is None: - raise ValueError("Must specify either colspecs or widths") - if colspecs not in (None, "infer") and widths is not None: - raise ValueError("You must specify only one of 'widths' and 'colspecs'") - - # Compute 'colspecs' from 'widths', if specified. - if widths is not None: - colspecs, col = [], 0 - for w in widths: - colspecs.append((col, col + w)) - col += w - - # for mypy - assert colspecs is not None - - # GH#40830 - # Ensure length of `colspecs` matches length of `names` - names = kwds.get("names") - if names is not None and names is not lib.no_default: - if len(names) != len(colspecs) and colspecs != "infer": - # need to check len(index_col) as it might contain - # unnamed indices, in which case it's name is not required - len_index = 0 - if kwds.get("index_col") is not None: - index_col: Any = kwds.get("index_col") - if index_col is not False: - if not is_list_like(index_col): - len_index = 1 + if sl_type == "slice": + blknos = self.blknos[slobj] + blklocs = self.blklocs[slobj] + else: + blknos = algos.take_nd( + self.blknos, slobj, fill_value=-1, allow_fill=allow_fill + ) + blklocs = algos.take_nd( + self.blklocs, slobj, fill_value=-1, allow_fill=allow_fill + ) + + # When filling blknos, make sure blknos is updated before appending to + # blocks list, that way new blkno is exactly len(blocks). + blocks = [] + group = not only_slice + for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group): + if blkno == -1: + # If we've got here, fill_value was not lib.no_default + + blocks.append( + self._make_na_block( + placement=mgr_locs, + fill_value=fill_value, + use_na_proxy=use_na_proxy, + ) + ) + else: + blk = self.blocks[blkno] + + # Otherwise, slicing along items axis is necessary. + if not blk._can_consolidate and not blk._validate_ndim: + # i.e. we dont go through here for DatetimeTZBlock + # A non-consolidatable block, it's easy, because there's + # only one item and each mgr loc is a copy of that single + # item. + deep = False + for mgr_loc in mgr_locs: + newblk = blk.copy(deep=deep) + newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1)) + blocks.append(newblk) + + else: + # GH#32779 to avoid the performance penalty of copying, + # we may try to only slice + taker = blklocs[mgr_locs.indexer] + max_len = max(len(mgr_locs), taker.max() + 1) + taker = lib.maybe_indices_to_slice(taker, max_len) + + if isinstance(taker, slice): + nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs) + blocks.append(nb) + elif only_slice: + # GH#33597 slice instead of take, so we get + # views instead of copies + for i, ml in zip(taker, mgr_locs): + slc = slice(i, i + 1) + bp = BlockPlacement(ml) + nb = blk.getitem_block_columns(slc, new_mgr_locs=bp) + # We have np.shares_memory(nb.values, blk.values) + blocks.append(nb) else: - # for mypy: handled in the if-branch - assert index_col is not lib.no_default - - len_index = len(index_col) - if kwds.get("usecols") is None and len(names) + len_index != len(colspecs): - # If usecols is used colspec may be longer than names - raise ValueError("Length of colspecs must match length of names") - - check_dtype_backend(kwds.setdefault("dtype_backend", lib.no_default)) - return _read( - filepath_or_buffer, - kwds - | { - "colspecs": colspecs, - "infer_nrows": infer_nrows, - "engine": "python-fwf", - "iterator": iterator, - "chunksize": chunksize, - }, - ) + nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) + blocks.append(nb) + return blocks -class TextFileReader(abc.Iterator): - """ + def _make_na_block( + self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False + ) -> Block: + # Note: we only get here with self.ndim == 2 + + if use_na_proxy: + assert fill_value is None + shape = (len(placement), self.shape[1]) + vals = np.empty(shape, dtype=np.void) + nb = NumpyBlock(vals, placement, ndim=2) + return nb + + if fill_value is None or fill_value is np.nan: + fill_value = np.nan + # GH45857 avoid unnecessary upcasting + dtype = interleaved_dtype([blk.dtype for blk in self.blocks]) + if dtype is not None and np.issubdtype(dtype.type, np.floating): + fill_value = dtype.type(fill_value) - Passed dialect overrides any of the related parser options + shape = (len(placement), self.shape[1]) + + dtype, fill_value = infer_dtype_from_scalar(fill_value) + block_values = make_na_array(dtype, shape, fill_value) + return new_block_2d(block_values, placement=placement) + + def take( + self, + indexer: npt.NDArray[np.intp], + axis: AxisInt = 1, + verify: bool = True, + ) -> Self: + """ + Take items along any axis. + + indexer : np.ndarray[np.intp] + axis : int, default 1 + verify : bool, default True + Check that all entries are between 0 and len(self) - 1, inclusive. + Pass verify=False if this check has been done by the caller. + + Returns + ------- + BlockManager + """ + # Caller is responsible for ensuring indexer annotation is accurate + + n = self.shape[axis] + indexer = maybe_convert_indices(indexer, n, verify=verify) + + new_labels = self.axes[axis].take(indexer) + return self.reindex_indexer( + new_axis=new_labels, + indexer=indexer, + axis=axis, + allow_dups=True, + ) + +class BlockManager(libinternals.BlockManager, BaseBlockManager): + """ + BaseBlockManager that holds 2D blocks. """ + ndim = 2 + + # ---------------------------------------------------------------- + # Constructors + def __init__( self, - f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list, - engine: CSVEngine | None = None, - **kwds, + blocks: Sequence[Block], + axes: Sequence[Index], + verify_integrity: bool = True, ) -> None: - if engine is not None: - engine_specified = True - else: - engine = "python" - engine_specified = False - self.engine = engine - self._engine_specified = kwds.get("engine_specified", engine_specified) + if verify_integrity: + # Assertion disabled for performance + # assert all(isinstance(x, Index) for x in axes) + + for block in blocks: + if self.ndim != block.ndim: + raise AssertionError( + f"Number of Block dimensions ({block.ndim}) must equal " + f"number of axes ({self.ndim})" + ) + # As of 2.0, the caller is responsible for ensuring that + # DatetimeTZBlock with block.ndim == 2 has block.values.ndim ==2; + # previously there was a special check for fastparquet compat. + + self._verify_integrity() + + def _verify_integrity(self) -> None: + mgr_shape = self.shape + tot_items = sum(len(x.mgr_locs) for x in self.blocks) + for block in self.blocks: + if block.shape[1:] != mgr_shape[1:]: + raise_construction_error(tot_items, block.shape[1:], self.axes) + if len(self.items) != tot_items: + raise AssertionError( + "Number of manager items must equal union of " + f"block items\n# manager items: {len(self.items)}, # " + f"tot_items: {tot_items}" + ) - _validate_skipfooter(kwds) + @classmethod + def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> Self: + """ + Constructor for BlockManager and SingleBlockManager with same signature. + """ + return cls(blocks, axes, verify_integrity=False) + + # ---------------------------------------------------------------- + # Indexing + + def fast_xs(self, loc: int) -> SingleBlockManager: + """ + Return the array corresponding to `frame.iloc[loc]`. + + Parameters + ---------- + loc : int + + Returns + ------- + np.ndarray or ExtensionArray + """ + if len(self.blocks) == 1: + # TODO: this could be wrong if blk.mgr_locs is not slice(None)-like; + # is this ruled out in the general case? + result: np.ndarray | ExtensionArray = self.blocks[0].iget( + (slice(None), loc) + ) + # in the case of a single block, the new block is a view + bp = BlockPlacement(slice(0, len(result))) + block = new_block( + result, + placement=bp, + ndim=1, + refs=self.blocks[0].refs, + ) + return SingleBlockManager(block, self.axes[0]) - dialect = _extract_dialect(kwds) - if dialect is not None: - if engine == "pyarrow": - raise ValueError( - "The 'dialect' option is not supported with the 'pyarrow' engine" - ) - kwds = _merge_with_dialect_properties(dialect, kwds) + dtype = interleaved_dtype([blk.dtype for blk in self.blocks]) - if kwds.get("header", "infer") == "infer": - kwds["header"] = 0 if kwds.get("names") is None else None + n = len(self) - self.orig_options = kwds + if isinstance(dtype, ExtensionDtype): + # TODO: use object dtype as workaround for non-performant + # EA.__setitem__ methods. (primarily ArrowExtensionArray.__setitem__ + # when iteratively setting individual values) + # https://github.com/pandas-dev/pandas/pull/54508#issuecomment-1675827918 + result = np.empty(n, dtype=object) + else: + result = np.empty(n, dtype=dtype) + result = ensure_wrapped_if_datetimelike(result) + + for blk in self.blocks: + # Such assignment may incorrectly coerce NaT to None + # result[blk.mgr_locs] = blk._slice((slice(None), loc)) + for i, rl in enumerate(blk.mgr_locs): + result[rl] = blk.iget((i, loc)) + + if isinstance(dtype, ExtensionDtype): + cls = dtype.construct_array_type() + result = cls._from_sequence(result, dtype=dtype) + + bp = BlockPlacement(slice(0, len(result))) + block = new_block(result, placement=bp, ndim=1) + return SingleBlockManager(block, self.axes[0]) + + def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager: + """ + Return the data as a SingleBlockManager. + """ + block = self.blocks[self.blknos[i]] + values = block.iget(self.blklocs[i]) + + # shortcut for select a single-dim from a 2-dim BM + bp = BlockPlacement(slice(0, len(values))) + nb = type(block)( + values, placement=bp, ndim=1, refs=block.refs if track_ref else None + ) + return SingleBlockManager(nb, self.axes[1]) + + def iget_values(self, i: int) -> ArrayLike: + """ + Return the data for column i as the values (ndarray or ExtensionArray). + + Warning! The returned array is a view but doesn't handle Copy-on-Write, + so this should be used with caution. + """ + # TODO(CoW) making the arrays read-only might make this safer to use? + block = self.blocks[self.blknos[i]] + values = block.iget(self.blklocs[i]) + return values + + @property + def column_arrays(self) -> list[np.ndarray]: + """ + Used in the JSON C code to access column arrays. + This optimizes compared to using `iget_values` by converting each + + Warning! This doesn't handle Copy-on-Write, so should be used with + caution (current use case of consuming this in the JSON code is fine). + """ + # This is an optimized equivalent to + # result = [self.iget_values(i) for i in range(len(self.items))] + result: list[np.ndarray | None] = [None] * len(self.items) + + for blk in self.blocks: + mgr_locs = blk._mgr_locs + values = blk.array_values._values_for_json() + if values.ndim == 1: + # TODO(EA2D): special casing not needed with 2D EAs + result[mgr_locs[0]] = values - # miscellanea - self._currow = 0 + else: + for i, loc in enumerate(mgr_locs): + result[loc] = values[i] - options = self._get_options_with_defaults(engine) - options["storage_options"] = kwds.get("storage_options", None) + # error: Incompatible return value type (got "List[None]", + # expected "List[ndarray[Any, Any]]") + return result # type: ignore[return-value] - self.chunksize = options.pop("chunksize", None) - self.nrows = options.pop("nrows", None) + def iset( + self, + loc: int | slice | np.ndarray, + value: ArrayLike, + inplace: bool = False, + refs: BlockValuesRefs | None = None, + ) -> None: + """ + Set new item in-place. Does not consolidate. Adds new Block if not + contained in the current set of items + """ + + # FIXME: refactor, clearly separate broadcasting & zip-like assignment + # can prob also fix the various if tests for sparse/categorical + if self._blklocs is None and self.ndim > 1: + self._rebuild_blknos_and_blklocs() + + # Note: we exclude DTA/TDA here + value_is_extension_type = is_1d_only_ea_dtype(value.dtype) + if not value_is_extension_type: + if value.ndim == 2: + value = value.T + else: + value = ensure_block_shape(value, ndim=2) - self._check_file_or_buffer(f, engine) - self.options, self.engine = self._clean_options(options, engine) + if value.shape[1:] != self.shape[1:]: + raise AssertionError( + "Shape of new values must be compatible with manager shape" + ) - if "has_index_names" in kwds: - self.options["has_index_names"] = kwds["has_index_names"] + if lib.is_integer(loc): + # We have 6 tests where loc is _not_ an int. + # In this case, get_blkno_placements will yield only one tuple, + # containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1))) + + # Check if we can use _iset_single fastpath + loc = cast(int, loc) + blkno = self.blknos[loc] + blk = self.blocks[blkno] + if len(blk._mgr_locs) == 1: # TODO: fastest way to check this? + return self._iset_single( + loc, + value, + inplace=inplace, + blkno=blkno, + blk=blk, + refs=refs, + ) - self.handles: IOHandles | None = None - self._engine = self._make_engine(f, self.engine) + # error: Incompatible types in assignment (expression has type + # "List[Union[int, slice, ndarray]]", variable has type "Union[int, + # slice, ndarray]") + loc = [loc] # type: ignore[assignment] - def close(self) -> None: - if self.handles is not None: - self.handles.close() - self._engine.close() + # categorical/sparse/datetimetz + if value_is_extension_type: - def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: - kwds = self.orig_options + def value_getitem(placement): + return value - options = {} - default: object | None + else: - for argname, default in parser_defaults.items(): - value = kwds.get(argname, default) + def value_getitem(placement): + return value[placement.indexer] + + # Accessing public blknos ensures the public versions are initialized + blknos = self.blknos[loc] + blklocs = self.blklocs[loc].copy() + + unfit_mgr_locs = [] + unfit_val_locs = [] + removed_blknos = [] + for blkno_l, val_locs in libinternals.get_blkno_placements(blknos, group=True): + blk = self.blocks[blkno_l] + blk_locs = blklocs[val_locs.indexer] + if inplace and blk.should_store(value): + # Updating inplace -> check if we need to do Copy-on-Write + if not self._has_no_reference_block(blkno_l): + self._iset_split_block( + blkno_l, blk_locs, value_getitem(val_locs), refs=refs + ) + else: + blk.set_inplace(blk_locs, value_getitem(val_locs)) + continue + else: + unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) + unfit_val_locs.append(val_locs) - # see gh-12935 - if ( - engine == "pyarrow" - and argname in _pyarrow_unsupported - and value != default - and value != getattr(value, "value", default) - ): - raise ValueError( - f"The {argname!r} option is not supported with the " - f"'pyarrow' engine" + # If all block items are unfit, schedule the block for removal. + if len(val_locs) == len(blk.mgr_locs): + removed_blknos.append(blkno_l) + continue + else: + # Defer setting the new values to enable consolidation + self._iset_split_block(blkno_l, blk_locs, refs=refs) + + if len(removed_blknos): + # Remove blocks & update blknos accordingly + is_deleted = np.zeros(self.nblocks, dtype=np.bool_) + is_deleted[removed_blknos] = True + + new_blknos = np.empty(self.nblocks, dtype=np.intp) + new_blknos.fill(-1) + new_blknos[~is_deleted] = np.arange(self.nblocks - len(removed_blknos)) + self._blknos = new_blknos[self._blknos] + self.blocks = tuple( + blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos) + ) + + if unfit_val_locs: + unfit_idxr = np.concatenate(unfit_mgr_locs) + unfit_count = len(unfit_idxr) + + new_blocks: list[Block] = [] + if value_is_extension_type: + # This code (ab-)uses the fact that EA blocks contain only + # one item. + # TODO(EA2D): special casing unnecessary with 2D EAs + new_blocks.extend( + new_block_2d( + values=value, + placement=BlockPlacement(slice(mgr_loc, mgr_loc + 1)), + refs=refs, + ) + for mgr_loc in unfit_idxr ) - options[argname] = value - - for argname, default in _c_parser_defaults.items(): - if argname in kwds: - value = kwds[argname] - - if engine != "c" and value != default: - # TODO: Refactor this logic, its pretty convoluted - if "python" in engine and argname not in _python_unsupported: - pass - elif "pyarrow" in engine and argname not in _pyarrow_unsupported: - pass - else: - raise ValueError( - f"The {argname!r} option is not supported with the " - f"{engine!r} engine" - ) + + self._blknos[unfit_idxr] = np.arange(unfit_count) + len(self.blocks) + self._blklocs[unfit_idxr] = 0 + else: - value = default - options[argname] = value - - if engine == "python-fwf": - for argname, default in _fwf_defaults.items(): - options[argname] = kwds.get(argname, default) - - return options - - def _check_file_or_buffer(self, f, engine: CSVEngine) -> None: - # see gh-16530 - if is_file_like(f) and engine != "c" and not hasattr(f, "__iter__"): - # The C engine doesn't need the file-like to have the "__iter__" - # attribute. However, the Python engine needs "__iter__(...)" - # when iterating through such an object, meaning it - # needs to have that attribute - raise ValueError( - "The 'python' engine cannot iterate through this file buffer." - ) - if hasattr(f, "encoding"): - file_encoding = f.encoding - orig_reader_enc = self.orig_options.get("encoding", None) - any_none = file_encoding is None or orig_reader_enc is None - if file_encoding != orig_reader_enc and not any_none: - file_path = getattr(f, "name", None) - raise ValueError( - f"The specified reader encoding {orig_reader_enc} is different " - f"from the encoding {file_encoding} of file {file_path}." + # unfit_val_locs contains BlockPlacement objects + unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:]) + + new_blocks.append( + new_block_2d( + values=value_getitem(unfit_val_items), + placement=BlockPlacement(unfit_idxr), + refs=refs, + ) ) - def _clean_options( - self, options: dict[str, Any], engine: CSVEngine - ) -> tuple[dict[str, Any], CSVEngine]: - result = options.copy() + self._blknos[unfit_idxr] = len(self.blocks) + self._blklocs[unfit_idxr] = np.arange(unfit_count) - fallback_reason = None + self.blocks += tuple(new_blocks) - # C engine not supported yet - if engine == "c": - if options["skipfooter"] > 0: - fallback_reason = "the 'c' engine does not support skipfooter" - engine = "python" + # Newly created block's dtype may already be present. + self._known_consolidated = False - sep = options["delimiter"] - delim_whitespace = options["delim_whitespace"] + def _iset_split_block( + self, + blkno_l: int, + blk_locs: np.ndarray | list[int], + value: ArrayLike | None = None, + refs: BlockValuesRefs | None = None, + ) -> None: + """Removes columns from a block by splitting the block. + + Avoids copying the whole block through slicing and updates the manager + after determinint the new block structure. Optionally adds a new block, + otherwise has to be done by the caller. + + Parameters + ---------- + blkno_l: The block number to operate on, relevant for updating the manager + blk_locs: The locations of our block that should be deleted. + value: The value to set as a replacement. + refs: The reference tracking object of the value to set. + """ + blk = self.blocks[blkno_l] + + if self._blklocs is None: + self._rebuild_blknos_and_blklocs() + + nbs_tup = tuple(blk.delete(blk_locs)) + if value is not None: + locs = blk.mgr_locs.as_array[blk_locs] + first_nb = new_block_2d(value, BlockPlacement(locs), refs=refs) + else: + first_nb = nbs_tup[0] + nbs_tup = tuple(nbs_tup[1:]) - if sep is None and not delim_whitespace: - if engine in ("c", "pyarrow"): - fallback_reason = ( - f"the '{engine}' engine does not support " - "sep=None with delim_whitespace=False" - ) - engine = "python" - elif sep is not None and len(sep) > 1: - if engine == "c" and sep == r"\s+": - result["delim_whitespace"] = True - del result["delimiter"] - elif engine not in ("python", "python-fwf"): - # wait until regex engine integrated - fallback_reason = ( - f"the '{engine}' engine does not support " - "regex separators (separators > 1 char and " - r"different from '\s+' are interpreted as regex)" - ) - engine = "python" - elif delim_whitespace: - if "python" in engine: - result["delimiter"] = r"\s+" - elif sep is not None: - encodeable = True - encoding = sys.getfilesystemencoding() or "utf-8" - try: - if len(sep.encode(encoding)) > 1: - encodeable = False - except UnicodeDecodeError: - encodeable = False - if not encodeable and engine not in ("python", "python-fwf"): - fallback_reason = ( - f"the separator encoded in {encoding} " - f"is > 1 char long, and the '{engine}' engine " - "does not support such separators" - ) - engine = "python" - - quotechar = options["quotechar"] - if quotechar is not None and isinstance(quotechar, (str, bytes)): - if ( - len(quotechar) == 1 - and ord(quotechar) > 127 - and engine not in ("python", "python-fwf") - ): - fallback_reason = ( - "ord(quotechar) > 127, meaning the " - "quotechar is larger than one byte, " - f"and the '{engine}' engine does not support such quotechars" + nr_blocks = len(self.blocks) + blocks_tup = ( + self.blocks[:blkno_l] + (first_nb,) + self.blocks[blkno_l + 1 :] + nbs_tup + ) + self.blocks = blocks_tup + + if not nbs_tup and value is not None: + # No need to update anything if split did not happen + return + + self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb)) + + for i, nb in enumerate(nbs_tup): + self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) + self._blknos[nb.mgr_locs.indexer] = i + nr_blocks + + def _iset_single( + self, + loc: int, + value: ArrayLike, + inplace: bool, + blkno: int, + blk: Block, + refs: BlockValuesRefs | None = None, + ) -> None: + """ + Fastpath for iset when we are only setting a single position and + the Block currently in that position is itself single-column. + + In this case we can swap out the entire Block and blklocs and blknos + are unaffected. + """ + # Caller is responsible for verifying value.shape + + if inplace and blk.should_store(value): + copy = not self._has_no_reference_block(blkno) + iloc = self.blklocs[loc] + blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy) + return + + nb = new_block_2d(value, placement=blk._mgr_locs, refs=refs) + old_blocks = self.blocks + new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :] + self.blocks = new_blocks + return + + def column_setitem( + self, loc: int, idx: int | slice | np.ndarray, value, inplace_only: bool = False + ) -> None: + """ + Set values ("setitem") into a single column (not setting the full column). + + This is a method on the BlockManager level, to avoid creating an + intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`) + """ + if not self._has_no_reference(loc): + blkno = self.blknos[loc] + # Split blocks to only copy the column we want to modify + blk_loc = self.blklocs[loc] + # Copy our values + values = self.blocks[blkno].values + if values.ndim == 1: + values = values.copy() + else: + # Use [blk_loc] as indexer to keep ndim=2, this already results in a + # copy + values = values[[blk_loc]] + self._iset_split_block(blkno, [blk_loc], values) + + # this manager is only created temporarily to mutate the values in place + # so don't track references, otherwise the `setitem` would perform CoW again + col_mgr = self.iget(loc, track_ref=False) + if inplace_only: + col_mgr.setitem_inplace(idx, value) + else: + new_mgr = col_mgr.setitem((idx,), value) + self.iset(loc, new_mgr._block.values, inplace=True) + + def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None: + """ + Insert item at selected position. + + Parameters + ---------- + loc : int + item : hashable + value : np.ndarray or ExtensionArray + refs : The reference tracking object of the value to set. + """ + new_axis = self.items.insert(loc, item) + + if value.ndim == 2: + value = value.T + if len(value) > 1: + raise ValueError( + f"Expected a 1D array, got an array with shape {value.T.shape}" ) - engine = "python" - - if fallback_reason and self._engine_specified: - raise ValueError(fallback_reason) - - if engine == "c": - for arg in _c_unsupported: - del result[arg] - - if "python" in engine: - for arg in _python_unsupported: - if fallback_reason and result[arg] != _c_parser_defaults.get(arg): - raise ValueError( - "Falling back to the 'python' engine because " - f"{fallback_reason}, but this causes {arg!r} to be " - "ignored as it is not supported by the 'python' engine." - ) - del result[arg] + else: + value = ensure_block_shape(value, ndim=self.ndim) + + bp = BlockPlacement(slice(loc, loc + 1)) + block = new_block_2d(values=value, placement=bp, refs=refs) + + if not len(self.blocks): + # Fastpath + self._blklocs = np.array([0], dtype=np.intp) + self._blknos = np.array([0], dtype=np.intp) + else: + self._insert_update_mgr_locs(loc) + self._insert_update_blklocs_and_blknos(loc) + + self.axes[0] = new_axis + self.blocks += (block,) + + self._known_consolidated = False - if fallback_reason: + if ( + get_option("performance_warnings") + and sum(not block.is_extension for block in self.blocks) > 100 + ): warnings.warn( - ( - "Falling back to the 'python' engine because " - f"{fallback_reason}; you can avoid this warning by specifying " - "engine='python'." - ), - ParserWarning, + "DataFrame is highly fragmented. This is usually the result " + "of calling `frame.insert` many times, which has poor performance. " + "Consider joining all columns at once using pd.concat(axis=1) " + "instead. To get a de-fragmented frame, use `newframe = frame.copy()`", + PerformanceWarning, stacklevel=find_stack_level(), ) - index_col = options["index_col"] - names = options["names"] - converters = options["converters"] - na_values = options["na_values"] - skiprows = options["skiprows"] - - validate_header_arg(options["header"]) - - if index_col is True: - raise ValueError("The value of index_col couldn't be 'True'") - if is_index_col(index_col): - if not isinstance(index_col, (list, tuple, np.ndarray)): - index_col = [index_col] - result["index_col"] = index_col - - names = list(names) if names is not None else names - - # type conversion-related - if converters is not None: - if not isinstance(converters, dict): - raise TypeError( - "Type converters must be a dict or subclass, " - f"input was a {type(converters).__name__}" - ) + def _insert_update_mgr_locs(self, loc) -> None: + """ + When inserting a new Block at location 'loc', we increment + all of the mgr_locs of blocks above that by one. + """ + for blkno, count in _fast_count_smallints(self.blknos[loc:]): + # .620 this way, .326 of which is in increment_above + blk = self.blocks[blkno] + blk._mgr_locs = blk._mgr_locs.increment_above(loc) + + def _insert_update_blklocs_and_blknos(self, loc) -> None: + """ + When inserting a new Block at location 'loc', we update our + _blklocs and _blknos. + """ + + # Accessing public blklocs ensures the public versions are initialized + if loc == self.blklocs.shape[0]: + # np.append is a lot faster, let's use it if we can. + self._blklocs = np.append(self._blklocs, 0) + self._blknos = np.append(self._blknos, len(self.blocks)) + elif loc == 0: + # As of numpy 1.26.4, np.concatenate faster than np.append + self._blklocs = np.concatenate([[0], self._blklocs]) + self._blknos = np.concatenate([[len(self.blocks)], self._blknos]) else: - converters = {} - - # Converting values to NA - keep_default_na = options["keep_default_na"] - floatify = engine != "pyarrow" - na_values, na_fvalues = _clean_na_values( - na_values, keep_default_na, floatify=floatify - ) + new_blklocs, new_blknos = libinternals.update_blklocs_and_blknos( + self.blklocs, self.blknos, loc, len(self.blocks) + ) + self._blklocs = new_blklocs + self._blknos = new_blknos + + def idelete(self, indexer) -> BlockManager: + """ + Delete selected locations, returning a new BlockManager. + """ + is_deleted = np.zeros(self.shape[0], dtype=np.bool_) + is_deleted[indexer] = True + taker = (~is_deleted).nonzero()[0] + + nbs = self._slice_take_blocks_ax0(taker, only_slice=True, ref_inplace_op=True) + new_columns = self.items[~is_deleted] + axes = [new_columns, self.axes[1]] + return type(self)(tuple(nbs), axes, verify_integrity=False) + + # ---------------------------------------------------------------- + # Block-wise Operation + + def grouped_reduce(self, func: Callable) -> Self: + """ + Apply grouped reduction function blockwise, returning a new BlockManager. + + Parameters + ---------- + func : grouped reduction function + + Returns + ------- + BlockManager + """ + result_blocks: list[Block] = [] + + for blk in self.blocks: + if blk.is_object: + # split on object-dtype blocks bc some columns may raise + # while others do not. + for sb in blk._split(): + applied = sb.apply(func) + result_blocks = extend_blocks(applied, result_blocks) + else: + applied = blk.apply(func) + result_blocks = extend_blocks(applied, result_blocks) - # handle skiprows; this is internally handled by the - # c-engine, so only need for python and pyarrow parsers - if engine == "pyarrow": - if not is_integer(skiprows) and skiprows is not None: - # pyarrow expects skiprows to be passed as an integer - raise ValueError( - "skiprows argument must be an integer when using " - "engine='pyarrow'" - ) + if len(result_blocks) == 0: + nrows = 0 else: - if is_integer(skiprows): - skiprows = range(skiprows) - if skiprows is None: - skiprows = set() - elif not callable(skiprows): - skiprows = set(skiprows) - - # put stuff back - result["names"] = names - result["converters"] = converters - result["na_values"] = na_values - result["na_fvalues"] = na_fvalues - result["skiprows"] = skiprows - - return result, engine - - def __next__(self) -> DataFrame: - try: - return self.get_chunk() - except StopIteration: - self.close() - raise - - def _make_engine( + nrows = result_blocks[0].values.shape[-1] + index = Index(range(nrows)) + + return type(self).from_blocks(result_blocks, [self.axes[0], index]) + + def reduce(self, func: Callable) -> Self: + """ + Apply reduction function blockwise, returning a single-row BlockManager. + + Parameters + ---------- + func : reduction function + + Returns + ------- + BlockManager + """ + # If 2D, we assume that we're operating column-wise + assert self.ndim == 2 + + res_blocks: list[Block] = [] + for blk in self.blocks: + nbs = blk.reduce(func) + res_blocks.extend(nbs) + + index = Index([None]) # placeholder + new_mgr = type(self).from_blocks(res_blocks, [self.items, index]) + return new_mgr + + def operate_blockwise(self, other: BlockManager, array_op) -> BlockManager: + """ + Apply array_op blockwise with another (aligned) BlockManager. + """ + return operate_blockwise(self, other, array_op) + + def _equal_values(self: BlockManager, other: BlockManager) -> bool: + """ + Used in .equals defined in base class. Only check the column values + assuming shape and indexes have already been checked. + """ + return blockwise_all(self, other, array_equals) + + def quantile( self, - f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list | IO, - engine: CSVEngine = "c", - ) -> ParserBase: - mapping: dict[str, type[ParserBase]] = { - "c": CParserWrapper, - "python": PythonParser, - "pyarrow": ArrowParserWrapper, - "python-fwf": FixedWidthFieldParser, - } + *, + qs: Index, # with dtype float 64 + interpolation: QuantileInterpolation = "linear", + ) -> Self: + """ + Iterate over blocks applying quantile reduction. + This routine is intended for reduction type operations and + will do inference on the generated blocks. + + Parameters + ---------- + interpolation : type of interpolation, default 'linear' + qs : list of the quantiles to be computed + + Returns + ------- + BlockManager + """ + # Series dispatches to DataFrame for quantile, which allows us to + # simplify some of the code here and in the blocks + assert self.ndim >= 2 + assert is_list_like(qs) # caller is responsible for this + + new_axes = list(self.axes) + new_axes[1] = Index(qs, dtype=np.float64) + + blocks = [ + blk.quantile(qs=qs, interpolation=interpolation) for blk in self.blocks + ] + + return type(self)(blocks, new_axes) + + # ---------------------------------------------------------------- + + def unstack(self, unstacker, fill_value) -> BlockManager: + """ + Return a BlockManager with all blocks unstacked. + + Parameters + ---------- + unstacker : reshape._Unstacker + fill_value : Any + fill_value for newly introduced missing values. + + Returns + ------- + unstacked : BlockManager + """ + new_columns = unstacker.get_new_columns(self.items) + new_index = unstacker.new_index + + allow_fill = not unstacker.mask_all + if allow_fill: + # calculating the full mask once and passing it to Block._unstack is + # faster than letting calculating it in each repeated call + new_mask2D = (~unstacker.mask).reshape(*unstacker.full_shape) + needs_masking = new_mask2D.any(axis=0) + else: + needs_masking = np.zeros(unstacker.full_shape[1], dtype=bool) - if engine not in mapping: - raise ValueError( - f"Unknown engine: {engine} (valid options are {mapping.keys()})" - ) - if not isinstance(f, list): - # open file here - is_text = True - mode = "r" - if engine == "pyarrow": - is_text = False - mode = "rb" - elif ( - engine == "c" - and self.options.get("encoding", "utf-8") == "utf-8" - and isinstance(stringify_path(f), str) - ): - # c engine can decode utf-8 bytes, adding TextIOWrapper makes - # the c-engine especially for memory_map=True far slower - is_text = False - if "b" not in mode: - mode += "b" - self.handles = get_handle( - f, - mode, - encoding=self.options.get("encoding", None), - compression=self.options.get("compression", None), - memory_map=self.options.get("memory_map", False), - is_text=is_text, - errors=self.options.get("encoding_errors", "strict"), - storage_options=self.options.get("storage_options", None), + new_blocks: list[Block] = [] + columns_mask: list[np.ndarray] = [] + + if len(self.items) == 0: + factor = 1 + else: + fac = len(new_columns) / len(self.items) + assert fac == int(fac) + factor = int(fac) + + for blk in self.blocks: + mgr_locs = blk.mgr_locs + new_placement = mgr_locs.tile_for_unstack(factor) + + blocks, mask = blk._unstack( + unstacker, + fill_value, + new_placement=new_placement, + needs_masking=needs_masking, ) - assert self.handles is not None - f = self.handles.handle - elif engine != "python": - msg = f"Invalid file path or buffer object type: {type(f)}" - raise ValueError(msg) + new_blocks.extend(blocks) + columns_mask.extend(mask) - try: - return mapping[engine](f, **self.options) - except Exception: - if self.handles is not None: - self.handles.close() - raise + # Block._unstack should ensure this holds, + assert mask.sum() == sum(len(nb._mgr_locs) for nb in blocks) + # In turn this ensures that in the BlockManager call below + # we have len(new_columns) == sum(x.shape[0] for x in new_blocks) + # which suffices to allow us to pass verify_inegrity=False - def _failover_to_python(self) -> None: - raise AbstractMethodError(self) + new_columns = new_columns[columns_mask] - def read(self, nrows: int | None = None) -> DataFrame: - if self.engine == "pyarrow": - try: - # error: "ParserBase" has no attribute "read" - df = self._engine.read() # type: ignore[attr-defined] - except Exception: - self.close() - raise - else: - nrows = validate_integer("nrows", nrows) - try: - # error: "ParserBase" has no attribute "read" - ( - index, - columns, - col_dict, - ) = self._engine.read( # type: ignore[attr-defined] - nrows - ) - except Exception: - self.close() - raise - - if index is None: - if col_dict: - # Any column is actually fine: - new_rows = len(next(iter(col_dict.values()))) - index = RangeIndex(self._currow, self._currow + new_rows) + bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False) + return bm + + def to_dict(self) -> dict[str, Self]: + """ + Return a dict of str(dtype) -> BlockManager + + Returns + ------- + values : a dict of dtype -> BlockManager + """ + + bd: dict[str, list[Block]] = {} + for b in self.blocks: + bd.setdefault(str(b.dtype), []).append(b) + + # TODO(EA2D): the combine will be unnecessary with 2D EAs + return {dtype: self._combine(blocks) for dtype, blocks in bd.items()} + + def as_array( + self, + dtype: np.dtype | None = None, + copy: bool = False, + na_value: object = lib.no_default, + ) -> np.ndarray: + """ + Convert the blockmanager data into an numpy array. + + Parameters + ---------- + dtype : np.dtype or None, default None + Data type of the return array. + copy : bool, default False + If True then guarantee that a copy is returned. A value of + False does not guarantee that the underlying data is not + copied. + na_value : object, default lib.no_default + Value to be used as the missing value sentinel. + + Returns + ------- + arr : ndarray + """ + passed_nan = lib.is_float(na_value) and isna(na_value) + + if len(self.blocks) == 0: + arr = np.empty(self.shape, dtype=float) + return arr.transpose() + + if self.is_single_block: + blk = self.blocks[0] + + if na_value is not lib.no_default: + # We want to copy when na_value is provided to avoid + # mutating the original object + if lib.is_np_dtype(blk.dtype, "f") and passed_nan: + # We are already numpy-float and na_value=np.nan + pass else: - new_rows = 0 + copy = True + + if blk.is_extension: + # Avoid implicit conversion of extension blocks to object + + # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no + # attribute "to_numpy" + arr = blk.values.to_numpy( # type: ignore[union-attr] + dtype=dtype, + na_value=na_value, + copy=copy, + ).reshape(blk.shape) + elif not copy: + arr = np.asarray(blk.values, dtype=dtype) else: - new_rows = len(index) + arr = np.array(blk.values, dtype=dtype, copy=copy) - if hasattr(self, "orig_options"): - dtype_arg = self.orig_options.get("dtype", None) - else: - dtype_arg = None - - if isinstance(dtype_arg, dict): - dtype = defaultdict(lambda: None) # type: ignore[var-annotated] - dtype.update(dtype_arg) - elif dtype_arg is not None and pandas_dtype(dtype_arg) in ( - np.str_, - np.object_, - ): - dtype = defaultdict(lambda: dtype_arg) - else: - dtype = None - - if dtype is not None: - new_col_dict = {} - for k, v in col_dict.items(): - d = ( - dtype[k] - if pandas_dtype(dtype[k]) in (np.str_, np.object_) - else None - ) - new_col_dict[k] = Series(v, index=index, dtype=d, copy=False) - else: - new_col_dict = col_dict + if not copy: + arr = arr.view() + arr.flags.writeable = False + else: + arr = self._interleave(dtype=dtype, na_value=na_value) + # The underlying data was copied within _interleave, so no need + # to further copy if copy=True or setting na_value - df = DataFrame( - new_col_dict, - columns=columns, - index=index, - copy=False, - ) + if na_value is lib.no_default: + pass + elif arr.dtype.kind == "f" and passed_nan: + pass + else: + arr[isna(arr)] = na_value - self._currow += new_rows - return df + return arr.transpose() - def get_chunk(self, size: int | None = None) -> DataFrame: - if size is None: - size = self.chunksize - if self.nrows is not None: - if self._currow >= self.nrows: - raise StopIteration - size = min(size, self.nrows - self._currow) - return self.read(nrows=size) + def _interleave( + self, + dtype: np.dtype | None = None, + na_value: object = lib.no_default, + ) -> np.ndarray: + """ + Return ndarray from blocks with specified item order + Items must be contained in the blocks + """ + if not dtype: + # Incompatible types in assignment (expression has type + # "Optional[Union[dtype[Any], ExtensionDtype]]", variable has + # type "Optional[dtype[Any]]") + dtype = interleaved_dtype( # type: ignore[assignment] + [blk.dtype for blk in self.blocks] + ) - def __enter__(self) -> Self: - return self + # error: Argument 1 to "ensure_np_dtype" has incompatible type + # "Optional[dtype[Any]]"; expected "Union[dtype[Any], ExtensionDtype]" + dtype = ensure_np_dtype(dtype) # type: ignore[arg-type] + result = np.empty(self.shape, dtype=dtype) + + itemmask = np.zeros(self.shape[0]) + + if dtype == np.dtype("object") and na_value is lib.no_default: + # much more performant than using to_numpy below + for blk in self.blocks: + rl = blk.mgr_locs + arr = blk.get_values(dtype) + result[rl.indexer] = arr + itemmask[rl.indexer] = 1 + return result + + for blk in self.blocks: + rl = blk.mgr_locs + if blk.is_extension: + # Avoid implicit conversion of extension blocks to object + + # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no + # attribute "to_numpy" + arr = blk.values.to_numpy( # type: ignore[union-attr] + dtype=dtype, + na_value=na_value, + ) + else: + arr = blk.get_values(dtype) + result[rl.indexer] = arr + itemmask[rl.indexer] = 1 + + if not itemmask.all(): + raise AssertionError("Some items were not contained in blocks") + + return result + + # ---------------------------------------------------------------- + # Consolidation + + def is_consolidated(self) -> bool: + """ + Return True if more than one block with the same dtype + """ + if not self._known_consolidated: + self._consolidate_check() + return self._is_consolidated + + def _consolidate_check(self) -> None: + if len(self.blocks) == 1: + # fastpath + self._is_consolidated = True + self._known_consolidated = True + return + dtypes = [blk.dtype for blk in self.blocks if blk._can_consolidate] + self._is_consolidated = len(dtypes) == len(set(dtypes)) + self._known_consolidated = True + + def _consolidate_inplace(self) -> None: + # In general, _consolidate_inplace should only be called via + # DataFrame._consolidate_inplace, otherwise we will fail to invalidate + # the DataFrame's _item_cache. The exception is for newly-created + # BlockManager objects not yet attached to a DataFrame. + if not self.is_consolidated(): + self.blocks = _consolidate(self.blocks) + self._is_consolidated = True + self._known_consolidated = True + self._rebuild_blknos_and_blklocs() + + # ---------------------------------------------------------------- + # Concatenation + + @classmethod + def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self: + """ + Concatenate uniformly-indexed BlockManagers horizontally. + """ + offset = 0 + blocks: list[Block] = [] + for mgr in mgrs: + for blk in mgr.blocks: + # We need to do getitem_block here otherwise we would be altering + # blk.mgr_locs in place, which would render it invalid. This is only + # relevant in the copy=False case. + nb = blk.slice_block_columns(slice(None)) + nb._mgr_locs = nb._mgr_locs.add(offset) + blocks.append(nb) + + offset += len(mgr.items) + + new_mgr = cls(tuple(blocks), axes) + return new_mgr + + @classmethod + def concat_vertical(cls, mgrs: list[Self], axes: list[Index]) -> Self: + """ + Concatenate uniformly-indexed BlockManagers vertically. + """ + raise NotImplementedError("This logic lives (for now) in internals.concat") + + +class SingleBlockManager(BaseBlockManager): + """manage a single block with""" + + @property + def ndim(self) -> Literal[1]: + return 1 + + _is_consolidated = True + _known_consolidated = True + __slots__ = () + is_single_block = True - def __exit__( + def __init__( self, - exc_type: type[BaseException] | None, - exc_value: BaseException | None, - traceback: TracebackType | None, + block: Block, + axis: Index, + verify_integrity: bool = False, ) -> None: - self.close() + # Assertions disabled for performance + # assert isinstance(block, Block), type(block) + # assert isinstance(axis, Index), type(axis) + + self.axes = [axis] + self.blocks = (block,) + + @classmethod + def from_blocks( + cls, + blocks: list[Block], + axes: list[Index], + ) -> Self: + """ + Constructor for BlockManager and SingleBlockManager with same signature. + """ + assert len(blocks) == 1 + assert len(axes) == 1 + return cls(blocks[0], axes[0], verify_integrity=False) + + @classmethod + def from_array( + cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None + ) -> SingleBlockManager: + """ + Constructor for if we have an array that is not yet a Block. + """ + array = maybe_coerce_values(array) + bp = BlockPlacement(slice(0, len(index))) + block = new_block(array, placement=bp, ndim=1, refs=refs) + return cls(block, index) + + def to_2d_mgr(self, columns: Index) -> BlockManager: + """ + Manager analogue of Series.to_frame + """ + blk = self.blocks[0] + arr = ensure_block_shape(blk.values, ndim=2) + bp = BlockPlacement(0) + new_blk = type(blk)(arr, placement=bp, ndim=2, refs=blk.refs) + axes = [columns, self.axes[0]] + return BlockManager([new_blk], axes=axes, verify_integrity=False) + + def _has_no_reference(self, i: int = 0) -> bool: + """ + Check for column `i` if it has references. + (whether it references another array or is itself being referenced) + Returns True if the column has no references. + """ + return not self.blocks[0].refs.has_reference() + + def __getstate__(self): + block_values = [b.values for b in self.blocks] + block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] + axes_array = list(self.axes) + + extra_state = { + "0.14.1": { + "axes": axes_array, + "blocks": [ + {"values": b.values, "mgr_locs": b.mgr_locs.indexer} + for b in self.blocks + ], + } + } + # First three elements of the state are to maintain forward + # compatibility with 0.13.1. + return axes_array, block_values, block_items, extra_state + + def __setstate__(self, state) -> None: + def unpickle_block(values, mgr_locs, ndim: int) -> Block: + # TODO(EA2D): ndim would be unnecessary with 2D EAs + # older pickles may store e.g. DatetimeIndex instead of DatetimeArray + values = extract_array(values, extract_numpy=True) + if not isinstance(mgr_locs, BlockPlacement): + mgr_locs = BlockPlacement(mgr_locs) + + values = maybe_coerce_values(values) + return new_block(values, placement=mgr_locs, ndim=ndim) + + if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]: + state = state[3]["0.14.1"] + self.axes = [ensure_index(ax) for ax in state["axes"]] + ndim = len(self.axes) + self.blocks = tuple( + unpickle_block(b["values"], b["mgr_locs"], ndim=ndim) + for b in state["blocks"] + ) + else: + raise NotImplementedError("pre-0.14.1 pickles are no longer supported") -def TextParser(*args, **kwds) -> TextFileReader: - """ - Converts lists of lists/tuples into DataFrames with proper type inference - and optional (e.g. string to datetime) conversion. Also enables iterating - lazily over chunks of large files + self._post_setstate() - Parameters - ---------- - data : file-like object or list - delimiter : separator character to use - dialect : str or csv.Dialect instance, optional - Ignored if delimiter is longer than 1 character - names : sequence, default - header : int, default 0 - Row to use to parse column labels. Defaults to the first row. Prior - rows will be discarded - index_col : int or list, optional - Column or columns to use as the (possibly hierarchical) index - has_index_names: bool, default False - True if the cols defined in index_col have an index name and are - not in the header. - na_values : scalar, str, list-like, or dict, optional - Additional strings to recognize as NA/NaN. - keep_default_na : bool, default True - thousands : str, optional - Thousands separator - comment : str, optional - Comment out remainder of line - parse_dates : bool, default False - keep_date_col : bool, default False - date_parser : function, optional - - .. deprecated:: 2.0.0 - date_format : str or dict of column -> format, default ``None`` - - .. versionadded:: 2.0.0 - skiprows : list of integers - Row numbers to skip - skipfooter : int - Number of line at bottom of file to skip - converters : dict, optional - Dict of functions for converting values in certain columns. Keys can - either be integers or column labels, values are functions that take one - input argument, the cell (not column) content, and return the - transformed content. - encoding : str, optional - Encoding to use for UTF when reading/writing (ex. 'utf-8') - float_precision : str, optional - Specifies which converter the C engine should use for floating-point - values. The options are `None` or `high` for the ordinary converter, - `legacy` for the original lower precision pandas converter, and - `round_trip` for the round-trip converter. - """ - kwds["engine"] = "python" - return TextFileReader(*args, **kwds) + def _post_setstate(self) -> None: + pass + @cache_readonly + def _block(self) -> Block: + return self.blocks[0] -def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = True): - na_fvalues: set | dict - if na_values is None: - if keep_default_na: - na_values = STR_NA_VALUES - else: - na_values = set() - na_fvalues = set() - elif isinstance(na_values, dict): - old_na_values = na_values.copy() - na_values = {} # Prevent aliasing. - - # Convert the values in the na_values dictionary - # into array-likes for further use. This is also - # where we append the default NaN values, provided - # that `keep_default_na=True`. - for k, v in old_na_values.items(): - if not is_list_like(v): - v = [v] - - if keep_default_na: - v = set(v) | STR_NA_VALUES - - na_values[k] = v - na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()} - else: - if not is_list_like(na_values): - na_values = [na_values] - na_values = _stringify_na_values(na_values, floatify) - if keep_default_na: - na_values = na_values | STR_NA_VALUES - - na_fvalues = _floatify_na_values(na_values) - - return na_values, na_fvalues - - -def _floatify_na_values(na_values): - # create float versions of the na_values - result = set() - for v in na_values: - try: - v = float(v) - if not np.isnan(v): - result.add(v) - except (TypeError, ValueError, OverflowError): - pass - return result - - -def _stringify_na_values(na_values, floatify: bool) -> set[str | float]: - """return a stringified and numeric for these values""" - result: list[str | float] = [] - for x in na_values: - result.append(str(x)) - result.append(x) - try: - v = float(x) - - # we are like 999 here - if v == int(v): - v = int(v) - result.append(f"{v}.0") - result.append(str(v)) - - if floatify: - result.append(v) - except (TypeError, ValueError, OverflowError): - pass - if floatify: - try: - result.append(int(x)) - except (TypeError, ValueError, OverflowError): - pass - return set(result) - - -def _refine_defaults_read( - dialect: str | csv.Dialect | None, - delimiter: str | None | lib.NoDefault, - delim_whitespace: bool, - engine: CSVEngine | None, - sep: str | None | lib.NoDefault, - on_bad_lines: str | Callable, - names: Sequence[Hashable] | None | lib.NoDefault, - defaults: dict[str, Any], - dtype_backend: DtypeBackend | lib.NoDefault, -): - """Validate/refine default values of input parameters of read_csv, read_table. + @final + @property + def array(self) -> ArrayLike: + """ + Quick access to the backing array of the Block. + """ + return self.arrays[0] - Parameters - ---------- - dialect : str or csv.Dialect - If provided, this parameter will override values (default or not) for the - following parameters: `delimiter`, `doublequote`, `escapechar`, - `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to - override values, a ParserWarning will be issued. See csv.Dialect - documentation for more details. - delimiter : str or object - Alias for sep. - delim_whitespace : bool - Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be - used as the sep. Equivalent to setting ``sep='\\s+'``. If this option - is set to True, nothing should be passed in for the ``delimiter`` - parameter. - - .. deprecated:: 2.2.0 - Use ``sep="\\s+"`` instead. - engine : {{'c', 'python'}} - Parser engine to use. The C engine is faster while the python engine is - currently more feature-complete. - sep : str or object - A delimiter provided by the user (str) or a sentinel value, i.e. - pandas._libs.lib.no_default. - on_bad_lines : str, callable - An option for handling bad lines or a sentinel value(None). - names : array-like, optional - List of column names to use. If the file contains a header row, - then you should explicitly pass ``header=0`` to override the column names. - Duplicates in this list are not allowed. - defaults: dict - Default values of input parameters. + # error: Cannot override writeable attribute with read-only property + @property + def _blknos(self) -> None: # type: ignore[override] + """compat with BlockManager""" + return None - Returns - ------- - kwds : dict - Input parameters with correct values. - - Raises - ------ - ValueError : - If a delimiter was specified with ``sep`` (or ``delimiter``) and - ``delim_whitespace=True``. - """ - # fix types for sep, delimiter to Union(str, Any) - delim_default = defaults["delimiter"] - kwds: dict[str, Any] = {} - # gh-23761 - # - # When a dialect is passed, it overrides any of the overlapping - # parameters passed in directly. We don't want to warn if the - # default parameters were passed in (since it probably means - # that the user didn't pass them in explicitly in the first place). - # - # "delimiter" is the annoying corner case because we alias it to - # "sep" before doing comparison to the dialect values later on. - # Thus, we need a flag to indicate that we need to "override" - # the comparison to dialect values by checking if default values - # for BOTH "delimiter" and "sep" were provided. - if dialect is not None: - kwds["sep_override"] = delimiter is None and ( - sep is lib.no_default or sep == delim_default - ) + # error: Cannot override writeable attribute with read-only property + @property + def _blklocs(self) -> None: # type: ignore[override] + """compat with BlockManager""" + return None - if delimiter and (sep is not lib.no_default): - raise ValueError("Specified a sep and a delimiter; you can only specify one.") + def get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> Self: + # similar to get_slice, but not restricted to slice indexer + blk = self._block + if len(indexer) > 0 and indexer.all(): + return type(self)(blk.copy(deep=False), self.index) + array = blk.values[indexer] - kwds["names"] = None if names is lib.no_default else names + if isinstance(indexer, np.ndarray) and indexer.dtype.kind == "b": + # boolean indexing always gives a copy with numpy + refs = None + else: + # TODO(CoW) in theory only need to track reference if new_array is a view + refs = blk.refs + + bp = BlockPlacement(slice(0, len(array))) + block = type(blk)(array, placement=bp, ndim=1, refs=refs) + + new_idx = self.index[indexer] + return type(self)(block, new_idx) + + def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleBlockManager: + # Assertion disabled for performance + # assert isinstance(slobj, slice), type(slobj) + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + blk = self._block + array = blk.values[slobj] + bp = BlockPlacement(slice(0, len(array))) + # TODO this method is only used in groupby SeriesSplitter at the moment, + # so passing refs is not yet covered by the tests + block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs) + new_index = self.index._getitem_slice(slobj) + return type(self)(block, new_index) + + @property + def index(self) -> Index: + return self.axes[0] + + @property + def dtype(self) -> DtypeObj: + return self._block.dtype + + def get_dtypes(self) -> npt.NDArray[np.object_]: + return np.array([self._block.dtype], dtype=object) + + def external_values(self): + """The array that Series.values returns""" + return self._block.external_values() + + def internal_values(self): + """The array that Series._values returns""" + return self._block.values + + def array_values(self) -> ExtensionArray: + """The array that Series.array returns""" + return self._block.array_values + + def get_numeric_data(self) -> Self: + if self._block.is_numeric: + return self.copy(deep=False) + return self.make_empty() + + @property + def _can_hold_na(self) -> bool: + return self._block._can_hold_na + + def setitem_inplace(self, indexer, value) -> None: + """ + Set values with indexer. + + For SingleBlockManager, this backs s[indexer] = value + + This is an inplace version of `setitem()`, mutating the manager/values + in place, not returning a new Manager (and Block), and thus never changing + the dtype. + """ + if not self._has_no_reference(0): + self.blocks = (self._block.copy(),) + self._cache.clear() + + arr = self.array + + # EAs will do this validation in their own __setitem__ methods. + if isinstance(arr, np.ndarray): + # Note: checking for ndarray instead of np.dtype means we exclude + # dt64/td64, which do their own validation. + value = np_can_hold_element(arr.dtype, value) + + if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1: + # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 + value = value[0, ...] + + arr[indexer] = value + + def idelete(self, indexer) -> SingleBlockManager: + """ + Delete single location from SingleBlockManager. + + Ensures that self.blocks doesn't become empty. + """ + nb = self._block.delete(indexer)[0] + self.blocks = (nb,) + self.axes[0] = self.axes[0].delete(indexer) + self._cache.clear() + return self - # Alias sep -> delimiter. - if delimiter is None: - delimiter = sep + def fast_xs(self, loc): + """ + fast path for getting a cross-section + return a view of the data + """ + raise NotImplementedError("Use series._values[loc] instead") + + def set_values(self, values: ArrayLike) -> None: + """ + Set the values of the single block in place. + + Use at your own risk! This does not check if the passed values are + valid for the current Block/SingleBlockManager (length, dtype, etc), + and this does not properly keep track of references. + """ + # NOTE(CoW) Currently this is only used for FrameColumnApply.series_generator + # which handles CoW by setting the refs manually if necessary + self.blocks[0].values = values + self.blocks[0]._mgr_locs = BlockPlacement(slice(len(values))) + + def _equal_values(self, other: Self) -> bool: + """ + Used in .equals defined in base class. Only check the column values + assuming shape and indexes have already been checked. + """ + # For SingleBlockManager (i.e.Series) + if other.ndim != 1: + return False + left = self.blocks[0].values + right = other.blocks[0].values + return array_equals(left, right) + + def grouped_reduce(self, func): + arr = self.array + res = func(arr) + index = default_index(len(res)) + + mgr = type(self).from_array(res, index) + return mgr + + +# -------------------------------------------------------------------- +# Constructor Helpers + + +def create_block_manager_from_blocks( + blocks: list[Block], + axes: list[Index], + consolidate: bool = True, + verify_integrity: bool = True, +) -> BlockManager: + # If verify_integrity=False, then caller is responsible for checking + # all(x.shape[-1] == len(axes[1]) for x in blocks) + # sum(x.shape[0] for x in blocks) == len(axes[0]) + # set(x for blk in blocks for x in blk.mgr_locs) == set(range(len(axes[0]))) + # all(blk.ndim == 2 for blk in blocks) + # This allows us to safely pass verify_integrity=False + + try: + mgr = BlockManager(blocks, axes, verify_integrity=verify_integrity) + + except ValueError as err: + arrays = [blk.values for blk in blocks] + tot_items = sum(arr.shape[0] for arr in arrays) + raise_construction_error(tot_items, arrays[0].shape[1:], axes, err) + + if consolidate: + mgr._consolidate_inplace() + return mgr + + +def create_block_manager_from_column_arrays( + arrays: list[ArrayLike], + axes: list[Index], + consolidate: bool, + refs: list, +) -> BlockManager: + # Assertions disabled for performance (caller is responsible for verifying) + # assert isinstance(axes, list) + # assert all(isinstance(x, Index) for x in axes) + # assert all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays) + # assert all(type(x) is not NumpyExtensionArray for x in arrays) + # assert all(x.ndim == 1 for x in arrays) + # assert all(len(x) == len(axes[1]) for x in arrays) + # assert len(arrays) == len(axes[0]) + # These last three are sufficient to allow us to safely pass + # verify_integrity=False below. + + try: + blocks = _form_blocks(arrays, consolidate, refs) + mgr = BlockManager(blocks, axes, verify_integrity=False) + except ValueError as e: + raise_construction_error(len(arrays), arrays[0].shape, axes, e) + if consolidate: + mgr._consolidate_inplace() + return mgr + + +def raise_construction_error( + tot_items: int, + block_shape: Shape, + axes: list[Index], + e: ValueError | None = None, +) -> NoReturn: + """raise a helpful message about our construction""" + passed = tuple(map(int, [tot_items] + list(block_shape))) + # Correcting the user facing error message during dataframe construction + if len(passed) <= 2: + passed = passed[::-1] + + implied = tuple(len(ax) for ax in axes) + # Correcting the user facing error message during dataframe construction + if len(implied) <= 2: + implied = implied[::-1] + + # We return the exception object instead of raising it so that we + # can raise it in the caller; mypy plays better with that + if passed == implied and e is not None: + raise e + if block_shape[0] == 0: + raise ValueError("Empty data passed with indices specified.") + raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}") + + +# ----------------------------------------------------------------------- + + +def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, DtypeObj]: + dtype = tup[1].dtype + + if is_1d_only_ea_dtype(dtype): + # We know these won't be consolidated, so don't need to group these. + # This avoids expensive comparisons of CategoricalDtype objects + sep = id(dtype) + else: + sep = 0 - if delim_whitespace and (delimiter is not lib.no_default): - raise ValueError( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) + return sep, dtype - if delimiter == "\n": - raise ValueError( - r"Specified \n as separator or delimiter. This forces the python engine " - "which does not accept a line terminator. Hence it is not allowed to use " - "the line terminator as separator.", - ) - if delimiter is lib.no_default: - # assign default separator value - kwds["delimiter"] = delim_default - else: - kwds["delimiter"] = delimiter +def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]: + tuples = list(enumerate(arrays)) - if engine is not None: - kwds["engine_specified"] = True - else: - kwds["engine"] = "c" - kwds["engine_specified"] = False - - if on_bad_lines == "error": - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR - elif on_bad_lines == "warn": - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN - elif on_bad_lines == "skip": - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP - elif callable(on_bad_lines): - if engine not in ["python", "pyarrow"]: - raise ValueError( - "on_bad_line can only be a callable function " - "if engine='python' or 'pyarrow'" - ) - kwds["on_bad_lines"] = on_bad_lines - else: - raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") + if not consolidate: + return _tuples_to_blocks_no_consolidate(tuples, refs) - check_dtype_backend(dtype_backend) + # when consolidating, we can ignore refs (either stacking always copies, + # or the EA is already copied in the calling dict_to_mgr) - kwds["dtype_backend"] = dtype_backend + # group by dtype + grouper = itertools.groupby(tuples, _grouping_func) - return kwds + nbs: list[Block] = [] + for (_, dtype), tup_block in grouper: + block_type = get_block_type(dtype) + if isinstance(dtype, np.dtype): + is_dtlike = dtype.kind in "mM" -def _extract_dialect(kwds: dict[str, Any]) -> csv.Dialect | None: - """ - Extract concrete csv dialect instance. + if issubclass(dtype.type, (str, bytes)): + dtype = np.dtype(object) - Returns - ------- - csv.Dialect or None - """ - if kwds.get("dialect") is None: - return None + values, placement = _stack_arrays(list(tup_block), dtype) + if is_dtlike: + values = ensure_wrapped_if_datetimelike(values) + blk = block_type(values, placement=BlockPlacement(placement), ndim=2) + nbs.append(blk) - dialect = kwds["dialect"] - if dialect in csv.list_dialects(): - dialect = csv.get_dialect(dialect) + elif is_1d_only_ea_dtype(dtype): + dtype_blocks = [ + block_type(x[1], placement=BlockPlacement(x[0]), ndim=2) + for x in tup_block + ] + nbs.extend(dtype_blocks) - _validate_dialect(dialect) + else: + dtype_blocks = [ + block_type( + ensure_block_shape(x[1], 2), placement=BlockPlacement(x[0]), ndim=2 + ) + for x in tup_block + ] + nbs.extend(dtype_blocks) + return nbs - return dialect +def _tuples_to_blocks_no_consolidate(tuples, refs) -> list[Block]: + # tuples produced within _form_blocks are of the form (placement, array) + return [ + new_block_2d( + ensure_block_shape(arr, ndim=2), placement=BlockPlacement(i), refs=ref + ) + for ((i, arr), ref) in zip(tuples, refs) + ] -MANDATORY_DIALECT_ATTRS = ( - "delimiter", - "doublequote", - "escapechar", - "skipinitialspace", - "quotechar", - "quoting", -) +def _stack_arrays(tuples, dtype: np.dtype): + placement, arrays = zip(*tuples) -def _validate_dialect(dialect: csv.Dialect) -> None: - """ - Validate csv dialect instance. + first = arrays[0] + shape = (len(arrays),) + first.shape - Raises - ------ - ValueError - If incorrect dialect is provided. - """ - for param in MANDATORY_DIALECT_ATTRS: - if not hasattr(dialect, param): - raise ValueError(f"Invalid dialect {dialect} provided") + stacked = np.empty(shape, dtype=dtype) + for i, arr in enumerate(arrays): + stacked[i] = arr + + return stacked, placement -def _merge_with_dialect_properties( - dialect: csv.Dialect, - defaults: dict[str, Any], -) -> dict[str, Any]: +def _consolidate(blocks: tuple[Block, ...]) -> tuple[Block, ...]: """ - Merge default kwargs in TextFileReader with dialect parameters. + Merge blocks having same dtype, exclude non-consolidating blocks + """ + # sort by _can_consolidate, dtype + gkey = lambda x: x._consolidate_key + grouper = itertools.groupby(sorted(blocks, key=gkey), gkey) + + new_blocks: list[Block] = [] + for (_can_consolidate, dtype), group_blocks in grouper: + merged_blocks, _ = _merge_blocks( + list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate + ) + new_blocks = extend_blocks(merged_blocks, new_blocks) + return tuple(new_blocks) - Parameters - ---------- - dialect : csv.Dialect - Concrete csv dialect. See csv.Dialect documentation for more details. - defaults : dict - Keyword arguments passed to TextFileReader. - Returns - ------- - kwds : dict - Updated keyword arguments, merged with dialect parameters. - """ - kwds = defaults.copy() +def _merge_blocks( + blocks: list[Block], dtype: DtypeObj, can_consolidate: bool +) -> tuple[list[Block], bool]: + if len(blocks) == 1: + return blocks, False - for param in MANDATORY_DIALECT_ATTRS: - dialect_val = getattr(dialect, param) + if can_consolidate: + # TODO: optimization potential in case all mgrs contain slices and + # combination of those slices is a slice, too. + new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks]) - parser_default = parser_defaults[param] - provided = kwds.get(param, parser_default) + new_values: ArrayLike - # Messages for conflicting values between the dialect - # instance and the actual parameters provided. - conflict_msgs = [] + if isinstance(blocks[0].dtype, np.dtype): + # error: List comprehension has incompatible type List[Union[ndarray, + # ExtensionArray]]; expected List[Union[complex, generic, + # Sequence[Union[int, float, complex, str, bytes, generic]], + # Sequence[Sequence[Any]], SupportsArray]] + new_values = np.vstack([b.values for b in blocks]) # type: ignore[misc] + else: + bvals = [blk.values for blk in blocks] + bvals2 = cast(Sequence[NDArrayBackedExtensionArray], bvals) + new_values = bvals2[0]._concat_same_type(bvals2, axis=0) - # Don't warn if the default parameter was passed in, - # even if it conflicts with the dialect (gh-23761). - if provided not in (parser_default, dialect_val): - msg = ( - f"Conflicting values for '{param}': '{provided}' was " - f"provided, but the dialect specifies '{dialect_val}'. " - "Using the dialect-specified value." - ) + argsort = np.argsort(new_mgr_locs) + new_values = new_values[argsort] + new_mgr_locs = new_mgr_locs[argsort] - # Annoying corner case for not warning about - # conflicts between dialect and delimiter parameter. - # Refer to the outer "_read_" function for more info. - if not (param == "delimiter" and kwds.pop("sep_override", False)): - conflict_msgs.append(msg) + bp = BlockPlacement(new_mgr_locs) + return [new_block_2d(new_values, placement=bp)], True - if conflict_msgs: - warnings.warn( - "\n\n".join(conflict_msgs), ParserWarning, stacklevel=find_stack_level() - ) - kwds[param] = dialect_val - return kwds + # can't consolidate --> no merge + return blocks, False -def _validate_skipfooter(kwds: dict[str, Any]) -> None: - """ - Check whether skipfooter is compatible with other kwargs in TextFileReader. +def _fast_count_smallints(arr: npt.NDArray[np.intp]): + """Faster version of set(arr) for sequences of small numbers.""" + counts = np.bincount(arr) + nz = counts.nonzero()[0] + # Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here, + # in one benchmark by a factor of 11 + return zip(nz, counts[nz]) - Parameters - ---------- - kwds : dict - Keyword arguments passed to TextFileReader. - Raises - ------ - ValueError - If skipfooter is not compatible with other parameters. - """ - if kwds.get("skipfooter"): - if kwds.get("iterator") or kwds.get("chunksize"): - raise ValueError("'skipfooter' not supported for iteration") - if kwds.get("nrows"): - raise ValueError("'skipfooter' not supported with 'nrows'") +def _preprocess_slice_or_indexer( + slice_or_indexer: slice | np.ndarray, length: int, allow_fill: bool +): + if isinstance(slice_or_indexer, slice): + return ( + "slice", + slice_or_indexer, + libinternals.slice_len(slice_or_indexer, length), + ) + else: + if ( + not isinstance(slice_or_indexer, np.ndarray) + or slice_or_indexer.dtype.kind != "i" + ): + dtype = getattr(slice_or_indexer, "dtype", None) + raise TypeError(type(slice_or_indexer), dtype) + + indexer = ensure_platform_int(slice_or_indexer) + if not allow_fill: + indexer = maybe_convert_indices(indexer, length) + return "fancy", indexer, len(indexer) + + +def make_na_array(dtype: DtypeObj, shape: Shape, fill_value) -> ArrayLike: + if isinstance(dtype, DatetimeTZDtype): + # NB: exclude e.g. pyarrow[dt64tz] dtypes + ts = Timestamp(fill_value).as_unit(dtype.unit) + i8values = np.full(shape, ts._value) + dt64values = i8values.view(f"M8[{dtype.unit}]") + return DatetimeArray._simple_new(dt64values, dtype=dtype) + + elif is_1d_only_ea_dtype(dtype): + dtype = cast(ExtensionDtype, dtype) + cls = dtype.construct_array_type() + + missing_arr = cls._from_sequence([], dtype=dtype) + ncols, nrows = shape + assert ncols == 1, ncols + empty_arr = -1 * np.ones((nrows,), dtype=np.intp) + return missing_arr.take(empty_arr, allow_fill=True, fill_value=fill_value) + elif isinstance(dtype, ExtensionDtype): + # TODO: no tests get here, a handful would if we disabled + # the dt64tz special-case above (which is faster) + cls = dtype.construct_array_type() + missing_arr = cls._empty(shape=shape, dtype=dtype) + missing_arr[:] = fill_value + return missing_arr + else: + # NB: we should never get here with dtype integer or bool; + # if we did, the missing_arr.fill would cast to gibberish + missing_arr_np = np.empty(shape, dtype=dtype) + missing_arr_np.fill(fill_value) + + if dtype.kind in "mM": + missing_arr_np = ensure_wrapped_if_datetimelike(missing_arr_np) + return missing_arr_np + \ No newline at end of file From 4059ff670b5163e24616e3df8ebd0d297b019d85 Mon Sep 17 00:00:00 2001 From: Gabe Barnard Date: Sat, 20 Apr 2024 23:54:51 -0500 Subject: [PATCH 4/5] fixed implicit conversion of 1-arrays inside data frames --- pandas/core/internals/managers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 8fda9cd23b508..6b04f04262b13 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2171,7 +2171,10 @@ def setitem_inplace(self, indexer, value) -> None: # dt64/td64, which do their own validation. value = np_can_hold_element(arr.dtype, value) - if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1: + # check if the dtype of the block is object + implicit_convert = arr.dtype != 'object' + if (isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1 + and implicit_convert): # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 value = value[0, ...] From 31574580a24c24cd095ef21d7f1ec3db4d0b8898 Mon Sep 17 00:00:00 2001 From: Gabe Barnard Date: Sun, 21 Apr 2024 00:07:24 -0500 Subject: [PATCH 5/5] fixed issue pandas-dev#57944 --- pandas/io/parsers/readers.py | 4209 +++++++++++++++------------------- 1 file changed, 1868 insertions(+), 2341 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 0c2332f24de1e..e0060048dcae2 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1,2518 +1,2045 @@ +""" +Module contains tools for processing files into DataFrames or other objects + +GH#48849 provides a convenient way of deprecating keyword arguments +""" + from __future__ import annotations -from collections.abc import ( - Hashable, - Sequence, +from collections import ( + abc, + defaultdict, ) -import itertools +import csv +import sys +from textwrap import fill from typing import ( + IO, TYPE_CHECKING, Any, Callable, + Generic, Literal, - NoReturn, - cast, - final, + TypedDict, + overload, ) import warnings import numpy as np -from pandas._config.config import get_option - -from pandas._libs import ( - algos as libalgos, - internals as libinternals, - lib, -) -from pandas._libs.internals import ( - BlockPlacement, - BlockValuesRefs, -) -from pandas._libs.tslibs import Timestamp +from pandas._libs import lib +from pandas._libs.parsers import STR_NA_VALUES from pandas.errors import ( AbstractMethodError, - PerformanceWarning, + ParserWarning, ) -from pandas.util._decorators import cache_readonly +from pandas.util._decorators import Appender from pandas.util._exceptions import find_stack_level -from pandas.util._validators import validate_bool_kwarg +from pandas.util._validators import check_dtype_backend -from pandas.core.dtypes.cast import ( - find_common_type, - infer_dtype_from_scalar, - np_can_hold_element, -) from pandas.core.dtypes.common import ( - ensure_platform_int, - is_1d_only_ea_dtype, + is_file_like, + is_float, + is_hashable, + is_integer, is_list_like, -) -from pandas.core.dtypes.dtypes import ( - DatetimeTZDtype, - ExtensionDtype, - SparseDtype, -) -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCSeries, -) -from pandas.core.dtypes.missing import ( - array_equals, - isna, + pandas_dtype, ) -import pandas.core.algorithms as algos -from pandas.core.arrays import DatetimeArray -from pandas.core.arrays._mixins import NDArrayBackedExtensionArray -from pandas.core.base import PandasObject -from pandas.core.construction import ( - ensure_wrapped_if_datetimelike, - extract_array, -) -from pandas.core.indexers import maybe_convert_indices -from pandas.core.indexes.api import ( - Index, - default_index, - ensure_index, +from pandas import Series +from pandas.core.frame import DataFrame +from pandas.core.indexes.api import RangeIndex +from pandas.core.shared_docs import _shared_docs + +from pandas.io.common import ( + IOHandles, + get_handle, + stringify_path, + validate_header_arg, ) -from pandas.core.internals.blocks import ( - Block, - NumpyBlock, - ensure_block_shape, - extend_blocks, - get_block_type, - maybe_coerce_values, - new_block, - new_block_2d, +from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper +from pandas.io.parsers.base_parser import ( + ParserBase, + is_index_col, + parser_defaults, ) -from pandas.core.internals.ops import ( - blockwise_all, - operate_blockwise, +from pandas.io.parsers.c_parser_wrapper import CParserWrapper +from pandas.io.parsers.python_parser import ( + FixedWidthFieldParser, + PythonParser, ) if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Iterable, + Mapping, + Sequence, + ) + from types import TracebackType + from pandas._typing import ( - ArrayLike, - AxisInt, - DtypeObj, - QuantileInterpolation, + CompressionOptions, + CSVEngine, + DtypeArg, + DtypeBackend, + FilePath, + HashableT, + IndexLabel, + ReadCsvBuffer, Self, - Shape, - npt, + StorageOptions, + Unpack, + UsecolsArgType, ) - from pandas.api.extensions import ExtensionArray + class _read_shared(TypedDict, Generic[HashableT], total=False): + # annotations shared between read_csv/fwf/table's overloads + # NOTE: Keep in sync with the annotations of the implementation + sep: str | None | lib.NoDefault + delimiter: str | None | lib.NoDefault + header: int | Sequence[int] | None | Literal["infer"] + names: Sequence[Hashable] | None | lib.NoDefault + index_col: IndexLabel | Literal[False] | None + usecols: UsecolsArgType + dtype: DtypeArg | None + engine: CSVEngine | None + converters: Mapping[HashableT, Callable] | None + true_values: list | None + false_values: list | None + skipinitialspace: bool + skiprows: list[int] | int | Callable[[Hashable], bool] | None + skipfooter: int + nrows: int | None + na_values: ( + Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None + ) + keep_default_na: bool + na_filter: bool + skip_blank_lines: bool + parse_dates: bool | Sequence[Hashable] | None + infer_datetime_format: bool | lib.NoDefault + keep_date_col: bool | lib.NoDefault + date_parser: Callable | lib.NoDefault + date_format: str | dict[Hashable, str] | None + dayfirst: bool + cache_dates: bool + compression: CompressionOptions + thousands: str | None + decimal: str + lineterminator: str | None + quotechar: str + quoting: int + doublequote: bool + escapechar: str | None + comment: str | None + encoding: str | None + encoding_errors: str | None + dialect: str | csv.Dialect | None + on_bad_lines: str + delim_whitespace: bool | lib.NoDefault + low_memory: bool + memory_map: bool + float_precision: Literal["high", "legacy", "round_trip"] | None + storage_options: StorageOptions | None + dtype_backend: DtypeBackend | lib.NoDefault +else: + _read_shared = dict + + +_doc_read_csv_and_table = ( + r""" +{summary} + +Also supports optionally iterating or breaking of the file +into chunks. + +Additional help can be found in the online docs for +`IO Tools `_. + +Parameters +---------- +filepath_or_buffer : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is + expected. A local file could be: file://localhost/path/to/table.csv. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, such as + a file handle (e.g. via builtin ``open`` function) or ``StringIO``. +sep : str, default {_default_sep} + Character or regex pattern to treat as the delimiter. If ``sep=None``, the + C engine cannot automatically detect + the separator, but the Python parsing engine can, meaning the latter will + be used and automatically detect the separator from only the first valid + row of the file by Python's builtin sniffer tool, ``csv.Sniffer``. + In addition, separators longer than 1 character and different from + ``'\s+'`` will be interpreted as regular expressions and will also force + the use of the Python parsing engine. Note that regex delimiters are prone + to ignoring quoted data. Regex example: ``'\r\t'``. +delimiter : str, optional + Alias for ``sep``. +header : int, Sequence of int, 'infer' or None, default 'infer' + Row number(s) containing column labels and marking the start of the + data (zero-indexed). Default behavior is to infer the column names: if no ``names`` + are passed the behavior is identical to ``header=0`` and column + names are inferred from the first line of the file, if column + names are passed explicitly to ``names`` then the behavior is identical to + ``header=None``. Explicitly pass ``header=0`` to be able to + replace existing names. The header can be a list of integers that + specify row locations for a :class:`~pandas.MultiIndex` on the columns + e.g. ``[0, 1, 3]``. Intervening rows that are not specified will be + skipped (e.g. 2 in this example is skipped). Note that this + parameter ignores commented lines and empty lines if + ``skip_blank_lines=True``, so ``header=0`` denotes the first line of + data rather than the first line of the file. + + When inferred from the file contents, headers are kept distinct from + each other by renaming duplicate names with a numeric suffix of the form + ``".{{count}}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. + Empty headers are named ``"Unnamed: {{i}}"`` or ``"Unnamed: {{i}}_level_{{level}}"`` + in the case of MultiIndex columns. +names : Sequence of Hashable, optional + Sequence of column labels to apply. If the file contains a header row, + then you should explicitly pass ``header=0`` to override the column names. + Duplicates in this list are not allowed. +index_col : Hashable, Sequence of Hashable or False, optional + Column(s) to use as row label(s), denoted either by column labels or column + indices. If a sequence of labels or indices is given, :class:`~pandas.MultiIndex` + will be formed for the row labels. + + Note: ``index_col=False`` can be used to force pandas to *not* use the first + column as the index, e.g., when you have a malformed file with delimiters at + the end of each line. +usecols : Sequence of Hashable or Callable, optional + Subset of columns to select, denoted either by column labels or column indices. + If list-like, all elements must either + be positional (i.e. integer indices into the document columns) or strings + that correspond to column names provided either by the user in ``names`` or + inferred from the document header row(s). If ``names`` are given, the document + header row(s) are not taken into account. For example, a valid list-like + ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. + Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. + To instantiate a :class:`~pandas.DataFrame` from ``data`` with element order + preserved use ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` + for columns in ``['foo', 'bar']`` order or + ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` + for ``['bar', 'foo']`` order. + + If callable, the callable function will be evaluated against the column + names, returning names where the callable function evaluates to ``True``. An + example of a valid callable argument would be ``lambda x: x.upper() in + ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster + parsing time and lower memory usage. +dtype : dtype or dict of {{Hashable : dtype}}, optional + Data type(s) to apply to either the whole dataset or individual columns. + E.g., ``{{'a': np.float64, 'b': np.int32, 'c': 'Int64'}}`` + Use ``str`` or ``object`` together with suitable ``na_values`` settings + to preserve and not interpret ``dtype``. + If ``converters`` are specified, they will be applied INSTEAD + of ``dtype`` conversion. + + .. versionadded:: 1.5.0 + + Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input where + the default determines the ``dtype`` of the columns which are not explicitly + listed. +engine : {{'c', 'python', 'pyarrow'}}, optional + Parser engine to use. The C and pyarrow engines are faster, while the python engine + is currently more feature-complete. Multithreading is currently only supported by + the pyarrow engine. + + .. versionadded:: 1.4.0 + + The 'pyarrow' engine was added as an *experimental* engine, and some features + are unsupported, or may not work correctly, with this engine. +converters : dict of {{Hashable : Callable}}, optional + Functions for converting values in specified columns. Keys can either + be column labels or column indices. +true_values : list, optional + Values to consider as ``True`` in addition to case-insensitive variants of 'True'. +false_values : list, optional + Values to consider as ``False`` in addition to case-insensitive variants of 'False'. +skipinitialspace : bool, default False + Skip spaces after delimiter. +skiprows : int, list of int or Callable, optional + Line numbers to skip (0-indexed) or number of lines to skip (``int``) + at the start of the file. + + If callable, the callable function will be evaluated against the row + indices, returning ``True`` if the row should be skipped and ``False`` otherwise. + An example of a valid callable argument would be ``lambda x: x in [0, 2]``. +skipfooter : int, default 0 + Number of lines at bottom of file to skip (Unsupported with ``engine='c'``). +nrows : int, optional + Number of rows of file to read. Useful for reading pieces of large files. +na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional + Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific + per-column ``NA`` values. By default the following values are interpreted as + ``NaN``: " """ + + fill('", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") + + """ ". + +keep_default_na : bool, default True + Whether or not to include the default ``NaN`` values when parsing the data. + Depending on whether ``na_values`` is passed in, the behavior is as follows: + + * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, ``na_values`` + is appended to the default ``NaN`` values used for parsing. + * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only + the default ``NaN`` values are used for parsing. + * If ``keep_default_na`` is ``False``, and ``na_values`` are specified, only + the ``NaN`` values specified ``na_values`` are used for parsing. + * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no + strings will be parsed as ``NaN``. + + Note that if ``na_filter`` is passed in as ``False``, the ``keep_default_na`` and + ``na_values`` parameters will be ignored. +na_filter : bool, default True + Detect missing value markers (empty strings and the value of ``na_values``). In + data without any ``NA`` values, passing ``na_filter=False`` can improve the + performance of reading a large file. +skip_blank_lines : bool, default True + If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. +parse_dates : bool, None, list of Hashable, list of lists or dict of {{Hashable : \ +list}}, default None + The behavior is as follows: + + * ``bool``. If ``True`` -> try parsing the index. + * ``None``. Behaves like ``True`` if ``date_parser`` or ``date_format`` are + specified. + * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 + each as a separate date column. + * ``list`` of ``list``. e.g. If ``[[1, 3]]`` -> combine columns 1 and 3 and parse + as a single date column. Values are joined with a space before parsing. + * ``dict``, e.g. ``{{'foo' : [1, 3]}}`` -> parse columns 1, 3 as date and call + result 'foo'. Values are joined with a space before parsing. + + If a column or index cannot be represented as an array of ``datetime``, + say because of an unparsable value or a mixture of timezones, the column + or index will be returned unaltered as an ``object`` data type. For + non-standard ``datetime`` parsing, use :func:`~pandas.to_datetime` after + :func:`~pandas.read_csv`. + + Note: A fast-path exists for iso8601-formatted dates. +infer_datetime_format : bool, default False + If ``True`` and ``parse_dates`` is enabled, pandas will attempt to infer the + format of the ``datetime`` strings in the columns, and if it can be inferred, + switch to a faster method of parsing them. In some cases this can increase + the parsing speed by 5-10x. + + .. deprecated:: 2.0.0 + A strict version of this argument is now the default, passing it has no effect. + +keep_date_col : bool, default False + If ``True`` and ``parse_dates`` specifies combining multiple columns then + keep the original columns. +date_parser : Callable, optional + Function to use for converting a sequence of string columns to an array of + ``datetime`` instances. The default uses ``dateutil.parser.parser`` to do the + conversion. pandas will try to call ``date_parser`` in three different ways, + advancing to the next if an exception occurs: 1) Pass one or more arrays + (as defined by ``parse_dates``) as arguments; 2) concatenate (row-wise) the + string values from the columns defined by ``parse_dates`` into a single array + and pass that; and 3) call ``date_parser`` once for each row using one or + more strings (corresponding to the columns defined by ``parse_dates``) as + arguments. + + .. deprecated:: 2.0.0 + Use ``date_format`` instead, or read in as ``object`` and then apply + :func:`~pandas.to_datetime` as-needed. +date_format : str or dict of column -> format, optional + Format to use for parsing dates when used in conjunction with ``parse_dates``. + The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See + `strftime documentation + `_ for more information on choices, though + note that :const:`"%f"` will parse all the way up to nanoseconds. + You can also pass: + + - "ISO8601", to parse any `ISO8601 `_ + time string (not necessarily in exactly the same format); + - "mixed", to infer the format for each element individually. This is risky, + and you should probably use it along with `dayfirst`. + + .. versionadded:: 2.0.0 +dayfirst : bool, default False + DD/MM format dates, international and European format. +cache_dates : bool, default True + If ``True``, use a cache of unique, converted dates to apply the ``datetime`` + conversion. May produce significant speed-up when parsing duplicate + date strings, especially ones with timezone offsets. + +iterator : bool, default False + Return ``TextFileReader`` object for iteration or getting chunks with + ``get_chunk()``. +chunksize : int, optional + Number of lines to read from the file per chunk. Passing a value will cause the + function to return a ``TextFileReader`` object for iteration. + See the `IO Tools docs + `_ + for more information on ``iterator`` and ``chunksize``. + +{decompression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + +thousands : str (length 1), optional + Character acting as the thousands separator in numerical values. +decimal : str (length 1), default '.' + Character to recognize as decimal point (e.g., use ',' for European data). +lineterminator : str (length 1), optional + Character used to denote a line break. Only valid with C parser. +quotechar : str (length 1), optional + Character used to denote the start and end of a quoted item. Quoted + items can include the ``delimiter`` and it will be ignored. +quoting : {{0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, \ +3 or csv.QUOTE_NONE}}, default csv.QUOTE_MINIMAL + Control field quoting behavior per ``csv.QUOTE_*`` constants. Default is + ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only fields containing special + characters are quoted (e.g., characters defined in ``quotechar``, ``delimiter``, + or ``lineterminator``. +doublequote : bool, default True + When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, indicate + whether or not to interpret two consecutive ``quotechar`` elements INSIDE a + field as a single ``quotechar`` element. +escapechar : str (length 1), optional + Character used to escape other characters. +comment : str (length 1), optional + Character indicating that the remainder of line should not be parsed. + If found at the beginning + of a line, the line will be ignored altogether. This parameter must be a + single character. Like empty lines (as long as ``skip_blank_lines=True``), + fully commented lines are ignored by the parameter ``header`` but not by + ``skiprows``. For example, if ``comment='#'``, parsing + ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in ``'a,b,c'`` being + treated as the header. +encoding : str, optional, default 'utf-8' + Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). `List of Python + standard encodings + `_ . + +encoding_errors : str, optional, default 'strict' + How encoding errors are treated. `List of possible values + `_ . + + .. versionadded:: 1.3.0 + +dialect : str or csv.Dialect, optional + If provided, this parameter will override values (default or not) for the + following parameters: ``delimiter``, ``doublequote``, ``escapechar``, + ``skipinitialspace``, ``quotechar``, and ``quoting``. If it is necessary to + override values, a ``ParserWarning`` will be issued. See ``csv.Dialect`` + documentation for more details. +on_bad_lines : {{'error', 'warn', 'skip'}} or Callable, default 'error' + Specifies what to do upon encountering a bad line (a line with too many fields). + Allowed values are : + + - ``'error'``, raise an Exception when a bad line is encountered. + - ``'warn'``, raise a warning when a bad line is encountered and skip that line. + - ``'skip'``, skip bad lines without raising or warning when they are encountered. + + .. versionadded:: 1.3.0 + + .. versionadded:: 1.4.0 + + - Callable, function with signature + ``(bad_line: list[str]) -> list[str] | None`` that will process a single + bad line. ``bad_line`` is a list of strings split by the ``sep``. + If the function returns ``None``, the bad line will be ignored. + If the function returns a new ``list`` of strings with more elements than + expected, a ``ParserWarning`` will be emitted while dropping extra elements. + Only supported when ``engine='python'`` + + .. versionchanged:: 2.2.0 + + - Callable, function with signature + as described in `pyarrow documentation + `_ when ``engine='pyarrow'`` + +delim_whitespace : bool, default False + Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be + used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option + is set to ``True``, nothing should be passed in for the ``delimiter`` + parameter. + + .. deprecated:: 2.2.0 + Use ``sep="\\s+"`` instead. +low_memory : bool, default True + Internally process the file in chunks, resulting in lower memory use + while parsing, but possibly mixed type inference. To ensure no mixed + types either set ``False``, or specify the type with the ``dtype`` parameter. + Note that the entire file is read into a single :class:`~pandas.DataFrame` + regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in + chunks. (Only valid with C parser). +memory_map : bool, default False + If a filepath is provided for ``filepath_or_buffer``, map the file object + directly onto memory and access the data directly from there. Using this + option can improve performance because there is no longer any I/O overhead. +float_precision : {{'high', 'legacy', 'round_trip'}}, optional + Specifies which converter the C engine should use for floating-point + values. The options are ``None`` or ``'high'`` for the ordinary converter, + ``'legacy'`` for the original lower precision pandas converter, and + ``'round_trip'`` for the round-trip converter. + +{storage_options} + +dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + +Returns +------- +DataFrame or TextFileReader + A comma-separated values (csv) file is returned as two-dimensional + data structure with labeled axes. + +See Also +-------- +DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. +{see_also_func_name} : {see_also_func_summary} +read_fwf : Read a table of fixed-width formatted lines into DataFrame. + +Examples +-------- +>>> pd.{func_name}('data.csv') # doctest: +SKIP +""" +) -def interleaved_dtype(dtypes: list[DtypeObj]) -> DtypeObj | None: - """ - Find the common dtype for `blocks`. +class _C_Parser_Defaults(TypedDict): + delim_whitespace: Literal[False] + na_filter: Literal[True] + low_memory: Literal[True] + memory_map: Literal[False] + float_precision: None - Parameters - ---------- - blocks : List[DtypeObj] - Returns - ------- - dtype : np.dtype, ExtensionDtype, or None - None is returned when `blocks` is empty. - """ - if not len(dtypes): - return None +_c_parser_defaults: _C_Parser_Defaults = { + "delim_whitespace": False, + "na_filter": True, + "low_memory": True, + "memory_map": False, + "float_precision": None, +} - return find_common_type(dtypes) +class _Fwf_Defaults(TypedDict): + colspecs: Literal["infer"] + infer_nrows: Literal[100] + widths: None -def ensure_np_dtype(dtype: DtypeObj) -> np.dtype: - # TODO: https://github.com/pandas-dev/pandas/issues/22791 - # Give EAs some input on what happens here. Sparse needs this. - if isinstance(dtype, SparseDtype): - dtype = dtype.subtype - dtype = cast(np.dtype, dtype) - elif isinstance(dtype, ExtensionDtype): - dtype = np.dtype("object") - elif dtype == np.dtype(str): - dtype = np.dtype("object") - return dtype +_fwf_defaults: _Fwf_Defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} +_c_unsupported = {"skipfooter"} +_python_unsupported = {"low_memory", "float_precision"} +_pyarrow_unsupported = { + "skipfooter", + "float_precision", + "chunksize", + "comment", + "nrows", + "thousands", + "memory_map", + "dialect", + "delim_whitespace", + "quoting", + "lineterminator", + "converters", + "iterator", + "dayfirst", + "skipinitialspace", + "low_memory", +} -class BaseBlockManager(PandasObject): - """ - Core internal data structure to implement DataFrame, Series, etc. - Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a - lightweight blocked set of labeled data to be manipulated by the DataFrame - public API class - - Attributes - ---------- - shape - ndim - axes - values - items - - Methods - ------- - set_axis(axis, new_labels) - copy(deep=True) +@overload +def validate_integer(name: str, val: None, min_val: int = ...) -> None: ... - get_dtypes - apply(func, axes, block_filter_fn) +@overload +def validate_integer(name: str, val: float, min_val: int = ...) -> int: ... - get_bool_data - get_numeric_data - get_slice(slice_like, axis) - get(label) - iget(loc) +@overload +def validate_integer(name: str, val: int | None, min_val: int = ...) -> int | None: ... - take(indexer, axis) - reindex_axis(new_labels, axis) - reindex_indexer(new_labels, indexer, axis) - delete(label) - insert(loc, label, value) - set(label, value) +def validate_integer( + name: str, val: int | float | None, min_val: int = 0 +) -> int | None: + """ + Checks whether the 'name' parameter for parsing is either + an integer OR float that can SAFELY be cast to an integer + without losing accuracy. Raises a ValueError if that is + not the case. Parameters ---------- - blocks: Sequence of Block - axes: Sequence of Index - verify_integrity: bool, default True - - Notes - ----- - This is *not* a public API class + name : str + Parameter name (used for error reporting) + val : int or float + The value to check + min_val : int + Minimum allowed value (val < min_val will result in a ValueError) """ + if val is None: + return val - __slots__ = () - - _blknos: npt.NDArray[np.intp] - _blklocs: npt.NDArray[np.intp] - blocks: tuple[Block, ...] - axes: list[Index] - - @property - def ndim(self) -> int: - raise NotImplementedError - - _known_consolidated: bool - _is_consolidated: bool - - def __init__(self, blocks, axes, verify_integrity: bool = True) -> None: - raise NotImplementedError - - @final - def __len__(self) -> int: - return len(self.items) - - @property - def shape(self) -> Shape: - return tuple(len(ax) for ax in self.axes) - - @classmethod - def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> Self: - raise NotImplementedError - - @property - def blknos(self) -> npt.NDArray[np.intp]: - """ - Suppose we want to find the array corresponding to our i'th column. - - blknos[i] identifies the block from self.blocks that contains this column. - - blklocs[i] identifies the column of interest within - self.blocks[self.blknos[i]] - """ - if self._blknos is None: - # Note: these can be altered by other BlockManager methods. - self._rebuild_blknos_and_blklocs() - - return self._blknos - - @property - def blklocs(self) -> npt.NDArray[np.intp]: - """ - See blknos.__doc__ - """ - if self._blklocs is None: - # Note: these can be altered by other BlockManager methods. - self._rebuild_blknos_and_blklocs() - - return self._blklocs - - def make_empty(self, axes=None) -> Self: - """return an empty BlockManager with the items axis of len 0""" - if axes is None: - axes = [Index([])] + self.axes[1:] - - # preserve dtype if possible - if self.ndim == 1: - assert isinstance(self, SingleBlockManager) # for mypy - blk = self.blocks[0] - arr = blk.values[:0] - bp = BlockPlacement(slice(0, 0)) - nb = blk.make_block_same_class(arr, placement=bp) - blocks = [nb] - else: - blocks = [] - return type(self).from_blocks(blocks, axes) - - def __nonzero__(self) -> bool: - return True + msg = f"'{name:s}' must be an integer >={min_val:d}" + if is_float(val): + if int(val) != val: + raise ValueError(msg) + val = int(val) + elif not (is_integer(val) and val >= min_val): + raise ValueError(msg) - # Python3 compat - __bool__ = __nonzero__ + return int(val) - def set_axis(self, axis: AxisInt, new_labels: Index) -> None: - # Caller is responsible for ensuring we have an Index object. - self._validate_set_axis(axis, new_labels) - self.axes[axis] = new_labels - @final - def _validate_set_axis(self, axis: AxisInt, new_labels: Index) -> None: - # Caller is responsible for ensuring we have an Index object. - old_len = len(self.axes[axis]) - new_len = len(new_labels) - - if axis == 1 and len(self.items) == 0: - # If we are setting the index on a DataFrame with no columns, - # it is OK to change the length. - pass - - elif new_len != old_len: - raise ValueError( - f"Length mismatch: Expected axis has {old_len} elements, new " - f"values have {new_len} elements" - ) - - @property - def is_single_block(self) -> bool: - # Assumes we are 2D; overridden by SingleBlockManager - return len(self.blocks) == 1 - - @property - def items(self) -> Index: - return self.axes[0] - - def _has_no_reference(self, i: int) -> bool: - """ - Check for column `i` if it has references. - (whether it references another array or is itself being referenced) - Returns True if the column has no references. - """ - blkno = self.blknos[i] - return self._has_no_reference_block(blkno) - - def _has_no_reference_block(self, blkno: int) -> bool: - """ - Check for block `i` if it has references. - (whether it references another array or is itself being referenced) - Returns True if the block has no references. - """ - return not self.blocks[blkno].refs.has_reference() - - def add_references(self, mgr: BaseBlockManager) -> None: - """ - Adds the references from one manager to another. We assume that both - managers have the same block structure. - """ - if len(self.blocks) != len(mgr.blocks): - # If block structure changes, then we made a copy - return - for i, blk in enumerate(self.blocks): - blk.refs = mgr.blocks[i].refs - blk.refs.add_reference(blk) - - def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool: - """ - Checks if two blocks from two different block managers reference the - same underlying values. - """ - blk = self.blocks[blkno] - return any(blk is ref() for ref in mgr.blocks[blkno].refs.referenced_blocks) - - def get_dtypes(self) -> npt.NDArray[np.object_]: - dtypes = np.array([blk.dtype for blk in self.blocks], dtype=object) - return dtypes.take(self.blknos) - - @property - def arrays(self) -> list[ArrayLike]: - """ - Quick access to the backing arrays of the Blocks. - - Only for compatibility with ArrayManager for testing convenience. - Not to be used in actual code, and return value is not the same as the - ArrayManager method (list of 1D arrays vs iterator of 2D ndarrays / 1D EAs). - - Warning! The returned arrays don't handle Copy-on-Write, so this should - be used with caution (only in read-mode). - """ - return [blk.values for blk in self.blocks] - - def __repr__(self) -> str: - output = type(self).__name__ - for i, ax in enumerate(self.axes): - if i == 0: - output += f"\nItems: {ax}" - else: - output += f"\nAxis {i}: {ax}" - - for block in self.blocks: - output += f"\n{block}" - return output - - def _equal_values(self, other: Self) -> bool: - """ - To be implemented by the subclasses. Only check the column values - assuming shape and indexes have already been checked. - """ - raise AbstractMethodError(self) +def _validate_names(names: Sequence[Hashable] | None) -> None: + """ + Raise ValueError if the `names` parameter contains duplicates or has an + invalid data type. - @final - def equals(self, other: object) -> bool: - """ - Implementation for DataFrame.equals - """ - if not isinstance(other, type(self)): - return False + Parameters + ---------- + names : array-like or None + An array containing a list of the names used for the output DataFrame. - self_axes, other_axes = self.axes, other.axes - if len(self_axes) != len(other_axes): - return False - if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)): - return False + Raises + ------ + ValueError + If names are not unique or are not ordered (e.g. set). + """ + if names is not None: + if len(names) != len(set(names)): + raise ValueError("Duplicate names are not allowed.") + if not ( + is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView) + ): + raise ValueError("Names should be an ordered collection.") - return self._equal_values(other) - def apply( - self, - f, - align_keys: list[str] | None = None, - **kwargs, - ) -> Self: - """ - Iterate over the blocks, collect and create a new BlockManager. - - Parameters - ---------- - f : str or callable - Name of the Block method to apply. - align_keys: List[str] or None, default None - **kwargs - Keywords to pass to `f` - - Returns - ------- - BlockManager - """ - assert "filter" not in kwargs - - align_keys = align_keys or [] - result_blocks: list[Block] = [] - # fillna: Series/DataFrame is responsible for making sure value is aligned - - aligned_args = {k: kwargs[k] for k in align_keys} - - for b in self.blocks: - if aligned_args: - for k, obj in aligned_args.items(): - if isinstance(obj, (ABCSeries, ABCDataFrame)): - # The caller is responsible for ensuring that - # obj.axes[-1].equals(self.items) - if obj.ndim == 1: - kwargs[k] = obj.iloc[b.mgr_locs.indexer]._values - else: - kwargs[k] = obj.iloc[:, b.mgr_locs.indexer]._values - else: - # otherwise we have an ndarray - kwargs[k] = obj[b.mgr_locs.indexer] +def _read( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds +) -> DataFrame | TextFileReader: + """Generic reader of line files.""" + # if we pass a date_parser and parse_dates=False, we should not parse the + # dates GH#44366 + if kwds.get("parse_dates", None) is None: + if ( + kwds.get("date_parser", lib.no_default) is lib.no_default + and kwds.get("date_format", None) is None + ): + kwds["parse_dates"] = False + else: + kwds["parse_dates"] = True - if callable(f): - applied = b.apply(f, **kwargs) - else: - applied = getattr(b, f)(**kwargs) - result_blocks = extend_blocks(applied, result_blocks) - - out = type(self).from_blocks(result_blocks, self.axes) - return out - - @final - def isna(self, func) -> Self: - return self.apply("apply", func=func) - - @final - def fillna(self, value, limit: int | None, inplace: bool) -> Self: - if limit is not None: - # Do this validation even if we go through one of the no-op paths - limit = libalgos.validate_limit(None, limit=limit) - - return self.apply( - "fillna", - value=value, - limit=limit, - inplace=inplace, - ) + # Extract some of the arguments (pass chunksize on). + iterator = kwds.get("iterator", False) + chunksize = kwds.get("chunksize", None) + if kwds.get("engine") == "pyarrow": + if iterator: + raise ValueError( + "The 'iterator' option is not supported with the 'pyarrow' engine" + ) - @final - def where(self, other, cond, align: bool) -> Self: - if align: - align_keys = ["other", "cond"] + if chunksize is not None: + raise ValueError( + "The 'chunksize' option is not supported with the 'pyarrow' engine" + ) + else: + chunksize = validate_integer("chunksize", chunksize, 1) + + nrows = kwds.get("nrows", None) + + # Check for duplicates in names. + _validate_names(kwds.get("names", None)) + + # Create the parser. + parser = TextFileReader(filepath_or_buffer, **kwds) + + if chunksize or iterator: + return parser + + with parser: + return parser.read(nrows) + + +@overload +def read_csv( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + iterator: Literal[True], + chunksize: int | None = ..., + **kwds: Unpack[_read_shared[HashableT]], +) -> TextFileReader: ... + + +@overload +def read_csv( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + iterator: bool = ..., + chunksize: int, + **kwds: Unpack[_read_shared[HashableT]], +) -> TextFileReader: ... + + +@overload +def read_csv( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + iterator: Literal[False] = ..., + chunksize: None = ..., + **kwds: Unpack[_read_shared[HashableT]], +) -> DataFrame: ... + + +@overload +def read_csv( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + iterator: bool = ..., + chunksize: int | None = ..., + **kwds: Unpack[_read_shared[HashableT]], +) -> DataFrame | TextFileReader: ... + + +# a helper function for the read_csv(...) below). +# ensures that all keys in dtype are of type str. +# this allows for compatibility with the csv library +def parse_dtype(dtype) -> DtypeArg: + temp = {} + for key in dtype: + if isinstance(key, str): + temp[f"{key}"] = dtype[key] else: - align_keys = ["cond"] - other = extract_array(other, extract_numpy=True) - - return self.apply( - "where", - align_keys=align_keys, - other=other, - cond=cond, + temp[key] = dtype[key] + return temp + + +@Appender( + _doc_read_csv_and_table.format( + func_name="read_csv", + summary="Read a comma-separated values (csv) file into DataFrame.", + see_also_func_name="read_table", + see_also_func_summary="Read general delimited file into DataFrame.", + _default_sep="','", + storage_options=_shared_docs["storage_options"], + decompression_options=_shared_docs["decompression_options"] + % "filepath_or_buffer", + ) +) +def read_csv( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + sep: str | None | lib.NoDefault = lib.no_default, + delimiter: str | None | lib.NoDefault = None, + # Column and Index Locations and Names + header: int | Sequence[int] | None | Literal["infer"] = "infer", + names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, + index_col: IndexLabel | Literal[False] | None = None, + usecols: UsecolsArgType = None, + # General Parsing Configuration + dtype: DtypeArg | None = None, + engine: CSVEngine | None = None, + converters: Mapping[HashableT, Callable] | None = None, + true_values: list | None = None, + false_values: list | None = None, + skipinitialspace: bool = False, + skiprows: list[int] | int | Callable[[Hashable], bool] | None = None, + skipfooter: int = 0, + nrows: int | None = None, + # NA and Missing Data Handling + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = None, + keep_default_na: bool = True, + na_filter: bool = True, + skip_blank_lines: bool = True, + # Datetime Handling + parse_dates: bool | Sequence[Hashable] | None = None, + infer_datetime_format: bool | lib.NoDefault = lib.no_default, + keep_date_col: bool | lib.NoDefault = lib.no_default, + date_parser: Callable | lib.NoDefault = lib.no_default, + date_format: str | dict[Hashable, str] | None = None, + dayfirst: bool = False, + cache_dates: bool = True, + # Iteration + iterator: bool = False, + chunksize: int | None = None, + # Quoting, Compression, and File Format + compression: CompressionOptions = "infer", + thousands: str | None = None, + decimal: str = ".", + lineterminator: str | None = None, + quotechar: str = '"', + quoting: int = csv.QUOTE_MINIMAL, + doublequote: bool = True, + escapechar: str | None = None, + comment: str | None = None, + encoding: str | None = None, + encoding_errors: str | None = "strict", + dialect: str | csv.Dialect | None = None, + # Error Handling + on_bad_lines: str = "error", + # Internal + delim_whitespace: bool | lib.NoDefault = lib.no_default, + low_memory: bool = _c_parser_defaults["low_memory"], + memory_map: bool = False, + float_precision: Literal["high", "legacy", "round_trip"] | None = None, + storage_options: StorageOptions | None = None, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, +) -> DataFrame | TextFileReader: + # ensures that all keys in dtype are a string for compatibility with csv + dtype = parse_dtype(dtype) + + if keep_date_col is not lib.no_default: + # GH#55569 + warnings.warn( + "The 'keep_date_col' keyword in pd.read_csv is deprecated and " + "will be removed in a future version. Explicitly remove unwanted " + "columns after parsing instead.", + FutureWarning, + stacklevel=find_stack_level(), ) + else: + keep_date_col = False + + if lib.is_list_like(parse_dates): + # GH#55569 + depr = False + # error: Item "bool" of "bool | Sequence[Hashable] | None" has no + # attribute "__iter__" (not iterable) + if not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr] + depr = True + elif isinstance(parse_dates, dict) and any( + lib.is_list_like(x) for x in parse_dates.values() + ): + depr = True + if depr: + warnings.warn( + "Support for nested sequences for 'parse_dates' in pd.read_csv " + "is deprecated. Combine the desired columns with pd.to_datetime " + "after parsing instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) - @final - def putmask(self, mask, new, align: bool = True) -> Self: - if align: - align_keys = ["new", "mask"] - else: - align_keys = ["mask"] - new = extract_array(new, extract_numpy=True) - - return self.apply( - "putmask", - align_keys=align_keys, - mask=mask, - new=new, + if infer_datetime_format is not lib.no_default: + warnings.warn( + "The argument 'infer_datetime_format' is deprecated and will " + "be removed in a future version. " + "A strict version of it is now the default, see " + "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " + "You can safely remove this argument.", + FutureWarning, + stacklevel=find_stack_level(), ) - @final - def round(self, decimals: int) -> Self: - return self.apply("round", decimals=decimals) - - @final - def replace(self, to_replace, value, inplace: bool) -> Self: - inplace = validate_bool_kwarg(inplace, "inplace") - # NDFrame.replace ensures the not-is_list_likes here - assert not lib.is_list_like(to_replace) - assert not lib.is_list_like(value) - return self.apply( - "replace", - to_replace=to_replace, - value=value, - inplace=inplace, + if delim_whitespace is not lib.no_default: + # GH#55569 + warnings.warn( + "The 'delim_whitespace' keyword in pd.read_csv is deprecated and " + "will be removed in a future version. Use ``sep='\\s+'`` instead", + FutureWarning, + stacklevel=find_stack_level(), ) - - @final - def replace_regex(self, **kwargs) -> Self: - return self.apply("_replace_regex", **kwargs) - - @final - def replace_list( - self, - src_list: list[Any], - dest_list: list[Any], - inplace: bool = False, - regex: bool = False, - ) -> Self: - """do a list replace""" - inplace = validate_bool_kwarg(inplace, "inplace") - - bm = self.apply( - "replace_list", - src_list=src_list, - dest_list=dest_list, - inplace=inplace, - regex=regex, + else: + delim_whitespace = False + + # locals() should never be modified + kwds = locals().copy() + del kwds["filepath_or_buffer"] + del kwds["sep"] + + kwds_defaults = _refine_defaults_read( + dialect, + delimiter, + delim_whitespace, + engine, + sep, + on_bad_lines, + names, + defaults={"delimiter": ","}, + dtype_backend=dtype_backend, + ) + kwds.update(kwds_defaults) + + return _read(filepath_or_buffer, kwds) + + +@overload +def read_table( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + iterator: Literal[True], + chunksize: int | None = ..., + **kwds: Unpack[_read_shared[HashableT]], +) -> TextFileReader: ... + + +@overload +def read_table( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + iterator: bool = ..., + chunksize: int, + **kwds: Unpack[_read_shared[HashableT]], +) -> TextFileReader: ... + + +@overload +def read_table( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + iterator: Literal[False] = ..., + chunksize: None = ..., + **kwds: Unpack[_read_shared[HashableT]], +) -> DataFrame: ... + + +@overload +def read_table( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + iterator: bool = ..., + chunksize: int | None = ..., + **kwds: Unpack[_read_shared[HashableT]], +) -> DataFrame | TextFileReader: ... + + +@Appender( + _doc_read_csv_and_table.format( + func_name="read_table", + summary="Read general delimited file into DataFrame.", + see_also_func_name="read_csv", + see_also_func_summary=( + "Read a comma-separated values (csv) file into DataFrame." + ), + _default_sep=r"'\\t' (tab-stop)", + storage_options=_shared_docs["storage_options"], + decompression_options=_shared_docs["decompression_options"] + % "filepath_or_buffer", + ) +) +def read_table( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + sep: str | None | lib.NoDefault = lib.no_default, + delimiter: str | None | lib.NoDefault = None, + # Column and Index Locations and Names + header: int | Sequence[int] | None | Literal["infer"] = "infer", + names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, + index_col: IndexLabel | Literal[False] | None = None, + usecols: UsecolsArgType = None, + # General Parsing Configuration + dtype: DtypeArg | None = None, + engine: CSVEngine | None = None, + converters: Mapping[HashableT, Callable] | None = None, + true_values: list | None = None, + false_values: list | None = None, + skipinitialspace: bool = False, + skiprows: list[int] | int | Callable[[Hashable], bool] | None = None, + skipfooter: int = 0, + nrows: int | None = None, + # NA and Missing Data Handling + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = None, + keep_default_na: bool = True, + na_filter: bool = True, + skip_blank_lines: bool = True, + # Datetime Handling + parse_dates: bool | Sequence[Hashable] | None = None, + infer_datetime_format: bool | lib.NoDefault = lib.no_default, + keep_date_col: bool | lib.NoDefault = lib.no_default, + date_parser: Callable | lib.NoDefault = lib.no_default, + date_format: str | dict[Hashable, str] | None = None, + dayfirst: bool = False, + cache_dates: bool = True, + # Iteration + iterator: bool = False, + chunksize: int | None = None, + # Quoting, Compression, and File Format + compression: CompressionOptions = "infer", + thousands: str | None = None, + decimal: str = ".", + lineterminator: str | None = None, + quotechar: str = '"', + quoting: int = csv.QUOTE_MINIMAL, + doublequote: bool = True, + escapechar: str | None = None, + comment: str | None = None, + encoding: str | None = None, + encoding_errors: str | None = "strict", + dialect: str | csv.Dialect | None = None, + # Error Handling + on_bad_lines: str = "error", + # Internal + delim_whitespace: bool | lib.NoDefault = lib.no_default, + low_memory: bool = _c_parser_defaults["low_memory"], + memory_map: bool = False, + float_precision: Literal["high", "legacy", "round_trip"] | None = None, + storage_options: StorageOptions | None = None, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, +) -> DataFrame | TextFileReader: + if keep_date_col is not lib.no_default: + # GH#55569 + warnings.warn( + "The 'keep_date_col' keyword in pd.read_table is deprecated and " + "will be removed in a future version. Explicitly remove unwanted " + "columns after parsing instead.", + FutureWarning, + stacklevel=find_stack_level(), ) - bm._consolidate_inplace() - return bm - - def interpolate(self, inplace: bool, **kwargs) -> Self: - return self.apply("interpolate", inplace=inplace, **kwargs) - - def pad_or_backfill(self, inplace: bool, **kwargs) -> Self: - return self.apply("pad_or_backfill", inplace=inplace, **kwargs) - - def shift(self, periods: int, fill_value) -> Self: - if fill_value is lib.no_default: - fill_value = None - - return self.apply("shift", periods=periods, fill_value=fill_value) - - def setitem(self, indexer, value) -> Self: - """ - Set values with indexer. - - For SingleBlockManager, this backs s[indexer] = value - """ - if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim: - raise ValueError(f"Cannot set values with ndim > {self.ndim}") - - if not self._has_no_reference(0): - # this method is only called if there is a single block -> hardcoded 0 - # Split blocks to only copy the columns we want to modify - if self.ndim == 2 and isinstance(indexer, tuple): - blk_loc = self.blklocs[indexer[1]] - if is_list_like(blk_loc) and blk_loc.ndim == 2: - blk_loc = np.squeeze(blk_loc, axis=0) - elif not is_list_like(blk_loc): - # Keep dimension and copy data later - blk_loc = [blk_loc] # type: ignore[assignment] - if len(blk_loc) == 0: - return self.copy(deep=False) - - values = self.blocks[0].values - if values.ndim == 2: - values = values[blk_loc] - # "T" has no attribute "_iset_split_block" - self._iset_split_block( # type: ignore[attr-defined] - 0, blk_loc, values - ) - # first block equals values - self.blocks[0].setitem((indexer[0], np.arange(len(blk_loc))), value) - return self - # No need to split if we either set all columns or on a single block - # manager - self = self.copy() - - return self.apply("setitem", indexer=indexer, value=value) - - def diff(self, n: int) -> Self: - # only reached with self.ndim == 2 - return self.apply("diff", n=n) - - def astype(self, dtype, errors: str = "raise") -> Self: - return self.apply("astype", dtype=dtype, errors=errors) - - def convert(self) -> Self: - return self.apply("convert") - - def convert_dtypes(self, **kwargs): - return self.apply("convert_dtypes", **kwargs) - - def get_values_for_csv( - self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None - ) -> Self: - """ - Convert values to native types (strings / python objects) that are used - in formatting (repr / csv). - """ - return self.apply( - "get_values_for_csv", - na_rep=na_rep, - quoting=quoting, - float_format=float_format, - date_format=date_format, - decimal=decimal, + else: + keep_date_col = False + + # error: Item "bool" of "bool | Sequence[Hashable]" has no attribute "__iter__" + if lib.is_list_like(parse_dates) and not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr] + # GH#55569 + warnings.warn( + "Support for nested sequences for 'parse_dates' in pd.read_table " + "is deprecated. Combine the desired columns with pd.to_datetime " + "after parsing instead.", + FutureWarning, + stacklevel=find_stack_level(), ) - @property - def any_extension_types(self) -> bool: - """Whether any of the blocks in this manager are extension blocks""" - return any(block.is_extension for block in self.blocks) - - @property - def is_view(self) -> bool: - """return a boolean if we are a single block and are a view""" - if len(self.blocks) == 1: - return self.blocks[0].is_view - - # It is technically possible to figure out which blocks are views - # e.g. [ b.values.base is not None for b in self.blocks ] - # but then we have the case of possibly some blocks being a view - # and some blocks not. setting in theory is possible on the non-view - # blocks. But this is a bit - # complicated - - return False - - def _get_data_subset(self, predicate: Callable) -> Self: - blocks = [blk for blk in self.blocks if predicate(blk.values)] - return self._combine(blocks) - - def get_bool_data(self) -> Self: - """ - Select blocks that are bool-dtype and columns from object-dtype blocks - that are all-bool. - """ - - new_blocks = [] - - for blk in self.blocks: - if blk.dtype == bool: - new_blocks.append(blk) - - elif blk.is_object: - nbs = blk._split() - new_blocks.extend(nb for nb in nbs if nb.is_bool) - - return self._combine(new_blocks) - - def get_numeric_data(self) -> Self: - numeric_blocks = [blk for blk in self.blocks if blk.is_numeric] - if len(numeric_blocks) == len(self.blocks): - # Avoid somewhat expensive _combine - return self - return self._combine(numeric_blocks) - - def _combine(self, blocks: list[Block], index: Index | None = None) -> Self: - """return a new manager with the blocks""" - if len(blocks) == 0: - if self.ndim == 2: - # retain our own Index dtype - if index is not None: - axes = [self.items[:0], index] - else: - axes = [self.items[:0]] + self.axes[1:] - return self.make_empty(axes) - return self.make_empty() - - # FIXME: optimization potential - indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) - inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) - - new_blocks: list[Block] = [] - for b in blocks: - nb = b.copy(deep=False) - nb.mgr_locs = BlockPlacement(inv_indexer[nb.mgr_locs.indexer]) - new_blocks.append(nb) - - axes = list(self.axes) - if index is not None: - axes[-1] = index - axes[0] = self.items.take(indexer) - - return type(self).from_blocks(new_blocks, axes) - - @property - def nblocks(self) -> int: - return len(self.blocks) - - def copy(self, deep: bool | Literal["all"] = True) -> Self: - """ - Make deep or shallow copy of BlockManager - - Parameters - ---------- - deep : bool, string or None, default True - If False or None, return a shallow copy (do not copy data) - If 'all', copy data and a deep copy of the index - - Returns - ------- - BlockManager - """ - # this preserves the notion of view copying of axes - if deep: - # hit in e.g. tests.io.json.test_pandas - - def copy_func(ax): - return ax.copy(deep=True) if deep == "all" else ax.view() - - new_axes = [copy_func(ax) for ax in self.axes] - else: - new_axes = [ax.view() for ax in self.axes] - - res = self.apply("copy", deep=deep) - res.axes = new_axes - - if self.ndim > 1: - # Avoid needing to re-compute these - blknos = self._blknos - if blknos is not None: - res._blknos = blknos.copy() - res._blklocs = self._blklocs.copy() - - if deep: - res._consolidate_inplace() - return res - - def is_consolidated(self) -> bool: - return True - - def consolidate(self) -> Self: - """ - Join together blocks having same dtype - - Returns - ------- - y : BlockManager - """ - if self.is_consolidated(): - return self - - bm = type(self)(self.blocks, self.axes, verify_integrity=False) - bm._is_consolidated = False - bm._consolidate_inplace() - return bm - - def _consolidate_inplace(self) -> None: - return - - @final - def reindex_axis( - self, - new_index: Index, - axis: AxisInt, - fill_value=None, - only_slice: bool = False, - ) -> Self: - """ - Conform data manager to new index. - """ - new_index, indexer = self.axes[axis].reindex(new_index) - - return self.reindex_indexer( - new_index, - indexer, - axis=axis, - fill_value=fill_value, - only_slice=only_slice, + if infer_datetime_format is not lib.no_default: + warnings.warn( + "The argument 'infer_datetime_format' is deprecated and will " + "be removed in a future version. " + "A strict version of it is now the default, see " + "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " + "You can safely remove this argument.", + FutureWarning, + stacklevel=find_stack_level(), ) - def reindex_indexer( - self, - new_axis: Index, - indexer: npt.NDArray[np.intp] | None, - axis: AxisInt, - fill_value=None, - allow_dups: bool = False, - only_slice: bool = False, - *, - use_na_proxy: bool = False, - ) -> Self: - """ - Parameters - ---------- - new_axis : Index - indexer : ndarray[intp] or None - axis : int - fill_value : object, default None - allow_dups : bool, default False - only_slice : bool, default False - Whether to take views, not copies, along columns. - use_na_proxy : bool, default False - Whether to use a np.void ndarray for newly introduced columns. - - pandas-indexer with -1's only. - """ - if indexer is None: - if new_axis is self.axes[axis]: - return self - - result = self.copy(deep=False) - result.axes = list(self.axes) - result.axes[axis] = new_axis - return result - - # Should be intp, but in some cases we get int64 on 32bit builds - assert isinstance(indexer, np.ndarray) - - # some axes don't allow reindexing with dups - if not allow_dups: - self.axes[axis]._validate_can_reindex(indexer) - - if axis >= self.ndim: - raise IndexError("Requested axis not found in manager") - - if axis == 0: - new_blocks = self._slice_take_blocks_ax0( - indexer, - fill_value=fill_value, - only_slice=only_slice, - use_na_proxy=use_na_proxy, - ) - else: - new_blocks = [ - blk.take_nd( - indexer, - axis=1, - fill_value=( - fill_value if fill_value is not None else blk.fill_value - ), - ) - for blk in self.blocks - ] - - new_axes = list(self.axes) - new_axes[axis] = new_axis - - new_mgr = type(self).from_blocks(new_blocks, new_axes) - if axis == 1: - # We can avoid the need to rebuild these - new_mgr._blknos = self.blknos.copy() - new_mgr._blklocs = self.blklocs.copy() - return new_mgr - - def _slice_take_blocks_ax0( - self, - slice_or_indexer: slice | np.ndarray, - fill_value=lib.no_default, - only_slice: bool = False, - *, - use_na_proxy: bool = False, - ref_inplace_op: bool = False, - ) -> list[Block]: - """ - Slice/take blocks along axis=0. - - Overloaded for SingleBlock - - Parameters - ---------- - slice_or_indexer : slice or np.ndarray[int64] - fill_value : scalar, default lib.no_default - only_slice : bool, default False - If True, we always return views on existing arrays, never copies. - This is used when called from ops.blockwise.operate_blockwise. - use_na_proxy : bool, default False - Whether to use a np.void ndarray for newly introduced columns. - ref_inplace_op: bool, default False - Don't track refs if True because we operate inplace - - Returns - ------- - new_blocks : list of Block - """ - allow_fill = fill_value is not lib.no_default - - sl_type, slobj, sllen = _preprocess_slice_or_indexer( - slice_or_indexer, self.shape[0], allow_fill=allow_fill + if delim_whitespace is not lib.no_default: + # GH#55569 + warnings.warn( + "The 'delim_whitespace' keyword in pd.read_table is deprecated and " + "will be removed in a future version. Use ``sep='\\s+'`` instead", + FutureWarning, + stacklevel=find_stack_level(), ) + else: + delim_whitespace = False + + # locals() should never be modified + kwds = locals().copy() + del kwds["filepath_or_buffer"] + del kwds["sep"] + + kwds_defaults = _refine_defaults_read( + dialect, + delimiter, + delim_whitespace, + engine, + sep, + on_bad_lines, + names, + defaults={"delimiter": "\t"}, + dtype_backend=dtype_backend, + ) + kwds.update(kwds_defaults) + + return _read(filepath_or_buffer, kwds) + + +@overload +def read_fwf( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + colspecs: Sequence[tuple[int, int]] | str | None = ..., + widths: Sequence[int] | None = ..., + infer_nrows: int = ..., + iterator: Literal[True], + chunksize: int | None = ..., + **kwds: Unpack[_read_shared[HashableT]], +) -> TextFileReader: ... + + +@overload +def read_fwf( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + colspecs: Sequence[tuple[int, int]] | str | None = ..., + widths: Sequence[int] | None = ..., + infer_nrows: int = ..., + iterator: bool = ..., + chunksize: int, + **kwds: Unpack[_read_shared[HashableT]], +) -> TextFileReader: ... + + +@overload +def read_fwf( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + colspecs: Sequence[tuple[int, int]] | str | None = ..., + widths: Sequence[int] | None = ..., + infer_nrows: int = ..., + iterator: Literal[False] = ..., + chunksize: None = ..., + **kwds: Unpack[_read_shared[HashableT]], +) -> DataFrame: ... + + +def read_fwf( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + colspecs: Sequence[tuple[int, int]] | str | None = "infer", + widths: Sequence[int] | None = None, + infer_nrows: int = 100, + iterator: bool = False, + chunksize: int | None = None, + **kwds: Unpack[_read_shared[HashableT]], +) -> DataFrame | TextFileReader: + r""" + Read a table of fixed-width formatted lines into DataFrame. + + Also supports optionally iterating or breaking of the file + into chunks. + + Additional help can be found in the `online docs for IO Tools + `_. - if self.is_single_block: - blk = self.blocks[0] - - if sl_type == "slice": - # GH#32959 EABlock would fail since we can't make 0-width - # TODO(EA2D): special casing unnecessary with 2D EAs - if sllen == 0: - return [] - bp = BlockPlacement(slice(0, sllen)) - return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)] - elif not allow_fill or self.ndim == 1: - if allow_fill and fill_value is None: - fill_value = blk.fill_value - - if not allow_fill and only_slice: - # GH#33597 slice instead of take, so we get - # views instead of copies - blocks = [ - blk.getitem_block_columns( - slice(ml, ml + 1), - new_mgr_locs=BlockPlacement(i), - ref_inplace_op=ref_inplace_op, - ) - for i, ml in enumerate(slobj) - ] - return blocks - else: - bp = BlockPlacement(slice(0, sllen)) - return [ - blk.take_nd( - slobj, - axis=0, - new_mgr_locs=bp, - fill_value=fill_value, - ) - ] - - if sl_type == "slice": - blknos = self.blknos[slobj] - blklocs = self.blklocs[slobj] - else: - blknos = algos.take_nd( - self.blknos, slobj, fill_value=-1, allow_fill=allow_fill - ) - blklocs = algos.take_nd( - self.blklocs, slobj, fill_value=-1, allow_fill=allow_fill - ) - - # When filling blknos, make sure blknos is updated before appending to - # blocks list, that way new blkno is exactly len(blocks). - blocks = [] - group = not only_slice - for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group): - if blkno == -1: - # If we've got here, fill_value was not lib.no_default - - blocks.append( - self._make_na_block( - placement=mgr_locs, - fill_value=fill_value, - use_na_proxy=use_na_proxy, - ) - ) - else: - blk = self.blocks[blkno] - - # Otherwise, slicing along items axis is necessary. - if not blk._can_consolidate and not blk._validate_ndim: - # i.e. we dont go through here for DatetimeTZBlock - # A non-consolidatable block, it's easy, because there's - # only one item and each mgr loc is a copy of that single - # item. - deep = False - for mgr_loc in mgr_locs: - newblk = blk.copy(deep=deep) - newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1)) - blocks.append(newblk) + Parameters + ---------- + filepath_or_buffer : str, path object, or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a text ``read()`` function.The string could be a URL. + Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: + ``file://localhost/path/to/table.csv``. + colspecs : list of tuple (int, int) or 'infer'. optional + A list of tuples giving the extents of the fixed-width + fields of each line as half-open intervals (i.e., [from, to] ). + String value 'infer' can be used to instruct the parser to try + detecting the column specifications from the first 100 rows of + the data which are not being skipped via skiprows (default='infer'). + widths : list of int, optional + A list of field widths which can be used instead of 'colspecs' if + the intervals are contiguous. + infer_nrows : int, default 100 + The number of rows to consider when letting the parser determine the + `colspecs`. + iterator : bool, default False + Return ``TextFileReader`` object for iteration or getting chunks with + ``get_chunk()``. + chunksize : int, optional + Number of lines to read from the file per chunk. + **kwds : optional + Optional keyword arguments can be passed to ``TextFileReader``. - else: - # GH#32779 to avoid the performance penalty of copying, - # we may try to only slice - taker = blklocs[mgr_locs.indexer] - max_len = max(len(mgr_locs), taker.max() + 1) - taker = lib.maybe_indices_to_slice(taker, max_len) - - if isinstance(taker, slice): - nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs) - blocks.append(nb) - elif only_slice: - # GH#33597 slice instead of take, so we get - # views instead of copies - for i, ml in zip(taker, mgr_locs): - slc = slice(i, i + 1) - bp = BlockPlacement(ml) - nb = blk.getitem_block_columns(slc, new_mgr_locs=bp) - # We have np.shares_memory(nb.values, blk.values) - blocks.append(nb) + Returns + ------- + DataFrame or TextFileReader + A comma-separated values (csv) file is returned as two-dimensional + data structure with labeled axes. + + See Also + -------- + DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. + read_csv : Read a comma-separated values (csv) file into DataFrame. + + Examples + -------- + >>> pd.read_fwf("data.csv") # doctest: +SKIP + """ + # Check input arguments. + if colspecs is None and widths is None: + raise ValueError("Must specify either colspecs or widths") + if colspecs not in (None, "infer") and widths is not None: + raise ValueError("You must specify only one of 'widths' and 'colspecs'") + + # Compute 'colspecs' from 'widths', if specified. + if widths is not None: + colspecs, col = [], 0 + for w in widths: + colspecs.append((col, col + w)) + col += w + + # for mypy + assert colspecs is not None + + # GH#40830 + # Ensure length of `colspecs` matches length of `names` + names = kwds.get("names") + if names is not None and names is not lib.no_default: + if len(names) != len(colspecs) and colspecs != "infer": + # need to check len(index_col) as it might contain + # unnamed indices, in which case it's name is not required + len_index = 0 + if kwds.get("index_col") is not None: + index_col: Any = kwds.get("index_col") + if index_col is not False: + if not is_list_like(index_col): + len_index = 1 else: - nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) - blocks.append(nb) - - return blocks - - def _make_na_block( - self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False - ) -> Block: - # Note: we only get here with self.ndim == 2 - - if use_na_proxy: - assert fill_value is None - shape = (len(placement), self.shape[1]) - vals = np.empty(shape, dtype=np.void) - nb = NumpyBlock(vals, placement, ndim=2) - return nb - - if fill_value is None or fill_value is np.nan: - fill_value = np.nan - # GH45857 avoid unnecessary upcasting - dtype = interleaved_dtype([blk.dtype for blk in self.blocks]) - if dtype is not None and np.issubdtype(dtype.type, np.floating): - fill_value = dtype.type(fill_value) - - shape = (len(placement), self.shape[1]) - - dtype, fill_value = infer_dtype_from_scalar(fill_value) - block_values = make_na_array(dtype, shape, fill_value) - return new_block_2d(block_values, placement=placement) - - def take( - self, - indexer: npt.NDArray[np.intp], - axis: AxisInt = 1, - verify: bool = True, - ) -> Self: - """ - Take items along any axis. - - indexer : np.ndarray[np.intp] - axis : int, default 1 - verify : bool, default True - Check that all entries are between 0 and len(self) - 1, inclusive. - Pass verify=False if this check has been done by the caller. - - Returns - ------- - BlockManager - """ - # Caller is responsible for ensuring indexer annotation is accurate - - n = self.shape[axis] - indexer = maybe_convert_indices(indexer, n, verify=verify) - - new_labels = self.axes[axis].take(indexer) - return self.reindex_indexer( - new_axis=new_labels, - indexer=indexer, - axis=axis, - allow_dups=True, - ) + # for mypy: handled in the if-branch + assert index_col is not lib.no_default + + len_index = len(index_col) + if kwds.get("usecols") is None and len(names) + len_index != len(colspecs): + # If usecols is used colspec may be longer than names + raise ValueError("Length of colspecs must match length of names") + + check_dtype_backend(kwds.setdefault("dtype_backend", lib.no_default)) + return _read( + filepath_or_buffer, + kwds + | { + "colspecs": colspecs, + "infer_nrows": infer_nrows, + "engine": "python-fwf", + "iterator": iterator, + "chunksize": chunksize, + }, + ) -class BlockManager(libinternals.BlockManager, BaseBlockManager): - """ - BaseBlockManager that holds 2D blocks. +class TextFileReader(abc.Iterator): """ - ndim = 2 + Passed dialect overrides any of the related parser options - # ---------------------------------------------------------------- - # Constructors + """ def __init__( self, - blocks: Sequence[Block], - axes: Sequence[Index], - verify_integrity: bool = True, + f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list, + engine: CSVEngine | None = None, + **kwds, ) -> None: - if verify_integrity: - # Assertion disabled for performance - # assert all(isinstance(x, Index) for x in axes) - - for block in blocks: - if self.ndim != block.ndim: - raise AssertionError( - f"Number of Block dimensions ({block.ndim}) must equal " - f"number of axes ({self.ndim})" - ) - # As of 2.0, the caller is responsible for ensuring that - # DatetimeTZBlock with block.ndim == 2 has block.values.ndim ==2; - # previously there was a special check for fastparquet compat. - - self._verify_integrity() - - def _verify_integrity(self) -> None: - mgr_shape = self.shape - tot_items = sum(len(x.mgr_locs) for x in self.blocks) - for block in self.blocks: - if block.shape[1:] != mgr_shape[1:]: - raise_construction_error(tot_items, block.shape[1:], self.axes) - if len(self.items) != tot_items: - raise AssertionError( - "Number of manager items must equal union of " - f"block items\n# manager items: {len(self.items)}, # " - f"tot_items: {tot_items}" - ) - - @classmethod - def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> Self: - """ - Constructor for BlockManager and SingleBlockManager with same signature. - """ - return cls(blocks, axes, verify_integrity=False) - - # ---------------------------------------------------------------- - # Indexing - - def fast_xs(self, loc: int) -> SingleBlockManager: - """ - Return the array corresponding to `frame.iloc[loc]`. - - Parameters - ---------- - loc : int - - Returns - ------- - np.ndarray or ExtensionArray - """ - if len(self.blocks) == 1: - # TODO: this could be wrong if blk.mgr_locs is not slice(None)-like; - # is this ruled out in the general case? - result: np.ndarray | ExtensionArray = self.blocks[0].iget( - (slice(None), loc) - ) - # in the case of a single block, the new block is a view - bp = BlockPlacement(slice(0, len(result))) - block = new_block( - result, - placement=bp, - ndim=1, - refs=self.blocks[0].refs, - ) - return SingleBlockManager(block, self.axes[0]) - - dtype = interleaved_dtype([blk.dtype for blk in self.blocks]) - - n = len(self) - - if isinstance(dtype, ExtensionDtype): - # TODO: use object dtype as workaround for non-performant - # EA.__setitem__ methods. (primarily ArrowExtensionArray.__setitem__ - # when iteratively setting individual values) - # https://github.com/pandas-dev/pandas/pull/54508#issuecomment-1675827918 - result = np.empty(n, dtype=object) + if engine is not None: + engine_specified = True else: - result = np.empty(n, dtype=dtype) - result = ensure_wrapped_if_datetimelike(result) - - for blk in self.blocks: - # Such assignment may incorrectly coerce NaT to None - # result[blk.mgr_locs] = blk._slice((slice(None), loc)) - for i, rl in enumerate(blk.mgr_locs): - result[rl] = blk.iget((i, loc)) - - if isinstance(dtype, ExtensionDtype): - cls = dtype.construct_array_type() - result = cls._from_sequence(result, dtype=dtype) - - bp = BlockPlacement(slice(0, len(result))) - block = new_block(result, placement=bp, ndim=1) - return SingleBlockManager(block, self.axes[0]) - - def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager: - """ - Return the data as a SingleBlockManager. - """ - block = self.blocks[self.blknos[i]] - values = block.iget(self.blklocs[i]) - - # shortcut for select a single-dim from a 2-dim BM - bp = BlockPlacement(slice(0, len(values))) - nb = type(block)( - values, placement=bp, ndim=1, refs=block.refs if track_ref else None - ) - return SingleBlockManager(nb, self.axes[1]) - - def iget_values(self, i: int) -> ArrayLike: - """ - Return the data for column i as the values (ndarray or ExtensionArray). - - Warning! The returned array is a view but doesn't handle Copy-on-Write, - so this should be used with caution. - """ - # TODO(CoW) making the arrays read-only might make this safer to use? - block = self.blocks[self.blknos[i]] - values = block.iget(self.blklocs[i]) - return values - - @property - def column_arrays(self) -> list[np.ndarray]: - """ - Used in the JSON C code to access column arrays. - This optimizes compared to using `iget_values` by converting each - - Warning! This doesn't handle Copy-on-Write, so should be used with - caution (current use case of consuming this in the JSON code is fine). - """ - # This is an optimized equivalent to - # result = [self.iget_values(i) for i in range(len(self.items))] - result: list[np.ndarray | None] = [None] * len(self.items) - - for blk in self.blocks: - mgr_locs = blk._mgr_locs - values = blk.array_values._values_for_json() - if values.ndim == 1: - # TODO(EA2D): special casing not needed with 2D EAs - result[mgr_locs[0]] = values + engine = "python" + engine_specified = False + self.engine = engine + self._engine_specified = kwds.get("engine_specified", engine_specified) - else: - for i, loc in enumerate(mgr_locs): - result[loc] = values[i] - - # error: Incompatible return value type (got "List[None]", - # expected "List[ndarray[Any, Any]]") - return result # type: ignore[return-value] + _validate_skipfooter(kwds) - def iset( - self, - loc: int | slice | np.ndarray, - value: ArrayLike, - inplace: bool = False, - refs: BlockValuesRefs | None = None, - ) -> None: - """ - Set new item in-place. Does not consolidate. Adds new Block if not - contained in the current set of items - """ - - # FIXME: refactor, clearly separate broadcasting & zip-like assignment - # can prob also fix the various if tests for sparse/categorical - if self._blklocs is None and self.ndim > 1: - self._rebuild_blknos_and_blklocs() - - # Note: we exclude DTA/TDA here - value_is_extension_type = is_1d_only_ea_dtype(value.dtype) - if not value_is_extension_type: - if value.ndim == 2: - value = value.T - else: - value = ensure_block_shape(value, ndim=2) - - if value.shape[1:] != self.shape[1:]: - raise AssertionError( - "Shape of new values must be compatible with manager shape" + dialect = _extract_dialect(kwds) + if dialect is not None: + if engine == "pyarrow": + raise ValueError( + "The 'dialect' option is not supported with the 'pyarrow' engine" ) + kwds = _merge_with_dialect_properties(dialect, kwds) - if lib.is_integer(loc): - # We have 6 tests where loc is _not_ an int. - # In this case, get_blkno_placements will yield only one tuple, - # containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1))) - - # Check if we can use _iset_single fastpath - loc = cast(int, loc) - blkno = self.blknos[loc] - blk = self.blocks[blkno] - if len(blk._mgr_locs) == 1: # TODO: fastest way to check this? - return self._iset_single( - loc, - value, - inplace=inplace, - blkno=blkno, - blk=blk, - refs=refs, - ) + if kwds.get("header", "infer") == "infer": + kwds["header"] = 0 if kwds.get("names") is None else None - # error: Incompatible types in assignment (expression has type - # "List[Union[int, slice, ndarray]]", variable has type "Union[int, - # slice, ndarray]") - loc = [loc] # type: ignore[assignment] + self.orig_options = kwds - # categorical/sparse/datetimetz - if value_is_extension_type: + # miscellanea + self._currow = 0 - def value_getitem(placement): - return value + options = self._get_options_with_defaults(engine) + options["storage_options"] = kwds.get("storage_options", None) - else: + self.chunksize = options.pop("chunksize", None) + self.nrows = options.pop("nrows", None) - def value_getitem(placement): - return value[placement.indexer] - - # Accessing public blknos ensures the public versions are initialized - blknos = self.blknos[loc] - blklocs = self.blklocs[loc].copy() - - unfit_mgr_locs = [] - unfit_val_locs = [] - removed_blknos = [] - for blkno_l, val_locs in libinternals.get_blkno_placements(blknos, group=True): - blk = self.blocks[blkno_l] - blk_locs = blklocs[val_locs.indexer] - if inplace and blk.should_store(value): - # Updating inplace -> check if we need to do Copy-on-Write - if not self._has_no_reference_block(blkno_l): - self._iset_split_block( - blkno_l, blk_locs, value_getitem(val_locs), refs=refs - ) - else: - blk.set_inplace(blk_locs, value_getitem(val_locs)) - continue - else: - unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) - unfit_val_locs.append(val_locs) + self._check_file_or_buffer(f, engine) + self.options, self.engine = self._clean_options(options, engine) - # If all block items are unfit, schedule the block for removal. - if len(val_locs) == len(blk.mgr_locs): - removed_blknos.append(blkno_l) - continue - else: - # Defer setting the new values to enable consolidation - self._iset_split_block(blkno_l, blk_locs, refs=refs) - - if len(removed_blknos): - # Remove blocks & update blknos accordingly - is_deleted = np.zeros(self.nblocks, dtype=np.bool_) - is_deleted[removed_blknos] = True - - new_blknos = np.empty(self.nblocks, dtype=np.intp) - new_blknos.fill(-1) - new_blknos[~is_deleted] = np.arange(self.nblocks - len(removed_blknos)) - self._blknos = new_blknos[self._blknos] - self.blocks = tuple( - blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos) - ) - - if unfit_val_locs: - unfit_idxr = np.concatenate(unfit_mgr_locs) - unfit_count = len(unfit_idxr) - - new_blocks: list[Block] = [] - if value_is_extension_type: - # This code (ab-)uses the fact that EA blocks contain only - # one item. - # TODO(EA2D): special casing unnecessary with 2D EAs - new_blocks.extend( - new_block_2d( - values=value, - placement=BlockPlacement(slice(mgr_loc, mgr_loc + 1)), - refs=refs, - ) - for mgr_loc in unfit_idxr - ) - - self._blknos[unfit_idxr] = np.arange(unfit_count) + len(self.blocks) - self._blklocs[unfit_idxr] = 0 - - else: - # unfit_val_locs contains BlockPlacement objects - unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:]) - - new_blocks.append( - new_block_2d( - values=value_getitem(unfit_val_items), - placement=BlockPlacement(unfit_idxr), - refs=refs, - ) - ) + if "has_index_names" in kwds: + self.options["has_index_names"] = kwds["has_index_names"] - self._blknos[unfit_idxr] = len(self.blocks) - self._blklocs[unfit_idxr] = np.arange(unfit_count) + self.handles: IOHandles | None = None + self._engine = self._make_engine(f, self.engine) - self.blocks += tuple(new_blocks) + def close(self) -> None: + if self.handles is not None: + self.handles.close() + self._engine.close() - # Newly created block's dtype may already be present. - self._known_consolidated = False + def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: + kwds = self.orig_options - def _iset_split_block( - self, - blkno_l: int, - blk_locs: np.ndarray | list[int], - value: ArrayLike | None = None, - refs: BlockValuesRefs | None = None, - ) -> None: - """Removes columns from a block by splitting the block. - - Avoids copying the whole block through slicing and updates the manager - after determinint the new block structure. Optionally adds a new block, - otherwise has to be done by the caller. - - Parameters - ---------- - blkno_l: The block number to operate on, relevant for updating the manager - blk_locs: The locations of our block that should be deleted. - value: The value to set as a replacement. - refs: The reference tracking object of the value to set. - """ - blk = self.blocks[blkno_l] - - if self._blklocs is None: - self._rebuild_blknos_and_blklocs() - - nbs_tup = tuple(blk.delete(blk_locs)) - if value is not None: - locs = blk.mgr_locs.as_array[blk_locs] - first_nb = new_block_2d(value, BlockPlacement(locs), refs=refs) - else: - first_nb = nbs_tup[0] - nbs_tup = tuple(nbs_tup[1:]) + options = {} + default: object | None - nr_blocks = len(self.blocks) - blocks_tup = ( - self.blocks[:blkno_l] + (first_nb,) + self.blocks[blkno_l + 1 :] + nbs_tup - ) - self.blocks = blocks_tup + for argname, default in parser_defaults.items(): + value = kwds.get(argname, default) - if not nbs_tup and value is not None: - # No need to update anything if split did not happen - return - - self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb)) - - for i, nb in enumerate(nbs_tup): - self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) - self._blknos[nb.mgr_locs.indexer] = i + nr_blocks - - def _iset_single( - self, - loc: int, - value: ArrayLike, - inplace: bool, - blkno: int, - blk: Block, - refs: BlockValuesRefs | None = None, - ) -> None: - """ - Fastpath for iset when we are only setting a single position and - the Block currently in that position is itself single-column. - - In this case we can swap out the entire Block and blklocs and blknos - are unaffected. - """ - # Caller is responsible for verifying value.shape - - if inplace and blk.should_store(value): - copy = not self._has_no_reference_block(blkno) - iloc = self.blklocs[loc] - blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy) - return - - nb = new_block_2d(value, placement=blk._mgr_locs, refs=refs) - old_blocks = self.blocks - new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :] - self.blocks = new_blocks - return - - def column_setitem( - self, loc: int, idx: int | slice | np.ndarray, value, inplace_only: bool = False - ) -> None: - """ - Set values ("setitem") into a single column (not setting the full column). - - This is a method on the BlockManager level, to avoid creating an - intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`) - """ - if not self._has_no_reference(loc): - blkno = self.blknos[loc] - # Split blocks to only copy the column we want to modify - blk_loc = self.blklocs[loc] - # Copy our values - values = self.blocks[blkno].values - if values.ndim == 1: - values = values.copy() + # see gh-12935 + if ( + engine == "pyarrow" + and argname in _pyarrow_unsupported + and value != default + and value != getattr(value, "value", default) + ): + raise ValueError( + f"The {argname!r} option is not supported with the " + f"'pyarrow' engine" + ) + options[argname] = value + + for argname, default in _c_parser_defaults.items(): + if argname in kwds: + value = kwds[argname] + + if engine != "c" and value != default: + # TODO: Refactor this logic, its pretty convoluted + if "python" in engine and argname not in _python_unsupported: + pass + elif "pyarrow" in engine and argname not in _pyarrow_unsupported: + pass + else: + raise ValueError( + f"The {argname!r} option is not supported with the " + f"{engine!r} engine" + ) else: - # Use [blk_loc] as indexer to keep ndim=2, this already results in a - # copy - values = values[[blk_loc]] - self._iset_split_block(blkno, [blk_loc], values) - - # this manager is only created temporarily to mutate the values in place - # so don't track references, otherwise the `setitem` would perform CoW again - col_mgr = self.iget(loc, track_ref=False) - if inplace_only: - col_mgr.setitem_inplace(idx, value) - else: - new_mgr = col_mgr.setitem((idx,), value) - self.iset(loc, new_mgr._block.values, inplace=True) - - def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None: - """ - Insert item at selected position. - - Parameters - ---------- - loc : int - item : hashable - value : np.ndarray or ExtensionArray - refs : The reference tracking object of the value to set. - """ - new_axis = self.items.insert(loc, item) - - if value.ndim == 2: - value = value.T - if len(value) > 1: + value = default + options[argname] = value + + if engine == "python-fwf": + for argname, default in _fwf_defaults.items(): + options[argname] = kwds.get(argname, default) + + return options + + def _check_file_or_buffer(self, f, engine: CSVEngine) -> None: + # see gh-16530 + if is_file_like(f) and engine != "c" and not hasattr(f, "__iter__"): + # The C engine doesn't need the file-like to have the "__iter__" + # attribute. However, the Python engine needs "__iter__(...)" + # when iterating through such an object, meaning it + # needs to have that attribute + raise ValueError( + "The 'python' engine cannot iterate through this file buffer." + ) + if hasattr(f, "encoding"): + file_encoding = f.encoding + orig_reader_enc = self.orig_options.get("encoding", None) + any_none = file_encoding is None or orig_reader_enc is None + if file_encoding != orig_reader_enc and not any_none: + file_path = getattr(f, "name", None) raise ValueError( - f"Expected a 1D array, got an array with shape {value.T.shape}" + f"The specified reader encoding {orig_reader_enc} is different " + f"from the encoding {file_encoding} of file {file_path}." ) - else: - value = ensure_block_shape(value, ndim=self.ndim) - bp = BlockPlacement(slice(loc, loc + 1)) - block = new_block_2d(values=value, placement=bp, refs=refs) + def _clean_options( + self, options: dict[str, Any], engine: CSVEngine + ) -> tuple[dict[str, Any], CSVEngine]: + result = options.copy() - if not len(self.blocks): - # Fastpath - self._blklocs = np.array([0], dtype=np.intp) - self._blknos = np.array([0], dtype=np.intp) - else: - self._insert_update_mgr_locs(loc) - self._insert_update_blklocs_and_blknos(loc) + fallback_reason = None - self.axes[0] = new_axis - self.blocks += (block,) + # C engine not supported yet + if engine == "c": + if options["skipfooter"] > 0: + fallback_reason = "the 'c' engine does not support skipfooter" + engine = "python" - self._known_consolidated = False + sep = options["delimiter"] + delim_whitespace = options["delim_whitespace"] - if ( - get_option("performance_warnings") - and sum(not block.is_extension for block in self.blocks) > 100 - ): + if sep is None and not delim_whitespace: + if engine in ("c", "pyarrow"): + fallback_reason = ( + f"the '{engine}' engine does not support " + "sep=None with delim_whitespace=False" + ) + engine = "python" + elif sep is not None and len(sep) > 1: + if engine == "c" and sep == r"\s+": + result["delim_whitespace"] = True + del result["delimiter"] + elif engine not in ("python", "python-fwf"): + # wait until regex engine integrated + fallback_reason = ( + f"the '{engine}' engine does not support " + "regex separators (separators > 1 char and " + r"different from '\s+' are interpreted as regex)" + ) + engine = "python" + elif delim_whitespace: + if "python" in engine: + result["delimiter"] = r"\s+" + elif sep is not None: + encodeable = True + encoding = sys.getfilesystemencoding() or "utf-8" + try: + if len(sep.encode(encoding)) > 1: + encodeable = False + except UnicodeDecodeError: + encodeable = False + if not encodeable and engine not in ("python", "python-fwf"): + fallback_reason = ( + f"the separator encoded in {encoding} " + f"is > 1 char long, and the '{engine}' engine " + "does not support such separators" + ) + engine = "python" + + quotechar = options["quotechar"] + if quotechar is not None and isinstance(quotechar, (str, bytes)): + if ( + len(quotechar) == 1 + and ord(quotechar) > 127 + and engine not in ("python", "python-fwf") + ): + fallback_reason = ( + "ord(quotechar) > 127, meaning the " + "quotechar is larger than one byte, " + f"and the '{engine}' engine does not support such quotechars" + ) + engine = "python" + + if fallback_reason and self._engine_specified: + raise ValueError(fallback_reason) + + if engine == "c": + for arg in _c_unsupported: + del result[arg] + + if "python" in engine: + for arg in _python_unsupported: + if fallback_reason and result[arg] != _c_parser_defaults.get(arg): + raise ValueError( + "Falling back to the 'python' engine because " + f"{fallback_reason}, but this causes {arg!r} to be " + "ignored as it is not supported by the 'python' engine." + ) + del result[arg] + + if fallback_reason: warnings.warn( - "DataFrame is highly fragmented. This is usually the result " - "of calling `frame.insert` many times, which has poor performance. " - "Consider joining all columns at once using pd.concat(axis=1) " - "instead. To get a de-fragmented frame, use `newframe = frame.copy()`", - PerformanceWarning, + ( + "Falling back to the 'python' engine because " + f"{fallback_reason}; you can avoid this warning by specifying " + "engine='python'." + ), + ParserWarning, stacklevel=find_stack_level(), ) - def _insert_update_mgr_locs(self, loc) -> None: - """ - When inserting a new Block at location 'loc', we increment - all of the mgr_locs of blocks above that by one. - """ - for blkno, count in _fast_count_smallints(self.blknos[loc:]): - # .620 this way, .326 of which is in increment_above - blk = self.blocks[blkno] - blk._mgr_locs = blk._mgr_locs.increment_above(loc) - - def _insert_update_blklocs_and_blknos(self, loc) -> None: - """ - When inserting a new Block at location 'loc', we update our - _blklocs and _blknos. - """ - - # Accessing public blklocs ensures the public versions are initialized - if loc == self.blklocs.shape[0]: - # np.append is a lot faster, let's use it if we can. - self._blklocs = np.append(self._blklocs, 0) - self._blknos = np.append(self._blknos, len(self.blocks)) - elif loc == 0: - # As of numpy 1.26.4, np.concatenate faster than np.append - self._blklocs = np.concatenate([[0], self._blklocs]) - self._blknos = np.concatenate([[len(self.blocks)], self._blknos]) + index_col = options["index_col"] + names = options["names"] + converters = options["converters"] + na_values = options["na_values"] + skiprows = options["skiprows"] + + validate_header_arg(options["header"]) + + if index_col is True: + raise ValueError("The value of index_col couldn't be 'True'") + if is_index_col(index_col): + if not isinstance(index_col, (list, tuple, np.ndarray)): + index_col = [index_col] + result["index_col"] = index_col + + names = list(names) if names is not None else names + + # type conversion-related + if converters is not None: + if not isinstance(converters, dict): + raise TypeError( + "Type converters must be a dict or subclass, " + f"input was a {type(converters).__name__}" + ) else: - new_blklocs, new_blknos = libinternals.update_blklocs_and_blknos( - self.blklocs, self.blknos, loc, len(self.blocks) - ) - self._blklocs = new_blklocs - self._blknos = new_blknos - - def idelete(self, indexer) -> BlockManager: - """ - Delete selected locations, returning a new BlockManager. - """ - is_deleted = np.zeros(self.shape[0], dtype=np.bool_) - is_deleted[indexer] = True - taker = (~is_deleted).nonzero()[0] - - nbs = self._slice_take_blocks_ax0(taker, only_slice=True, ref_inplace_op=True) - new_columns = self.items[~is_deleted] - axes = [new_columns, self.axes[1]] - return type(self)(tuple(nbs), axes, verify_integrity=False) - - # ---------------------------------------------------------------- - # Block-wise Operation - - def grouped_reduce(self, func: Callable) -> Self: - """ - Apply grouped reduction function blockwise, returning a new BlockManager. - - Parameters - ---------- - func : grouped reduction function - - Returns - ------- - BlockManager - """ - result_blocks: list[Block] = [] - - for blk in self.blocks: - if blk.is_object: - # split on object-dtype blocks bc some columns may raise - # while others do not. - for sb in blk._split(): - applied = sb.apply(func) - result_blocks = extend_blocks(applied, result_blocks) - else: - applied = blk.apply(func) - result_blocks = extend_blocks(applied, result_blocks) + converters = {} - if len(result_blocks) == 0: - nrows = 0 + # Converting values to NA + keep_default_na = options["keep_default_na"] + floatify = engine != "pyarrow" + na_values, na_fvalues = _clean_na_values( + na_values, keep_default_na, floatify=floatify + ) + + # handle skiprows; this is internally handled by the + # c-engine, so only need for python and pyarrow parsers + if engine == "pyarrow": + if not is_integer(skiprows) and skiprows is not None: + # pyarrow expects skiprows to be passed as an integer + raise ValueError( + "skiprows argument must be an integer when using " + "engine='pyarrow'" + ) else: - nrows = result_blocks[0].values.shape[-1] - index = Index(range(nrows)) - - return type(self).from_blocks(result_blocks, [self.axes[0], index]) - - def reduce(self, func: Callable) -> Self: - """ - Apply reduction function blockwise, returning a single-row BlockManager. - - Parameters - ---------- - func : reduction function - - Returns - ------- - BlockManager - """ - # If 2D, we assume that we're operating column-wise - assert self.ndim == 2 - - res_blocks: list[Block] = [] - for blk in self.blocks: - nbs = blk.reduce(func) - res_blocks.extend(nbs) - - index = Index([None]) # placeholder - new_mgr = type(self).from_blocks(res_blocks, [self.items, index]) - return new_mgr - - def operate_blockwise(self, other: BlockManager, array_op) -> BlockManager: - """ - Apply array_op blockwise with another (aligned) BlockManager. - """ - return operate_blockwise(self, other, array_op) - - def _equal_values(self: BlockManager, other: BlockManager) -> bool: - """ - Used in .equals defined in base class. Only check the column values - assuming shape and indexes have already been checked. - """ - return blockwise_all(self, other, array_equals) - - def quantile( + if is_integer(skiprows): + skiprows = range(skiprows) + if skiprows is None: + skiprows = set() + elif not callable(skiprows): + skiprows = set(skiprows) + + # put stuff back + result["names"] = names + result["converters"] = converters + result["na_values"] = na_values + result["na_fvalues"] = na_fvalues + result["skiprows"] = skiprows + + return result, engine + + def __next__(self) -> DataFrame: + try: + return self.get_chunk() + except StopIteration: + self.close() + raise + + def _make_engine( self, - *, - qs: Index, # with dtype float 64 - interpolation: QuantileInterpolation = "linear", - ) -> Self: - """ - Iterate over blocks applying quantile reduction. - This routine is intended for reduction type operations and - will do inference on the generated blocks. - - Parameters - ---------- - interpolation : type of interpolation, default 'linear' - qs : list of the quantiles to be computed - - Returns - ------- - BlockManager - """ - # Series dispatches to DataFrame for quantile, which allows us to - # simplify some of the code here and in the blocks - assert self.ndim >= 2 - assert is_list_like(qs) # caller is responsible for this - - new_axes = list(self.axes) - new_axes[1] = Index(qs, dtype=np.float64) - - blocks = [ - blk.quantile(qs=qs, interpolation=interpolation) for blk in self.blocks - ] - - return type(self)(blocks, new_axes) - - # ---------------------------------------------------------------- - - def unstack(self, unstacker, fill_value) -> BlockManager: - """ - Return a BlockManager with all blocks unstacked. - - Parameters - ---------- - unstacker : reshape._Unstacker - fill_value : Any - fill_value for newly introduced missing values. - - Returns - ------- - unstacked : BlockManager - """ - new_columns = unstacker.get_new_columns(self.items) - new_index = unstacker.new_index - - allow_fill = not unstacker.mask_all - if allow_fill: - # calculating the full mask once and passing it to Block._unstack is - # faster than letting calculating it in each repeated call - new_mask2D = (~unstacker.mask).reshape(*unstacker.full_shape) - needs_masking = new_mask2D.any(axis=0) - else: - needs_masking = np.zeros(unstacker.full_shape[1], dtype=bool) - - new_blocks: list[Block] = [] - columns_mask: list[np.ndarray] = [] + f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list | IO, + engine: CSVEngine = "c", + ) -> ParserBase: + mapping: dict[str, type[ParserBase]] = { + "c": CParserWrapper, + "python": PythonParser, + "pyarrow": ArrowParserWrapper, + "python-fwf": FixedWidthFieldParser, + } - if len(self.items) == 0: - factor = 1 - else: - fac = len(new_columns) / len(self.items) - assert fac == int(fac) - factor = int(fac) - - for blk in self.blocks: - mgr_locs = blk.mgr_locs - new_placement = mgr_locs.tile_for_unstack(factor) - - blocks, mask = blk._unstack( - unstacker, - fill_value, - new_placement=new_placement, - needs_masking=needs_masking, + if engine not in mapping: + raise ValueError( + f"Unknown engine: {engine} (valid options are {mapping.keys()})" ) + if not isinstance(f, list): + # open file here + is_text = True + mode = "r" + if engine == "pyarrow": + is_text = False + mode = "rb" + elif ( + engine == "c" + and self.options.get("encoding", "utf-8") == "utf-8" + and isinstance(stringify_path(f), str) + ): + # c engine can decode utf-8 bytes, adding TextIOWrapper makes + # the c-engine especially for memory_map=True far slower + is_text = False + if "b" not in mode: + mode += "b" + self.handles = get_handle( + f, + mode, + encoding=self.options.get("encoding", None), + compression=self.options.get("compression", None), + memory_map=self.options.get("memory_map", False), + is_text=is_text, + errors=self.options.get("encoding_errors", "strict"), + storage_options=self.options.get("storage_options", None), + ) + assert self.handles is not None + f = self.handles.handle - new_blocks.extend(blocks) - columns_mask.extend(mask) - - # Block._unstack should ensure this holds, - assert mask.sum() == sum(len(nb._mgr_locs) for nb in blocks) - # In turn this ensures that in the BlockManager call below - # we have len(new_columns) == sum(x.shape[0] for x in new_blocks) - # which suffices to allow us to pass verify_inegrity=False - - new_columns = new_columns[columns_mask] - - bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False) - return bm - - def to_dict(self) -> dict[str, Self]: - """ - Return a dict of str(dtype) -> BlockManager - - Returns - ------- - values : a dict of dtype -> BlockManager - """ + elif engine != "python": + msg = f"Invalid file path or buffer object type: {type(f)}" + raise ValueError(msg) - bd: dict[str, list[Block]] = {} - for b in self.blocks: - bd.setdefault(str(b.dtype), []).append(b) + try: + return mapping[engine](f, **self.options) + except Exception: + if self.handles is not None: + self.handles.close() + raise - # TODO(EA2D): the combine will be unnecessary with 2D EAs - return {dtype: self._combine(blocks) for dtype, blocks in bd.items()} + def _failover_to_python(self) -> None: + raise AbstractMethodError(self) - def as_array( - self, - dtype: np.dtype | None = None, - copy: bool = False, - na_value: object = lib.no_default, - ) -> np.ndarray: - """ - Convert the blockmanager data into an numpy array. - - Parameters - ---------- - dtype : np.dtype or None, default None - Data type of the return array. - copy : bool, default False - If True then guarantee that a copy is returned. A value of - False does not guarantee that the underlying data is not - copied. - na_value : object, default lib.no_default - Value to be used as the missing value sentinel. - - Returns - ------- - arr : ndarray - """ - passed_nan = lib.is_float(na_value) and isna(na_value) - - if len(self.blocks) == 0: - arr = np.empty(self.shape, dtype=float) - return arr.transpose() - - if self.is_single_block: - blk = self.blocks[0] - - if na_value is not lib.no_default: - # We want to copy when na_value is provided to avoid - # mutating the original object - if lib.is_np_dtype(blk.dtype, "f") and passed_nan: - # We are already numpy-float and na_value=np.nan - pass + def read(self, nrows: int | None = None) -> DataFrame: + if self.engine == "pyarrow": + try: + # error: "ParserBase" has no attribute "read" + df = self._engine.read() # type: ignore[attr-defined] + except Exception: + self.close() + raise + else: + nrows = validate_integer("nrows", nrows) + try: + # error: "ParserBase" has no attribute "read" + ( + index, + columns, + col_dict, + ) = self._engine.read( # type: ignore[attr-defined] + nrows + ) + except Exception: + self.close() + raise + + if index is None: + if col_dict: + # Any column is actually fine: + new_rows = len(next(iter(col_dict.values()))) + index = RangeIndex(self._currow, self._currow + new_rows) else: - copy = True - - if blk.is_extension: - # Avoid implicit conversion of extension blocks to object - - # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no - # attribute "to_numpy" - arr = blk.values.to_numpy( # type: ignore[union-attr] - dtype=dtype, - na_value=na_value, - copy=copy, - ).reshape(blk.shape) - elif not copy: - arr = np.asarray(blk.values, dtype=dtype) + new_rows = 0 else: - arr = np.array(blk.values, dtype=dtype, copy=copy) + new_rows = len(index) - if not copy: - arr = arr.view() - arr.flags.writeable = False - else: - arr = self._interleave(dtype=dtype, na_value=na_value) - # The underlying data was copied within _interleave, so no need - # to further copy if copy=True or setting na_value + if hasattr(self, "orig_options"): + dtype_arg = self.orig_options.get("dtype", None) + else: + dtype_arg = None + + if isinstance(dtype_arg, dict): + dtype = defaultdict(lambda: None) # type: ignore[var-annotated] + dtype.update(dtype_arg) + elif dtype_arg is not None and pandas_dtype(dtype_arg) in ( + np.str_, + np.object_, + ): + dtype = defaultdict(lambda: dtype_arg) + else: + dtype = None + + if dtype is not None: + new_col_dict = {} + for k, v in col_dict.items(): + d = ( + dtype[k] + if pandas_dtype(dtype[k]) in (np.str_, np.object_) + else None + ) + new_col_dict[k] = Series(v, index=index, dtype=d, copy=False) + else: + new_col_dict = col_dict - if na_value is lib.no_default: - pass - elif arr.dtype.kind == "f" and passed_nan: - pass - else: - arr[isna(arr)] = na_value + df = DataFrame( + new_col_dict, + columns=columns, + index=index, + copy=False, + ) - return arr.transpose() + self._currow += new_rows + return df - def _interleave( - self, - dtype: np.dtype | None = None, - na_value: object = lib.no_default, - ) -> np.ndarray: - """ - Return ndarray from blocks with specified item order - Items must be contained in the blocks - """ - if not dtype: - # Incompatible types in assignment (expression has type - # "Optional[Union[dtype[Any], ExtensionDtype]]", variable has - # type "Optional[dtype[Any]]") - dtype = interleaved_dtype( # type: ignore[assignment] - [blk.dtype for blk in self.blocks] - ) + def get_chunk(self, size: int | None = None) -> DataFrame: + if size is None: + size = self.chunksize + if self.nrows is not None: + if self._currow >= self.nrows: + raise StopIteration + size = min(size, self.nrows - self._currow) + return self.read(nrows=size) - # error: Argument 1 to "ensure_np_dtype" has incompatible type - # "Optional[dtype[Any]]"; expected "Union[dtype[Any], ExtensionDtype]" - dtype = ensure_np_dtype(dtype) # type: ignore[arg-type] - result = np.empty(self.shape, dtype=dtype) - - itemmask = np.zeros(self.shape[0]) - - if dtype == np.dtype("object") and na_value is lib.no_default: - # much more performant than using to_numpy below - for blk in self.blocks: - rl = blk.mgr_locs - arr = blk.get_values(dtype) - result[rl.indexer] = arr - itemmask[rl.indexer] = 1 - return result - - for blk in self.blocks: - rl = blk.mgr_locs - if blk.is_extension: - # Avoid implicit conversion of extension blocks to object - - # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no - # attribute "to_numpy" - arr = blk.values.to_numpy( # type: ignore[union-attr] - dtype=dtype, - na_value=na_value, - ) - else: - arr = blk.get_values(dtype) - result[rl.indexer] = arr - itemmask[rl.indexer] = 1 - - if not itemmask.all(): - raise AssertionError("Some items were not contained in blocks") - - return result - - # ---------------------------------------------------------------- - # Consolidation - - def is_consolidated(self) -> bool: - """ - Return True if more than one block with the same dtype - """ - if not self._known_consolidated: - self._consolidate_check() - return self._is_consolidated - - def _consolidate_check(self) -> None: - if len(self.blocks) == 1: - # fastpath - self._is_consolidated = True - self._known_consolidated = True - return - dtypes = [blk.dtype for blk in self.blocks if blk._can_consolidate] - self._is_consolidated = len(dtypes) == len(set(dtypes)) - self._known_consolidated = True - - def _consolidate_inplace(self) -> None: - # In general, _consolidate_inplace should only be called via - # DataFrame._consolidate_inplace, otherwise we will fail to invalidate - # the DataFrame's _item_cache. The exception is for newly-created - # BlockManager objects not yet attached to a DataFrame. - if not self.is_consolidated(): - self.blocks = _consolidate(self.blocks) - self._is_consolidated = True - self._known_consolidated = True - self._rebuild_blknos_and_blklocs() - - # ---------------------------------------------------------------- - # Concatenation - - @classmethod - def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self: - """ - Concatenate uniformly-indexed BlockManagers horizontally. - """ - offset = 0 - blocks: list[Block] = [] - for mgr in mgrs: - for blk in mgr.blocks: - # We need to do getitem_block here otherwise we would be altering - # blk.mgr_locs in place, which would render it invalid. This is only - # relevant in the copy=False case. - nb = blk.slice_block_columns(slice(None)) - nb._mgr_locs = nb._mgr_locs.add(offset) - blocks.append(nb) - - offset += len(mgr.items) - - new_mgr = cls(tuple(blocks), axes) - return new_mgr - - @classmethod - def concat_vertical(cls, mgrs: list[Self], axes: list[Index]) -> Self: - """ - Concatenate uniformly-indexed BlockManagers vertically. - """ - raise NotImplementedError("This logic lives (for now) in internals.concat") - - -class SingleBlockManager(BaseBlockManager): - """manage a single block with""" - - @property - def ndim(self) -> Literal[1]: - return 1 - - _is_consolidated = True - _known_consolidated = True - __slots__ = () - is_single_block = True + def __enter__(self) -> Self: + return self - def __init__( + def __exit__( self, - block: Block, - axis: Index, - verify_integrity: bool = False, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: TracebackType | None, ) -> None: - # Assertions disabled for performance - # assert isinstance(block, Block), type(block) - # assert isinstance(axis, Index), type(axis) - - self.axes = [axis] - self.blocks = (block,) - - @classmethod - def from_blocks( - cls, - blocks: list[Block], - axes: list[Index], - ) -> Self: - """ - Constructor for BlockManager and SingleBlockManager with same signature. - """ - assert len(blocks) == 1 - assert len(axes) == 1 - return cls(blocks[0], axes[0], verify_integrity=False) - - @classmethod - def from_array( - cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None - ) -> SingleBlockManager: - """ - Constructor for if we have an array that is not yet a Block. - """ - array = maybe_coerce_values(array) - bp = BlockPlacement(slice(0, len(index))) - block = new_block(array, placement=bp, ndim=1, refs=refs) - return cls(block, index) - - def to_2d_mgr(self, columns: Index) -> BlockManager: - """ - Manager analogue of Series.to_frame - """ - blk = self.blocks[0] - arr = ensure_block_shape(blk.values, ndim=2) - bp = BlockPlacement(0) - new_blk = type(blk)(arr, placement=bp, ndim=2, refs=blk.refs) - axes = [columns, self.axes[0]] - return BlockManager([new_blk], axes=axes, verify_integrity=False) - - def _has_no_reference(self, i: int = 0) -> bool: - """ - Check for column `i` if it has references. - (whether it references another array or is itself being referenced) - Returns True if the column has no references. - """ - return not self.blocks[0].refs.has_reference() - - def __getstate__(self): - block_values = [b.values for b in self.blocks] - block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] - axes_array = list(self.axes) - - extra_state = { - "0.14.1": { - "axes": axes_array, - "blocks": [ - {"values": b.values, "mgr_locs": b.mgr_locs.indexer} - for b in self.blocks - ], - } - } - - # First three elements of the state are to maintain forward - # compatibility with 0.13.1. - return axes_array, block_values, block_items, extra_state - - def __setstate__(self, state) -> None: - def unpickle_block(values, mgr_locs, ndim: int) -> Block: - # TODO(EA2D): ndim would be unnecessary with 2D EAs - # older pickles may store e.g. DatetimeIndex instead of DatetimeArray - values = extract_array(values, extract_numpy=True) - if not isinstance(mgr_locs, BlockPlacement): - mgr_locs = BlockPlacement(mgr_locs) - - values = maybe_coerce_values(values) - return new_block(values, placement=mgr_locs, ndim=ndim) - - if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]: - state = state[3]["0.14.1"] - self.axes = [ensure_index(ax) for ax in state["axes"]] - ndim = len(self.axes) - self.blocks = tuple( - unpickle_block(b["values"], b["mgr_locs"], ndim=ndim) - for b in state["blocks"] - ) - else: - raise NotImplementedError("pre-0.14.1 pickles are no longer supported") + self.close() - self._post_setstate() - def _post_setstate(self) -> None: - pass +def TextParser(*args, **kwds) -> TextFileReader: + """ + Converts lists of lists/tuples into DataFrames with proper type inference + and optional (e.g. string to datetime) conversion. Also enables iterating + lazily over chunks of large files - @cache_readonly - def _block(self) -> Block: - return self.blocks[0] + Parameters + ---------- + data : file-like object or list + delimiter : separator character to use + dialect : str or csv.Dialect instance, optional + Ignored if delimiter is longer than 1 character + names : sequence, default + header : int, default 0 + Row to use to parse column labels. Defaults to the first row. Prior + rows will be discarded + index_col : int or list, optional + Column or columns to use as the (possibly hierarchical) index + has_index_names: bool, default False + True if the cols defined in index_col have an index name and are + not in the header. + na_values : scalar, str, list-like, or dict, optional + Additional strings to recognize as NA/NaN. + keep_default_na : bool, default True + thousands : str, optional + Thousands separator + comment : str, optional + Comment out remainder of line + parse_dates : bool, default False + keep_date_col : bool, default False + date_parser : function, optional + + .. deprecated:: 2.0.0 + date_format : str or dict of column -> format, default ``None`` + + .. versionadded:: 2.0.0 + skiprows : list of integers + Row numbers to skip + skipfooter : int + Number of line at bottom of file to skip + converters : dict, optional + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the cell (not column) content, and return the + transformed content. + encoding : str, optional + Encoding to use for UTF when reading/writing (ex. 'utf-8') + float_precision : str, optional + Specifies which converter the C engine should use for floating-point + values. The options are `None` or `high` for the ordinary converter, + `legacy` for the original lower precision pandas converter, and + `round_trip` for the round-trip converter. + """ + kwds["engine"] = "python" + return TextFileReader(*args, **kwds) - @final - @property - def array(self) -> ArrayLike: - """ - Quick access to the backing array of the Block. - """ - return self.arrays[0] - # error: Cannot override writeable attribute with read-only property - @property - def _blknos(self) -> None: # type: ignore[override] - """compat with BlockManager""" - return None +def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = True): + na_fvalues: set | dict + if na_values is None: + if keep_default_na: + na_values = STR_NA_VALUES + else: + na_values = set() + na_fvalues = set() + elif isinstance(na_values, dict): + old_na_values = na_values.copy() + na_values = {} # Prevent aliasing. + + # Convert the values in the na_values dictionary + # into array-likes for further use. This is also + # where we append the default NaN values, provided + # that `keep_default_na=True`. + for k, v in old_na_values.items(): + if not is_list_like(v): + v = [v] + + if keep_default_na: + v = set(v) | STR_NA_VALUES + + na_values[k] = v + na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()} + else: + if not is_list_like(na_values): + na_values = [na_values] + na_values = _stringify_na_values(na_values, floatify) + if keep_default_na: + na_values = na_values | STR_NA_VALUES + + na_fvalues = _floatify_na_values(na_values) + + return na_values, na_fvalues + + +def _floatify_na_values(na_values): + # create float versions of the na_values + result = set() + for v in na_values: + try: + v = float(v) + if not np.isnan(v): + result.add(v) + except (TypeError, ValueError, OverflowError): + pass + return result + + +def _stringify_na_values(na_values, floatify: bool) -> set[str | float]: + """return a stringified and numeric for these values""" + result: list[str | float] = [] + for x in na_values: + result.append(str(x)) + result.append(x) + try: + v = float(x) + + # we are like 999 here + if v == int(v): + v = int(v) + result.append(f"{v}.0") + result.append(str(v)) + + if floatify: + result.append(v) + except (TypeError, ValueError, OverflowError): + pass + if floatify: + try: + result.append(int(x)) + except (TypeError, ValueError, OverflowError): + pass + return set(result) + + +def _refine_defaults_read( + dialect: str | csv.Dialect | None, + delimiter: str | None | lib.NoDefault, + delim_whitespace: bool, + engine: CSVEngine | None, + sep: str | None | lib.NoDefault, + on_bad_lines: str | Callable, + names: Sequence[Hashable] | None | lib.NoDefault, + defaults: dict[str, Any], + dtype_backend: DtypeBackend | lib.NoDefault, +): + """Validate/refine default values of input parameters of read_csv, read_table. - # error: Cannot override writeable attribute with read-only property - @property - def _blklocs(self) -> None: # type: ignore[override] - """compat with BlockManager""" - return None + Parameters + ---------- + dialect : str or csv.Dialect + If provided, this parameter will override values (default or not) for the + following parameters: `delimiter`, `doublequote`, `escapechar`, + `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to + override values, a ParserWarning will be issued. See csv.Dialect + documentation for more details. + delimiter : str or object + Alias for sep. + delim_whitespace : bool + Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be + used as the sep. Equivalent to setting ``sep='\\s+'``. If this option + is set to True, nothing should be passed in for the ``delimiter`` + parameter. + + .. deprecated:: 2.2.0 + Use ``sep="\\s+"`` instead. + engine : {{'c', 'python'}} + Parser engine to use. The C engine is faster while the python engine is + currently more feature-complete. + sep : str or object + A delimiter provided by the user (str) or a sentinel value, i.e. + pandas._libs.lib.no_default. + on_bad_lines : str, callable + An option for handling bad lines or a sentinel value(None). + names : array-like, optional + List of column names to use. If the file contains a header row, + then you should explicitly pass ``header=0`` to override the column names. + Duplicates in this list are not allowed. + defaults: dict + Default values of input parameters. - def get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> Self: - # similar to get_slice, but not restricted to slice indexer - blk = self._block - if len(indexer) > 0 and indexer.all(): - return type(self)(blk.copy(deep=False), self.index) - array = blk.values[indexer] + Returns + ------- + kwds : dict + Input parameters with correct values. + + Raises + ------ + ValueError : + If a delimiter was specified with ``sep`` (or ``delimiter``) and + ``delim_whitespace=True``. + """ + # fix types for sep, delimiter to Union(str, Any) + delim_default = defaults["delimiter"] + kwds: dict[str, Any] = {} + # gh-23761 + # + # When a dialect is passed, it overrides any of the overlapping + # parameters passed in directly. We don't want to warn if the + # default parameters were passed in (since it probably means + # that the user didn't pass them in explicitly in the first place). + # + # "delimiter" is the annoying corner case because we alias it to + # "sep" before doing comparison to the dialect values later on. + # Thus, we need a flag to indicate that we need to "override" + # the comparison to dialect values by checking if default values + # for BOTH "delimiter" and "sep" were provided. + if dialect is not None: + kwds["sep_override"] = delimiter is None and ( + sep is lib.no_default or sep == delim_default + ) - if isinstance(indexer, np.ndarray) and indexer.dtype.kind == "b": - # boolean indexing always gives a copy with numpy - refs = None - else: - # TODO(CoW) in theory only need to track reference if new_array is a view - refs = blk.refs - - bp = BlockPlacement(slice(0, len(array))) - block = type(blk)(array, placement=bp, ndim=1, refs=refs) - - new_idx = self.index[indexer] - return type(self)(block, new_idx) - - def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleBlockManager: - # Assertion disabled for performance - # assert isinstance(slobj, slice), type(slobj) - if axis >= self.ndim: - raise IndexError("Requested axis not found in manager") - - blk = self._block - array = blk.values[slobj] - bp = BlockPlacement(slice(0, len(array))) - # TODO this method is only used in groupby SeriesSplitter at the moment, - # so passing refs is not yet covered by the tests - block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs) - new_index = self.index._getitem_slice(slobj) - return type(self)(block, new_index) - - @property - def index(self) -> Index: - return self.axes[0] - - @property - def dtype(self) -> DtypeObj: - return self._block.dtype - - def get_dtypes(self) -> npt.NDArray[np.object_]: - return np.array([self._block.dtype], dtype=object) - - def external_values(self): - """The array that Series.values returns""" - return self._block.external_values() - - def internal_values(self): - """The array that Series._values returns""" - return self._block.values - - def array_values(self) -> ExtensionArray: - """The array that Series.array returns""" - return self._block.array_values - - def get_numeric_data(self) -> Self: - if self._block.is_numeric: - return self.copy(deep=False) - return self.make_empty() - - @property - def _can_hold_na(self) -> bool: - return self._block._can_hold_na - - def setitem_inplace(self, indexer, value) -> None: - """ - Set values with indexer. - - For SingleBlockManager, this backs s[indexer] = value - - This is an inplace version of `setitem()`, mutating the manager/values - in place, not returning a new Manager (and Block), and thus never changing - the dtype. - """ - if not self._has_no_reference(0): - self.blocks = (self._block.copy(),) - self._cache.clear() - - arr = self.array - - # EAs will do this validation in their own __setitem__ methods. - if isinstance(arr, np.ndarray): - # Note: checking for ndarray instead of np.dtype means we exclude - # dt64/td64, which do their own validation. - value = np_can_hold_element(arr.dtype, value) - - if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1: - # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 - value = value[0, ...] - - arr[indexer] = value - - def idelete(self, indexer) -> SingleBlockManager: - """ - Delete single location from SingleBlockManager. - - Ensures that self.blocks doesn't become empty. - """ - nb = self._block.delete(indexer)[0] - self.blocks = (nb,) - self.axes[0] = self.axes[0].delete(indexer) - self._cache.clear() - return self + if delimiter and (sep is not lib.no_default): + raise ValueError("Specified a sep and a delimiter; you can only specify one.") - def fast_xs(self, loc): - """ - fast path for getting a cross-section - return a view of the data - """ - raise NotImplementedError("Use series._values[loc] instead") - - def set_values(self, values: ArrayLike) -> None: - """ - Set the values of the single block in place. - - Use at your own risk! This does not check if the passed values are - valid for the current Block/SingleBlockManager (length, dtype, etc), - and this does not properly keep track of references. - """ - # NOTE(CoW) Currently this is only used for FrameColumnApply.series_generator - # which handles CoW by setting the refs manually if necessary - self.blocks[0].values = values - self.blocks[0]._mgr_locs = BlockPlacement(slice(len(values))) - - def _equal_values(self, other: Self) -> bool: - """ - Used in .equals defined in base class. Only check the column values - assuming shape and indexes have already been checked. - """ - # For SingleBlockManager (i.e.Series) - if other.ndim != 1: - return False - left = self.blocks[0].values - right = other.blocks[0].values - return array_equals(left, right) - - def grouped_reduce(self, func): - arr = self.array - res = func(arr) - index = default_index(len(res)) - - mgr = type(self).from_array(res, index) - return mgr - - -# -------------------------------------------------------------------- -# Constructor Helpers - - -def create_block_manager_from_blocks( - blocks: list[Block], - axes: list[Index], - consolidate: bool = True, - verify_integrity: bool = True, -) -> BlockManager: - # If verify_integrity=False, then caller is responsible for checking - # all(x.shape[-1] == len(axes[1]) for x in blocks) - # sum(x.shape[0] for x in blocks) == len(axes[0]) - # set(x for blk in blocks for x in blk.mgr_locs) == set(range(len(axes[0]))) - # all(blk.ndim == 2 for blk in blocks) - # This allows us to safely pass verify_integrity=False - - try: - mgr = BlockManager(blocks, axes, verify_integrity=verify_integrity) - - except ValueError as err: - arrays = [blk.values for blk in blocks] - tot_items = sum(arr.shape[0] for arr in arrays) - raise_construction_error(tot_items, arrays[0].shape[1:], axes, err) - - if consolidate: - mgr._consolidate_inplace() - return mgr - - -def create_block_manager_from_column_arrays( - arrays: list[ArrayLike], - axes: list[Index], - consolidate: bool, - refs: list, -) -> BlockManager: - # Assertions disabled for performance (caller is responsible for verifying) - # assert isinstance(axes, list) - # assert all(isinstance(x, Index) for x in axes) - # assert all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays) - # assert all(type(x) is not NumpyExtensionArray for x in arrays) - # assert all(x.ndim == 1 for x in arrays) - # assert all(len(x) == len(axes[1]) for x in arrays) - # assert len(arrays) == len(axes[0]) - # These last three are sufficient to allow us to safely pass - # verify_integrity=False below. - - try: - blocks = _form_blocks(arrays, consolidate, refs) - mgr = BlockManager(blocks, axes, verify_integrity=False) - except ValueError as e: - raise_construction_error(len(arrays), arrays[0].shape, axes, e) - if consolidate: - mgr._consolidate_inplace() - return mgr - - -def raise_construction_error( - tot_items: int, - block_shape: Shape, - axes: list[Index], - e: ValueError | None = None, -) -> NoReturn: - """raise a helpful message about our construction""" - passed = tuple(map(int, [tot_items] + list(block_shape))) - # Correcting the user facing error message during dataframe construction - if len(passed) <= 2: - passed = passed[::-1] - - implied = tuple(len(ax) for ax in axes) - # Correcting the user facing error message during dataframe construction - if len(implied) <= 2: - implied = implied[::-1] - - # We return the exception object instead of raising it so that we - # can raise it in the caller; mypy plays better with that - if passed == implied and e is not None: - raise e - if block_shape[0] == 0: - raise ValueError("Empty data passed with indices specified.") - raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}") - - -# ----------------------------------------------------------------------- - - -def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, DtypeObj]: - dtype = tup[1].dtype - - if is_1d_only_ea_dtype(dtype): - # We know these won't be consolidated, so don't need to group these. - # This avoids expensive comparisons of CategoricalDtype objects - sep = id(dtype) - else: - sep = 0 + kwds["names"] = None if names is lib.no_default else names - return sep, dtype + # Alias sep -> delimiter. + if delimiter is None: + delimiter = sep + if delim_whitespace and (delimiter is not lib.no_default): + raise ValueError( + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only specify one." + ) -def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]: - tuples = list(enumerate(arrays)) + if delimiter == "\n": + raise ValueError( + r"Specified \n as separator or delimiter. This forces the python engine " + "which does not accept a line terminator. Hence it is not allowed to use " + "the line terminator as separator.", + ) - if not consolidate: - return _tuples_to_blocks_no_consolidate(tuples, refs) + if delimiter is lib.no_default: + # assign default separator value + kwds["delimiter"] = delim_default + else: + kwds["delimiter"] = delimiter - # when consolidating, we can ignore refs (either stacking always copies, - # or the EA is already copied in the calling dict_to_mgr) + if engine is not None: + kwds["engine_specified"] = True + else: + kwds["engine"] = "c" + kwds["engine_specified"] = False + + if on_bad_lines == "error": + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR + elif on_bad_lines == "warn": + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN + elif on_bad_lines == "skip": + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP + elif callable(on_bad_lines): + if engine not in ["python", "pyarrow"]: + raise ValueError( + "on_bad_line can only be a callable function " + "if engine='python' or 'pyarrow'" + ) + kwds["on_bad_lines"] = on_bad_lines + else: + raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") - # group by dtype - grouper = itertools.groupby(tuples, _grouping_func) + check_dtype_backend(dtype_backend) - nbs: list[Block] = [] - for (_, dtype), tup_block in grouper: - block_type = get_block_type(dtype) + kwds["dtype_backend"] = dtype_backend - if isinstance(dtype, np.dtype): - is_dtlike = dtype.kind in "mM" + return kwds - if issubclass(dtype.type, (str, bytes)): - dtype = np.dtype(object) - values, placement = _stack_arrays(list(tup_block), dtype) - if is_dtlike: - values = ensure_wrapped_if_datetimelike(values) - blk = block_type(values, placement=BlockPlacement(placement), ndim=2) - nbs.append(blk) +def _extract_dialect(kwds: dict[str, Any]) -> csv.Dialect | None: + """ + Extract concrete csv dialect instance. - elif is_1d_only_ea_dtype(dtype): - dtype_blocks = [ - block_type(x[1], placement=BlockPlacement(x[0]), ndim=2) - for x in tup_block - ] - nbs.extend(dtype_blocks) + Returns + ------- + csv.Dialect or None + """ + if kwds.get("dialect") is None: + return None - else: - dtype_blocks = [ - block_type( - ensure_block_shape(x[1], 2), placement=BlockPlacement(x[0]), ndim=2 - ) - for x in tup_block - ] - nbs.extend(dtype_blocks) - return nbs + dialect = kwds["dialect"] + if dialect in csv.list_dialects(): + dialect = csv.get_dialect(dialect) + _validate_dialect(dialect) -def _tuples_to_blocks_no_consolidate(tuples, refs) -> list[Block]: - # tuples produced within _form_blocks are of the form (placement, array) - return [ - new_block_2d( - ensure_block_shape(arr, ndim=2), placement=BlockPlacement(i), refs=ref - ) - for ((i, arr), ref) in zip(tuples, refs) - ] + return dialect -def _stack_arrays(tuples, dtype: np.dtype): - placement, arrays = zip(*tuples) +MANDATORY_DIALECT_ATTRS = ( + "delimiter", + "doublequote", + "escapechar", + "skipinitialspace", + "quotechar", + "quoting", +) - first = arrays[0] - shape = (len(arrays),) + first.shape - stacked = np.empty(shape, dtype=dtype) - for i, arr in enumerate(arrays): - stacked[i] = arr +def _validate_dialect(dialect: csv.Dialect) -> None: + """ + Validate csv dialect instance. - return stacked, placement + Raises + ------ + ValueError + If incorrect dialect is provided. + """ + for param in MANDATORY_DIALECT_ATTRS: + if not hasattr(dialect, param): + raise ValueError(f"Invalid dialect {dialect} provided") -def _consolidate(blocks: tuple[Block, ...]) -> tuple[Block, ...]: +def _merge_with_dialect_properties( + dialect: csv.Dialect, + defaults: dict[str, Any], +) -> dict[str, Any]: """ - Merge blocks having same dtype, exclude non-consolidating blocks - """ - # sort by _can_consolidate, dtype - gkey = lambda x: x._consolidate_key - grouper = itertools.groupby(sorted(blocks, key=gkey), gkey) - - new_blocks: list[Block] = [] - for (_can_consolidate, dtype), group_blocks in grouper: - merged_blocks, _ = _merge_blocks( - list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate - ) - new_blocks = extend_blocks(merged_blocks, new_blocks) - return tuple(new_blocks) + Merge default kwargs in TextFileReader with dialect parameters. + Parameters + ---------- + dialect : csv.Dialect + Concrete csv dialect. See csv.Dialect documentation for more details. + defaults : dict + Keyword arguments passed to TextFileReader. -def _merge_blocks( - blocks: list[Block], dtype: DtypeObj, can_consolidate: bool -) -> tuple[list[Block], bool]: - if len(blocks) == 1: - return blocks, False + Returns + ------- + kwds : dict + Updated keyword arguments, merged with dialect parameters. + """ + kwds = defaults.copy() - if can_consolidate: - # TODO: optimization potential in case all mgrs contain slices and - # combination of those slices is a slice, too. - new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks]) + for param in MANDATORY_DIALECT_ATTRS: + dialect_val = getattr(dialect, param) - new_values: ArrayLike + parser_default = parser_defaults[param] + provided = kwds.get(param, parser_default) - if isinstance(blocks[0].dtype, np.dtype): - # error: List comprehension has incompatible type List[Union[ndarray, - # ExtensionArray]]; expected List[Union[complex, generic, - # Sequence[Union[int, float, complex, str, bytes, generic]], - # Sequence[Sequence[Any]], SupportsArray]] - new_values = np.vstack([b.values for b in blocks]) # type: ignore[misc] - else: - bvals = [blk.values for blk in blocks] - bvals2 = cast(Sequence[NDArrayBackedExtensionArray], bvals) - new_values = bvals2[0]._concat_same_type(bvals2, axis=0) + # Messages for conflicting values between the dialect + # instance and the actual parameters provided. + conflict_msgs = [] - argsort = np.argsort(new_mgr_locs) - new_values = new_values[argsort] - new_mgr_locs = new_mgr_locs[argsort] + # Don't warn if the default parameter was passed in, + # even if it conflicts with the dialect (gh-23761). + if provided not in (parser_default, dialect_val): + msg = ( + f"Conflicting values for '{param}': '{provided}' was " + f"provided, but the dialect specifies '{dialect_val}'. " + "Using the dialect-specified value." + ) - bp = BlockPlacement(new_mgr_locs) - return [new_block_2d(new_values, placement=bp)], True + # Annoying corner case for not warning about + # conflicts between dialect and delimiter parameter. + # Refer to the outer "_read_" function for more info. + if not (param == "delimiter" and kwds.pop("sep_override", False)): + conflict_msgs.append(msg) - # can't consolidate --> no merge - return blocks, False + if conflict_msgs: + warnings.warn( + "\n\n".join(conflict_msgs), ParserWarning, stacklevel=find_stack_level() + ) + kwds[param] = dialect_val + return kwds -def _fast_count_smallints(arr: npt.NDArray[np.intp]): - """Faster version of set(arr) for sequences of small numbers.""" - counts = np.bincount(arr) - nz = counts.nonzero()[0] - # Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here, - # in one benchmark by a factor of 11 - return zip(nz, counts[nz]) +def _validate_skipfooter(kwds: dict[str, Any]) -> None: + """ + Check whether skipfooter is compatible with other kwargs in TextFileReader. + Parameters + ---------- + kwds : dict + Keyword arguments passed to TextFileReader. -def _preprocess_slice_or_indexer( - slice_or_indexer: slice | np.ndarray, length: int, allow_fill: bool -): - if isinstance(slice_or_indexer, slice): - return ( - "slice", - slice_or_indexer, - libinternals.slice_len(slice_or_indexer, length), - ) - else: - if ( - not isinstance(slice_or_indexer, np.ndarray) - or slice_or_indexer.dtype.kind != "i" - ): - dtype = getattr(slice_or_indexer, "dtype", None) - raise TypeError(type(slice_or_indexer), dtype) - - indexer = ensure_platform_int(slice_or_indexer) - if not allow_fill: - indexer = maybe_convert_indices(indexer, length) - return "fancy", indexer, len(indexer) - - -def make_na_array(dtype: DtypeObj, shape: Shape, fill_value) -> ArrayLike: - if isinstance(dtype, DatetimeTZDtype): - # NB: exclude e.g. pyarrow[dt64tz] dtypes - ts = Timestamp(fill_value).as_unit(dtype.unit) - i8values = np.full(shape, ts._value) - dt64values = i8values.view(f"M8[{dtype.unit}]") - return DatetimeArray._simple_new(dt64values, dtype=dtype) - - elif is_1d_only_ea_dtype(dtype): - dtype = cast(ExtensionDtype, dtype) - cls = dtype.construct_array_type() - - missing_arr = cls._from_sequence([], dtype=dtype) - ncols, nrows = shape - assert ncols == 1, ncols - empty_arr = -1 * np.ones((nrows,), dtype=np.intp) - return missing_arr.take(empty_arr, allow_fill=True, fill_value=fill_value) - elif isinstance(dtype, ExtensionDtype): - # TODO: no tests get here, a handful would if we disabled - # the dt64tz special-case above (which is faster) - cls = dtype.construct_array_type() - missing_arr = cls._empty(shape=shape, dtype=dtype) - missing_arr[:] = fill_value - return missing_arr - else: - # NB: we should never get here with dtype integer or bool; - # if we did, the missing_arr.fill would cast to gibberish - missing_arr_np = np.empty(shape, dtype=dtype) - missing_arr_np.fill(fill_value) - - if dtype.kind in "mM": - missing_arr_np = ensure_wrapped_if_datetimelike(missing_arr_np) - return missing_arr_np - \ No newline at end of file + Raises + ------ + ValueError + If skipfooter is not compatible with other parameters. + """ + if kwds.get("skipfooter"): + if kwds.get("iterator") or kwds.get("chunksize"): + raise ValueError("'skipfooter' not supported for iteration") + if kwds.get("nrows"): + raise ValueError("'skipfooter' not supported with 'nrows'") \ No newline at end of file