From a775f9cb40a6cd41d52c44261cbb1594705ffcee Mon Sep 17 00:00:00 2001 From: barnargh Date: Sat, 20 Apr 2024 20:19:38 -0500 Subject: [PATCH 1/4] fixed implicit conversion of 1-arrays inside data frames --- pandas/core/internals/managers.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 8fda9cd23b508..d09e0fb74463a 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2170,8 +2170,10 @@ def setitem_inplace(self, indexer, value) -> None: # Note: checking for ndarray instead of np.dtype means we exclude # dt64/td64, which do their own validation. value = np_can_hold_element(arr.dtype, value) - - if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1: + + # check if the dtype of the block is object + implicit_convert = arr.dtype != 'object' + if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1 and implicit_convert: # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 value = value[0, ...] From 1d04d6f959239e709404a4519518f651e9c32d49 Mon Sep 17 00:00:00 2001 From: barnargh Date: Sat, 20 Apr 2024 22:54:45 -0500 Subject: [PATCH 2/4] fixed issue #57944 --- pandas/io/parsers/readers.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 70f9a68244164..1eb736880c369 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -716,6 +716,19 @@ def read_csv( ) -> DataFrame | TextFileReader: ... +# a helper function for the read_csv(...) below). +# ensures that all keys in dtype are of type str. +# this allows for compatibility with the csv library +def parse_dtype(dtype) -> DtypeArg: + temp = {} + for key in dtype: + if isinstance(key, str): + temp[f"{key}"] = dtype[key] + else: + temp[key] = dtype[key] + return temp + + @Appender( _doc_read_csv_and_table.format( func_name="read_csv", @@ -790,6 +803,9 @@ def read_csv( storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: + # ensures that all keys in dtype are a string for compatibility with csv + dtype = parse_dtype(dtype) + if keep_date_col is not lib.no_default: # GH#55569 warnings.warn( From 9458f3e2ca33bb304f32c4d354c29f558d80c0a4 Mon Sep 17 00:00:00 2001 From: Gabe Barnard Date: Sat, 20 Apr 2024 23:47:04 -0500 Subject: [PATCH 3/4] restored to og --- pandas/core/internals/managers.py | 6 +- pandas/io/parsers/readers.py | 4209 ++++++++++++++++------------- 2 files changed, 2343 insertions(+), 1872 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index d09e0fb74463a..8fda9cd23b508 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2170,10 +2170,8 @@ def setitem_inplace(self, indexer, value) -> None: # Note: checking for ndarray instead of np.dtype means we exclude # dt64/td64, which do their own validation. value = np_can_hold_element(arr.dtype, value) - - # check if the dtype of the block is object - implicit_convert = arr.dtype != 'object' - if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1 and implicit_convert: + + if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1: # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 value = value[0, ...] diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 1eb736880c369..0c2332f24de1e 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1,2045 +1,2518 @@ -""" -Module contains tools for processing files into DataFrames or other objects - -GH#48849 provides a convenient way of deprecating keyword arguments -""" - from __future__ import annotations -from collections import ( - abc, - defaultdict, +from collections.abc import ( + Hashable, + Sequence, ) -import csv -import sys -from textwrap import fill +import itertools from typing import ( - IO, TYPE_CHECKING, Any, Callable, - Generic, Literal, - TypedDict, - overload, + NoReturn, + cast, + final, ) import warnings import numpy as np -from pandas._libs import lib -from pandas._libs.parsers import STR_NA_VALUES +from pandas._config.config import get_option + +from pandas._libs import ( + algos as libalgos, + internals as libinternals, + lib, +) +from pandas._libs.internals import ( + BlockPlacement, + BlockValuesRefs, +) +from pandas._libs.tslibs import Timestamp from pandas.errors import ( AbstractMethodError, - ParserWarning, + PerformanceWarning, ) -from pandas.util._decorators import Appender +from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level -from pandas.util._validators import check_dtype_backend +from pandas.util._validators import validate_bool_kwarg +from pandas.core.dtypes.cast import ( + find_common_type, + infer_dtype_from_scalar, + np_can_hold_element, +) from pandas.core.dtypes.common import ( - is_file_like, - is_float, - is_hashable, - is_integer, + ensure_platform_int, + is_1d_only_ea_dtype, is_list_like, - pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + ExtensionDtype, + SparseDtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) +from pandas.core.dtypes.missing import ( + array_equals, + isna, ) -from pandas import Series -from pandas.core.frame import DataFrame -from pandas.core.indexes.api import RangeIndex -from pandas.core.shared_docs import _shared_docs - -from pandas.io.common import ( - IOHandles, - get_handle, - stringify_path, - validate_header_arg, +import pandas.core.algorithms as algos +from pandas.core.arrays import DatetimeArray +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core.base import PandasObject +from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, + extract_array, +) +from pandas.core.indexers import maybe_convert_indices +from pandas.core.indexes.api import ( + Index, + default_index, + ensure_index, ) -from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper -from pandas.io.parsers.base_parser import ( - ParserBase, - is_index_col, - parser_defaults, +from pandas.core.internals.blocks import ( + Block, + NumpyBlock, + ensure_block_shape, + extend_blocks, + get_block_type, + maybe_coerce_values, + new_block, + new_block_2d, ) -from pandas.io.parsers.c_parser_wrapper import CParserWrapper -from pandas.io.parsers.python_parser import ( - FixedWidthFieldParser, - PythonParser, +from pandas.core.internals.ops import ( + blockwise_all, + operate_blockwise, ) if TYPE_CHECKING: - from collections.abc import ( - Hashable, - Iterable, - Mapping, - Sequence, - ) - from types import TracebackType - from pandas._typing import ( - CompressionOptions, - CSVEngine, - DtypeArg, - DtypeBackend, - FilePath, - HashableT, - IndexLabel, - ReadCsvBuffer, + ArrayLike, + AxisInt, + DtypeObj, + QuantileInterpolation, Self, - StorageOptions, - Unpack, - UsecolsArgType, + Shape, + npt, ) - class _read_shared(TypedDict, Generic[HashableT], total=False): - # annotations shared between read_csv/fwf/table's overloads - # NOTE: Keep in sync with the annotations of the implementation - sep: str | None | lib.NoDefault - delimiter: str | None | lib.NoDefault - header: int | Sequence[int] | None | Literal["infer"] - names: Sequence[Hashable] | None | lib.NoDefault - index_col: IndexLabel | Literal[False] | None - usecols: UsecolsArgType - dtype: DtypeArg | None - engine: CSVEngine | None - converters: Mapping[HashableT, Callable] | None - true_values: list | None - false_values: list | None - skipinitialspace: bool - skiprows: list[int] | int | Callable[[Hashable], bool] | None - skipfooter: int - nrows: int | None - na_values: ( - Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None - ) - keep_default_na: bool - na_filter: bool - skip_blank_lines: bool - parse_dates: bool | Sequence[Hashable] | None - infer_datetime_format: bool | lib.NoDefault - keep_date_col: bool | lib.NoDefault - date_parser: Callable | lib.NoDefault - date_format: str | dict[Hashable, str] | None - dayfirst: bool - cache_dates: bool - compression: CompressionOptions - thousands: str | None - decimal: str - lineterminator: str | None - quotechar: str - quoting: int - doublequote: bool - escapechar: str | None - comment: str | None - encoding: str | None - encoding_errors: str | None - dialect: str | csv.Dialect | None - on_bad_lines: str - delim_whitespace: bool | lib.NoDefault - low_memory: bool - memory_map: bool - float_precision: Literal["high", "legacy", "round_trip"] | None - storage_options: StorageOptions | None - dtype_backend: DtypeBackend | lib.NoDefault -else: - _read_shared = dict - - -_doc_read_csv_and_table = ( - r""" -{summary} - -Also supports optionally iterating or breaking of the file -into chunks. - -Additional help can be found in the online docs for -`IO Tools `_. - -Parameters ----------- -filepath_or_buffer : str, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is - expected. A local file could be: file://localhost/path/to/table.csv. - - If you want to pass in a path object, pandas accepts any ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, such as - a file handle (e.g. via builtin ``open`` function) or ``StringIO``. -sep : str, default {_default_sep} - Character or regex pattern to treat as the delimiter. If ``sep=None``, the - C engine cannot automatically detect - the separator, but the Python parsing engine can, meaning the latter will - be used and automatically detect the separator from only the first valid - row of the file by Python's builtin sniffer tool, ``csv.Sniffer``. - In addition, separators longer than 1 character and different from - ``'\s+'`` will be interpreted as regular expressions and will also force - the use of the Python parsing engine. Note that regex delimiters are prone - to ignoring quoted data. Regex example: ``'\r\t'``. -delimiter : str, optional - Alias for ``sep``. -header : int, Sequence of int, 'infer' or None, default 'infer' - Row number(s) containing column labels and marking the start of the - data (zero-indexed). Default behavior is to infer the column names: if no ``names`` - are passed the behavior is identical to ``header=0`` and column - names are inferred from the first line of the file, if column - names are passed explicitly to ``names`` then the behavior is identical to - ``header=None``. Explicitly pass ``header=0`` to be able to - replace existing names. The header can be a list of integers that - specify row locations for a :class:`~pandas.MultiIndex` on the columns - e.g. ``[0, 1, 3]``. Intervening rows that are not specified will be - skipped (e.g. 2 in this example is skipped). Note that this - parameter ignores commented lines and empty lines if - ``skip_blank_lines=True``, so ``header=0`` denotes the first line of - data rather than the first line of the file. - - When inferred from the file contents, headers are kept distinct from - each other by renaming duplicate names with a numeric suffix of the form - ``".{{count}}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. - Empty headers are named ``"Unnamed: {{i}}"`` or ``"Unnamed: {{i}}_level_{{level}}"`` - in the case of MultiIndex columns. -names : Sequence of Hashable, optional - Sequence of column labels to apply. If the file contains a header row, - then you should explicitly pass ``header=0`` to override the column names. - Duplicates in this list are not allowed. -index_col : Hashable, Sequence of Hashable or False, optional - Column(s) to use as row label(s), denoted either by column labels or column - indices. If a sequence of labels or indices is given, :class:`~pandas.MultiIndex` - will be formed for the row labels. - - Note: ``index_col=False`` can be used to force pandas to *not* use the first - column as the index, e.g., when you have a malformed file with delimiters at - the end of each line. -usecols : Sequence of Hashable or Callable, optional - Subset of columns to select, denoted either by column labels or column indices. - If list-like, all elements must either - be positional (i.e. integer indices into the document columns) or strings - that correspond to column names provided either by the user in ``names`` or - inferred from the document header row(s). If ``names`` are given, the document - header row(s) are not taken into account. For example, a valid list-like - ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. - Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. - To instantiate a :class:`~pandas.DataFrame` from ``data`` with element order - preserved use ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` - for columns in ``['foo', 'bar']`` order or - ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` - for ``['bar', 'foo']`` order. - - If callable, the callable function will be evaluated against the column - names, returning names where the callable function evaluates to ``True``. An - example of a valid callable argument would be ``lambda x: x.upper() in - ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster - parsing time and lower memory usage. -dtype : dtype or dict of {{Hashable : dtype}}, optional - Data type(s) to apply to either the whole dataset or individual columns. - E.g., ``{{'a': np.float64, 'b': np.int32, 'c': 'Int64'}}`` - Use ``str`` or ``object`` together with suitable ``na_values`` settings - to preserve and not interpret ``dtype``. - If ``converters`` are specified, they will be applied INSTEAD - of ``dtype`` conversion. - - .. versionadded:: 1.5.0 - - Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input where - the default determines the ``dtype`` of the columns which are not explicitly - listed. -engine : {{'c', 'python', 'pyarrow'}}, optional - Parser engine to use. The C and pyarrow engines are faster, while the python engine - is currently more feature-complete. Multithreading is currently only supported by - the pyarrow engine. - - .. versionadded:: 1.4.0 - - The 'pyarrow' engine was added as an *experimental* engine, and some features - are unsupported, or may not work correctly, with this engine. -converters : dict of {{Hashable : Callable}}, optional - Functions for converting values in specified columns. Keys can either - be column labels or column indices. -true_values : list, optional - Values to consider as ``True`` in addition to case-insensitive variants of 'True'. -false_values : list, optional - Values to consider as ``False`` in addition to case-insensitive variants of 'False'. -skipinitialspace : bool, default False - Skip spaces after delimiter. -skiprows : int, list of int or Callable, optional - Line numbers to skip (0-indexed) or number of lines to skip (``int``) - at the start of the file. - - If callable, the callable function will be evaluated against the row - indices, returning ``True`` if the row should be skipped and ``False`` otherwise. - An example of a valid callable argument would be ``lambda x: x in [0, 2]``. -skipfooter : int, default 0 - Number of lines at bottom of file to skip (Unsupported with ``engine='c'``). -nrows : int, optional - Number of rows of file to read. Useful for reading pieces of large files. -na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional - Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific - per-column ``NA`` values. By default the following values are interpreted as - ``NaN``: " """ - + fill('", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") - + """ ". - -keep_default_na : bool, default True - Whether or not to include the default ``NaN`` values when parsing the data. - Depending on whether ``na_values`` is passed in, the behavior is as follows: - - * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, ``na_values`` - is appended to the default ``NaN`` values used for parsing. - * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only - the default ``NaN`` values are used for parsing. - * If ``keep_default_na`` is ``False``, and ``na_values`` are specified, only - the ``NaN`` values specified ``na_values`` are used for parsing. - * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no - strings will be parsed as ``NaN``. - - Note that if ``na_filter`` is passed in as ``False``, the ``keep_default_na`` and - ``na_values`` parameters will be ignored. -na_filter : bool, default True - Detect missing value markers (empty strings and the value of ``na_values``). In - data without any ``NA`` values, passing ``na_filter=False`` can improve the - performance of reading a large file. -skip_blank_lines : bool, default True - If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. -parse_dates : bool, None, list of Hashable, list of lists or dict of {{Hashable : \ -list}}, default None - The behavior is as follows: - - * ``bool``. If ``True`` -> try parsing the index. - * ``None``. Behaves like ``True`` if ``date_parser`` or ``date_format`` are - specified. - * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 - each as a separate date column. - * ``list`` of ``list``. e.g. If ``[[1, 3]]`` -> combine columns 1 and 3 and parse - as a single date column. Values are joined with a space before parsing. - * ``dict``, e.g. ``{{'foo' : [1, 3]}}`` -> parse columns 1, 3 as date and call - result 'foo'. Values are joined with a space before parsing. - - If a column or index cannot be represented as an array of ``datetime``, - say because of an unparsable value or a mixture of timezones, the column - or index will be returned unaltered as an ``object`` data type. For - non-standard ``datetime`` parsing, use :func:`~pandas.to_datetime` after - :func:`~pandas.read_csv`. - - Note: A fast-path exists for iso8601-formatted dates. -infer_datetime_format : bool, default False - If ``True`` and ``parse_dates`` is enabled, pandas will attempt to infer the - format of the ``datetime`` strings in the columns, and if it can be inferred, - switch to a faster method of parsing them. In some cases this can increase - the parsing speed by 5-10x. - - .. deprecated:: 2.0.0 - A strict version of this argument is now the default, passing it has no effect. - -keep_date_col : bool, default False - If ``True`` and ``parse_dates`` specifies combining multiple columns then - keep the original columns. -date_parser : Callable, optional - Function to use for converting a sequence of string columns to an array of - ``datetime`` instances. The default uses ``dateutil.parser.parser`` to do the - conversion. pandas will try to call ``date_parser`` in three different ways, - advancing to the next if an exception occurs: 1) Pass one or more arrays - (as defined by ``parse_dates``) as arguments; 2) concatenate (row-wise) the - string values from the columns defined by ``parse_dates`` into a single array - and pass that; and 3) call ``date_parser`` once for each row using one or - more strings (corresponding to the columns defined by ``parse_dates``) as - arguments. - - .. deprecated:: 2.0.0 - Use ``date_format`` instead, or read in as ``object`` and then apply - :func:`~pandas.to_datetime` as-needed. -date_format : str or dict of column -> format, optional - Format to use for parsing dates when used in conjunction with ``parse_dates``. - The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See - `strftime documentation - `_ for more information on choices, though - note that :const:`"%f"` will parse all the way up to nanoseconds. - You can also pass: - - - "ISO8601", to parse any `ISO8601 `_ - time string (not necessarily in exactly the same format); - - "mixed", to infer the format for each element individually. This is risky, - and you should probably use it along with `dayfirst`. - - .. versionadded:: 2.0.0 -dayfirst : bool, default False - DD/MM format dates, international and European format. -cache_dates : bool, default True - If ``True``, use a cache of unique, converted dates to apply the ``datetime`` - conversion. May produce significant speed-up when parsing duplicate - date strings, especially ones with timezone offsets. - -iterator : bool, default False - Return ``TextFileReader`` object for iteration or getting chunks with - ``get_chunk()``. -chunksize : int, optional - Number of lines to read from the file per chunk. Passing a value will cause the - function to return a ``TextFileReader`` object for iteration. - See the `IO Tools docs - `_ - for more information on ``iterator`` and ``chunksize``. - -{decompression_options} - - .. versionchanged:: 1.4.0 Zstandard support. - -thousands : str (length 1), optional - Character acting as the thousands separator in numerical values. -decimal : str (length 1), default '.' - Character to recognize as decimal point (e.g., use ',' for European data). -lineterminator : str (length 1), optional - Character used to denote a line break. Only valid with C parser. -quotechar : str (length 1), optional - Character used to denote the start and end of a quoted item. Quoted - items can include the ``delimiter`` and it will be ignored. -quoting : {{0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, \ -3 or csv.QUOTE_NONE}}, default csv.QUOTE_MINIMAL - Control field quoting behavior per ``csv.QUOTE_*`` constants. Default is - ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only fields containing special - characters are quoted (e.g., characters defined in ``quotechar``, ``delimiter``, - or ``lineterminator``. -doublequote : bool, default True - When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, indicate - whether or not to interpret two consecutive ``quotechar`` elements INSIDE a - field as a single ``quotechar`` element. -escapechar : str (length 1), optional - Character used to escape other characters. -comment : str (length 1), optional - Character indicating that the remainder of line should not be parsed. - If found at the beginning - of a line, the line will be ignored altogether. This parameter must be a - single character. Like empty lines (as long as ``skip_blank_lines=True``), - fully commented lines are ignored by the parameter ``header`` but not by - ``skiprows``. For example, if ``comment='#'``, parsing - ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in ``'a,b,c'`` being - treated as the header. -encoding : str, optional, default 'utf-8' - Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). `List of Python - standard encodings - `_ . - -encoding_errors : str, optional, default 'strict' - How encoding errors are treated. `List of possible values - `_ . - - .. versionadded:: 1.3.0 - -dialect : str or csv.Dialect, optional - If provided, this parameter will override values (default or not) for the - following parameters: ``delimiter``, ``doublequote``, ``escapechar``, - ``skipinitialspace``, ``quotechar``, and ``quoting``. If it is necessary to - override values, a ``ParserWarning`` will be issued. See ``csv.Dialect`` - documentation for more details. -on_bad_lines : {{'error', 'warn', 'skip'}} or Callable, default 'error' - Specifies what to do upon encountering a bad line (a line with too many fields). - Allowed values are : - - - ``'error'``, raise an Exception when a bad line is encountered. - - ``'warn'``, raise a warning when a bad line is encountered and skip that line. - - ``'skip'``, skip bad lines without raising or warning when they are encountered. - - .. versionadded:: 1.3.0 - - .. versionadded:: 1.4.0 - - - Callable, function with signature - ``(bad_line: list[str]) -> list[str] | None`` that will process a single - bad line. ``bad_line`` is a list of strings split by the ``sep``. - If the function returns ``None``, the bad line will be ignored. - If the function returns a new ``list`` of strings with more elements than - expected, a ``ParserWarning`` will be emitted while dropping extra elements. - Only supported when ``engine='python'`` - - .. versionchanged:: 2.2.0 - - - Callable, function with signature - as described in `pyarrow documentation - `_ when ``engine='pyarrow'`` - -delim_whitespace : bool, default False - Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be - used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option - is set to ``True``, nothing should be passed in for the ``delimiter`` - parameter. - - .. deprecated:: 2.2.0 - Use ``sep="\\s+"`` instead. -low_memory : bool, default True - Internally process the file in chunks, resulting in lower memory use - while parsing, but possibly mixed type inference. To ensure no mixed - types either set ``False``, or specify the type with the ``dtype`` parameter. - Note that the entire file is read into a single :class:`~pandas.DataFrame` - regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in - chunks. (Only valid with C parser). -memory_map : bool, default False - If a filepath is provided for ``filepath_or_buffer``, map the file object - directly onto memory and access the data directly from there. Using this - option can improve performance because there is no longer any I/O overhead. -float_precision : {{'high', 'legacy', 'round_trip'}}, optional - Specifies which converter the C engine should use for floating-point - values. The options are ``None`` or ``'high'`` for the ordinary converter, - ``'legacy'`` for the original lower precision pandas converter, and - ``'round_trip'`` for the round-trip converter. - -{storage_options} - -dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' - Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: - - * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. - - .. versionadded:: 2.0 - -Returns -------- -DataFrame or TextFileReader - A comma-separated values (csv) file is returned as two-dimensional - data structure with labeled axes. - -See Also --------- -DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. -{see_also_func_name} : {see_also_func_summary} -read_fwf : Read a table of fixed-width formatted lines into DataFrame. - -Examples --------- ->>> pd.{func_name}('data.csv') # doctest: +SKIP -""" -) + from pandas.api.extensions import ExtensionArray -class _C_Parser_Defaults(TypedDict): - delim_whitespace: Literal[False] - na_filter: Literal[True] - low_memory: Literal[True] - memory_map: Literal[False] - float_precision: None +def interleaved_dtype(dtypes: list[DtypeObj]) -> DtypeObj | None: + """ + Find the common dtype for `blocks`. + Parameters + ---------- + blocks : List[DtypeObj] -_c_parser_defaults: _C_Parser_Defaults = { - "delim_whitespace": False, - "na_filter": True, - "low_memory": True, - "memory_map": False, - "float_precision": None, -} + Returns + ------- + dtype : np.dtype, ExtensionDtype, or None + None is returned when `blocks` is empty. + """ + if not len(dtypes): + return None + return find_common_type(dtypes) -class _Fwf_Defaults(TypedDict): - colspecs: Literal["infer"] - infer_nrows: Literal[100] - widths: None +def ensure_np_dtype(dtype: DtypeObj) -> np.dtype: + # TODO: https://github.com/pandas-dev/pandas/issues/22791 + # Give EAs some input on what happens here. Sparse needs this. + if isinstance(dtype, SparseDtype): + dtype = dtype.subtype + dtype = cast(np.dtype, dtype) + elif isinstance(dtype, ExtensionDtype): + dtype = np.dtype("object") + elif dtype == np.dtype(str): + dtype = np.dtype("object") + return dtype -_fwf_defaults: _Fwf_Defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} -_c_unsupported = {"skipfooter"} -_python_unsupported = {"low_memory", "float_precision"} -_pyarrow_unsupported = { - "skipfooter", - "float_precision", - "chunksize", - "comment", - "nrows", - "thousands", - "memory_map", - "dialect", - "delim_whitespace", - "quoting", - "lineterminator", - "converters", - "iterator", - "dayfirst", - "skipinitialspace", - "low_memory", -} +class BaseBlockManager(PandasObject): + """ + Core internal data structure to implement DataFrame, Series, etc. -@overload -def validate_integer(name: str, val: None, min_val: int = ...) -> None: ... + Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a + lightweight blocked set of labeled data to be manipulated by the DataFrame + public API class + Attributes + ---------- + shape + ndim + axes + values + items + + Methods + ------- + set_axis(axis, new_labels) + copy(deep=True) -@overload -def validate_integer(name: str, val: float, min_val: int = ...) -> int: ... + get_dtypes + apply(func, axes, block_filter_fn) -@overload -def validate_integer(name: str, val: int | None, min_val: int = ...) -> int | None: ... + get_bool_data + get_numeric_data + get_slice(slice_like, axis) + get(label) + iget(loc) -def validate_integer( - name: str, val: int | float | None, min_val: int = 0 -) -> int | None: - """ - Checks whether the 'name' parameter for parsing is either - an integer OR float that can SAFELY be cast to an integer - without losing accuracy. Raises a ValueError if that is - not the case. + take(indexer, axis) + reindex_axis(new_labels, axis) + reindex_indexer(new_labels, indexer, axis) + + delete(label) + insert(loc, label, value) + set(label, value) Parameters ---------- - name : str - Parameter name (used for error reporting) - val : int or float - The value to check - min_val : int - Minimum allowed value (val < min_val will result in a ValueError) + blocks: Sequence of Block + axes: Sequence of Index + verify_integrity: bool, default True + + Notes + ----- + This is *not* a public API class """ - if val is None: - return val - msg = f"'{name:s}' must be an integer >={min_val:d}" - if is_float(val): - if int(val) != val: - raise ValueError(msg) - val = int(val) - elif not (is_integer(val) and val >= min_val): - raise ValueError(msg) + __slots__ = () + + _blknos: npt.NDArray[np.intp] + _blklocs: npt.NDArray[np.intp] + blocks: tuple[Block, ...] + axes: list[Index] + + @property + def ndim(self) -> int: + raise NotImplementedError + + _known_consolidated: bool + _is_consolidated: bool + + def __init__(self, blocks, axes, verify_integrity: bool = True) -> None: + raise NotImplementedError + + @final + def __len__(self) -> int: + return len(self.items) + + @property + def shape(self) -> Shape: + return tuple(len(ax) for ax in self.axes) + + @classmethod + def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> Self: + raise NotImplementedError + + @property + def blknos(self) -> npt.NDArray[np.intp]: + """ + Suppose we want to find the array corresponding to our i'th column. + + blknos[i] identifies the block from self.blocks that contains this column. + + blklocs[i] identifies the column of interest within + self.blocks[self.blknos[i]] + """ + if self._blknos is None: + # Note: these can be altered by other BlockManager methods. + self._rebuild_blknos_and_blklocs() + + return self._blknos + + @property + def blklocs(self) -> npt.NDArray[np.intp]: + """ + See blknos.__doc__ + """ + if self._blklocs is None: + # Note: these can be altered by other BlockManager methods. + self._rebuild_blknos_and_blklocs() + + return self._blklocs + + def make_empty(self, axes=None) -> Self: + """return an empty BlockManager with the items axis of len 0""" + if axes is None: + axes = [Index([])] + self.axes[1:] + + # preserve dtype if possible + if self.ndim == 1: + assert isinstance(self, SingleBlockManager) # for mypy + blk = self.blocks[0] + arr = blk.values[:0] + bp = BlockPlacement(slice(0, 0)) + nb = blk.make_block_same_class(arr, placement=bp) + blocks = [nb] + else: + blocks = [] + return type(self).from_blocks(blocks, axes) - return int(val) + def __nonzero__(self) -> bool: + return True + # Python3 compat + __bool__ = __nonzero__ -def _validate_names(names: Sequence[Hashable] | None) -> None: - """ - Raise ValueError if the `names` parameter contains duplicates or has an - invalid data type. + def set_axis(self, axis: AxisInt, new_labels: Index) -> None: + # Caller is responsible for ensuring we have an Index object. + self._validate_set_axis(axis, new_labels) + self.axes[axis] = new_labels - Parameters - ---------- - names : array-like or None - An array containing a list of the names used for the output DataFrame. + @final + def _validate_set_axis(self, axis: AxisInt, new_labels: Index) -> None: + # Caller is responsible for ensuring we have an Index object. + old_len = len(self.axes[axis]) + new_len = len(new_labels) - Raises - ------ - ValueError - If names are not unique or are not ordered (e.g. set). - """ - if names is not None: - if len(names) != len(set(names)): - raise ValueError("Duplicate names are not allowed.") - if not ( - is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView) - ): - raise ValueError("Names should be an ordered collection.") - - -def _read( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds -) -> DataFrame | TextFileReader: - """Generic reader of line files.""" - # if we pass a date_parser and parse_dates=False, we should not parse the - # dates GH#44366 - if kwds.get("parse_dates", None) is None: - if ( - kwds.get("date_parser", lib.no_default) is lib.no_default - and kwds.get("date_format", None) is None - ): - kwds["parse_dates"] = False - else: - kwds["parse_dates"] = True + if axis == 1 and len(self.items) == 0: + # If we are setting the index on a DataFrame with no columns, + # it is OK to change the length. + pass - # Extract some of the arguments (pass chunksize on). - iterator = kwds.get("iterator", False) - chunksize = kwds.get("chunksize", None) - if kwds.get("engine") == "pyarrow": - if iterator: + elif new_len != old_len: raise ValueError( - "The 'iterator' option is not supported with the 'pyarrow' engine" + f"Length mismatch: Expected axis has {old_len} elements, new " + f"values have {new_len} elements" ) - if chunksize is not None: - raise ValueError( - "The 'chunksize' option is not supported with the 'pyarrow' engine" - ) - else: - chunksize = validate_integer("chunksize", chunksize, 1) - - nrows = kwds.get("nrows", None) - - # Check for duplicates in names. - _validate_names(kwds.get("names", None)) - - # Create the parser. - parser = TextFileReader(filepath_or_buffer, **kwds) - - if chunksize or iterator: - return parser - - with parser: - return parser.read(nrows) - - -@overload -def read_csv( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - iterator: Literal[True], - chunksize: int | None = ..., - **kwds: Unpack[_read_shared[HashableT]], -) -> TextFileReader: ... - - -@overload -def read_csv( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - iterator: bool = ..., - chunksize: int, - **kwds: Unpack[_read_shared[HashableT]], -) -> TextFileReader: ... - - -@overload -def read_csv( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - iterator: Literal[False] = ..., - chunksize: None = ..., - **kwds: Unpack[_read_shared[HashableT]], -) -> DataFrame: ... - - -@overload -def read_csv( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - iterator: bool = ..., - chunksize: int | None = ..., - **kwds: Unpack[_read_shared[HashableT]], -) -> DataFrame | TextFileReader: ... - - -# a helper function for the read_csv(...) below). -# ensures that all keys in dtype are of type str. -# this allows for compatibility with the csv library -def parse_dtype(dtype) -> DtypeArg: - temp = {} - for key in dtype: - if isinstance(key, str): - temp[f"{key}"] = dtype[key] + @property + def is_single_block(self) -> bool: + # Assumes we are 2D; overridden by SingleBlockManager + return len(self.blocks) == 1 + + @property + def items(self) -> Index: + return self.axes[0] + + def _has_no_reference(self, i: int) -> bool: + """ + Check for column `i` if it has references. + (whether it references another array or is itself being referenced) + Returns True if the column has no references. + """ + blkno = self.blknos[i] + return self._has_no_reference_block(blkno) + + def _has_no_reference_block(self, blkno: int) -> bool: + """ + Check for block `i` if it has references. + (whether it references another array or is itself being referenced) + Returns True if the block has no references. + """ + return not self.blocks[blkno].refs.has_reference() + + def add_references(self, mgr: BaseBlockManager) -> None: + """ + Adds the references from one manager to another. We assume that both + managers have the same block structure. + """ + if len(self.blocks) != len(mgr.blocks): + # If block structure changes, then we made a copy + return + for i, blk in enumerate(self.blocks): + blk.refs = mgr.blocks[i].refs + blk.refs.add_reference(blk) + + def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool: + """ + Checks if two blocks from two different block managers reference the + same underlying values. + """ + blk = self.blocks[blkno] + return any(blk is ref() for ref in mgr.blocks[blkno].refs.referenced_blocks) + + def get_dtypes(self) -> npt.NDArray[np.object_]: + dtypes = np.array([blk.dtype for blk in self.blocks], dtype=object) + return dtypes.take(self.blknos) + + @property + def arrays(self) -> list[ArrayLike]: + """ + Quick access to the backing arrays of the Blocks. + + Only for compatibility with ArrayManager for testing convenience. + Not to be used in actual code, and return value is not the same as the + ArrayManager method (list of 1D arrays vs iterator of 2D ndarrays / 1D EAs). + + Warning! The returned arrays don't handle Copy-on-Write, so this should + be used with caution (only in read-mode). + """ + return [blk.values for blk in self.blocks] + + def __repr__(self) -> str: + output = type(self).__name__ + for i, ax in enumerate(self.axes): + if i == 0: + output += f"\nItems: {ax}" + else: + output += f"\nAxis {i}: {ax}" + + for block in self.blocks: + output += f"\n{block}" + return output + + def _equal_values(self, other: Self) -> bool: + """ + To be implemented by the subclasses. Only check the column values + assuming shape and indexes have already been checked. + """ + raise AbstractMethodError(self) + + @final + def equals(self, other: object) -> bool: + """ + Implementation for DataFrame.equals + """ + if not isinstance(other, type(self)): + return False + + self_axes, other_axes = self.axes, other.axes + if len(self_axes) != len(other_axes): + return False + if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)): + return False + + return self._equal_values(other) + + def apply( + self, + f, + align_keys: list[str] | None = None, + **kwargs, + ) -> Self: + """ + Iterate over the blocks, collect and create a new BlockManager. + + Parameters + ---------- + f : str or callable + Name of the Block method to apply. + align_keys: List[str] or None, default None + **kwargs + Keywords to pass to `f` + + Returns + ------- + BlockManager + """ + assert "filter" not in kwargs + + align_keys = align_keys or [] + result_blocks: list[Block] = [] + # fillna: Series/DataFrame is responsible for making sure value is aligned + + aligned_args = {k: kwargs[k] for k in align_keys} + + for b in self.blocks: + if aligned_args: + for k, obj in aligned_args.items(): + if isinstance(obj, (ABCSeries, ABCDataFrame)): + # The caller is responsible for ensuring that + # obj.axes[-1].equals(self.items) + if obj.ndim == 1: + kwargs[k] = obj.iloc[b.mgr_locs.indexer]._values + else: + kwargs[k] = obj.iloc[:, b.mgr_locs.indexer]._values + else: + # otherwise we have an ndarray + kwargs[k] = obj[b.mgr_locs.indexer] + + if callable(f): + applied = b.apply(f, **kwargs) + else: + applied = getattr(b, f)(**kwargs) + result_blocks = extend_blocks(applied, result_blocks) + + out = type(self).from_blocks(result_blocks, self.axes) + return out + + @final + def isna(self, func) -> Self: + return self.apply("apply", func=func) + + @final + def fillna(self, value, limit: int | None, inplace: bool) -> Self: + if limit is not None: + # Do this validation even if we go through one of the no-op paths + limit = libalgos.validate_limit(None, limit=limit) + + return self.apply( + "fillna", + value=value, + limit=limit, + inplace=inplace, + ) + + @final + def where(self, other, cond, align: bool) -> Self: + if align: + align_keys = ["other", "cond"] else: - temp[key] = dtype[key] - return temp - - -@Appender( - _doc_read_csv_and_table.format( - func_name="read_csv", - summary="Read a comma-separated values (csv) file into DataFrame.", - see_also_func_name="read_table", - see_also_func_summary="Read general delimited file into DataFrame.", - _default_sep="','", - storage_options=_shared_docs["storage_options"], - decompression_options=_shared_docs["decompression_options"] - % "filepath_or_buffer", - ) -) -def read_csv( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - sep: str | None | lib.NoDefault = lib.no_default, - delimiter: str | None | lib.NoDefault = None, - # Column and Index Locations and Names - header: int | Sequence[int] | None | Literal["infer"] = "infer", - names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, - index_col: IndexLabel | Literal[False] | None = None, - usecols: UsecolsArgType = None, - # General Parsing Configuration - dtype: DtypeArg | None = None, - engine: CSVEngine | None = None, - converters: Mapping[HashableT, Callable] | None = None, - true_values: list | None = None, - false_values: list | None = None, - skipinitialspace: bool = False, - skiprows: list[int] | int | Callable[[Hashable], bool] | None = None, - skipfooter: int = 0, - nrows: int | None = None, - # NA and Missing Data Handling - na_values: Hashable - | Iterable[Hashable] - | Mapping[Hashable, Iterable[Hashable]] - | None = None, - keep_default_na: bool = True, - na_filter: bool = True, - skip_blank_lines: bool = True, - # Datetime Handling - parse_dates: bool | Sequence[Hashable] | None = None, - infer_datetime_format: bool | lib.NoDefault = lib.no_default, - keep_date_col: bool | lib.NoDefault = lib.no_default, - date_parser: Callable | lib.NoDefault = lib.no_default, - date_format: str | dict[Hashable, str] | None = None, - dayfirst: bool = False, - cache_dates: bool = True, - # Iteration - iterator: bool = False, - chunksize: int | None = None, - # Quoting, Compression, and File Format - compression: CompressionOptions = "infer", - thousands: str | None = None, - decimal: str = ".", - lineterminator: str | None = None, - quotechar: str = '"', - quoting: int = csv.QUOTE_MINIMAL, - doublequote: bool = True, - escapechar: str | None = None, - comment: str | None = None, - encoding: str | None = None, - encoding_errors: str | None = "strict", - dialect: str | csv.Dialect | None = None, - # Error Handling - on_bad_lines: str = "error", - # Internal - delim_whitespace: bool | lib.NoDefault = lib.no_default, - low_memory: bool = _c_parser_defaults["low_memory"], - memory_map: bool = False, - float_precision: Literal["high", "legacy", "round_trip"] | None = None, - storage_options: StorageOptions | None = None, - dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, -) -> DataFrame | TextFileReader: - # ensures that all keys in dtype are a string for compatibility with csv - dtype = parse_dtype(dtype) - - if keep_date_col is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'keep_date_col' keyword in pd.read_csv is deprecated and " - "will be removed in a future version. Explicitly remove unwanted " - "columns after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), + align_keys = ["cond"] + other = extract_array(other, extract_numpy=True) + + return self.apply( + "where", + align_keys=align_keys, + other=other, + cond=cond, ) - else: - keep_date_col = False - - if lib.is_list_like(parse_dates): - # GH#55569 - depr = False - # error: Item "bool" of "bool | Sequence[Hashable] | None" has no - # attribute "__iter__" (not iterable) - if not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr] - depr = True - elif isinstance(parse_dates, dict) and any( - lib.is_list_like(x) for x in parse_dates.values() - ): - depr = True - if depr: - warnings.warn( - "Support for nested sequences for 'parse_dates' in pd.read_csv " - "is deprecated. Combine the desired columns with pd.to_datetime " - "after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if infer_datetime_format is not lib.no_default: - warnings.warn( - "The argument 'infer_datetime_format' is deprecated and will " - "be removed in a future version. " - "A strict version of it is now the default, see " - "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " - "You can safely remove this argument.", - FutureWarning, - stacklevel=find_stack_level(), + @final + def putmask(self, mask, new, align: bool = True) -> Self: + if align: + align_keys = ["new", "mask"] + else: + align_keys = ["mask"] + new = extract_array(new, extract_numpy=True) + + return self.apply( + "putmask", + align_keys=align_keys, + mask=mask, + new=new, ) - if delim_whitespace is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'delim_whitespace' keyword in pd.read_csv is deprecated and " - "will be removed in a future version. Use ``sep='\\s+'`` instead", - FutureWarning, - stacklevel=find_stack_level(), + @final + def round(self, decimals: int) -> Self: + return self.apply("round", decimals=decimals) + + @final + def replace(self, to_replace, value, inplace: bool) -> Self: + inplace = validate_bool_kwarg(inplace, "inplace") + # NDFrame.replace ensures the not-is_list_likes here + assert not lib.is_list_like(to_replace) + assert not lib.is_list_like(value) + return self.apply( + "replace", + to_replace=to_replace, + value=value, + inplace=inplace, ) - else: - delim_whitespace = False - - # locals() should never be modified - kwds = locals().copy() - del kwds["filepath_or_buffer"] - del kwds["sep"] - - kwds_defaults = _refine_defaults_read( - dialect, - delimiter, - delim_whitespace, - engine, - sep, - on_bad_lines, - names, - defaults={"delimiter": ","}, - dtype_backend=dtype_backend, - ) - kwds.update(kwds_defaults) - - return _read(filepath_or_buffer, kwds) - - -@overload -def read_table( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - iterator: Literal[True], - chunksize: int | None = ..., - **kwds: Unpack[_read_shared[HashableT]], -) -> TextFileReader: ... - - -@overload -def read_table( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - iterator: bool = ..., - chunksize: int, - **kwds: Unpack[_read_shared[HashableT]], -) -> TextFileReader: ... - - -@overload -def read_table( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - iterator: Literal[False] = ..., - chunksize: None = ..., - **kwds: Unpack[_read_shared[HashableT]], -) -> DataFrame: ... - - -@overload -def read_table( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - iterator: bool = ..., - chunksize: int | None = ..., - **kwds: Unpack[_read_shared[HashableT]], -) -> DataFrame | TextFileReader: ... - - -@Appender( - _doc_read_csv_and_table.format( - func_name="read_table", - summary="Read general delimited file into DataFrame.", - see_also_func_name="read_csv", - see_also_func_summary=( - "Read a comma-separated values (csv) file into DataFrame." - ), - _default_sep=r"'\\t' (tab-stop)", - storage_options=_shared_docs["storage_options"], - decompression_options=_shared_docs["decompression_options"] - % "filepath_or_buffer", - ) -) -def read_table( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - sep: str | None | lib.NoDefault = lib.no_default, - delimiter: str | None | lib.NoDefault = None, - # Column and Index Locations and Names - header: int | Sequence[int] | None | Literal["infer"] = "infer", - names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, - index_col: IndexLabel | Literal[False] | None = None, - usecols: UsecolsArgType = None, - # General Parsing Configuration - dtype: DtypeArg | None = None, - engine: CSVEngine | None = None, - converters: Mapping[HashableT, Callable] | None = None, - true_values: list | None = None, - false_values: list | None = None, - skipinitialspace: bool = False, - skiprows: list[int] | int | Callable[[Hashable], bool] | None = None, - skipfooter: int = 0, - nrows: int | None = None, - # NA and Missing Data Handling - na_values: Hashable - | Iterable[Hashable] - | Mapping[Hashable, Iterable[Hashable]] - | None = None, - keep_default_na: bool = True, - na_filter: bool = True, - skip_blank_lines: bool = True, - # Datetime Handling - parse_dates: bool | Sequence[Hashable] | None = None, - infer_datetime_format: bool | lib.NoDefault = lib.no_default, - keep_date_col: bool | lib.NoDefault = lib.no_default, - date_parser: Callable | lib.NoDefault = lib.no_default, - date_format: str | dict[Hashable, str] | None = None, - dayfirst: bool = False, - cache_dates: bool = True, - # Iteration - iterator: bool = False, - chunksize: int | None = None, - # Quoting, Compression, and File Format - compression: CompressionOptions = "infer", - thousands: str | None = None, - decimal: str = ".", - lineterminator: str | None = None, - quotechar: str = '"', - quoting: int = csv.QUOTE_MINIMAL, - doublequote: bool = True, - escapechar: str | None = None, - comment: str | None = None, - encoding: str | None = None, - encoding_errors: str | None = "strict", - dialect: str | csv.Dialect | None = None, - # Error Handling - on_bad_lines: str = "error", - # Internal - delim_whitespace: bool | lib.NoDefault = lib.no_default, - low_memory: bool = _c_parser_defaults["low_memory"], - memory_map: bool = False, - float_precision: Literal["high", "legacy", "round_trip"] | None = None, - storage_options: StorageOptions | None = None, - dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, -) -> DataFrame | TextFileReader: - if keep_date_col is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'keep_date_col' keyword in pd.read_table is deprecated and " - "will be removed in a future version. Explicitly remove unwanted " - "columns after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), + + @final + def replace_regex(self, **kwargs) -> Self: + return self.apply("_replace_regex", **kwargs) + + @final + def replace_list( + self, + src_list: list[Any], + dest_list: list[Any], + inplace: bool = False, + regex: bool = False, + ) -> Self: + """do a list replace""" + inplace = validate_bool_kwarg(inplace, "inplace") + + bm = self.apply( + "replace_list", + src_list=src_list, + dest_list=dest_list, + inplace=inplace, + regex=regex, ) - else: - keep_date_col = False - - # error: Item "bool" of "bool | Sequence[Hashable]" has no attribute "__iter__" - if lib.is_list_like(parse_dates) and not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr] - # GH#55569 - warnings.warn( - "Support for nested sequences for 'parse_dates' in pd.read_table " - "is deprecated. Combine the desired columns with pd.to_datetime " - "after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), + bm._consolidate_inplace() + return bm + + def interpolate(self, inplace: bool, **kwargs) -> Self: + return self.apply("interpolate", inplace=inplace, **kwargs) + + def pad_or_backfill(self, inplace: bool, **kwargs) -> Self: + return self.apply("pad_or_backfill", inplace=inplace, **kwargs) + + def shift(self, periods: int, fill_value) -> Self: + if fill_value is lib.no_default: + fill_value = None + + return self.apply("shift", periods=periods, fill_value=fill_value) + + def setitem(self, indexer, value) -> Self: + """ + Set values with indexer. + + For SingleBlockManager, this backs s[indexer] = value + """ + if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim: + raise ValueError(f"Cannot set values with ndim > {self.ndim}") + + if not self._has_no_reference(0): + # this method is only called if there is a single block -> hardcoded 0 + # Split blocks to only copy the columns we want to modify + if self.ndim == 2 and isinstance(indexer, tuple): + blk_loc = self.blklocs[indexer[1]] + if is_list_like(blk_loc) and blk_loc.ndim == 2: + blk_loc = np.squeeze(blk_loc, axis=0) + elif not is_list_like(blk_loc): + # Keep dimension and copy data later + blk_loc = [blk_loc] # type: ignore[assignment] + if len(blk_loc) == 0: + return self.copy(deep=False) + + values = self.blocks[0].values + if values.ndim == 2: + values = values[blk_loc] + # "T" has no attribute "_iset_split_block" + self._iset_split_block( # type: ignore[attr-defined] + 0, blk_loc, values + ) + # first block equals values + self.blocks[0].setitem((indexer[0], np.arange(len(blk_loc))), value) + return self + # No need to split if we either set all columns or on a single block + # manager + self = self.copy() + + return self.apply("setitem", indexer=indexer, value=value) + + def diff(self, n: int) -> Self: + # only reached with self.ndim == 2 + return self.apply("diff", n=n) + + def astype(self, dtype, errors: str = "raise") -> Self: + return self.apply("astype", dtype=dtype, errors=errors) + + def convert(self) -> Self: + return self.apply("convert") + + def convert_dtypes(self, **kwargs): + return self.apply("convert_dtypes", **kwargs) + + def get_values_for_csv( + self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None + ) -> Self: + """ + Convert values to native types (strings / python objects) that are used + in formatting (repr / csv). + """ + return self.apply( + "get_values_for_csv", + na_rep=na_rep, + quoting=quoting, + float_format=float_format, + date_format=date_format, + decimal=decimal, ) - if infer_datetime_format is not lib.no_default: - warnings.warn( - "The argument 'infer_datetime_format' is deprecated and will " - "be removed in a future version. " - "A strict version of it is now the default, see " - "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " - "You can safely remove this argument.", - FutureWarning, - stacklevel=find_stack_level(), + @property + def any_extension_types(self) -> bool: + """Whether any of the blocks in this manager are extension blocks""" + return any(block.is_extension for block in self.blocks) + + @property + def is_view(self) -> bool: + """return a boolean if we are a single block and are a view""" + if len(self.blocks) == 1: + return self.blocks[0].is_view + + # It is technically possible to figure out which blocks are views + # e.g. [ b.values.base is not None for b in self.blocks ] + # but then we have the case of possibly some blocks being a view + # and some blocks not. setting in theory is possible on the non-view + # blocks. But this is a bit + # complicated + + return False + + def _get_data_subset(self, predicate: Callable) -> Self: + blocks = [blk for blk in self.blocks if predicate(blk.values)] + return self._combine(blocks) + + def get_bool_data(self) -> Self: + """ + Select blocks that are bool-dtype and columns from object-dtype blocks + that are all-bool. + """ + + new_blocks = [] + + for blk in self.blocks: + if blk.dtype == bool: + new_blocks.append(blk) + + elif blk.is_object: + nbs = blk._split() + new_blocks.extend(nb for nb in nbs if nb.is_bool) + + return self._combine(new_blocks) + + def get_numeric_data(self) -> Self: + numeric_blocks = [blk for blk in self.blocks if blk.is_numeric] + if len(numeric_blocks) == len(self.blocks): + # Avoid somewhat expensive _combine + return self + return self._combine(numeric_blocks) + + def _combine(self, blocks: list[Block], index: Index | None = None) -> Self: + """return a new manager with the blocks""" + if len(blocks) == 0: + if self.ndim == 2: + # retain our own Index dtype + if index is not None: + axes = [self.items[:0], index] + else: + axes = [self.items[:0]] + self.axes[1:] + return self.make_empty(axes) + return self.make_empty() + + # FIXME: optimization potential + indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) + inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) + + new_blocks: list[Block] = [] + for b in blocks: + nb = b.copy(deep=False) + nb.mgr_locs = BlockPlacement(inv_indexer[nb.mgr_locs.indexer]) + new_blocks.append(nb) + + axes = list(self.axes) + if index is not None: + axes[-1] = index + axes[0] = self.items.take(indexer) + + return type(self).from_blocks(new_blocks, axes) + + @property + def nblocks(self) -> int: + return len(self.blocks) + + def copy(self, deep: bool | Literal["all"] = True) -> Self: + """ + Make deep or shallow copy of BlockManager + + Parameters + ---------- + deep : bool, string or None, default True + If False or None, return a shallow copy (do not copy data) + If 'all', copy data and a deep copy of the index + + Returns + ------- + BlockManager + """ + # this preserves the notion of view copying of axes + if deep: + # hit in e.g. tests.io.json.test_pandas + + def copy_func(ax): + return ax.copy(deep=True) if deep == "all" else ax.view() + + new_axes = [copy_func(ax) for ax in self.axes] + else: + new_axes = [ax.view() for ax in self.axes] + + res = self.apply("copy", deep=deep) + res.axes = new_axes + + if self.ndim > 1: + # Avoid needing to re-compute these + blknos = self._blknos + if blknos is not None: + res._blknos = blknos.copy() + res._blklocs = self._blklocs.copy() + + if deep: + res._consolidate_inplace() + return res + + def is_consolidated(self) -> bool: + return True + + def consolidate(self) -> Self: + """ + Join together blocks having same dtype + + Returns + ------- + y : BlockManager + """ + if self.is_consolidated(): + return self + + bm = type(self)(self.blocks, self.axes, verify_integrity=False) + bm._is_consolidated = False + bm._consolidate_inplace() + return bm + + def _consolidate_inplace(self) -> None: + return + + @final + def reindex_axis( + self, + new_index: Index, + axis: AxisInt, + fill_value=None, + only_slice: bool = False, + ) -> Self: + """ + Conform data manager to new index. + """ + new_index, indexer = self.axes[axis].reindex(new_index) + + return self.reindex_indexer( + new_index, + indexer, + axis=axis, + fill_value=fill_value, + only_slice=only_slice, ) - if delim_whitespace is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'delim_whitespace' keyword in pd.read_table is deprecated and " - "will be removed in a future version. Use ``sep='\\s+'`` instead", - FutureWarning, - stacklevel=find_stack_level(), + def reindex_indexer( + self, + new_axis: Index, + indexer: npt.NDArray[np.intp] | None, + axis: AxisInt, + fill_value=None, + allow_dups: bool = False, + only_slice: bool = False, + *, + use_na_proxy: bool = False, + ) -> Self: + """ + Parameters + ---------- + new_axis : Index + indexer : ndarray[intp] or None + axis : int + fill_value : object, default None + allow_dups : bool, default False + only_slice : bool, default False + Whether to take views, not copies, along columns. + use_na_proxy : bool, default False + Whether to use a np.void ndarray for newly introduced columns. + + pandas-indexer with -1's only. + """ + if indexer is None: + if new_axis is self.axes[axis]: + return self + + result = self.copy(deep=False) + result.axes = list(self.axes) + result.axes[axis] = new_axis + return result + + # Should be intp, but in some cases we get int64 on 32bit builds + assert isinstance(indexer, np.ndarray) + + # some axes don't allow reindexing with dups + if not allow_dups: + self.axes[axis]._validate_can_reindex(indexer) + + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + if axis == 0: + new_blocks = self._slice_take_blocks_ax0( + indexer, + fill_value=fill_value, + only_slice=only_slice, + use_na_proxy=use_na_proxy, + ) + else: + new_blocks = [ + blk.take_nd( + indexer, + axis=1, + fill_value=( + fill_value if fill_value is not None else blk.fill_value + ), + ) + for blk in self.blocks + ] + + new_axes = list(self.axes) + new_axes[axis] = new_axis + + new_mgr = type(self).from_blocks(new_blocks, new_axes) + if axis == 1: + # We can avoid the need to rebuild these + new_mgr._blknos = self.blknos.copy() + new_mgr._blklocs = self.blklocs.copy() + return new_mgr + + def _slice_take_blocks_ax0( + self, + slice_or_indexer: slice | np.ndarray, + fill_value=lib.no_default, + only_slice: bool = False, + *, + use_na_proxy: bool = False, + ref_inplace_op: bool = False, + ) -> list[Block]: + """ + Slice/take blocks along axis=0. + + Overloaded for SingleBlock + + Parameters + ---------- + slice_or_indexer : slice or np.ndarray[int64] + fill_value : scalar, default lib.no_default + only_slice : bool, default False + If True, we always return views on existing arrays, never copies. + This is used when called from ops.blockwise.operate_blockwise. + use_na_proxy : bool, default False + Whether to use a np.void ndarray for newly introduced columns. + ref_inplace_op: bool, default False + Don't track refs if True because we operate inplace + + Returns + ------- + new_blocks : list of Block + """ + allow_fill = fill_value is not lib.no_default + + sl_type, slobj, sllen = _preprocess_slice_or_indexer( + slice_or_indexer, self.shape[0], allow_fill=allow_fill ) - else: - delim_whitespace = False - - # locals() should never be modified - kwds = locals().copy() - del kwds["filepath_or_buffer"] - del kwds["sep"] - - kwds_defaults = _refine_defaults_read( - dialect, - delimiter, - delim_whitespace, - engine, - sep, - on_bad_lines, - names, - defaults={"delimiter": "\t"}, - dtype_backend=dtype_backend, - ) - kwds.update(kwds_defaults) - - return _read(filepath_or_buffer, kwds) - - -@overload -def read_fwf( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - colspecs: Sequence[tuple[int, int]] | str | None = ..., - widths: Sequence[int] | None = ..., - infer_nrows: int = ..., - iterator: Literal[True], - chunksize: int | None = ..., - **kwds: Unpack[_read_shared[HashableT]], -) -> TextFileReader: ... - - -@overload -def read_fwf( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - colspecs: Sequence[tuple[int, int]] | str | None = ..., - widths: Sequence[int] | None = ..., - infer_nrows: int = ..., - iterator: bool = ..., - chunksize: int, - **kwds: Unpack[_read_shared[HashableT]], -) -> TextFileReader: ... - - -@overload -def read_fwf( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - colspecs: Sequence[tuple[int, int]] | str | None = ..., - widths: Sequence[int] | None = ..., - infer_nrows: int = ..., - iterator: Literal[False] = ..., - chunksize: None = ..., - **kwds: Unpack[_read_shared[HashableT]], -) -> DataFrame: ... - - -def read_fwf( - filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - *, - colspecs: Sequence[tuple[int, int]] | str | None = "infer", - widths: Sequence[int] | None = None, - infer_nrows: int = 100, - iterator: bool = False, - chunksize: int | None = None, - **kwds: Unpack[_read_shared[HashableT]], -) -> DataFrame | TextFileReader: - r""" - Read a table of fixed-width formatted lines into DataFrame. - - Also supports optionally iterating or breaking of the file - into chunks. - - Additional help can be found in the `online docs for IO Tools - `_. - Parameters - ---------- - filepath_or_buffer : str, path object, or file-like object - String, path object (implementing ``os.PathLike[str]``), or file-like - object implementing a text ``read()`` function.The string could be a URL. - Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is - expected. A local file could be: - ``file://localhost/path/to/table.csv``. - colspecs : list of tuple (int, int) or 'infer'. optional - A list of tuples giving the extents of the fixed-width - fields of each line as half-open intervals (i.e., [from, to] ). - String value 'infer' can be used to instruct the parser to try - detecting the column specifications from the first 100 rows of - the data which are not being skipped via skiprows (default='infer'). - widths : list of int, optional - A list of field widths which can be used instead of 'colspecs' if - the intervals are contiguous. - infer_nrows : int, default 100 - The number of rows to consider when letting the parser determine the - `colspecs`. - iterator : bool, default False - Return ``TextFileReader`` object for iteration or getting chunks with - ``get_chunk()``. - chunksize : int, optional - Number of lines to read from the file per chunk. - **kwds : optional - Optional keyword arguments can be passed to ``TextFileReader``. + if self.is_single_block: + blk = self.blocks[0] + + if sl_type == "slice": + # GH#32959 EABlock would fail since we can't make 0-width + # TODO(EA2D): special casing unnecessary with 2D EAs + if sllen == 0: + return [] + bp = BlockPlacement(slice(0, sllen)) + return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)] + elif not allow_fill or self.ndim == 1: + if allow_fill and fill_value is None: + fill_value = blk.fill_value + + if not allow_fill and only_slice: + # GH#33597 slice instead of take, so we get + # views instead of copies + blocks = [ + blk.getitem_block_columns( + slice(ml, ml + 1), + new_mgr_locs=BlockPlacement(i), + ref_inplace_op=ref_inplace_op, + ) + for i, ml in enumerate(slobj) + ] + return blocks + else: + bp = BlockPlacement(slice(0, sllen)) + return [ + blk.take_nd( + slobj, + axis=0, + new_mgr_locs=bp, + fill_value=fill_value, + ) + ] - Returns - ------- - DataFrame or TextFileReader - A comma-separated values (csv) file is returned as two-dimensional - data structure with labeled axes. - - See Also - -------- - DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. - read_csv : Read a comma-separated values (csv) file into DataFrame. - - Examples - -------- - >>> pd.read_fwf("data.csv") # doctest: +SKIP - """ - # Check input arguments. - if colspecs is None and widths is None: - raise ValueError("Must specify either colspecs or widths") - if colspecs not in (None, "infer") and widths is not None: - raise ValueError("You must specify only one of 'widths' and 'colspecs'") - - # Compute 'colspecs' from 'widths', if specified. - if widths is not None: - colspecs, col = [], 0 - for w in widths: - colspecs.append((col, col + w)) - col += w - - # for mypy - assert colspecs is not None - - # GH#40830 - # Ensure length of `colspecs` matches length of `names` - names = kwds.get("names") - if names is not None and names is not lib.no_default: - if len(names) != len(colspecs) and colspecs != "infer": - # need to check len(index_col) as it might contain - # unnamed indices, in which case it's name is not required - len_index = 0 - if kwds.get("index_col") is not None: - index_col: Any = kwds.get("index_col") - if index_col is not False: - if not is_list_like(index_col): - len_index = 1 + if sl_type == "slice": + blknos = self.blknos[slobj] + blklocs = self.blklocs[slobj] + else: + blknos = algos.take_nd( + self.blknos, slobj, fill_value=-1, allow_fill=allow_fill + ) + blklocs = algos.take_nd( + self.blklocs, slobj, fill_value=-1, allow_fill=allow_fill + ) + + # When filling blknos, make sure blknos is updated before appending to + # blocks list, that way new blkno is exactly len(blocks). + blocks = [] + group = not only_slice + for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group): + if blkno == -1: + # If we've got here, fill_value was not lib.no_default + + blocks.append( + self._make_na_block( + placement=mgr_locs, + fill_value=fill_value, + use_na_proxy=use_na_proxy, + ) + ) + else: + blk = self.blocks[blkno] + + # Otherwise, slicing along items axis is necessary. + if not blk._can_consolidate and not blk._validate_ndim: + # i.e. we dont go through here for DatetimeTZBlock + # A non-consolidatable block, it's easy, because there's + # only one item and each mgr loc is a copy of that single + # item. + deep = False + for mgr_loc in mgr_locs: + newblk = blk.copy(deep=deep) + newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1)) + blocks.append(newblk) + + else: + # GH#32779 to avoid the performance penalty of copying, + # we may try to only slice + taker = blklocs[mgr_locs.indexer] + max_len = max(len(mgr_locs), taker.max() + 1) + taker = lib.maybe_indices_to_slice(taker, max_len) + + if isinstance(taker, slice): + nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs) + blocks.append(nb) + elif only_slice: + # GH#33597 slice instead of take, so we get + # views instead of copies + for i, ml in zip(taker, mgr_locs): + slc = slice(i, i + 1) + bp = BlockPlacement(ml) + nb = blk.getitem_block_columns(slc, new_mgr_locs=bp) + # We have np.shares_memory(nb.values, blk.values) + blocks.append(nb) else: - # for mypy: handled in the if-branch - assert index_col is not lib.no_default - - len_index = len(index_col) - if kwds.get("usecols") is None and len(names) + len_index != len(colspecs): - # If usecols is used colspec may be longer than names - raise ValueError("Length of colspecs must match length of names") - - check_dtype_backend(kwds.setdefault("dtype_backend", lib.no_default)) - return _read( - filepath_or_buffer, - kwds - | { - "colspecs": colspecs, - "infer_nrows": infer_nrows, - "engine": "python-fwf", - "iterator": iterator, - "chunksize": chunksize, - }, - ) + nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) + blocks.append(nb) + return blocks -class TextFileReader(abc.Iterator): - """ + def _make_na_block( + self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False + ) -> Block: + # Note: we only get here with self.ndim == 2 + + if use_na_proxy: + assert fill_value is None + shape = (len(placement), self.shape[1]) + vals = np.empty(shape, dtype=np.void) + nb = NumpyBlock(vals, placement, ndim=2) + return nb + + if fill_value is None or fill_value is np.nan: + fill_value = np.nan + # GH45857 avoid unnecessary upcasting + dtype = interleaved_dtype([blk.dtype for blk in self.blocks]) + if dtype is not None and np.issubdtype(dtype.type, np.floating): + fill_value = dtype.type(fill_value) - Passed dialect overrides any of the related parser options + shape = (len(placement), self.shape[1]) + + dtype, fill_value = infer_dtype_from_scalar(fill_value) + block_values = make_na_array(dtype, shape, fill_value) + return new_block_2d(block_values, placement=placement) + + def take( + self, + indexer: npt.NDArray[np.intp], + axis: AxisInt = 1, + verify: bool = True, + ) -> Self: + """ + Take items along any axis. + + indexer : np.ndarray[np.intp] + axis : int, default 1 + verify : bool, default True + Check that all entries are between 0 and len(self) - 1, inclusive. + Pass verify=False if this check has been done by the caller. + + Returns + ------- + BlockManager + """ + # Caller is responsible for ensuring indexer annotation is accurate + + n = self.shape[axis] + indexer = maybe_convert_indices(indexer, n, verify=verify) + + new_labels = self.axes[axis].take(indexer) + return self.reindex_indexer( + new_axis=new_labels, + indexer=indexer, + axis=axis, + allow_dups=True, + ) + +class BlockManager(libinternals.BlockManager, BaseBlockManager): + """ + BaseBlockManager that holds 2D blocks. """ + ndim = 2 + + # ---------------------------------------------------------------- + # Constructors + def __init__( self, - f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list, - engine: CSVEngine | None = None, - **kwds, + blocks: Sequence[Block], + axes: Sequence[Index], + verify_integrity: bool = True, ) -> None: - if engine is not None: - engine_specified = True - else: - engine = "python" - engine_specified = False - self.engine = engine - self._engine_specified = kwds.get("engine_specified", engine_specified) + if verify_integrity: + # Assertion disabled for performance + # assert all(isinstance(x, Index) for x in axes) + + for block in blocks: + if self.ndim != block.ndim: + raise AssertionError( + f"Number of Block dimensions ({block.ndim}) must equal " + f"number of axes ({self.ndim})" + ) + # As of 2.0, the caller is responsible for ensuring that + # DatetimeTZBlock with block.ndim == 2 has block.values.ndim ==2; + # previously there was a special check for fastparquet compat. + + self._verify_integrity() + + def _verify_integrity(self) -> None: + mgr_shape = self.shape + tot_items = sum(len(x.mgr_locs) for x in self.blocks) + for block in self.blocks: + if block.shape[1:] != mgr_shape[1:]: + raise_construction_error(tot_items, block.shape[1:], self.axes) + if len(self.items) != tot_items: + raise AssertionError( + "Number of manager items must equal union of " + f"block items\n# manager items: {len(self.items)}, # " + f"tot_items: {tot_items}" + ) - _validate_skipfooter(kwds) + @classmethod + def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> Self: + """ + Constructor for BlockManager and SingleBlockManager with same signature. + """ + return cls(blocks, axes, verify_integrity=False) + + # ---------------------------------------------------------------- + # Indexing + + def fast_xs(self, loc: int) -> SingleBlockManager: + """ + Return the array corresponding to `frame.iloc[loc]`. + + Parameters + ---------- + loc : int + + Returns + ------- + np.ndarray or ExtensionArray + """ + if len(self.blocks) == 1: + # TODO: this could be wrong if blk.mgr_locs is not slice(None)-like; + # is this ruled out in the general case? + result: np.ndarray | ExtensionArray = self.blocks[0].iget( + (slice(None), loc) + ) + # in the case of a single block, the new block is a view + bp = BlockPlacement(slice(0, len(result))) + block = new_block( + result, + placement=bp, + ndim=1, + refs=self.blocks[0].refs, + ) + return SingleBlockManager(block, self.axes[0]) - dialect = _extract_dialect(kwds) - if dialect is not None: - if engine == "pyarrow": - raise ValueError( - "The 'dialect' option is not supported with the 'pyarrow' engine" - ) - kwds = _merge_with_dialect_properties(dialect, kwds) + dtype = interleaved_dtype([blk.dtype for blk in self.blocks]) - if kwds.get("header", "infer") == "infer": - kwds["header"] = 0 if kwds.get("names") is None else None + n = len(self) - self.orig_options = kwds + if isinstance(dtype, ExtensionDtype): + # TODO: use object dtype as workaround for non-performant + # EA.__setitem__ methods. (primarily ArrowExtensionArray.__setitem__ + # when iteratively setting individual values) + # https://github.com/pandas-dev/pandas/pull/54508#issuecomment-1675827918 + result = np.empty(n, dtype=object) + else: + result = np.empty(n, dtype=dtype) + result = ensure_wrapped_if_datetimelike(result) + + for blk in self.blocks: + # Such assignment may incorrectly coerce NaT to None + # result[blk.mgr_locs] = blk._slice((slice(None), loc)) + for i, rl in enumerate(blk.mgr_locs): + result[rl] = blk.iget((i, loc)) + + if isinstance(dtype, ExtensionDtype): + cls = dtype.construct_array_type() + result = cls._from_sequence(result, dtype=dtype) + + bp = BlockPlacement(slice(0, len(result))) + block = new_block(result, placement=bp, ndim=1) + return SingleBlockManager(block, self.axes[0]) + + def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager: + """ + Return the data as a SingleBlockManager. + """ + block = self.blocks[self.blknos[i]] + values = block.iget(self.blklocs[i]) + + # shortcut for select a single-dim from a 2-dim BM + bp = BlockPlacement(slice(0, len(values))) + nb = type(block)( + values, placement=bp, ndim=1, refs=block.refs if track_ref else None + ) + return SingleBlockManager(nb, self.axes[1]) + + def iget_values(self, i: int) -> ArrayLike: + """ + Return the data for column i as the values (ndarray or ExtensionArray). + + Warning! The returned array is a view but doesn't handle Copy-on-Write, + so this should be used with caution. + """ + # TODO(CoW) making the arrays read-only might make this safer to use? + block = self.blocks[self.blknos[i]] + values = block.iget(self.blklocs[i]) + return values + + @property + def column_arrays(self) -> list[np.ndarray]: + """ + Used in the JSON C code to access column arrays. + This optimizes compared to using `iget_values` by converting each + + Warning! This doesn't handle Copy-on-Write, so should be used with + caution (current use case of consuming this in the JSON code is fine). + """ + # This is an optimized equivalent to + # result = [self.iget_values(i) for i in range(len(self.items))] + result: list[np.ndarray | None] = [None] * len(self.items) + + for blk in self.blocks: + mgr_locs = blk._mgr_locs + values = blk.array_values._values_for_json() + if values.ndim == 1: + # TODO(EA2D): special casing not needed with 2D EAs + result[mgr_locs[0]] = values - # miscellanea - self._currow = 0 + else: + for i, loc in enumerate(mgr_locs): + result[loc] = values[i] - options = self._get_options_with_defaults(engine) - options["storage_options"] = kwds.get("storage_options", None) + # error: Incompatible return value type (got "List[None]", + # expected "List[ndarray[Any, Any]]") + return result # type: ignore[return-value] - self.chunksize = options.pop("chunksize", None) - self.nrows = options.pop("nrows", None) + def iset( + self, + loc: int | slice | np.ndarray, + value: ArrayLike, + inplace: bool = False, + refs: BlockValuesRefs | None = None, + ) -> None: + """ + Set new item in-place. Does not consolidate. Adds new Block if not + contained in the current set of items + """ + + # FIXME: refactor, clearly separate broadcasting & zip-like assignment + # can prob also fix the various if tests for sparse/categorical + if self._blklocs is None and self.ndim > 1: + self._rebuild_blknos_and_blklocs() + + # Note: we exclude DTA/TDA here + value_is_extension_type = is_1d_only_ea_dtype(value.dtype) + if not value_is_extension_type: + if value.ndim == 2: + value = value.T + else: + value = ensure_block_shape(value, ndim=2) - self._check_file_or_buffer(f, engine) - self.options, self.engine = self._clean_options(options, engine) + if value.shape[1:] != self.shape[1:]: + raise AssertionError( + "Shape of new values must be compatible with manager shape" + ) - if "has_index_names" in kwds: - self.options["has_index_names"] = kwds["has_index_names"] + if lib.is_integer(loc): + # We have 6 tests where loc is _not_ an int. + # In this case, get_blkno_placements will yield only one tuple, + # containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1))) + + # Check if we can use _iset_single fastpath + loc = cast(int, loc) + blkno = self.blknos[loc] + blk = self.blocks[blkno] + if len(blk._mgr_locs) == 1: # TODO: fastest way to check this? + return self._iset_single( + loc, + value, + inplace=inplace, + blkno=blkno, + blk=blk, + refs=refs, + ) - self.handles: IOHandles | None = None - self._engine = self._make_engine(f, self.engine) + # error: Incompatible types in assignment (expression has type + # "List[Union[int, slice, ndarray]]", variable has type "Union[int, + # slice, ndarray]") + loc = [loc] # type: ignore[assignment] - def close(self) -> None: - if self.handles is not None: - self.handles.close() - self._engine.close() + # categorical/sparse/datetimetz + if value_is_extension_type: - def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: - kwds = self.orig_options + def value_getitem(placement): + return value - options = {} - default: object | None + else: - for argname, default in parser_defaults.items(): - value = kwds.get(argname, default) + def value_getitem(placement): + return value[placement.indexer] + + # Accessing public blknos ensures the public versions are initialized + blknos = self.blknos[loc] + blklocs = self.blklocs[loc].copy() + + unfit_mgr_locs = [] + unfit_val_locs = [] + removed_blknos = [] + for blkno_l, val_locs in libinternals.get_blkno_placements(blknos, group=True): + blk = self.blocks[blkno_l] + blk_locs = blklocs[val_locs.indexer] + if inplace and blk.should_store(value): + # Updating inplace -> check if we need to do Copy-on-Write + if not self._has_no_reference_block(blkno_l): + self._iset_split_block( + blkno_l, blk_locs, value_getitem(val_locs), refs=refs + ) + else: + blk.set_inplace(blk_locs, value_getitem(val_locs)) + continue + else: + unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) + unfit_val_locs.append(val_locs) - # see gh-12935 - if ( - engine == "pyarrow" - and argname in _pyarrow_unsupported - and value != default - and value != getattr(value, "value", default) - ): - raise ValueError( - f"The {argname!r} option is not supported with the " - f"'pyarrow' engine" + # If all block items are unfit, schedule the block for removal. + if len(val_locs) == len(blk.mgr_locs): + removed_blknos.append(blkno_l) + continue + else: + # Defer setting the new values to enable consolidation + self._iset_split_block(blkno_l, blk_locs, refs=refs) + + if len(removed_blknos): + # Remove blocks & update blknos accordingly + is_deleted = np.zeros(self.nblocks, dtype=np.bool_) + is_deleted[removed_blknos] = True + + new_blknos = np.empty(self.nblocks, dtype=np.intp) + new_blknos.fill(-1) + new_blknos[~is_deleted] = np.arange(self.nblocks - len(removed_blknos)) + self._blknos = new_blknos[self._blknos] + self.blocks = tuple( + blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos) + ) + + if unfit_val_locs: + unfit_idxr = np.concatenate(unfit_mgr_locs) + unfit_count = len(unfit_idxr) + + new_blocks: list[Block] = [] + if value_is_extension_type: + # This code (ab-)uses the fact that EA blocks contain only + # one item. + # TODO(EA2D): special casing unnecessary with 2D EAs + new_blocks.extend( + new_block_2d( + values=value, + placement=BlockPlacement(slice(mgr_loc, mgr_loc + 1)), + refs=refs, + ) + for mgr_loc in unfit_idxr ) - options[argname] = value - - for argname, default in _c_parser_defaults.items(): - if argname in kwds: - value = kwds[argname] - - if engine != "c" and value != default: - # TODO: Refactor this logic, its pretty convoluted - if "python" in engine and argname not in _python_unsupported: - pass - elif "pyarrow" in engine and argname not in _pyarrow_unsupported: - pass - else: - raise ValueError( - f"The {argname!r} option is not supported with the " - f"{engine!r} engine" - ) + + self._blknos[unfit_idxr] = np.arange(unfit_count) + len(self.blocks) + self._blklocs[unfit_idxr] = 0 + else: - value = default - options[argname] = value - - if engine == "python-fwf": - for argname, default in _fwf_defaults.items(): - options[argname] = kwds.get(argname, default) - - return options - - def _check_file_or_buffer(self, f, engine: CSVEngine) -> None: - # see gh-16530 - if is_file_like(f) and engine != "c" and not hasattr(f, "__iter__"): - # The C engine doesn't need the file-like to have the "__iter__" - # attribute. However, the Python engine needs "__iter__(...)" - # when iterating through such an object, meaning it - # needs to have that attribute - raise ValueError( - "The 'python' engine cannot iterate through this file buffer." - ) - if hasattr(f, "encoding"): - file_encoding = f.encoding - orig_reader_enc = self.orig_options.get("encoding", None) - any_none = file_encoding is None or orig_reader_enc is None - if file_encoding != orig_reader_enc and not any_none: - file_path = getattr(f, "name", None) - raise ValueError( - f"The specified reader encoding {orig_reader_enc} is different " - f"from the encoding {file_encoding} of file {file_path}." + # unfit_val_locs contains BlockPlacement objects + unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:]) + + new_blocks.append( + new_block_2d( + values=value_getitem(unfit_val_items), + placement=BlockPlacement(unfit_idxr), + refs=refs, + ) ) - def _clean_options( - self, options: dict[str, Any], engine: CSVEngine - ) -> tuple[dict[str, Any], CSVEngine]: - result = options.copy() + self._blknos[unfit_idxr] = len(self.blocks) + self._blklocs[unfit_idxr] = np.arange(unfit_count) - fallback_reason = None + self.blocks += tuple(new_blocks) - # C engine not supported yet - if engine == "c": - if options["skipfooter"] > 0: - fallback_reason = "the 'c' engine does not support skipfooter" - engine = "python" + # Newly created block's dtype may already be present. + self._known_consolidated = False - sep = options["delimiter"] - delim_whitespace = options["delim_whitespace"] + def _iset_split_block( + self, + blkno_l: int, + blk_locs: np.ndarray | list[int], + value: ArrayLike | None = None, + refs: BlockValuesRefs | None = None, + ) -> None: + """Removes columns from a block by splitting the block. + + Avoids copying the whole block through slicing and updates the manager + after determinint the new block structure. Optionally adds a new block, + otherwise has to be done by the caller. + + Parameters + ---------- + blkno_l: The block number to operate on, relevant for updating the manager + blk_locs: The locations of our block that should be deleted. + value: The value to set as a replacement. + refs: The reference tracking object of the value to set. + """ + blk = self.blocks[blkno_l] + + if self._blklocs is None: + self._rebuild_blknos_and_blklocs() + + nbs_tup = tuple(blk.delete(blk_locs)) + if value is not None: + locs = blk.mgr_locs.as_array[blk_locs] + first_nb = new_block_2d(value, BlockPlacement(locs), refs=refs) + else: + first_nb = nbs_tup[0] + nbs_tup = tuple(nbs_tup[1:]) - if sep is None and not delim_whitespace: - if engine in ("c", "pyarrow"): - fallback_reason = ( - f"the '{engine}' engine does not support " - "sep=None with delim_whitespace=False" - ) - engine = "python" - elif sep is not None and len(sep) > 1: - if engine == "c" and sep == r"\s+": - result["delim_whitespace"] = True - del result["delimiter"] - elif engine not in ("python", "python-fwf"): - # wait until regex engine integrated - fallback_reason = ( - f"the '{engine}' engine does not support " - "regex separators (separators > 1 char and " - r"different from '\s+' are interpreted as regex)" - ) - engine = "python" - elif delim_whitespace: - if "python" in engine: - result["delimiter"] = r"\s+" - elif sep is not None: - encodeable = True - encoding = sys.getfilesystemencoding() or "utf-8" - try: - if len(sep.encode(encoding)) > 1: - encodeable = False - except UnicodeDecodeError: - encodeable = False - if not encodeable and engine not in ("python", "python-fwf"): - fallback_reason = ( - f"the separator encoded in {encoding} " - f"is > 1 char long, and the '{engine}' engine " - "does not support such separators" - ) - engine = "python" - - quotechar = options["quotechar"] - if quotechar is not None and isinstance(quotechar, (str, bytes)): - if ( - len(quotechar) == 1 - and ord(quotechar) > 127 - and engine not in ("python", "python-fwf") - ): - fallback_reason = ( - "ord(quotechar) > 127, meaning the " - "quotechar is larger than one byte, " - f"and the '{engine}' engine does not support such quotechars" + nr_blocks = len(self.blocks) + blocks_tup = ( + self.blocks[:blkno_l] + (first_nb,) + self.blocks[blkno_l + 1 :] + nbs_tup + ) + self.blocks = blocks_tup + + if not nbs_tup and value is not None: + # No need to update anything if split did not happen + return + + self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb)) + + for i, nb in enumerate(nbs_tup): + self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) + self._blknos[nb.mgr_locs.indexer] = i + nr_blocks + + def _iset_single( + self, + loc: int, + value: ArrayLike, + inplace: bool, + blkno: int, + blk: Block, + refs: BlockValuesRefs | None = None, + ) -> None: + """ + Fastpath for iset when we are only setting a single position and + the Block currently in that position is itself single-column. + + In this case we can swap out the entire Block and blklocs and blknos + are unaffected. + """ + # Caller is responsible for verifying value.shape + + if inplace and blk.should_store(value): + copy = not self._has_no_reference_block(blkno) + iloc = self.blklocs[loc] + blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy) + return + + nb = new_block_2d(value, placement=blk._mgr_locs, refs=refs) + old_blocks = self.blocks + new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :] + self.blocks = new_blocks + return + + def column_setitem( + self, loc: int, idx: int | slice | np.ndarray, value, inplace_only: bool = False + ) -> None: + """ + Set values ("setitem") into a single column (not setting the full column). + + This is a method on the BlockManager level, to avoid creating an + intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`) + """ + if not self._has_no_reference(loc): + blkno = self.blknos[loc] + # Split blocks to only copy the column we want to modify + blk_loc = self.blklocs[loc] + # Copy our values + values = self.blocks[blkno].values + if values.ndim == 1: + values = values.copy() + else: + # Use [blk_loc] as indexer to keep ndim=2, this already results in a + # copy + values = values[[blk_loc]] + self._iset_split_block(blkno, [blk_loc], values) + + # this manager is only created temporarily to mutate the values in place + # so don't track references, otherwise the `setitem` would perform CoW again + col_mgr = self.iget(loc, track_ref=False) + if inplace_only: + col_mgr.setitem_inplace(idx, value) + else: + new_mgr = col_mgr.setitem((idx,), value) + self.iset(loc, new_mgr._block.values, inplace=True) + + def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None: + """ + Insert item at selected position. + + Parameters + ---------- + loc : int + item : hashable + value : np.ndarray or ExtensionArray + refs : The reference tracking object of the value to set. + """ + new_axis = self.items.insert(loc, item) + + if value.ndim == 2: + value = value.T + if len(value) > 1: + raise ValueError( + f"Expected a 1D array, got an array with shape {value.T.shape}" ) - engine = "python" - - if fallback_reason and self._engine_specified: - raise ValueError(fallback_reason) - - if engine == "c": - for arg in _c_unsupported: - del result[arg] - - if "python" in engine: - for arg in _python_unsupported: - if fallback_reason and result[arg] != _c_parser_defaults.get(arg): - raise ValueError( - "Falling back to the 'python' engine because " - f"{fallback_reason}, but this causes {arg!r} to be " - "ignored as it is not supported by the 'python' engine." - ) - del result[arg] + else: + value = ensure_block_shape(value, ndim=self.ndim) + + bp = BlockPlacement(slice(loc, loc + 1)) + block = new_block_2d(values=value, placement=bp, refs=refs) + + if not len(self.blocks): + # Fastpath + self._blklocs = np.array([0], dtype=np.intp) + self._blknos = np.array([0], dtype=np.intp) + else: + self._insert_update_mgr_locs(loc) + self._insert_update_blklocs_and_blknos(loc) + + self.axes[0] = new_axis + self.blocks += (block,) + + self._known_consolidated = False - if fallback_reason: + if ( + get_option("performance_warnings") + and sum(not block.is_extension for block in self.blocks) > 100 + ): warnings.warn( - ( - "Falling back to the 'python' engine because " - f"{fallback_reason}; you can avoid this warning by specifying " - "engine='python'." - ), - ParserWarning, + "DataFrame is highly fragmented. This is usually the result " + "of calling `frame.insert` many times, which has poor performance. " + "Consider joining all columns at once using pd.concat(axis=1) " + "instead. To get a de-fragmented frame, use `newframe = frame.copy()`", + PerformanceWarning, stacklevel=find_stack_level(), ) - index_col = options["index_col"] - names = options["names"] - converters = options["converters"] - na_values = options["na_values"] - skiprows = options["skiprows"] - - validate_header_arg(options["header"]) - - if index_col is True: - raise ValueError("The value of index_col couldn't be 'True'") - if is_index_col(index_col): - if not isinstance(index_col, (list, tuple, np.ndarray)): - index_col = [index_col] - result["index_col"] = index_col - - names = list(names) if names is not None else names - - # type conversion-related - if converters is not None: - if not isinstance(converters, dict): - raise TypeError( - "Type converters must be a dict or subclass, " - f"input was a {type(converters).__name__}" - ) + def _insert_update_mgr_locs(self, loc) -> None: + """ + When inserting a new Block at location 'loc', we increment + all of the mgr_locs of blocks above that by one. + """ + for blkno, count in _fast_count_smallints(self.blknos[loc:]): + # .620 this way, .326 of which is in increment_above + blk = self.blocks[blkno] + blk._mgr_locs = blk._mgr_locs.increment_above(loc) + + def _insert_update_blklocs_and_blknos(self, loc) -> None: + """ + When inserting a new Block at location 'loc', we update our + _blklocs and _blknos. + """ + + # Accessing public blklocs ensures the public versions are initialized + if loc == self.blklocs.shape[0]: + # np.append is a lot faster, let's use it if we can. + self._blklocs = np.append(self._blklocs, 0) + self._blknos = np.append(self._blknos, len(self.blocks)) + elif loc == 0: + # As of numpy 1.26.4, np.concatenate faster than np.append + self._blklocs = np.concatenate([[0], self._blklocs]) + self._blknos = np.concatenate([[len(self.blocks)], self._blknos]) else: - converters = {} - - # Converting values to NA - keep_default_na = options["keep_default_na"] - floatify = engine != "pyarrow" - na_values, na_fvalues = _clean_na_values( - na_values, keep_default_na, floatify=floatify - ) + new_blklocs, new_blknos = libinternals.update_blklocs_and_blknos( + self.blklocs, self.blknos, loc, len(self.blocks) + ) + self._blklocs = new_blklocs + self._blknos = new_blknos + + def idelete(self, indexer) -> BlockManager: + """ + Delete selected locations, returning a new BlockManager. + """ + is_deleted = np.zeros(self.shape[0], dtype=np.bool_) + is_deleted[indexer] = True + taker = (~is_deleted).nonzero()[0] + + nbs = self._slice_take_blocks_ax0(taker, only_slice=True, ref_inplace_op=True) + new_columns = self.items[~is_deleted] + axes = [new_columns, self.axes[1]] + return type(self)(tuple(nbs), axes, verify_integrity=False) + + # ---------------------------------------------------------------- + # Block-wise Operation + + def grouped_reduce(self, func: Callable) -> Self: + """ + Apply grouped reduction function blockwise, returning a new BlockManager. + + Parameters + ---------- + func : grouped reduction function + + Returns + ------- + BlockManager + """ + result_blocks: list[Block] = [] + + for blk in self.blocks: + if blk.is_object: + # split on object-dtype blocks bc some columns may raise + # while others do not. + for sb in blk._split(): + applied = sb.apply(func) + result_blocks = extend_blocks(applied, result_blocks) + else: + applied = blk.apply(func) + result_blocks = extend_blocks(applied, result_blocks) - # handle skiprows; this is internally handled by the - # c-engine, so only need for python and pyarrow parsers - if engine == "pyarrow": - if not is_integer(skiprows) and skiprows is not None: - # pyarrow expects skiprows to be passed as an integer - raise ValueError( - "skiprows argument must be an integer when using " - "engine='pyarrow'" - ) + if len(result_blocks) == 0: + nrows = 0 else: - if is_integer(skiprows): - skiprows = range(skiprows) - if skiprows is None: - skiprows = set() - elif not callable(skiprows): - skiprows = set(skiprows) - - # put stuff back - result["names"] = names - result["converters"] = converters - result["na_values"] = na_values - result["na_fvalues"] = na_fvalues - result["skiprows"] = skiprows - - return result, engine - - def __next__(self) -> DataFrame: - try: - return self.get_chunk() - except StopIteration: - self.close() - raise - - def _make_engine( + nrows = result_blocks[0].values.shape[-1] + index = Index(range(nrows)) + + return type(self).from_blocks(result_blocks, [self.axes[0], index]) + + def reduce(self, func: Callable) -> Self: + """ + Apply reduction function blockwise, returning a single-row BlockManager. + + Parameters + ---------- + func : reduction function + + Returns + ------- + BlockManager + """ + # If 2D, we assume that we're operating column-wise + assert self.ndim == 2 + + res_blocks: list[Block] = [] + for blk in self.blocks: + nbs = blk.reduce(func) + res_blocks.extend(nbs) + + index = Index([None]) # placeholder + new_mgr = type(self).from_blocks(res_blocks, [self.items, index]) + return new_mgr + + def operate_blockwise(self, other: BlockManager, array_op) -> BlockManager: + """ + Apply array_op blockwise with another (aligned) BlockManager. + """ + return operate_blockwise(self, other, array_op) + + def _equal_values(self: BlockManager, other: BlockManager) -> bool: + """ + Used in .equals defined in base class. Only check the column values + assuming shape and indexes have already been checked. + """ + return blockwise_all(self, other, array_equals) + + def quantile( self, - f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list | IO, - engine: CSVEngine = "c", - ) -> ParserBase: - mapping: dict[str, type[ParserBase]] = { - "c": CParserWrapper, - "python": PythonParser, - "pyarrow": ArrowParserWrapper, - "python-fwf": FixedWidthFieldParser, - } + *, + qs: Index, # with dtype float 64 + interpolation: QuantileInterpolation = "linear", + ) -> Self: + """ + Iterate over blocks applying quantile reduction. + This routine is intended for reduction type operations and + will do inference on the generated blocks. + + Parameters + ---------- + interpolation : type of interpolation, default 'linear' + qs : list of the quantiles to be computed + + Returns + ------- + BlockManager + """ + # Series dispatches to DataFrame for quantile, which allows us to + # simplify some of the code here and in the blocks + assert self.ndim >= 2 + assert is_list_like(qs) # caller is responsible for this + + new_axes = list(self.axes) + new_axes[1] = Index(qs, dtype=np.float64) + + blocks = [ + blk.quantile(qs=qs, interpolation=interpolation) for blk in self.blocks + ] + + return type(self)(blocks, new_axes) + + # ---------------------------------------------------------------- + + def unstack(self, unstacker, fill_value) -> BlockManager: + """ + Return a BlockManager with all blocks unstacked. + + Parameters + ---------- + unstacker : reshape._Unstacker + fill_value : Any + fill_value for newly introduced missing values. + + Returns + ------- + unstacked : BlockManager + """ + new_columns = unstacker.get_new_columns(self.items) + new_index = unstacker.new_index + + allow_fill = not unstacker.mask_all + if allow_fill: + # calculating the full mask once and passing it to Block._unstack is + # faster than letting calculating it in each repeated call + new_mask2D = (~unstacker.mask).reshape(*unstacker.full_shape) + needs_masking = new_mask2D.any(axis=0) + else: + needs_masking = np.zeros(unstacker.full_shape[1], dtype=bool) - if engine not in mapping: - raise ValueError( - f"Unknown engine: {engine} (valid options are {mapping.keys()})" - ) - if not isinstance(f, list): - # open file here - is_text = True - mode = "r" - if engine == "pyarrow": - is_text = False - mode = "rb" - elif ( - engine == "c" - and self.options.get("encoding", "utf-8") == "utf-8" - and isinstance(stringify_path(f), str) - ): - # c engine can decode utf-8 bytes, adding TextIOWrapper makes - # the c-engine especially for memory_map=True far slower - is_text = False - if "b" not in mode: - mode += "b" - self.handles = get_handle( - f, - mode, - encoding=self.options.get("encoding", None), - compression=self.options.get("compression", None), - memory_map=self.options.get("memory_map", False), - is_text=is_text, - errors=self.options.get("encoding_errors", "strict"), - storage_options=self.options.get("storage_options", None), + new_blocks: list[Block] = [] + columns_mask: list[np.ndarray] = [] + + if len(self.items) == 0: + factor = 1 + else: + fac = len(new_columns) / len(self.items) + assert fac == int(fac) + factor = int(fac) + + for blk in self.blocks: + mgr_locs = blk.mgr_locs + new_placement = mgr_locs.tile_for_unstack(factor) + + blocks, mask = blk._unstack( + unstacker, + fill_value, + new_placement=new_placement, + needs_masking=needs_masking, ) - assert self.handles is not None - f = self.handles.handle - elif engine != "python": - msg = f"Invalid file path or buffer object type: {type(f)}" - raise ValueError(msg) + new_blocks.extend(blocks) + columns_mask.extend(mask) - try: - return mapping[engine](f, **self.options) - except Exception: - if self.handles is not None: - self.handles.close() - raise + # Block._unstack should ensure this holds, + assert mask.sum() == sum(len(nb._mgr_locs) for nb in blocks) + # In turn this ensures that in the BlockManager call below + # we have len(new_columns) == sum(x.shape[0] for x in new_blocks) + # which suffices to allow us to pass verify_inegrity=False - def _failover_to_python(self) -> None: - raise AbstractMethodError(self) + new_columns = new_columns[columns_mask] - def read(self, nrows: int | None = None) -> DataFrame: - if self.engine == "pyarrow": - try: - # error: "ParserBase" has no attribute "read" - df = self._engine.read() # type: ignore[attr-defined] - except Exception: - self.close() - raise - else: - nrows = validate_integer("nrows", nrows) - try: - # error: "ParserBase" has no attribute "read" - ( - index, - columns, - col_dict, - ) = self._engine.read( # type: ignore[attr-defined] - nrows - ) - except Exception: - self.close() - raise - - if index is None: - if col_dict: - # Any column is actually fine: - new_rows = len(next(iter(col_dict.values()))) - index = RangeIndex(self._currow, self._currow + new_rows) + bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False) + return bm + + def to_dict(self) -> dict[str, Self]: + """ + Return a dict of str(dtype) -> BlockManager + + Returns + ------- + values : a dict of dtype -> BlockManager + """ + + bd: dict[str, list[Block]] = {} + for b in self.blocks: + bd.setdefault(str(b.dtype), []).append(b) + + # TODO(EA2D): the combine will be unnecessary with 2D EAs + return {dtype: self._combine(blocks) for dtype, blocks in bd.items()} + + def as_array( + self, + dtype: np.dtype | None = None, + copy: bool = False, + na_value: object = lib.no_default, + ) -> np.ndarray: + """ + Convert the blockmanager data into an numpy array. + + Parameters + ---------- + dtype : np.dtype or None, default None + Data type of the return array. + copy : bool, default False + If True then guarantee that a copy is returned. A value of + False does not guarantee that the underlying data is not + copied. + na_value : object, default lib.no_default + Value to be used as the missing value sentinel. + + Returns + ------- + arr : ndarray + """ + passed_nan = lib.is_float(na_value) and isna(na_value) + + if len(self.blocks) == 0: + arr = np.empty(self.shape, dtype=float) + return arr.transpose() + + if self.is_single_block: + blk = self.blocks[0] + + if na_value is not lib.no_default: + # We want to copy when na_value is provided to avoid + # mutating the original object + if lib.is_np_dtype(blk.dtype, "f") and passed_nan: + # We are already numpy-float and na_value=np.nan + pass else: - new_rows = 0 + copy = True + + if blk.is_extension: + # Avoid implicit conversion of extension blocks to object + + # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no + # attribute "to_numpy" + arr = blk.values.to_numpy( # type: ignore[union-attr] + dtype=dtype, + na_value=na_value, + copy=copy, + ).reshape(blk.shape) + elif not copy: + arr = np.asarray(blk.values, dtype=dtype) else: - new_rows = len(index) + arr = np.array(blk.values, dtype=dtype, copy=copy) - if hasattr(self, "orig_options"): - dtype_arg = self.orig_options.get("dtype", None) - else: - dtype_arg = None - - if isinstance(dtype_arg, dict): - dtype = defaultdict(lambda: None) # type: ignore[var-annotated] - dtype.update(dtype_arg) - elif dtype_arg is not None and pandas_dtype(dtype_arg) in ( - np.str_, - np.object_, - ): - dtype = defaultdict(lambda: dtype_arg) - else: - dtype = None - - if dtype is not None: - new_col_dict = {} - for k, v in col_dict.items(): - d = ( - dtype[k] - if pandas_dtype(dtype[k]) in (np.str_, np.object_) - else None - ) - new_col_dict[k] = Series(v, index=index, dtype=d, copy=False) - else: - new_col_dict = col_dict + if not copy: + arr = arr.view() + arr.flags.writeable = False + else: + arr = self._interleave(dtype=dtype, na_value=na_value) + # The underlying data was copied within _interleave, so no need + # to further copy if copy=True or setting na_value - df = DataFrame( - new_col_dict, - columns=columns, - index=index, - copy=False, - ) + if na_value is lib.no_default: + pass + elif arr.dtype.kind == "f" and passed_nan: + pass + else: + arr[isna(arr)] = na_value - self._currow += new_rows - return df + return arr.transpose() - def get_chunk(self, size: int | None = None) -> DataFrame: - if size is None: - size = self.chunksize - if self.nrows is not None: - if self._currow >= self.nrows: - raise StopIteration - size = min(size, self.nrows - self._currow) - return self.read(nrows=size) + def _interleave( + self, + dtype: np.dtype | None = None, + na_value: object = lib.no_default, + ) -> np.ndarray: + """ + Return ndarray from blocks with specified item order + Items must be contained in the blocks + """ + if not dtype: + # Incompatible types in assignment (expression has type + # "Optional[Union[dtype[Any], ExtensionDtype]]", variable has + # type "Optional[dtype[Any]]") + dtype = interleaved_dtype( # type: ignore[assignment] + [blk.dtype for blk in self.blocks] + ) - def __enter__(self) -> Self: - return self + # error: Argument 1 to "ensure_np_dtype" has incompatible type + # "Optional[dtype[Any]]"; expected "Union[dtype[Any], ExtensionDtype]" + dtype = ensure_np_dtype(dtype) # type: ignore[arg-type] + result = np.empty(self.shape, dtype=dtype) + + itemmask = np.zeros(self.shape[0]) + + if dtype == np.dtype("object") and na_value is lib.no_default: + # much more performant than using to_numpy below + for blk in self.blocks: + rl = blk.mgr_locs + arr = blk.get_values(dtype) + result[rl.indexer] = arr + itemmask[rl.indexer] = 1 + return result + + for blk in self.blocks: + rl = blk.mgr_locs + if blk.is_extension: + # Avoid implicit conversion of extension blocks to object + + # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no + # attribute "to_numpy" + arr = blk.values.to_numpy( # type: ignore[union-attr] + dtype=dtype, + na_value=na_value, + ) + else: + arr = blk.get_values(dtype) + result[rl.indexer] = arr + itemmask[rl.indexer] = 1 + + if not itemmask.all(): + raise AssertionError("Some items were not contained in blocks") + + return result + + # ---------------------------------------------------------------- + # Consolidation + + def is_consolidated(self) -> bool: + """ + Return True if more than one block with the same dtype + """ + if not self._known_consolidated: + self._consolidate_check() + return self._is_consolidated + + def _consolidate_check(self) -> None: + if len(self.blocks) == 1: + # fastpath + self._is_consolidated = True + self._known_consolidated = True + return + dtypes = [blk.dtype for blk in self.blocks if blk._can_consolidate] + self._is_consolidated = len(dtypes) == len(set(dtypes)) + self._known_consolidated = True + + def _consolidate_inplace(self) -> None: + # In general, _consolidate_inplace should only be called via + # DataFrame._consolidate_inplace, otherwise we will fail to invalidate + # the DataFrame's _item_cache. The exception is for newly-created + # BlockManager objects not yet attached to a DataFrame. + if not self.is_consolidated(): + self.blocks = _consolidate(self.blocks) + self._is_consolidated = True + self._known_consolidated = True + self._rebuild_blknos_and_blklocs() + + # ---------------------------------------------------------------- + # Concatenation + + @classmethod + def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self: + """ + Concatenate uniformly-indexed BlockManagers horizontally. + """ + offset = 0 + blocks: list[Block] = [] + for mgr in mgrs: + for blk in mgr.blocks: + # We need to do getitem_block here otherwise we would be altering + # blk.mgr_locs in place, which would render it invalid. This is only + # relevant in the copy=False case. + nb = blk.slice_block_columns(slice(None)) + nb._mgr_locs = nb._mgr_locs.add(offset) + blocks.append(nb) + + offset += len(mgr.items) + + new_mgr = cls(tuple(blocks), axes) + return new_mgr + + @classmethod + def concat_vertical(cls, mgrs: list[Self], axes: list[Index]) -> Self: + """ + Concatenate uniformly-indexed BlockManagers vertically. + """ + raise NotImplementedError("This logic lives (for now) in internals.concat") + + +class SingleBlockManager(BaseBlockManager): + """manage a single block with""" + + @property + def ndim(self) -> Literal[1]: + return 1 + + _is_consolidated = True + _known_consolidated = True + __slots__ = () + is_single_block = True - def __exit__( + def __init__( self, - exc_type: type[BaseException] | None, - exc_value: BaseException | None, - traceback: TracebackType | None, + block: Block, + axis: Index, + verify_integrity: bool = False, ) -> None: - self.close() + # Assertions disabled for performance + # assert isinstance(block, Block), type(block) + # assert isinstance(axis, Index), type(axis) + + self.axes = [axis] + self.blocks = (block,) + + @classmethod + def from_blocks( + cls, + blocks: list[Block], + axes: list[Index], + ) -> Self: + """ + Constructor for BlockManager and SingleBlockManager with same signature. + """ + assert len(blocks) == 1 + assert len(axes) == 1 + return cls(blocks[0], axes[0], verify_integrity=False) + + @classmethod + def from_array( + cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None + ) -> SingleBlockManager: + """ + Constructor for if we have an array that is not yet a Block. + """ + array = maybe_coerce_values(array) + bp = BlockPlacement(slice(0, len(index))) + block = new_block(array, placement=bp, ndim=1, refs=refs) + return cls(block, index) + + def to_2d_mgr(self, columns: Index) -> BlockManager: + """ + Manager analogue of Series.to_frame + """ + blk = self.blocks[0] + arr = ensure_block_shape(blk.values, ndim=2) + bp = BlockPlacement(0) + new_blk = type(blk)(arr, placement=bp, ndim=2, refs=blk.refs) + axes = [columns, self.axes[0]] + return BlockManager([new_blk], axes=axes, verify_integrity=False) + + def _has_no_reference(self, i: int = 0) -> bool: + """ + Check for column `i` if it has references. + (whether it references another array or is itself being referenced) + Returns True if the column has no references. + """ + return not self.blocks[0].refs.has_reference() + + def __getstate__(self): + block_values = [b.values for b in self.blocks] + block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] + axes_array = list(self.axes) + + extra_state = { + "0.14.1": { + "axes": axes_array, + "blocks": [ + {"values": b.values, "mgr_locs": b.mgr_locs.indexer} + for b in self.blocks + ], + } + } + # First three elements of the state are to maintain forward + # compatibility with 0.13.1. + return axes_array, block_values, block_items, extra_state + + def __setstate__(self, state) -> None: + def unpickle_block(values, mgr_locs, ndim: int) -> Block: + # TODO(EA2D): ndim would be unnecessary with 2D EAs + # older pickles may store e.g. DatetimeIndex instead of DatetimeArray + values = extract_array(values, extract_numpy=True) + if not isinstance(mgr_locs, BlockPlacement): + mgr_locs = BlockPlacement(mgr_locs) + + values = maybe_coerce_values(values) + return new_block(values, placement=mgr_locs, ndim=ndim) + + if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]: + state = state[3]["0.14.1"] + self.axes = [ensure_index(ax) for ax in state["axes"]] + ndim = len(self.axes) + self.blocks = tuple( + unpickle_block(b["values"], b["mgr_locs"], ndim=ndim) + for b in state["blocks"] + ) + else: + raise NotImplementedError("pre-0.14.1 pickles are no longer supported") -def TextParser(*args, **kwds) -> TextFileReader: - """ - Converts lists of lists/tuples into DataFrames with proper type inference - and optional (e.g. string to datetime) conversion. Also enables iterating - lazily over chunks of large files + self._post_setstate() - Parameters - ---------- - data : file-like object or list - delimiter : separator character to use - dialect : str or csv.Dialect instance, optional - Ignored if delimiter is longer than 1 character - names : sequence, default - header : int, default 0 - Row to use to parse column labels. Defaults to the first row. Prior - rows will be discarded - index_col : int or list, optional - Column or columns to use as the (possibly hierarchical) index - has_index_names: bool, default False - True if the cols defined in index_col have an index name and are - not in the header. - na_values : scalar, str, list-like, or dict, optional - Additional strings to recognize as NA/NaN. - keep_default_na : bool, default True - thousands : str, optional - Thousands separator - comment : str, optional - Comment out remainder of line - parse_dates : bool, default False - keep_date_col : bool, default False - date_parser : function, optional - - .. deprecated:: 2.0.0 - date_format : str or dict of column -> format, default ``None`` - - .. versionadded:: 2.0.0 - skiprows : list of integers - Row numbers to skip - skipfooter : int - Number of line at bottom of file to skip - converters : dict, optional - Dict of functions for converting values in certain columns. Keys can - either be integers or column labels, values are functions that take one - input argument, the cell (not column) content, and return the - transformed content. - encoding : str, optional - Encoding to use for UTF when reading/writing (ex. 'utf-8') - float_precision : str, optional - Specifies which converter the C engine should use for floating-point - values. The options are `None` or `high` for the ordinary converter, - `legacy` for the original lower precision pandas converter, and - `round_trip` for the round-trip converter. - """ - kwds["engine"] = "python" - return TextFileReader(*args, **kwds) + def _post_setstate(self) -> None: + pass + @cache_readonly + def _block(self) -> Block: + return self.blocks[0] -def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = True): - na_fvalues: set | dict - if na_values is None: - if keep_default_na: - na_values = STR_NA_VALUES - else: - na_values = set() - na_fvalues = set() - elif isinstance(na_values, dict): - old_na_values = na_values.copy() - na_values = {} # Prevent aliasing. - - # Convert the values in the na_values dictionary - # into array-likes for further use. This is also - # where we append the default NaN values, provided - # that `keep_default_na=True`. - for k, v in old_na_values.items(): - if not is_list_like(v): - v = [v] - - if keep_default_na: - v = set(v) | STR_NA_VALUES - - na_values[k] = v - na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()} - else: - if not is_list_like(na_values): - na_values = [na_values] - na_values = _stringify_na_values(na_values, floatify) - if keep_default_na: - na_values = na_values | STR_NA_VALUES - - na_fvalues = _floatify_na_values(na_values) - - return na_values, na_fvalues - - -def _floatify_na_values(na_values): - # create float versions of the na_values - result = set() - for v in na_values: - try: - v = float(v) - if not np.isnan(v): - result.add(v) - except (TypeError, ValueError, OverflowError): - pass - return result - - -def _stringify_na_values(na_values, floatify: bool) -> set[str | float]: - """return a stringified and numeric for these values""" - result: list[str | float] = [] - for x in na_values: - result.append(str(x)) - result.append(x) - try: - v = float(x) - - # we are like 999 here - if v == int(v): - v = int(v) - result.append(f"{v}.0") - result.append(str(v)) - - if floatify: - result.append(v) - except (TypeError, ValueError, OverflowError): - pass - if floatify: - try: - result.append(int(x)) - except (TypeError, ValueError, OverflowError): - pass - return set(result) - - -def _refine_defaults_read( - dialect: str | csv.Dialect | None, - delimiter: str | None | lib.NoDefault, - delim_whitespace: bool, - engine: CSVEngine | None, - sep: str | None | lib.NoDefault, - on_bad_lines: str | Callable, - names: Sequence[Hashable] | None | lib.NoDefault, - defaults: dict[str, Any], - dtype_backend: DtypeBackend | lib.NoDefault, -): - """Validate/refine default values of input parameters of read_csv, read_table. + @final + @property + def array(self) -> ArrayLike: + """ + Quick access to the backing array of the Block. + """ + return self.arrays[0] - Parameters - ---------- - dialect : str or csv.Dialect - If provided, this parameter will override values (default or not) for the - following parameters: `delimiter`, `doublequote`, `escapechar`, - `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to - override values, a ParserWarning will be issued. See csv.Dialect - documentation for more details. - delimiter : str or object - Alias for sep. - delim_whitespace : bool - Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be - used as the sep. Equivalent to setting ``sep='\\s+'``. If this option - is set to True, nothing should be passed in for the ``delimiter`` - parameter. - - .. deprecated:: 2.2.0 - Use ``sep="\\s+"`` instead. - engine : {{'c', 'python'}} - Parser engine to use. The C engine is faster while the python engine is - currently more feature-complete. - sep : str or object - A delimiter provided by the user (str) or a sentinel value, i.e. - pandas._libs.lib.no_default. - on_bad_lines : str, callable - An option for handling bad lines or a sentinel value(None). - names : array-like, optional - List of column names to use. If the file contains a header row, - then you should explicitly pass ``header=0`` to override the column names. - Duplicates in this list are not allowed. - defaults: dict - Default values of input parameters. + # error: Cannot override writeable attribute with read-only property + @property + def _blknos(self) -> None: # type: ignore[override] + """compat with BlockManager""" + return None - Returns - ------- - kwds : dict - Input parameters with correct values. - - Raises - ------ - ValueError : - If a delimiter was specified with ``sep`` (or ``delimiter``) and - ``delim_whitespace=True``. - """ - # fix types for sep, delimiter to Union(str, Any) - delim_default = defaults["delimiter"] - kwds: dict[str, Any] = {} - # gh-23761 - # - # When a dialect is passed, it overrides any of the overlapping - # parameters passed in directly. We don't want to warn if the - # default parameters were passed in (since it probably means - # that the user didn't pass them in explicitly in the first place). - # - # "delimiter" is the annoying corner case because we alias it to - # "sep" before doing comparison to the dialect values later on. - # Thus, we need a flag to indicate that we need to "override" - # the comparison to dialect values by checking if default values - # for BOTH "delimiter" and "sep" were provided. - if dialect is not None: - kwds["sep_override"] = delimiter is None and ( - sep is lib.no_default or sep == delim_default - ) + # error: Cannot override writeable attribute with read-only property + @property + def _blklocs(self) -> None: # type: ignore[override] + """compat with BlockManager""" + return None - if delimiter and (sep is not lib.no_default): - raise ValueError("Specified a sep and a delimiter; you can only specify one.") + def get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> Self: + # similar to get_slice, but not restricted to slice indexer + blk = self._block + if len(indexer) > 0 and indexer.all(): + return type(self)(blk.copy(deep=False), self.index) + array = blk.values[indexer] - kwds["names"] = None if names is lib.no_default else names + if isinstance(indexer, np.ndarray) and indexer.dtype.kind == "b": + # boolean indexing always gives a copy with numpy + refs = None + else: + # TODO(CoW) in theory only need to track reference if new_array is a view + refs = blk.refs + + bp = BlockPlacement(slice(0, len(array))) + block = type(blk)(array, placement=bp, ndim=1, refs=refs) + + new_idx = self.index[indexer] + return type(self)(block, new_idx) + + def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleBlockManager: + # Assertion disabled for performance + # assert isinstance(slobj, slice), type(slobj) + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + blk = self._block + array = blk.values[slobj] + bp = BlockPlacement(slice(0, len(array))) + # TODO this method is only used in groupby SeriesSplitter at the moment, + # so passing refs is not yet covered by the tests + block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs) + new_index = self.index._getitem_slice(slobj) + return type(self)(block, new_index) + + @property + def index(self) -> Index: + return self.axes[0] + + @property + def dtype(self) -> DtypeObj: + return self._block.dtype + + def get_dtypes(self) -> npt.NDArray[np.object_]: + return np.array([self._block.dtype], dtype=object) + + def external_values(self): + """The array that Series.values returns""" + return self._block.external_values() + + def internal_values(self): + """The array that Series._values returns""" + return self._block.values + + def array_values(self) -> ExtensionArray: + """The array that Series.array returns""" + return self._block.array_values + + def get_numeric_data(self) -> Self: + if self._block.is_numeric: + return self.copy(deep=False) + return self.make_empty() + + @property + def _can_hold_na(self) -> bool: + return self._block._can_hold_na + + def setitem_inplace(self, indexer, value) -> None: + """ + Set values with indexer. + + For SingleBlockManager, this backs s[indexer] = value + + This is an inplace version of `setitem()`, mutating the manager/values + in place, not returning a new Manager (and Block), and thus never changing + the dtype. + """ + if not self._has_no_reference(0): + self.blocks = (self._block.copy(),) + self._cache.clear() + + arr = self.array + + # EAs will do this validation in their own __setitem__ methods. + if isinstance(arr, np.ndarray): + # Note: checking for ndarray instead of np.dtype means we exclude + # dt64/td64, which do their own validation. + value = np_can_hold_element(arr.dtype, value) + + if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1: + # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 + value = value[0, ...] + + arr[indexer] = value + + def idelete(self, indexer) -> SingleBlockManager: + """ + Delete single location from SingleBlockManager. + + Ensures that self.blocks doesn't become empty. + """ + nb = self._block.delete(indexer)[0] + self.blocks = (nb,) + self.axes[0] = self.axes[0].delete(indexer) + self._cache.clear() + return self - # Alias sep -> delimiter. - if delimiter is None: - delimiter = sep + def fast_xs(self, loc): + """ + fast path for getting a cross-section + return a view of the data + """ + raise NotImplementedError("Use series._values[loc] instead") + + def set_values(self, values: ArrayLike) -> None: + """ + Set the values of the single block in place. + + Use at your own risk! This does not check if the passed values are + valid for the current Block/SingleBlockManager (length, dtype, etc), + and this does not properly keep track of references. + """ + # NOTE(CoW) Currently this is only used for FrameColumnApply.series_generator + # which handles CoW by setting the refs manually if necessary + self.blocks[0].values = values + self.blocks[0]._mgr_locs = BlockPlacement(slice(len(values))) + + def _equal_values(self, other: Self) -> bool: + """ + Used in .equals defined in base class. Only check the column values + assuming shape and indexes have already been checked. + """ + # For SingleBlockManager (i.e.Series) + if other.ndim != 1: + return False + left = self.blocks[0].values + right = other.blocks[0].values + return array_equals(left, right) + + def grouped_reduce(self, func): + arr = self.array + res = func(arr) + index = default_index(len(res)) + + mgr = type(self).from_array(res, index) + return mgr + + +# -------------------------------------------------------------------- +# Constructor Helpers + + +def create_block_manager_from_blocks( + blocks: list[Block], + axes: list[Index], + consolidate: bool = True, + verify_integrity: bool = True, +) -> BlockManager: + # If verify_integrity=False, then caller is responsible for checking + # all(x.shape[-1] == len(axes[1]) for x in blocks) + # sum(x.shape[0] for x in blocks) == len(axes[0]) + # set(x for blk in blocks for x in blk.mgr_locs) == set(range(len(axes[0]))) + # all(blk.ndim == 2 for blk in blocks) + # This allows us to safely pass verify_integrity=False + + try: + mgr = BlockManager(blocks, axes, verify_integrity=verify_integrity) + + except ValueError as err: + arrays = [blk.values for blk in blocks] + tot_items = sum(arr.shape[0] for arr in arrays) + raise_construction_error(tot_items, arrays[0].shape[1:], axes, err) + + if consolidate: + mgr._consolidate_inplace() + return mgr + + +def create_block_manager_from_column_arrays( + arrays: list[ArrayLike], + axes: list[Index], + consolidate: bool, + refs: list, +) -> BlockManager: + # Assertions disabled for performance (caller is responsible for verifying) + # assert isinstance(axes, list) + # assert all(isinstance(x, Index) for x in axes) + # assert all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays) + # assert all(type(x) is not NumpyExtensionArray for x in arrays) + # assert all(x.ndim == 1 for x in arrays) + # assert all(len(x) == len(axes[1]) for x in arrays) + # assert len(arrays) == len(axes[0]) + # These last three are sufficient to allow us to safely pass + # verify_integrity=False below. + + try: + blocks = _form_blocks(arrays, consolidate, refs) + mgr = BlockManager(blocks, axes, verify_integrity=False) + except ValueError as e: + raise_construction_error(len(arrays), arrays[0].shape, axes, e) + if consolidate: + mgr._consolidate_inplace() + return mgr + + +def raise_construction_error( + tot_items: int, + block_shape: Shape, + axes: list[Index], + e: ValueError | None = None, +) -> NoReturn: + """raise a helpful message about our construction""" + passed = tuple(map(int, [tot_items] + list(block_shape))) + # Correcting the user facing error message during dataframe construction + if len(passed) <= 2: + passed = passed[::-1] + + implied = tuple(len(ax) for ax in axes) + # Correcting the user facing error message during dataframe construction + if len(implied) <= 2: + implied = implied[::-1] + + # We return the exception object instead of raising it so that we + # can raise it in the caller; mypy plays better with that + if passed == implied and e is not None: + raise e + if block_shape[0] == 0: + raise ValueError("Empty data passed with indices specified.") + raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}") + + +# ----------------------------------------------------------------------- + + +def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, DtypeObj]: + dtype = tup[1].dtype + + if is_1d_only_ea_dtype(dtype): + # We know these won't be consolidated, so don't need to group these. + # This avoids expensive comparisons of CategoricalDtype objects + sep = id(dtype) + else: + sep = 0 - if delim_whitespace and (delimiter is not lib.no_default): - raise ValueError( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) + return sep, dtype - if delimiter == "\n": - raise ValueError( - r"Specified \n as separator or delimiter. This forces the python engine " - "which does not accept a line terminator. Hence it is not allowed to use " - "the line terminator as separator.", - ) - if delimiter is lib.no_default: - # assign default separator value - kwds["delimiter"] = delim_default - else: - kwds["delimiter"] = delimiter +def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]: + tuples = list(enumerate(arrays)) - if engine is not None: - kwds["engine_specified"] = True - else: - kwds["engine"] = "c" - kwds["engine_specified"] = False - - if on_bad_lines == "error": - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR - elif on_bad_lines == "warn": - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN - elif on_bad_lines == "skip": - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP - elif callable(on_bad_lines): - if engine not in ["python", "pyarrow"]: - raise ValueError( - "on_bad_line can only be a callable function " - "if engine='python' or 'pyarrow'" - ) - kwds["on_bad_lines"] = on_bad_lines - else: - raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") + if not consolidate: + return _tuples_to_blocks_no_consolidate(tuples, refs) - check_dtype_backend(dtype_backend) + # when consolidating, we can ignore refs (either stacking always copies, + # or the EA is already copied in the calling dict_to_mgr) - kwds["dtype_backend"] = dtype_backend + # group by dtype + grouper = itertools.groupby(tuples, _grouping_func) - return kwds + nbs: list[Block] = [] + for (_, dtype), tup_block in grouper: + block_type = get_block_type(dtype) + if isinstance(dtype, np.dtype): + is_dtlike = dtype.kind in "mM" -def _extract_dialect(kwds: dict[str, Any]) -> csv.Dialect | None: - """ - Extract concrete csv dialect instance. + if issubclass(dtype.type, (str, bytes)): + dtype = np.dtype(object) - Returns - ------- - csv.Dialect or None - """ - if kwds.get("dialect") is None: - return None + values, placement = _stack_arrays(list(tup_block), dtype) + if is_dtlike: + values = ensure_wrapped_if_datetimelike(values) + blk = block_type(values, placement=BlockPlacement(placement), ndim=2) + nbs.append(blk) - dialect = kwds["dialect"] - if dialect in csv.list_dialects(): - dialect = csv.get_dialect(dialect) + elif is_1d_only_ea_dtype(dtype): + dtype_blocks = [ + block_type(x[1], placement=BlockPlacement(x[0]), ndim=2) + for x in tup_block + ] + nbs.extend(dtype_blocks) - _validate_dialect(dialect) + else: + dtype_blocks = [ + block_type( + ensure_block_shape(x[1], 2), placement=BlockPlacement(x[0]), ndim=2 + ) + for x in tup_block + ] + nbs.extend(dtype_blocks) + return nbs - return dialect +def _tuples_to_blocks_no_consolidate(tuples, refs) -> list[Block]: + # tuples produced within _form_blocks are of the form (placement, array) + return [ + new_block_2d( + ensure_block_shape(arr, ndim=2), placement=BlockPlacement(i), refs=ref + ) + for ((i, arr), ref) in zip(tuples, refs) + ] -MANDATORY_DIALECT_ATTRS = ( - "delimiter", - "doublequote", - "escapechar", - "skipinitialspace", - "quotechar", - "quoting", -) +def _stack_arrays(tuples, dtype: np.dtype): + placement, arrays = zip(*tuples) -def _validate_dialect(dialect: csv.Dialect) -> None: - """ - Validate csv dialect instance. + first = arrays[0] + shape = (len(arrays),) + first.shape - Raises - ------ - ValueError - If incorrect dialect is provided. - """ - for param in MANDATORY_DIALECT_ATTRS: - if not hasattr(dialect, param): - raise ValueError(f"Invalid dialect {dialect} provided") + stacked = np.empty(shape, dtype=dtype) + for i, arr in enumerate(arrays): + stacked[i] = arr + + return stacked, placement -def _merge_with_dialect_properties( - dialect: csv.Dialect, - defaults: dict[str, Any], -) -> dict[str, Any]: +def _consolidate(blocks: tuple[Block, ...]) -> tuple[Block, ...]: """ - Merge default kwargs in TextFileReader with dialect parameters. + Merge blocks having same dtype, exclude non-consolidating blocks + """ + # sort by _can_consolidate, dtype + gkey = lambda x: x._consolidate_key + grouper = itertools.groupby(sorted(blocks, key=gkey), gkey) + + new_blocks: list[Block] = [] + for (_can_consolidate, dtype), group_blocks in grouper: + merged_blocks, _ = _merge_blocks( + list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate + ) + new_blocks = extend_blocks(merged_blocks, new_blocks) + return tuple(new_blocks) - Parameters - ---------- - dialect : csv.Dialect - Concrete csv dialect. See csv.Dialect documentation for more details. - defaults : dict - Keyword arguments passed to TextFileReader. - Returns - ------- - kwds : dict - Updated keyword arguments, merged with dialect parameters. - """ - kwds = defaults.copy() +def _merge_blocks( + blocks: list[Block], dtype: DtypeObj, can_consolidate: bool +) -> tuple[list[Block], bool]: + if len(blocks) == 1: + return blocks, False - for param in MANDATORY_DIALECT_ATTRS: - dialect_val = getattr(dialect, param) + if can_consolidate: + # TODO: optimization potential in case all mgrs contain slices and + # combination of those slices is a slice, too. + new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks]) - parser_default = parser_defaults[param] - provided = kwds.get(param, parser_default) + new_values: ArrayLike - # Messages for conflicting values between the dialect - # instance and the actual parameters provided. - conflict_msgs = [] + if isinstance(blocks[0].dtype, np.dtype): + # error: List comprehension has incompatible type List[Union[ndarray, + # ExtensionArray]]; expected List[Union[complex, generic, + # Sequence[Union[int, float, complex, str, bytes, generic]], + # Sequence[Sequence[Any]], SupportsArray]] + new_values = np.vstack([b.values for b in blocks]) # type: ignore[misc] + else: + bvals = [blk.values for blk in blocks] + bvals2 = cast(Sequence[NDArrayBackedExtensionArray], bvals) + new_values = bvals2[0]._concat_same_type(bvals2, axis=0) - # Don't warn if the default parameter was passed in, - # even if it conflicts with the dialect (gh-23761). - if provided not in (parser_default, dialect_val): - msg = ( - f"Conflicting values for '{param}': '{provided}' was " - f"provided, but the dialect specifies '{dialect_val}'. " - "Using the dialect-specified value." - ) + argsort = np.argsort(new_mgr_locs) + new_values = new_values[argsort] + new_mgr_locs = new_mgr_locs[argsort] - # Annoying corner case for not warning about - # conflicts between dialect and delimiter parameter. - # Refer to the outer "_read_" function for more info. - if not (param == "delimiter" and kwds.pop("sep_override", False)): - conflict_msgs.append(msg) + bp = BlockPlacement(new_mgr_locs) + return [new_block_2d(new_values, placement=bp)], True - if conflict_msgs: - warnings.warn( - "\n\n".join(conflict_msgs), ParserWarning, stacklevel=find_stack_level() - ) - kwds[param] = dialect_val - return kwds + # can't consolidate --> no merge + return blocks, False -def _validate_skipfooter(kwds: dict[str, Any]) -> None: - """ - Check whether skipfooter is compatible with other kwargs in TextFileReader. +def _fast_count_smallints(arr: npt.NDArray[np.intp]): + """Faster version of set(arr) for sequences of small numbers.""" + counts = np.bincount(arr) + nz = counts.nonzero()[0] + # Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here, + # in one benchmark by a factor of 11 + return zip(nz, counts[nz]) - Parameters - ---------- - kwds : dict - Keyword arguments passed to TextFileReader. - Raises - ------ - ValueError - If skipfooter is not compatible with other parameters. - """ - if kwds.get("skipfooter"): - if kwds.get("iterator") or kwds.get("chunksize"): - raise ValueError("'skipfooter' not supported for iteration") - if kwds.get("nrows"): - raise ValueError("'skipfooter' not supported with 'nrows'") +def _preprocess_slice_or_indexer( + slice_or_indexer: slice | np.ndarray, length: int, allow_fill: bool +): + if isinstance(slice_or_indexer, slice): + return ( + "slice", + slice_or_indexer, + libinternals.slice_len(slice_or_indexer, length), + ) + else: + if ( + not isinstance(slice_or_indexer, np.ndarray) + or slice_or_indexer.dtype.kind != "i" + ): + dtype = getattr(slice_or_indexer, "dtype", None) + raise TypeError(type(slice_or_indexer), dtype) + + indexer = ensure_platform_int(slice_or_indexer) + if not allow_fill: + indexer = maybe_convert_indices(indexer, length) + return "fancy", indexer, len(indexer) + + +def make_na_array(dtype: DtypeObj, shape: Shape, fill_value) -> ArrayLike: + if isinstance(dtype, DatetimeTZDtype): + # NB: exclude e.g. pyarrow[dt64tz] dtypes + ts = Timestamp(fill_value).as_unit(dtype.unit) + i8values = np.full(shape, ts._value) + dt64values = i8values.view(f"M8[{dtype.unit}]") + return DatetimeArray._simple_new(dt64values, dtype=dtype) + + elif is_1d_only_ea_dtype(dtype): + dtype = cast(ExtensionDtype, dtype) + cls = dtype.construct_array_type() + + missing_arr = cls._from_sequence([], dtype=dtype) + ncols, nrows = shape + assert ncols == 1, ncols + empty_arr = -1 * np.ones((nrows,), dtype=np.intp) + return missing_arr.take(empty_arr, allow_fill=True, fill_value=fill_value) + elif isinstance(dtype, ExtensionDtype): + # TODO: no tests get here, a handful would if we disabled + # the dt64tz special-case above (which is faster) + cls = dtype.construct_array_type() + missing_arr = cls._empty(shape=shape, dtype=dtype) + missing_arr[:] = fill_value + return missing_arr + else: + # NB: we should never get here with dtype integer or bool; + # if we did, the missing_arr.fill would cast to gibberish + missing_arr_np = np.empty(shape, dtype=dtype) + missing_arr_np.fill(fill_value) + + if dtype.kind in "mM": + missing_arr_np = ensure_wrapped_if_datetimelike(missing_arr_np) + return missing_arr_np + \ No newline at end of file From 4059ff670b5163e24616e3df8ebd0d297b019d85 Mon Sep 17 00:00:00 2001 From: Gabe Barnard Date: Sat, 20 Apr 2024 23:54:51 -0500 Subject: [PATCH 4/4] fixed implicit conversion of 1-arrays inside data frames --- pandas/core/internals/managers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 8fda9cd23b508..6b04f04262b13 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2171,7 +2171,10 @@ def setitem_inplace(self, indexer, value) -> None: # dt64/td64, which do their own validation. value = np_can_hold_element(arr.dtype, value) - if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1: + # check if the dtype of the block is object + implicit_convert = arr.dtype != 'object' + if (isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1 + and implicit_convert): # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 value = value[0, ...]