From 323838195fac7122cac9537f53637e10c0fb5692 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 20 Feb 2023 11:26:34 +0000 Subject: [PATCH 1/2] TYP: Use Self for type checking (pandas/core/internals/) --- .startup.ipy | 48 +++++++++++++++++ pandas/core/internals/array_manager.py | 68 ++++++++++++------------ pandas/core/internals/base.py | 22 ++++---- pandas/core/internals/managers.py | 72 +++++++++++++------------- 4 files changed, 128 insertions(+), 82 deletions(-) create mode 100644 .startup.ipy diff --git a/.startup.ipy b/.startup.ipy new file mode 100644 index 0000000000000..15c80b2ab6f30 --- /dev/null +++ b/.startup.ipy @@ -0,0 +1,48 @@ +>>> from itertools import product +>>> import numpy as np +>>> import pandas as pd +>>> from pandas.core.reshape.concat import _Concatenator +>>> +>>> def manual_concat(df_list: list[pd.DataFrame]) -> pd.DataFrame: +... columns = [col for df in df_list for col in df.columns] +... columns = list(dict.fromkeys(columns)) +... index = np.hstack([df.index.values for df in df_list]) +... df_list = [df.reindex(columns=columns) for df in df_list] +... values = np.vstack([df.values for df in df_list]) +... return pd.DataFrame(values, index=index, columns=columns, dtype=df_list[0].dtypes[0]) +>>> +>>> def compare_frames(df_list: list[pd.DataFrame]) -> None: +... concat_df = pd.concat(df_list) +... manual_df = manual_concat(df_list) +... if not concat_df.equals(manual_df): +... raise ValueError("different concatenations!") +>>> +>>> def make_dataframes(num_dfs, num_idx, num_cols, dtype=np.int32, drop_column=False) -> list[pd.DataFrame]: +... values = np.random.randint(-100, 100, size=[num_idx, num_cols]) +... index = [f"i{i}" for i in range(num_idx)] +... columns = np.random.choice([f"c{i}" for i in range(num_cols)], num_cols, replace=False) +... df = pd.DataFrame(values, index=index, columns=columns, dtype=dtype) +... +... df_list = [] +... for i in range(num_dfs): +... new_df = df.copy() +... if drop_column: +... label = new_df.columns[i] +... new_df = new_df.drop(label, axis=1) +... df_list.append(new_df) +... return df_list +>>> +>>> test_data = [ # num_idx, num_cols, num_dfs +... [100, 1_000, 3], +... ] +>>> for i, (num_idx, num_cols, num_dfs) in enumerate(test_data): +... print(f"\n{i}: {num_dfs=}, {num_idx=}, {num_cols=}") +... df_list = make_dataframes(num_dfs, num_idx, num_cols, drop_column=False) +... df_list_dropped = make_dataframes(num_dfs, num_idx, num_cols, drop_column=True) +... print("manual:") +... %timeit manual_concat(df_list) +... compare_frames(df_list) +... for use_dropped in [False, True]: +... print(f"pd.concat: {use_dropped=}") +... this_df_list = df_list if not use_dropped else df_list_dropped +... %timeit pd.concat(this_df_list) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 2de970466e19f..02b12d635739c 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -9,7 +9,6 @@ Callable, Hashable, Literal, - TypeVar, ) import numpy as np @@ -92,10 +91,10 @@ ArrayLike, AxisInt, DtypeObj, + Self, QuantileInterpolation, npt, ) -T = TypeVar("T", bound="BaseArrayManager") class BaseArrayManager(DataManager): @@ -131,7 +130,7 @@ def __init__( ) -> None: raise NotImplementedError - def make_empty(self: T, axes=None) -> T: + def make_empty(self, axes=None) -> Self: """Return an empty ArrayManager with the items axis of len 0 (no columns)""" if axes is None: axes = [self.axes[1:], Index([])] @@ -195,11 +194,11 @@ def __repr__(self) -> str: return output def apply( - self: T, + self, f, align_keys: list[str] | None = None, **kwargs, - ) -> T: + ) -> Self: """ Iterate over the arrays, collect and create a new ArrayManager. @@ -257,8 +256,8 @@ def apply( return type(self)(result_arrays, new_axes) # type: ignore[arg-type] def apply_with_block( - self: T, f, align_keys=None, swap_axis: bool = True, **kwargs - ) -> T: + self, f, align_keys=None, swap_axis: bool = True, **kwargs + ) -> Self: # switch axis to follow BlockManager logic if swap_axis and "axis" in kwargs and self.ndim == 2: kwargs["axis"] = 1 if kwargs["axis"] == 0 else 0 @@ -311,7 +310,7 @@ def apply_with_block( return type(self)(result_arrays, self._axes) - def where(self: T, other, cond, align: bool) -> T: + def where(self, other, cond, align: bool) -> Self: if align: align_keys = ["other", "cond"] else: @@ -325,13 +324,13 @@ def where(self: T, other, cond, align: bool) -> T: cond=cond, ) - def round(self: T, decimals: int, using_cow: bool = False) -> T: + def round(self, decimals: int, using_cow: bool = False) -> Self: return self.apply_with_block("round", decimals=decimals, using_cow=using_cow) - def setitem(self: T, indexer, value) -> T: + def setitem(self, indexer, value) -> Self: return self.apply_with_block("setitem", indexer=indexer, value=value) - def putmask(self: T, mask, new, align: bool = True) -> T: + def putmask(self, mask, new, align: bool = True) -> Self: if align: align_keys = ["new", "mask"] else: @@ -345,14 +344,14 @@ def putmask(self: T, mask, new, align: bool = True) -> T: new=new, ) - def diff(self: T, n: int, axis: AxisInt) -> T: + def diff(self, n: int, axis: AxisInt) -> Self: assert self.ndim == 2 and axis == 0 # caller ensures return self.apply(algos.diff, n=n, axis=axis) - def interpolate(self: T, **kwargs) -> T: + def interpolate(self, **kwargs) -> Self: return self.apply_with_block("interpolate", swap_axis=False, **kwargs) - def shift(self: T, periods: int, axis: AxisInt, fill_value) -> T: + def shift(self, periods: int, axis: AxisInt, fill_value) -> Self: if fill_value is lib.no_default: fill_value = None @@ -364,7 +363,7 @@ def shift(self: T, periods: int, axis: AxisInt, fill_value) -> T: "shift", periods=periods, axis=axis, fill_value=fill_value ) - def fillna(self: T, value, limit, inplace: bool, downcast) -> T: + def fillna(self, value, limit, inplace: bool, downcast) -> Self: if limit is not None: # Do this validation even if we go through one of the no-op paths limit = libalgos.validate_limit(None, limit=limit) @@ -373,13 +372,13 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T: "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast ) - def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T: + def astype(self, dtype, copy: bool | None = False, errors: str = "raise") -> Self: if copy is None: copy = True return self.apply(astype_array_safe, dtype=dtype, copy=copy, errors=errors) - def convert(self: T, copy: bool | None) -> T: + def convert(self, copy: bool | None) -> Self: if copy is None: copy = True @@ -402,10 +401,10 @@ def _convert(arr): return self.apply(_convert) - def replace_regex(self: T, **kwargs) -> T: + def replace_regex(self, **kwargs) -> Self: return self.apply_with_block("_replace_regex", **kwargs) - def replace(self: T, to_replace, value, inplace: bool) -> T: + def replace(self, to_replace, value, inplace: bool) -> Self: inplace = validate_bool_kwarg(inplace, "inplace") assert np.ndim(value) == 0, value # TODO "replace" is right now implemented on the blocks, we should move @@ -415,12 +414,12 @@ def replace(self: T, to_replace, value, inplace: bool) -> T: ) def replace_list( - self: T, + self, src_list: list[Any], dest_list: list[Any], inplace: bool = False, regex: bool = False, - ) -> T: + ) -> Self: """do a list replace""" inplace = validate_bool_kwarg(inplace, "inplace") @@ -432,7 +431,7 @@ def replace_list( regex=regex, ) - def to_native_types(self: T, **kwargs) -> T: + def to_native_types(self, **kwargs) -> Self: return self.apply(to_native_types, **kwargs) @property @@ -458,7 +457,7 @@ def is_view(self) -> bool: def is_single_block(self) -> bool: return len(self.arrays) == 1 - def _get_data_subset(self: T, predicate: Callable) -> T: + def _get_data_subset(self, predicate: Callable) -> Self: indices = [i for i, arr in enumerate(self.arrays) if predicate(arr)] arrays = [self.arrays[i] for i in indices] # TODO copy? @@ -469,7 +468,7 @@ def _get_data_subset(self: T, predicate: Callable) -> T: new_axes = [self._axes[0], new_cols] return type(self)(arrays, new_axes, verify_integrity=False) - def get_bool_data(self: T, copy: bool = False) -> T: + def get_bool_data(self, copy: bool = False) -> Self: """ Select columns that are bool-dtype and object-dtype columns that are all-bool. @@ -480,7 +479,7 @@ def get_bool_data(self: T, copy: bool = False) -> T: """ return self._get_data_subset(lambda x: x.dtype == np.dtype(bool)) - def get_numeric_data(self: T, copy: bool = False) -> T: + def get_numeric_data(self, copy: bool = False) -> Self: """ Select columns that have a numeric dtype. @@ -494,7 +493,7 @@ def get_numeric_data(self: T, copy: bool = False) -> T: or getattr(arr.dtype, "_is_numeric", False) ) - def copy(self: T, deep: bool | Literal["all"] | None = True) -> T: + def copy(self, deep: bool | Literal["all"] | None = True) -> Self: """ Make deep or shallow copy of ArrayManager @@ -531,7 +530,7 @@ def copy_func(ax): return type(self)(new_arrays, new_axes, verify_integrity=False) def reindex_indexer( - self: T, + self, new_axis, indexer, axis: AxisInt, @@ -542,7 +541,7 @@ def reindex_indexer( only_slice: bool = False, # ArrayManager specific keywords use_na_proxy: bool = False, - ) -> T: + ) -> Self: axis = self._normalize_axis(axis) return self._reindex_indexer( new_axis, @@ -555,7 +554,7 @@ def reindex_indexer( ) def _reindex_indexer( - self: T, + self, new_axis, indexer: npt.NDArray[np.intp] | None, axis: AxisInt, @@ -563,7 +562,7 @@ def _reindex_indexer( allow_dups: bool = False, copy: bool | None = True, use_na_proxy: bool = False, - ) -> T: + ) -> Self: """ Parameters ---------- @@ -634,11 +633,12 @@ def _reindex_indexer( return type(self)(new_arrays, new_axes, verify_integrity=False) def take( - self: T, + self, indexer: npt.NDArray[np.intp], axis: AxisInt = 1, verify: bool = True, - ) -> T: + convert_indices: bool = True, + ) -> Self: """ Take items along any axis. """ @@ -926,7 +926,7 @@ def idelete(self, indexer) -> ArrayManager: # -------------------------------------------------------------------- # Array-wise Operation - def grouped_reduce(self: T, func: Callable) -> T: + def grouped_reduce(self, func: Callable) -> Self: """ Apply grouped reduction function columnwise, returning a new ArrayManager. @@ -965,7 +965,7 @@ def grouped_reduce(self: T, func: Callable) -> T: # expected "List[Union[ndarray, ExtensionArray]]" return type(self)(result_arrays, [index, columns]) # type: ignore[arg-type] - def reduce(self: T, func: Callable) -> T: + def reduce(self, func: Callable) -> Self: """ Apply reduction function column-wise, returning a single-row ArrayManager. diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index 7ed6863aa60c1..2770f8f71bf6c 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -7,7 +7,6 @@ from typing import ( TYPE_CHECKING, Literal, - TypeVar, final, ) @@ -31,10 +30,9 @@ ArrayLike, AxisInt, DtypeObj, + Self, Shape, ) -T = TypeVar("T", bound="DataManager") - class DataManager(PandasObject): # TODO share more methods/attributes @@ -75,7 +73,7 @@ def _validate_set_axis(self, axis: AxisInt, new_labels: Index) -> None: ) def reindex_indexer( - self: T, + self, new_axis, indexer, axis: AxisInt, @@ -83,17 +81,17 @@ def reindex_indexer( allow_dups: bool = False, copy: bool = True, only_slice: bool = False, - ) -> T: + ) -> Self: raise AbstractMethodError(self) @final def reindex_axis( - self: T, + self, new_index: Index, axis: AxisInt, fill_value=None, only_slice: bool = False, - ) -> T: + ) -> Self: """ Conform data manager to new index. """ @@ -108,7 +106,7 @@ def reindex_axis( only_slice=only_slice, ) - def _equal_values(self: T, other: T) -> bool: + def _equal_values(self, other: Self) -> bool: """ To be implemented by the subclasses. Only check the column values assuming shape and indexes have already been checked. @@ -132,15 +130,15 @@ def equals(self, other: object) -> bool: return self._equal_values(other) def apply( - self: T, + self, f, align_keys: list[str] | None = None, **kwargs, - ) -> T: + ) -> Self: raise AbstractMethodError(self) @final - def isna(self: T, func) -> T: + def isna(self, func) -> Self: return self.apply("apply", func=func) # -------------------------------------------------------------------- @@ -149,7 +147,7 @@ def isna(self: T, func) -> T: def is_consolidated(self) -> bool: return True - def consolidate(self: T) -> T: + def consolidate(self) -> Self: return self def _consolidate_inplace(self) -> None: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index e5f50bb35d6bd..f3643e7b2011f 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -8,7 +8,6 @@ Hashable, Literal, Sequence, - TypeVar, cast, ) import warnings @@ -84,11 +83,11 @@ AxisInt, DtypeObj, QuantileInterpolation, + Self, Shape, npt, type_t, ) -T = TypeVar("T", bound="BaseBlockManager") class BaseBlockManager(DataManager): @@ -160,7 +159,7 @@ def __init__(self, blocks, axes, verify_integrity: bool = True) -> None: raise NotImplementedError @classmethod - def from_blocks(cls: type_t[T], blocks: list[Block], axes: list[Index]) -> T: + def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> Self: raise NotImplementedError @property @@ -190,7 +189,7 @@ def blklocs(self) -> npt.NDArray[np.intp]: return self._blklocs - def make_empty(self: T, axes=None) -> T: + def make_empty(self, axes=None) -> Self: """return an empty BlockManager with the items axis of len 0""" if axes is None: axes = [Index([])] + self.axes[1:] @@ -303,11 +302,11 @@ def __repr__(self) -> str: return output def apply( - self: T, + self, f, align_keys: list[str] | None = None, **kwargs, - ) -> T: + ) -> Self: """ Iterate over the blocks, collect and create a new BlockManager. @@ -354,7 +353,7 @@ def apply( out = type(self).from_blocks(result_blocks, self.axes) return out - def where(self: T, other, cond, align: bool) -> T: + def where(self, other, cond, align: bool) -> Self: if align: align_keys = ["other", "cond"] else: @@ -369,14 +368,14 @@ def where(self: T, other, cond, align: bool) -> T: using_cow=using_copy_on_write(), ) - def round(self: T, decimals: int, using_cow: bool = False) -> T: + def round(self, decimals: int, using_cow: bool = False) -> Self: return self.apply( "round", decimals=decimals, using_cow=using_cow, ) - def setitem(self: T, indexer, value) -> T: + def setitem(self, indexer, value) -> Self: """ Set values with indexer. @@ -407,24 +406,24 @@ def putmask(self, mask, new, align: bool = True): using_cow=using_copy_on_write(), ) - def diff(self: T, n: int, axis: AxisInt) -> T: + def diff(self, n: int, axis: AxisInt) -> Self: # only reached with self.ndim == 2 and axis == 1 axis = self._normalize_axis(axis) return self.apply("diff", n=n, axis=axis) - def interpolate(self: T, inplace: bool, **kwargs) -> T: + def interpolate(self, inplace: bool, **kwargs) -> Self: return self.apply( "interpolate", inplace=inplace, **kwargs, using_cow=using_copy_on_write() ) - def shift(self: T, periods: int, axis: AxisInt, fill_value) -> T: + def shift(self, periods: int, axis: AxisInt, fill_value) -> Self: axis = self._normalize_axis(axis) if fill_value is lib.no_default: fill_value = None return self.apply("shift", periods=periods, axis=axis, fill_value=fill_value) - def fillna(self: T, value, limit, inplace: bool, downcast) -> T: + def fillna(self, value, limit, inplace: bool, downcast) -> Self: if limit is not None: # Do this validation even if we go through one of the no-op paths limit = libalgos.validate_limit(None, limit=limit) @@ -438,7 +437,7 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T: using_cow=using_copy_on_write(), ) - def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T: + def astype(self, dtype, copy: bool | None = False, errors: str = "raise") -> Self: if copy is None: if using_copy_on_write(): copy = False @@ -455,7 +454,7 @@ def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> using_cow=using_copy_on_write(), ) - def convert(self: T, copy: bool | None) -> T: + def convert(self, copy: bool | None) -> Self: if copy is None: if using_copy_on_write(): copy = False @@ -466,7 +465,7 @@ def convert(self: T, copy: bool | None) -> T: return self.apply("convert", copy=copy, using_cow=using_copy_on_write()) - def replace(self: T, to_replace, value, inplace: bool) -> T: + def replace(self, to_replace, value, inplace: bool) -> Self: inplace = validate_bool_kwarg(inplace, "inplace") # NDFrame.replace ensures the not-is_list_likes here assert not is_list_like(to_replace) @@ -483,12 +482,12 @@ def replace_regex(self, **kwargs): return self.apply("_replace_regex", **kwargs, using_cow=using_copy_on_write()) def replace_list( - self: T, + self, src_list: list[Any], dest_list: list[Any], inplace: bool = False, regex: bool = False, - ) -> T: + ) -> Self: """do a list replace""" inplace = validate_bool_kwarg(inplace, "inplace") @@ -503,7 +502,7 @@ def replace_list( bm._consolidate_inplace() return bm - def to_native_types(self: T, **kwargs) -> T: + def to_native_types(self, **kwargs) -> Self: """ Convert values to native types (strings / python objects) that are used in formatting (repr / csv). @@ -534,11 +533,11 @@ def is_view(self) -> bool: return False - def _get_data_subset(self: T, predicate: Callable) -> T: + def _get_data_subset(self, predicate: Callable) -> Self: blocks = [blk for blk in self.blocks if predicate(blk.values)] return self._combine(blocks, copy=False) - def get_bool_data(self: T, copy: bool = False) -> T: + def get_bool_data(self, copy: bool = False) -> Self: """ Select blocks that are bool-dtype and columns from object-dtype blocks that are all-bool. @@ -563,7 +562,7 @@ def get_bool_data(self: T, copy: bool = False) -> T: return self._combine(new_blocks, copy) - def get_numeric_data(self: T, copy: bool = False) -> T: + def get_numeric_data(self, copy: bool = False) -> Self: """ Parameters ---------- @@ -579,8 +578,8 @@ def get_numeric_data(self: T, copy: bool = False) -> T: return self._combine(numeric_blocks, copy) def _combine( - self: T, blocks: list[Block], copy: bool = True, index: Index | None = None - ) -> T: + self, blocks: list[Block], copy: bool = True, index: Index | None = None + ) -> Self: """return a new manager with the blocks""" if len(blocks) == 0: if self.ndim == 2: @@ -616,7 +615,7 @@ def _combine( def nblocks(self) -> int: return len(self.blocks) - def copy(self: T, deep: bool | None | Literal["all"] = True) -> T: + def copy(self, deep: bool | None | Literal["all"] = True) -> Self: """ Make deep or shallow copy of BlockManager @@ -663,7 +662,7 @@ def copy_func(ax): res._consolidate_inplace() return res - def consolidate(self: T) -> T: + def consolidate(self) -> Self: """ Join together blocks having same dtype @@ -680,7 +679,7 @@ def consolidate(self: T) -> T: return bm def reindex_indexer( - self: T, + self, new_axis: Index, indexer: npt.NDArray[np.intp] | None, axis: AxisInt, @@ -690,7 +689,7 @@ def reindex_indexer( only_slice: bool = False, *, use_na_proxy: bool = False, - ) -> T: + ) -> Self: """ Parameters ---------- @@ -926,11 +925,12 @@ def _make_na_block( return new_block_2d(block_values, placement=placement) def take( - self: T, + self, indexer: npt.NDArray[np.intp], axis: AxisInt = 1, verify: bool = True, - ) -> T: + convert_indices: bool = True, + ) -> Self: """ Take items along any axis. @@ -1006,7 +1006,7 @@ def _verify_integrity(self) -> None: ) @classmethod - def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> BlockManager: + def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> Self: """ Constructor for BlockManager and SingleBlockManager with same signature. """ @@ -1472,7 +1472,7 @@ def idelete(self, indexer) -> BlockManager: # ---------------------------------------------------------------- # Block-wise Operation - def grouped_reduce(self: T, func: Callable) -> T: + def grouped_reduce(self, func: Callable) -> Self: """ Apply grouped reduction function blockwise, returning a new BlockManager. @@ -1505,7 +1505,7 @@ def grouped_reduce(self: T, func: Callable) -> T: return type(self).from_blocks(result_blocks, [self.axes[0], index]) - def reduce(self: T, func: Callable) -> T: + def reduce(self, func: Callable) -> Self: """ Apply reduction function blockwise, returning a single-row BlockManager. @@ -1543,12 +1543,12 @@ def _equal_values(self: BlockManager, other: BlockManager) -> bool: return blockwise_all(self, other, array_equals) def quantile( - self: T, + self, *, qs: Index, # with dtype float 64 axis: AxisInt = 0, interpolation: QuantileInterpolation = "linear", - ) -> T: + ) -> Self: """ Iterate over blocks applying quantile reduction. This routine is intended for reduction type operations and @@ -2062,7 +2062,7 @@ def set_values(self, values: ArrayLike) -> None: self.blocks[0].values = values self.blocks[0]._mgr_locs = BlockPlacement(slice(len(values))) - def _equal_values(self: T, other: T) -> bool: + def _equal_values(self, other: Self) -> bool: """ Used in .equals defined in base class. Only check the column values assuming shape and indexes have already been checked. From 3620d66850b69712c8e1fed90a294fad3cd472de Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 21 Feb 2023 06:05:57 +0000 Subject: [PATCH 2/2] TYP: Use Self for type checking (remaining locations/) --- .startup.ipy | 48 -------------------------- pandas/core/base.py | 6 ++-- pandas/core/dtypes/base.py | 5 ++- pandas/core/indexing.py | 8 ++--- pandas/core/internals/array_manager.py | 3 +- pandas/core/internals/base.py | 1 + pandas/core/internals/managers.py | 2 -- 7 files changed, 8 insertions(+), 65 deletions(-) delete mode 100644 .startup.ipy diff --git a/.startup.ipy b/.startup.ipy deleted file mode 100644 index 15c80b2ab6f30..0000000000000 --- a/.startup.ipy +++ /dev/null @@ -1,48 +0,0 @@ ->>> from itertools import product ->>> import numpy as np ->>> import pandas as pd ->>> from pandas.core.reshape.concat import _Concatenator ->>> ->>> def manual_concat(df_list: list[pd.DataFrame]) -> pd.DataFrame: -... columns = [col for df in df_list for col in df.columns] -... columns = list(dict.fromkeys(columns)) -... index = np.hstack([df.index.values for df in df_list]) -... df_list = [df.reindex(columns=columns) for df in df_list] -... values = np.vstack([df.values for df in df_list]) -... return pd.DataFrame(values, index=index, columns=columns, dtype=df_list[0].dtypes[0]) ->>> ->>> def compare_frames(df_list: list[pd.DataFrame]) -> None: -... concat_df = pd.concat(df_list) -... manual_df = manual_concat(df_list) -... if not concat_df.equals(manual_df): -... raise ValueError("different concatenations!") ->>> ->>> def make_dataframes(num_dfs, num_idx, num_cols, dtype=np.int32, drop_column=False) -> list[pd.DataFrame]: -... values = np.random.randint(-100, 100, size=[num_idx, num_cols]) -... index = [f"i{i}" for i in range(num_idx)] -... columns = np.random.choice([f"c{i}" for i in range(num_cols)], num_cols, replace=False) -... df = pd.DataFrame(values, index=index, columns=columns, dtype=dtype) -... -... df_list = [] -... for i in range(num_dfs): -... new_df = df.copy() -... if drop_column: -... label = new_df.columns[i] -... new_df = new_df.drop(label, axis=1) -... df_list.append(new_df) -... return df_list ->>> ->>> test_data = [ # num_idx, num_cols, num_dfs -... [100, 1_000, 3], -... ] ->>> for i, (num_idx, num_cols, num_dfs) in enumerate(test_data): -... print(f"\n{i}: {num_dfs=}, {num_idx=}, {num_cols=}") -... df_list = make_dataframes(num_dfs, num_idx, num_cols, drop_column=False) -... df_list_dropped = make_dataframes(num_dfs, num_idx, num_cols, drop_column=True) -... print("manual:") -... %timeit manual_concat(df_list) -... compare_frames(df_list) -... for use_dropped in [False, True]: -... print(f"pd.concat: {use_dropped=}") -... this_df_list = df_list if not use_dropped else df_list_dropped -... %timeit pd.concat(this_df_list) diff --git a/pandas/core/base.py b/pandas/core/base.py index 9ca4b9389113d..08e395a3cea11 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -12,7 +12,6 @@ Hashable, Iterator, Literal, - TypeVar, cast, final, overload, @@ -29,6 +28,7 @@ DtypeObj, IndexLabel, NDFrameT, + Self, Shape, npt, ) @@ -91,8 +91,6 @@ "duplicated": "IndexOpsMixin", } -_T = TypeVar("_T", bound="IndexOpsMixin") - class PandasObject(DirNamesMixin): """ @@ -285,7 +283,7 @@ def _values(self) -> ExtensionArray | np.ndarray: raise AbstractMethodError(self) @final - def transpose(self: _T, *args, **kwargs) -> _T: + def transpose(self, *args, **kwargs) -> Self: """ Return the transpose, which is by definition self. diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 34f22cf13f8e9..7c05ec1ba33a9 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -26,6 +26,7 @@ if TYPE_CHECKING: from pandas._typing import ( DtypeObj, + Self, Shape, npt, type_t, @@ -228,9 +229,7 @@ def empty(self, shape: Shape) -> type_t[ExtensionArray]: return cls._empty(shape, dtype=self) @classmethod - def construct_from_string( - cls: type_t[ExtensionDtypeT], string: str - ) -> ExtensionDtypeT: + def construct_from_string(cls, string: str) -> Self: r""" Construct this type from a string. diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 4a24ed221c89f..8db08fc15c0f4 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -6,7 +6,6 @@ TYPE_CHECKING, Hashable, Sequence, - TypeVar, cast, final, ) @@ -78,6 +77,7 @@ from pandas._typing import ( Axis, AxisInt, + Self, ) from pandas import ( @@ -85,8 +85,6 @@ Series, ) -_LocationIndexerT = TypeVar("_LocationIndexerT", bound="_LocationIndexer") - # "null slice" _NS = slice(None, None) _one_ellipsis_message = "indexer may only contain one '...' entry" @@ -669,9 +667,7 @@ class _LocationIndexer(NDFrameIndexerBase): _takeable: bool @final - def __call__( - self: _LocationIndexerT, axis: Axis | None = None - ) -> _LocationIndexerT: + def __call__(self, axis: Axis | None = None) -> Self: # we need to return a copy of ourselves new_self = type(self)(self.name, self.obj) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 02b12d635739c..3430874bb86ef 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -91,8 +91,8 @@ ArrayLike, AxisInt, DtypeObj, - Self, QuantileInterpolation, + Self, npt, ) @@ -637,7 +637,6 @@ def take( indexer: npt.NDArray[np.intp], axis: AxisInt = 1, verify: bool = True, - convert_indices: bool = True, ) -> Self: """ Take items along any axis. diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index 2770f8f71bf6c..523dee97a3c5c 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -34,6 +34,7 @@ Shape, ) + class DataManager(PandasObject): # TODO share more methods/attributes diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f3643e7b2011f..e5fd08e84ad07 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -86,7 +86,6 @@ Self, Shape, npt, - type_t, ) @@ -929,7 +928,6 @@ def take( indexer: npt.NDArray[np.intp], axis: AxisInt = 1, verify: bool = True, - convert_indices: bool = True, ) -> Self: """ Take items along any axis.