From 08fb7099272da4a9b2ff6a6f1b26353513fefdcb Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Sat, 4 Jun 2022 12:59:28 -0400 Subject: [PATCH 1/4] API: Disallow sets as index and columns argument in DataFrame constructor --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/_typing.py | 3 +-- pandas/core/frame.py | 21 +++++++++------------ pandas/core/indexes/base.py | 4 ++-- pandas/tests/frame/test_constructors.py | 8 ++++++++ 5 files changed, 21 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 6bf6fd65f5633..fb44408fa7699 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -152,6 +152,7 @@ Other enhancements - Added ``numeric_only`` argument to :meth:`Resampler.sum`, :meth:`Resampler.prod`, :meth:`Resampler.min`, :meth:`Resampler.max`, :meth:`Resampler.first`, and :meth:`Resampler.last` (:issue:`46442`) - ``times`` argument in :class:`.ExponentialMovingWindow` now accepts ``np.timedelta64`` (:issue:`47003`) - :class:`DataError`, :class:`SpecificationError`, :class:`SettingWithCopyError`, and :class:`SettingWithCopyWarning` are now exposed in ``pandas.errors`` (:issue:`27656`) +- :class:`DataFrame` constructor raises if ``index`` or ``columns`` arguments are sets (:issue:`47215`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: diff --git a/pandas/_typing.py b/pandas/_typing.py index a85820a403fde..37ecd4f540fa9 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -10,7 +10,6 @@ TYPE_CHECKING, Any, Callable, - Collection, Dict, Hashable, Iterator, @@ -115,7 +114,7 @@ Ordered = Optional[bool] JSONSerializable = Optional[Union[PythonScalar, List, Dict]] Frequency = Union[str, "DateOffset"] -Axes = Collection[Any] +Axes = Union[AnyArrayLike, List, Dict, range, Sequence[str]] RandomState = Union[ int, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c695c04052f9d..ec9837964e07a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -627,6 +627,12 @@ def __init__( manager = get_option("mode.data_manager") + # GH47215 + if index is not None and isinstance(index, set): + raise ValueError("index cannot be a set") + if columns is not None and isinstance(columns, set): + raise ValueError("columns cannot be a set") + if copy is None: if isinstance(data, dict): # retain pre-GH#38939 default behavior @@ -730,10 +736,7 @@ def __init__( if not isinstance(data, np.ndarray) and treat_as_nested(data): # exclude ndarray as we may have cast it a few lines above if columns is not None: - # error: Argument 1 to "ensure_index" has incompatible type - # "Collection[Any]"; expected "Union[Union[Union[ExtensionArray, - # ndarray], Index, Series], Sequence[Any]]" - columns = ensure_index(columns) # type: ignore[arg-type] + columns = ensure_index(columns) arrays, columns, index = nested_data_to_arrays( # error: Argument 3 to "nested_data_to_arrays" has incompatible # type "Optional[Collection[Any]]"; expected "Optional[Index]" @@ -771,14 +774,8 @@ def __init__( if index is None or columns is None: raise ValueError("DataFrame constructor not properly called!") - # Argument 1 to "ensure_index" has incompatible type "Collection[Any]"; - # expected "Union[Union[Union[ExtensionArray, ndarray], - # Index, Series], Sequence[Any]]" - index = ensure_index(index) # type: ignore[arg-type] - # Argument 1 to "ensure_index" has incompatible type "Collection[Any]"; - # expected "Union[Union[Union[ExtensionArray, ndarray], - # Index, Series], Sequence[Any]]" - columns = ensure_index(columns) # type: ignore[arg-type] + index = ensure_index(index) + columns = ensure_index(columns) if not dtype: dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8d8b413b53792..e958ceeff6f28 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -42,8 +42,8 @@ tz_compare, ) from pandas._typing import ( - AnyArrayLike, ArrayLike, + Axes, Dtype, DtypeObj, F, @@ -7273,7 +7273,7 @@ def ensure_index_from_sequences(sequences, names=None) -> Index: return MultiIndex.from_arrays(sequences, names=names) -def ensure_index(index_like: AnyArrayLike | Sequence, copy: bool = False) -> Index: +def ensure_index(index_like: Axes, copy: bool = False) -> Index: """ Ensure that we have an index from some index-like object. diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index e62c050fbf812..6ec0157082ff1 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2976,6 +2976,14 @@ def test_construction_from_ndarray_with_eadtype_mismatched_columns(self): with pytest.raises(ValueError, match=msg): DataFrame(arr2, columns=["foo", "bar"]) + def test_columns_indexes_raise_on_sets(self): + # GH 47215 + data = [[1, 2, 3], [4, 5, 6]] + with pytest.raises(ValueError, match="index cannot be a set"): + DataFrame(data, index={"a", "b"}) + with pytest.raises(ValueError, match="columns cannot be a set"): + DataFrame(data, columns={"a", "b", "c"}) + def get1(obj): # TODO: make a helper in tm? if isinstance(obj, Series): From 3e4901636fb8520d36fb6d349c9c1b58b5fec763 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Sat, 4 Jun 2022 14:06:31 -0400 Subject: [PATCH 2/4] TYP: Remove Sequence[str] from to_string, to_html --- pandas/_typing.py | 2 +- pandas/core/frame.py | 20 ++++++++++---------- pandas/io/formats/csvs.py | 2 +- pandas/io/formats/format.py | 9 +++++---- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 37ecd4f540fa9..96ba957bcdcaa 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -114,7 +114,7 @@ Ordered = Optional[bool] JSONSerializable = Optional[Union[PythonScalar, List, Dict]] Frequency = Union[str, "DateOffset"] -Axes = Union[AnyArrayLike, List, Dict, range, Sequence[str]] +Axes = Union[AnyArrayLike, List, Dict, range] RandomState = Union[ int, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ec9837964e07a..8e57165da9dee 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1105,9 +1105,9 @@ def _repr_html_(self) -> str | None: def to_string( self, buf: None = ..., - columns: Sequence[str] | None = ..., + columns: Axes | None = ..., col_space: int | list[int] | dict[Hashable, int] | None = ..., - header: bool | Sequence[str] = ..., + header: bool | list = ..., index: bool = ..., na_rep: str = ..., formatters: fmt.FormattersType | None = ..., @@ -1130,9 +1130,9 @@ def to_string( def to_string( self, buf: FilePath | WriteBuffer[str], - columns: Sequence[str] | None = ..., + columns: Axes | None = ..., col_space: int | list[int] | dict[Hashable, int] | None = ..., - header: bool | Sequence[str] = ..., + header: bool | list = ..., index: bool = ..., na_rep: str = ..., formatters: fmt.FormattersType | None = ..., @@ -1152,8 +1152,8 @@ def to_string( ... @Substitution( - header_type="bool or sequence of str", - header="Write out the column names. If a list of strings " + header_type="bool or array-like of column names", + header="Write out the column names. If a list of columns " "is given, it is assumed to be aliases for the " "column names", col_space_type="int, list or dict of int", @@ -1165,9 +1165,9 @@ def to_string( def to_string( self, buf: FilePath | WriteBuffer[str] | None = None, - columns: Sequence[str] | None = None, + columns: Axes | None = None, col_space: int | list[int] | dict[Hashable, int] | None = None, - header: bool | Sequence[str] = True, + header: bool | list = True, index: bool = True, na_rep: str = "NaN", formatters: fmt.FormattersType | None = None, @@ -2911,9 +2911,9 @@ def to_parquet( def to_html( self, buf: FilePath | WriteBuffer[str] | None = None, - columns: Sequence[str] | None = None, + columns: Axes | None = None, col_space: ColspaceArgType | None = None, - header: bool | Sequence[str] = True, + header: bool = True, index: bool = True, na_rep: str = "NaN", formatters: FormattersType | None = None, diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index c577acfaeba8e..80872a107d1aa 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -102,7 +102,7 @@ def decimal(self) -> str: return self.fmt.decimal @property - def header(self) -> bool | Sequence[str]: + def header(self) -> bool | list: return self.fmt.header @property diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 3019aa1fc2dc7..9fca4b2c2c786 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -47,6 +47,7 @@ from pandas._libs.tslibs.nattype import NaTType from pandas._typing import ( ArrayLike, + Axes, ColspaceArgType, ColspaceType, CompressionOptions, @@ -119,7 +120,7 @@ ---------- buf : str, Path or StringIO-like, optional, default None Buffer to write to. If None, the output is returned as a string. - columns : sequence, optional, default None + columns : array-like, optional, default None The subset of columns to write. Writes all columns by default. col_space : %(col_space_type)s, optional %(col_space)s. @@ -561,9 +562,9 @@ class DataFrameFormatter: def __init__( self, frame: DataFrame, - columns: Sequence[str] | None = None, + columns: Axes | None = None, col_space: ColspaceArgType | None = None, - header: bool | Sequence[str] = True, + header: bool | list = True, index: bool = True, na_rep: str = "NaN", formatters: FormattersType | None = None, @@ -683,7 +684,7 @@ def _initialize_justify(self, justify: str | None) -> str: else: return justify - def _initialize_columns(self, columns: Sequence[str] | None) -> Index: + def _initialize_columns(self, columns: Axes | None) -> Index: if columns is not None: cols = ensure_index(columns) self.frame = self.frame[cols] From a9e284983b468962bc4bd7438be87fe3c7ea0b5b Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Sun, 5 Jun 2022 11:52:20 -0400 Subject: [PATCH 3/4] use list[str] for header argument. Remove dict as index and columns argument for DataFrame. Define ListLike --- pandas/_typing.py | 9 ++++++++- pandas/core/frame.py | 8 ++++---- pandas/io/formats/csvs.py | 2 +- pandas/io/formats/format.py | 2 +- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 96ba957bcdcaa..6e55512ce6b87 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -79,6 +79,13 @@ ArrayLike = Union["ExtensionArray", np.ndarray] AnyArrayLike = Union[ArrayLike, "Index", "Series"] +# list-like + +# Cannot use `Sequence` because a string is a sequence, and we don't want to +# accept that. Could refine if https://github.com/python/typing/issues/256 is +# resolved to differentiate between Sequence[str] and str +ListLike = Union[AnyArrayLike, List, range] + # scalars PythonScalar = Union[str, int, float, bool] @@ -114,7 +121,7 @@ Ordered = Optional[bool] JSONSerializable = Optional[Union[PythonScalar, List, Dict]] Frequency = Union[str, "DateOffset"] -Axes = Union[AnyArrayLike, List, Dict, range] +Axes = ListLike RandomState = Union[ int, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8e57165da9dee..0ab567e9c583a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1107,7 +1107,7 @@ def to_string( buf: None = ..., columns: Axes | None = ..., col_space: int | list[int] | dict[Hashable, int] | None = ..., - header: bool | list = ..., + header: bool | list[str] = ..., index: bool = ..., na_rep: str = ..., formatters: fmt.FormattersType | None = ..., @@ -1132,7 +1132,7 @@ def to_string( buf: FilePath | WriteBuffer[str], columns: Axes | None = ..., col_space: int | list[int] | dict[Hashable, int] | None = ..., - header: bool | list = ..., + header: bool | list[str] = ..., index: bool = ..., na_rep: str = ..., formatters: fmt.FormattersType | None = ..., @@ -1152,7 +1152,7 @@ def to_string( ... @Substitution( - header_type="bool or array-like of column names", + header_type="bool or list of str", header="Write out the column names. If a list of columns " "is given, it is assumed to be aliases for the " "column names", @@ -1167,7 +1167,7 @@ def to_string( buf: FilePath | WriteBuffer[str] | None = None, columns: Axes | None = None, col_space: int | list[int] | dict[Hashable, int] | None = None, - header: bool | list = True, + header: bool | list[str] = True, index: bool = True, na_rep: str = "NaN", formatters: fmt.FormattersType | None = None, diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 80872a107d1aa..03432c62de7a5 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -102,7 +102,7 @@ def decimal(self) -> str: return self.fmt.decimal @property - def header(self) -> bool | list: + def header(self) -> bool | list[str]: return self.fmt.header @property diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 9fca4b2c2c786..dceb4220c3d70 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -564,7 +564,7 @@ def __init__( frame: DataFrame, columns: Axes | None = None, col_space: ColspaceArgType | None = None, - header: bool | list = True, + header: bool | list[str] = True, index: bool = True, na_rep: str = "NaN", formatters: FormattersType | None = None, From ca9b7accf87d6861408e815ef420265f2db06f20 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Tue, 4 Oct 2022 16:06:11 -0400 Subject: [PATCH 4/4] use Axes rather than Sequence[Hashable] --- pandas/_typing.py | 2 +- pandas/core/frame.py | 10 +++++----- pandas/core/generic.py | 13 +++++++------ pandas/io/formats/format.py | 8 +++----- 4 files changed, 16 insertions(+), 17 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index b13eb362039fb..df79d28e48151 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -132,7 +132,7 @@ Ordered = Optional[bool] JSONSerializable = Optional[Union[PythonScalar, List, Dict]] Frequency = Union[str, "BaseOffset"] -Axes = Union[AnyArrayLike, List, range] +Axes = ListLike RandomState = Union[ int, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4858f1ea39186..f1c4e897d6724 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3092,9 +3092,9 @@ def to_orc( def to_html( self, buf: FilePath | WriteBuffer[str], - columns: Sequence[Level] | None = ..., + columns: Axes | None = ..., col_space: ColspaceArgType | None = ..., - header: bool | Sequence[str] = ..., + header: bool = ..., index: bool = ..., na_rep: str = ..., formatters: FormattersType | None = ..., @@ -3121,9 +3121,9 @@ def to_html( def to_html( self, buf: None = ..., - columns: Sequence[Level] | None = ..., + columns: Axes | None = ..., col_space: ColspaceArgType | None = ..., - header: bool | Sequence[str] = ..., + header: bool = ..., index: bool = ..., na_rep: str = ..., formatters: FormattersType | None = ..., @@ -3159,7 +3159,7 @@ def to_html( def to_html( self, buf: FilePath | WriteBuffer[str] | None = None, - columns: Sequence[Level] | None = None, + columns: Axes | None = None, col_space: ColspaceArgType | None = None, header: bool = True, index: bool = True, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3df5d2aaf9896..f413c4a37ae53 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -44,6 +44,7 @@ AlignJoin, AnyArrayLike, ArrayLike, + Axes, Axis, AxisInt, ColspaceArgType, @@ -3238,9 +3239,9 @@ class (index) object 'bird' 'bird' 'mammal' 'mammal' def to_latex( self, buf: None = ..., - columns: Sequence[Hashable] | None = ..., + columns: Axes | None = ..., col_space: ColspaceArgType | None = ..., - header: bool_t | Sequence[str] = ..., + header: bool_t | list[str] = ..., index: bool_t = ..., na_rep: str = ..., formatters: FormattersType | None = ..., @@ -3266,9 +3267,9 @@ def to_latex( def to_latex( self, buf: FilePath | WriteBuffer[str], - columns: Sequence[Hashable] | None = ..., + columns: Axes | None = ..., col_space: ColspaceArgType | None = ..., - header: bool_t | Sequence[str] = ..., + header: bool_t | list[str] = ..., index: bool_t = ..., na_rep: str = ..., formatters: FormattersType | None = ..., @@ -3295,9 +3296,9 @@ def to_latex( def to_latex( self, buf: FilePath | WriteBuffer[str] | None = None, - columns: Sequence[Hashable] | None = None, + columns: Axes | None = None, col_space: ColspaceArgType | None = None, - header: bool_t | Sequence[str] = True, + header: bool_t | list[str] = True, index: bool_t = True, na_rep: str = "NaN", formatters: FormattersType | None = None, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index cf5159c470558..6050691754d66 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -566,7 +566,7 @@ class DataFrameFormatter: def __init__( self, frame: DataFrame, - columns: Sequence[Hashable] | None = None, + columns: Axes | None = None, col_space: ColspaceArgType | None = None, header: bool | list[str] = True, index: bool = True, @@ -688,11 +688,9 @@ def _initialize_justify(self, justify: str | None) -> str: else: return justify - def _initialize_columns(self, columns: Sequence[Hashable] | None) -> Index: + def _initialize_columns(self, columns: Axes | None) -> Index: if columns is not None: - # GH 47231 - columns doesn't have to be `Sequence[str]` - # Will fix in later PR - cols = ensure_index(cast(Axes, columns)) + cols = ensure_index(columns) self.frame = self.frame[cols] return cols else: