From 08fb7099272da4a9b2ff6a6f1b26353513fefdcb Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Sat, 4 Jun 2022 12:59:28 -0400 Subject: [PATCH 1/4] API: Disallow sets as index and columns argument in DataFrame constructor --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/_typing.py | 3 +-- pandas/core/frame.py | 21 +++++++++------------ pandas/core/indexes/base.py | 4 ++-- pandas/tests/frame/test_constructors.py | 8 ++++++++ 5 files changed, 21 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 6bf6fd65f5633..fb44408fa7699 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -152,6 +152,7 @@ Other enhancements - Added ``numeric_only`` argument to :meth:`Resampler.sum`, :meth:`Resampler.prod`, :meth:`Resampler.min`, :meth:`Resampler.max`, :meth:`Resampler.first`, and :meth:`Resampler.last` (:issue:`46442`) - ``times`` argument in :class:`.ExponentialMovingWindow` now accepts ``np.timedelta64`` (:issue:`47003`) - :class:`DataError`, :class:`SpecificationError`, :class:`SettingWithCopyError`, and :class:`SettingWithCopyWarning` are now exposed in ``pandas.errors`` (:issue:`27656`) +- :class:`DataFrame` constructor raises if ``index`` or ``columns`` arguments are sets (:issue:`47215`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: diff --git a/pandas/_typing.py b/pandas/_typing.py index a85820a403fde..37ecd4f540fa9 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -10,7 +10,6 @@ TYPE_CHECKING, Any, Callable, - Collection, Dict, Hashable, Iterator, @@ -115,7 +114,7 @@ Ordered = Optional[bool] JSONSerializable = Optional[Union[PythonScalar, List, Dict]] Frequency = Union[str, "DateOffset"] -Axes = Collection[Any] +Axes = Union[AnyArrayLike, List, Dict, range, Sequence[str]] RandomState = Union[ int, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c695c04052f9d..ec9837964e07a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -627,6 +627,12 @@ def __init__( manager = get_option("mode.data_manager") + # GH47215 + if index is not None and isinstance(index, set): + raise ValueError("index cannot be a set") + if columns is not None and isinstance(columns, set): + raise ValueError("columns cannot be a set") + if copy is None: if isinstance(data, dict): # retain pre-GH#38939 default behavior @@ -730,10 +736,7 @@ def __init__( if not isinstance(data, np.ndarray) and treat_as_nested(data): # exclude ndarray as we may have cast it a few lines above if columns is not None: - # error: Argument 1 to "ensure_index" has incompatible type - # "Collection[Any]"; expected "Union[Union[Union[ExtensionArray, - # ndarray], Index, Series], Sequence[Any]]" - columns = ensure_index(columns) # type: ignore[arg-type] + columns = ensure_index(columns) arrays, columns, index = nested_data_to_arrays( # error: Argument 3 to "nested_data_to_arrays" has incompatible # type "Optional[Collection[Any]]"; expected "Optional[Index]" @@ -771,14 +774,8 @@ def __init__( if index is None or columns is None: raise ValueError("DataFrame constructor not properly called!") - # Argument 1 to "ensure_index" has incompatible type "Collection[Any]"; - # expected "Union[Union[Union[ExtensionArray, ndarray], - # Index, Series], Sequence[Any]]" - index = ensure_index(index) # type: ignore[arg-type] - # Argument 1 to "ensure_index" has incompatible type "Collection[Any]"; - # expected "Union[Union[Union[ExtensionArray, ndarray], - # Index, Series], Sequence[Any]]" - columns = ensure_index(columns) # type: ignore[arg-type] + index = ensure_index(index) + columns = ensure_index(columns) if not dtype: dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8d8b413b53792..e958ceeff6f28 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -42,8 +42,8 @@ tz_compare, ) from pandas._typing import ( - AnyArrayLike, ArrayLike, + Axes, Dtype, DtypeObj, F, @@ -7273,7 +7273,7 @@ def ensure_index_from_sequences(sequences, names=None) -> Index: return MultiIndex.from_arrays(sequences, names=names) -def ensure_index(index_like: AnyArrayLike | Sequence, copy: bool = False) -> Index: +def ensure_index(index_like: Axes, copy: bool = False) -> Index: """ Ensure that we have an index from some index-like object. diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index e62c050fbf812..6ec0157082ff1 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2976,6 +2976,14 @@ def test_construction_from_ndarray_with_eadtype_mismatched_columns(self): with pytest.raises(ValueError, match=msg): DataFrame(arr2, columns=["foo", "bar"]) + def test_columns_indexes_raise_on_sets(self): + # GH 47215 + data = [[1, 2, 3], [4, 5, 6]] + with pytest.raises(ValueError, match="index cannot be a set"): + DataFrame(data, index={"a", "b"}) + with pytest.raises(ValueError, match="columns cannot be a set"): + DataFrame(data, columns={"a", "b", "c"}) + def get1(obj): # TODO: make a helper in tm? if isinstance(obj, Series): From 606506670d45bbc14c5f3f048ccab2c91cf79a7d Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Mon, 6 Jun 2022 21:33:13 -0400 Subject: [PATCH 2/4] remove Dict and Sequence[str] from Axes --- pandas/_typing.py | 2 +- pandas/io/formats/format.py | 5 ++++- test_fast.bat | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 37ecd4f540fa9..0a16e43348729 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -114,7 +114,7 @@ Ordered = Optional[bool] JSONSerializable = Optional[Union[PythonScalar, List, Dict]] Frequency = Union[str, "DateOffset"] -Axes = Union[AnyArrayLike, List, Dict, range, Sequence[str]] +Axes = Union[AnyArrayLike, List, range] RandomState = Union[ int, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 3019aa1fc2dc7..41436267cdc67 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -47,6 +47,7 @@ from pandas._libs.tslibs.nattype import NaTType from pandas._typing import ( ArrayLike, + Axes, ColspaceArgType, ColspaceType, CompressionOptions, @@ -685,7 +686,9 @@ def _initialize_justify(self, justify: str | None) -> str: def _initialize_columns(self, columns: Sequence[str] | None) -> Index: if columns is not None: - cols = ensure_index(columns) + # GH 47231 - columns doesn't have to be `Sequence[str]` + # Will fix in later PR + cols = ensure_index(cast(Axes, columns)) self.frame = self.frame[cols] return cols else: diff --git a/test_fast.bat b/test_fast.bat index 85ce305ab2d64..4d0b46bdf596b 100644 --- a/test_fast.bat +++ b/test_fast.bat @@ -1,3 +1,3 @@ :: test on windows set PYTHONHASHSEED=314159265 -pytest --skip-slow --skip-network --skip-db -m "not single_cpu" -n 4 -r sXX pandas +pytest --skip-slow --skip-network --skip-db -m "not single_cpu" -n 8 -r sXX pandas From 40684cf394a7a42d3c9bd84d59574c024a546c61 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Sun, 10 Jul 2022 12:46:23 -0400 Subject: [PATCH 3/4] move whatsnew message to api section --- doc/source/whatsnew/v1.5.0.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 688e71e50359a..2aa03ede4fb61 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -278,7 +278,6 @@ Other enhancements - :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`) - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`) - :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`) -- :class:`DataFrame` constructor raises if ``index`` or ``columns`` arguments are sets (:issue:`47215`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: @@ -491,7 +490,7 @@ Other API changes - :func:`read_json` now raises ``FileNotFoundError`` (previously ``ValueError``) when input is a string ending in ``.json``, ``.json.gz``, ``.json.bz2``, etc. but no such file exists. (:issue:`29102`) - Operations with :class:`Timestamp` or :class:`Timedelta` that would previously raise ``OverflowError`` instead raise ``OutOfBoundsDatetime`` or ``OutOfBoundsTimedelta`` where appropriate (:issue:`47268`) - When :func:`read_sas` previously returned ``None``, it now returns an empty :class:`DataFrame` (:issue:`47410`) -- +- :class:`DataFrame` constructor raises if ``index`` or ``columns`` arguments are sets (:issue:`47215`) .. --------------------------------------------------------------------------- .. _whatsnew_150.deprecations: From b543db40fe972f8a336a862144e31acef3212a5c Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Sun, 10 Jul 2022 17:08:52 -0400 Subject: [PATCH 4/4] fix _set_axis() - arguments too broad --- pandas/core/generic.py | 2 +- pandas/core/series.py | 2 +- test_fast.bat | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e392802bdb5ea..9dd1fad50aa86 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -758,7 +758,7 @@ def _set_axis_nocheck(self, labels, axis: Axis, inplace: bool_t): obj.set_axis(labels, axis=axis, inplace=True) return obj - def _set_axis(self, axis: int, labels: AnyArrayLike | Sequence) -> None: + def _set_axis(self, axis: int, labels: AnyArrayLike | list) -> None: labels = ensure_index(labels) self._mgr.set_axis(axis, labels) self._clear_item_cache() diff --git a/pandas/core/series.py b/pandas/core/series.py index ef4ea0172c505..f6fe375cf5cb4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -559,7 +559,7 @@ def _constructor_expanddim(self) -> Callable[..., DataFrame]: def _can_hold_na(self) -> bool: return self._mgr._can_hold_na - def _set_axis(self, axis: int, labels: AnyArrayLike | Sequence) -> None: + def _set_axis(self, axis: int, labels: AnyArrayLike | list) -> None: """ Override generic, we want to set the _typ here. diff --git a/test_fast.bat b/test_fast.bat index 4d0b46bdf596b..85ce305ab2d64 100644 --- a/test_fast.bat +++ b/test_fast.bat @@ -1,3 +1,3 @@ :: test on windows set PYTHONHASHSEED=314159265 -pytest --skip-slow --skip-network --skip-db -m "not single_cpu" -n 8 -r sXX pandas +pytest --skip-slow --skip-network --skip-db -m "not single_cpu" -n 4 -r sXX pandas