diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 8c0534159d056..2aa03ede4fb61 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -490,7 +490,7 @@ Other API changes - :func:`read_json` now raises ``FileNotFoundError`` (previously ``ValueError``) when input is a string ending in ``.json``, ``.json.gz``, ``.json.bz2``, etc. but no such file exists. (:issue:`29102`) - Operations with :class:`Timestamp` or :class:`Timedelta` that would previously raise ``OverflowError`` instead raise ``OutOfBoundsDatetime`` or ``OutOfBoundsTimedelta`` where appropriate (:issue:`47268`) - When :func:`read_sas` previously returned ``None``, it now returns an empty :class:`DataFrame` (:issue:`47410`) -- +- :class:`DataFrame` constructor raises if ``index`` or ``columns`` arguments are sets (:issue:`47215`) .. --------------------------------------------------------------------------- .. _whatsnew_150.deprecations: diff --git a/pandas/_typing.py b/pandas/_typing.py index 4bc5f75400455..edb8ca6eb9516 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -10,7 +10,6 @@ TYPE_CHECKING, Any, Callable, - Collection, Dict, Hashable, Iterator, @@ -115,7 +114,7 @@ Ordered = Optional[bool] JSONSerializable = Optional[Union[PythonScalar, List, Dict]] Frequency = Union[str, "DateOffset"] -Axes = Collection[Any] +Axes = Union[AnyArrayLike, List, range] RandomState = Union[ int, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ead4ea744c647..d24f7eda478ac 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -626,6 +626,12 @@ def __init__( manager = get_option("mode.data_manager") + # GH47215 + if index is not None and isinstance(index, set): + raise ValueError("index cannot be a set") + if columns is not None and isinstance(columns, set): + raise ValueError("columns cannot be a set") + if copy is None: if isinstance(data, dict): # retain pre-GH#38939 default behavior @@ -729,10 +735,7 @@ def __init__( if not isinstance(data, np.ndarray) and treat_as_nested(data): # exclude ndarray as we may have cast it a few lines above if columns is not None: - # error: Argument 1 to "ensure_index" has incompatible type - # "Collection[Any]"; expected "Union[Union[Union[ExtensionArray, - # ndarray], Index, Series], Sequence[Any]]" - columns = ensure_index(columns) # type: ignore[arg-type] + columns = ensure_index(columns) arrays, columns, index = nested_data_to_arrays( # error: Argument 3 to "nested_data_to_arrays" has incompatible # type "Optional[Collection[Any]]"; expected "Optional[Index]" @@ -770,14 +773,8 @@ def __init__( if index is None or columns is None: raise ValueError("DataFrame constructor not properly called!") - # Argument 1 to "ensure_index" has incompatible type "Collection[Any]"; - # expected "Union[Union[Union[ExtensionArray, ndarray], - # Index, Series], Sequence[Any]]" - index = ensure_index(index) # type: ignore[arg-type] - # Argument 1 to "ensure_index" has incompatible type "Collection[Any]"; - # expected "Union[Union[Union[ExtensionArray, ndarray], - # Index, Series], Sequence[Any]]" - columns = ensure_index(columns) # type: ignore[arg-type] + index = ensure_index(index) + columns = ensure_index(columns) if not dtype: dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e392802bdb5ea..9dd1fad50aa86 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -758,7 +758,7 @@ def _set_axis_nocheck(self, labels, axis: Axis, inplace: bool_t): obj.set_axis(labels, axis=axis, inplace=True) return obj - def _set_axis(self, axis: int, labels: AnyArrayLike | Sequence) -> None: + def _set_axis(self, axis: int, labels: AnyArrayLike | list) -> None: labels = ensure_index(labels) self._mgr.set_axis(axis, labels) self._clear_item_cache() diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index fc5fcaeab7d2a..b51cb029131ff 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -43,8 +43,8 @@ tz_compare, ) from pandas._typing import ( - AnyArrayLike, ArrayLike, + Axes, Dtype, DtypeObj, F, @@ -7272,7 +7272,7 @@ def ensure_index_from_sequences(sequences, names=None) -> Index: return MultiIndex.from_arrays(sequences, names=names) -def ensure_index(index_like: AnyArrayLike | Sequence, copy: bool = False) -> Index: +def ensure_index(index_like: Axes, copy: bool = False) -> Index: """ Ensure that we have an index from some index-like object. diff --git a/pandas/core/series.py b/pandas/core/series.py index ef4ea0172c505..f6fe375cf5cb4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -559,7 +559,7 @@ def _constructor_expanddim(self) -> Callable[..., DataFrame]: def _can_hold_na(self) -> bool: return self._mgr._can_hold_na - def _set_axis(self, axis: int, labels: AnyArrayLike | Sequence) -> None: + def _set_axis(self, axis: int, labels: AnyArrayLike | list) -> None: """ Override generic, we want to set the _typ here. diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 6554b4c1f1afd..b94e26cd0ac5a 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -50,6 +50,7 @@ from pandas._libs.tslibs.nattype import NaTType from pandas._typing import ( ArrayLike, + Axes, ColspaceArgType, ColspaceType, CompressionOptions, @@ -688,7 +689,9 @@ def _initialize_justify(self, justify: str | None) -> str: def _initialize_columns(self, columns: Sequence[Hashable] | None) -> Index: if columns is not None: - cols = ensure_index(columns) + # GH 47231 - columns doesn't have to be `Sequence[str]` + # Will fix in later PR + cols = ensure_index(cast(Axes, columns)) self.frame = self.frame[cols] return cols else: diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index f06641002e039..71da409bea367 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2982,6 +2982,14 @@ def test_construction_from_ndarray_with_eadtype_mismatched_columns(self): with pytest.raises(ValueError, match=msg): DataFrame(arr2, columns=["foo", "bar"]) + def test_columns_indexes_raise_on_sets(self): + # GH 47215 + data = [[1, 2, 3], [4, 5, 6]] + with pytest.raises(ValueError, match="index cannot be a set"): + DataFrame(data, index={"a", "b"}) + with pytest.raises(ValueError, match="columns cannot be a set"): + DataFrame(data, columns={"a", "b", "c"}) + def get1(obj): # TODO: make a helper in tm? if isinstance(obj, Series):