From 08fb7099272da4a9b2ff6a6f1b26353513fefdcb Mon Sep 17 00:00:00 2001
From: Irv Lustig <irv@princeton.com>
Date: Sat, 4 Jun 2022 12:59:28 -0400
Subject: [PATCH 1/4] API: Disallow sets as index and columns argument in
 DataFrame constructor

---
 doc/source/whatsnew/v1.5.0.rst          |  1 +
 pandas/_typing.py                       |  3 +--
 pandas/core/frame.py                    | 21 +++++++++------------
 pandas/core/indexes/base.py             |  4 ++--
 pandas/tests/frame/test_constructors.py |  8 ++++++++
 5 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 6bf6fd65f5633..fb44408fa7699 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -152,6 +152,7 @@ Other enhancements
 - Added ``numeric_only`` argument to :meth:`Resampler.sum`, :meth:`Resampler.prod`, :meth:`Resampler.min`, :meth:`Resampler.max`, :meth:`Resampler.first`, and :meth:`Resampler.last` (:issue:`46442`)
 - ``times`` argument in :class:`.ExponentialMovingWindow` now accepts ``np.timedelta64`` (:issue:`47003`)
 - :class:`DataError`, :class:`SpecificationError`, :class:`SettingWithCopyError`, and :class:`SettingWithCopyWarning` are now exposed in ``pandas.errors`` (:issue:`27656`)
+- :class:`DataFrame` constructor raises if ``index`` or ``columns`` arguments are sets (:issue:`47215`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.notable_bug_fixes:
diff --git a/pandas/_typing.py b/pandas/_typing.py
index a85820a403fde..37ecd4f540fa9 100644
--- a/pandas/_typing.py
+++ b/pandas/_typing.py
@@ -10,7 +10,6 @@
     TYPE_CHECKING,
     Any,
     Callable,
-    Collection,
     Dict,
     Hashable,
     Iterator,
@@ -115,7 +114,7 @@
 Ordered = Optional[bool]
 JSONSerializable = Optional[Union[PythonScalar, List, Dict]]
 Frequency = Union[str, "DateOffset"]
-Axes = Collection[Any]
+Axes = Union[AnyArrayLike, List, Dict, range, Sequence[str]]
 
 RandomState = Union[
     int,
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index c695c04052f9d..ec9837964e07a 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -627,6 +627,12 @@ def __init__(
 
         manager = get_option("mode.data_manager")
 
+        # GH47215
+        if index is not None and isinstance(index, set):
+            raise ValueError("index cannot be a set")
+        if columns is not None and isinstance(columns, set):
+            raise ValueError("columns cannot be a set")
+
         if copy is None:
             if isinstance(data, dict):
                 # retain pre-GH#38939 default behavior
@@ -730,10 +736,7 @@ def __init__(
                 if not isinstance(data, np.ndarray) and treat_as_nested(data):
                     # exclude ndarray as we may have cast it a few lines above
                     if columns is not None:
-                        # error: Argument 1 to "ensure_index" has incompatible type
-                        # "Collection[Any]"; expected "Union[Union[Union[ExtensionArray,
-                        # ndarray], Index, Series], Sequence[Any]]"
-                        columns = ensure_index(columns)  # type: ignore[arg-type]
+                        columns = ensure_index(columns)
                     arrays, columns, index = nested_data_to_arrays(
                         # error: Argument 3 to "nested_data_to_arrays" has incompatible
                         # type "Optional[Collection[Any]]"; expected "Optional[Index]"
@@ -771,14 +774,8 @@ def __init__(
             if index is None or columns is None:
                 raise ValueError("DataFrame constructor not properly called!")
 
-            # Argument 1 to "ensure_index" has incompatible type "Collection[Any]";
-            # expected "Union[Union[Union[ExtensionArray, ndarray],
-            # Index, Series], Sequence[Any]]"
-            index = ensure_index(index)  # type: ignore[arg-type]
-            # Argument 1 to "ensure_index" has incompatible type "Collection[Any]";
-            # expected "Union[Union[Union[ExtensionArray, ndarray],
-            # Index, Series], Sequence[Any]]"
-            columns = ensure_index(columns)  # type: ignore[arg-type]
+            index = ensure_index(index)
+            columns = ensure_index(columns)
 
             if not dtype:
                 dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True)
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 8d8b413b53792..e958ceeff6f28 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -42,8 +42,8 @@
     tz_compare,
 )
 from pandas._typing import (
-    AnyArrayLike,
     ArrayLike,
+    Axes,
     Dtype,
     DtypeObj,
     F,
@@ -7273,7 +7273,7 @@ def ensure_index_from_sequences(sequences, names=None) -> Index:
         return MultiIndex.from_arrays(sequences, names=names)
 
 
-def ensure_index(index_like: AnyArrayLike | Sequence, copy: bool = False) -> Index:
+def ensure_index(index_like: Axes, copy: bool = False) -> Index:
     """
     Ensure that we have an index from some index-like object.
 
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index e62c050fbf812..6ec0157082ff1 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -2976,6 +2976,14 @@ def test_construction_from_ndarray_with_eadtype_mismatched_columns(self):
         with pytest.raises(ValueError, match=msg):
             DataFrame(arr2, columns=["foo", "bar"])
 
+    def test_columns_indexes_raise_on_sets(self):
+        # GH 47215
+        data = [[1, 2, 3], [4, 5, 6]]
+        with pytest.raises(ValueError, match="index cannot be a set"):
+            DataFrame(data, index={"a", "b"})
+        with pytest.raises(ValueError, match="columns cannot be a set"):
+            DataFrame(data, columns={"a", "b", "c"})
+
 
 def get1(obj):  # TODO: make a helper in tm?
     if isinstance(obj, Series):

From 3e4901636fb8520d36fb6d349c9c1b58b5fec763 Mon Sep 17 00:00:00 2001
From: Irv Lustig <irv@princeton.com>
Date: Sat, 4 Jun 2022 14:06:31 -0400
Subject: [PATCH 2/4] TYP: Remove Sequence[str] from to_string, to_html

---
 pandas/_typing.py           |  2 +-
 pandas/core/frame.py        | 20 ++++++++++----------
 pandas/io/formats/csvs.py   |  2 +-
 pandas/io/formats/format.py |  9 +++++----
 4 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/pandas/_typing.py b/pandas/_typing.py
index 37ecd4f540fa9..96ba957bcdcaa 100644
--- a/pandas/_typing.py
+++ b/pandas/_typing.py
@@ -114,7 +114,7 @@
 Ordered = Optional[bool]
 JSONSerializable = Optional[Union[PythonScalar, List, Dict]]
 Frequency = Union[str, "DateOffset"]
-Axes = Union[AnyArrayLike, List, Dict, range, Sequence[str]]
+Axes = Union[AnyArrayLike, List, Dict, range]
 
 RandomState = Union[
     int,
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index ec9837964e07a..8e57165da9dee 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1105,9 +1105,9 @@ def _repr_html_(self) -> str | None:
     def to_string(
         self,
         buf: None = ...,
-        columns: Sequence[str] | None = ...,
+        columns: Axes | None = ...,
         col_space: int | list[int] | dict[Hashable, int] | None = ...,
-        header: bool | Sequence[str] = ...,
+        header: bool | list = ...,
         index: bool = ...,
         na_rep: str = ...,
         formatters: fmt.FormattersType | None = ...,
@@ -1130,9 +1130,9 @@ def to_string(
     def to_string(
         self,
         buf: FilePath | WriteBuffer[str],
-        columns: Sequence[str] | None = ...,
+        columns: Axes | None = ...,
         col_space: int | list[int] | dict[Hashable, int] | None = ...,
-        header: bool | Sequence[str] = ...,
+        header: bool | list = ...,
         index: bool = ...,
         na_rep: str = ...,
         formatters: fmt.FormattersType | None = ...,
@@ -1152,8 +1152,8 @@ def to_string(
         ...
 
     @Substitution(
-        header_type="bool or sequence of str",
-        header="Write out the column names. If a list of strings "
+        header_type="bool or array-like of column names",
+        header="Write out the column names. If a list of columns "
         "is given, it is assumed to be aliases for the "
         "column names",
         col_space_type="int, list or dict of int",
@@ -1165,9 +1165,9 @@ def to_string(
     def to_string(
         self,
         buf: FilePath | WriteBuffer[str] | None = None,
-        columns: Sequence[str] | None = None,
+        columns: Axes | None = None,
         col_space: int | list[int] | dict[Hashable, int] | None = None,
-        header: bool | Sequence[str] = True,
+        header: bool | list = True,
         index: bool = True,
         na_rep: str = "NaN",
         formatters: fmt.FormattersType | None = None,
@@ -2911,9 +2911,9 @@ def to_parquet(
     def to_html(
         self,
         buf: FilePath | WriteBuffer[str] | None = None,
-        columns: Sequence[str] | None = None,
+        columns: Axes | None = None,
         col_space: ColspaceArgType | None = None,
-        header: bool | Sequence[str] = True,
+        header: bool = True,
         index: bool = True,
         na_rep: str = "NaN",
         formatters: FormattersType | None = None,
diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
index c577acfaeba8e..80872a107d1aa 100644
--- a/pandas/io/formats/csvs.py
+++ b/pandas/io/formats/csvs.py
@@ -102,7 +102,7 @@ def decimal(self) -> str:
         return self.fmt.decimal
 
     @property
-    def header(self) -> bool | Sequence[str]:
+    def header(self) -> bool | list:
         return self.fmt.header
 
     @property
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
index 3019aa1fc2dc7..9fca4b2c2c786 100644
--- a/pandas/io/formats/format.py
+++ b/pandas/io/formats/format.py
@@ -47,6 +47,7 @@
 from pandas._libs.tslibs.nattype import NaTType
 from pandas._typing import (
     ArrayLike,
+    Axes,
     ColspaceArgType,
     ColspaceType,
     CompressionOptions,
@@ -119,7 +120,7 @@
         ----------
         buf : str, Path or StringIO-like, optional, default None
             Buffer to write to. If None, the output is returned as a string.
-        columns : sequence, optional, default None
+        columns : array-like, optional, default None
             The subset of columns to write. Writes all columns by default.
         col_space : %(col_space_type)s, optional
             %(col_space)s.
@@ -561,9 +562,9 @@ class DataFrameFormatter:
     def __init__(
         self,
         frame: DataFrame,
-        columns: Sequence[str] | None = None,
+        columns: Axes | None = None,
         col_space: ColspaceArgType | None = None,
-        header: bool | Sequence[str] = True,
+        header: bool | list = True,
         index: bool = True,
         na_rep: str = "NaN",
         formatters: FormattersType | None = None,
@@ -683,7 +684,7 @@ def _initialize_justify(self, justify: str | None) -> str:
         else:
             return justify
 
-    def _initialize_columns(self, columns: Sequence[str] | None) -> Index:
+    def _initialize_columns(self, columns: Axes | None) -> Index:
         if columns is not None:
             cols = ensure_index(columns)
             self.frame = self.frame[cols]

From a9e284983b468962bc4bd7438be87fe3c7ea0b5b Mon Sep 17 00:00:00 2001
From: Irv Lustig <irv@princeton.com>
Date: Sun, 5 Jun 2022 11:52:20 -0400
Subject: [PATCH 3/4] use list[str] for header argument.  Remove dict as index
 and columns argument for DataFrame.  Define ListLike

---
 pandas/_typing.py           | 9 ++++++++-
 pandas/core/frame.py        | 8 ++++----
 pandas/io/formats/csvs.py   | 2 +-
 pandas/io/formats/format.py | 2 +-
 4 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/pandas/_typing.py b/pandas/_typing.py
index 96ba957bcdcaa..6e55512ce6b87 100644
--- a/pandas/_typing.py
+++ b/pandas/_typing.py
@@ -79,6 +79,13 @@
 ArrayLike = Union["ExtensionArray", np.ndarray]
 AnyArrayLike = Union[ArrayLike, "Index", "Series"]
 
+# list-like
+
+# Cannot use `Sequence` because a string is a sequence, and we don't want to
+# accept that.  Could refine if https://github.com/python/typing/issues/256 is
+# resolved to differentiate between Sequence[str] and str
+ListLike = Union[AnyArrayLike, List, range]
+
 # scalars
 
 PythonScalar = Union[str, int, float, bool]
@@ -114,7 +121,7 @@
 Ordered = Optional[bool]
 JSONSerializable = Optional[Union[PythonScalar, List, Dict]]
 Frequency = Union[str, "DateOffset"]
-Axes = Union[AnyArrayLike, List, Dict, range]
+Axes = ListLike
 
 RandomState = Union[
     int,
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 8e57165da9dee..0ab567e9c583a 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1107,7 +1107,7 @@ def to_string(
         buf: None = ...,
         columns: Axes | None = ...,
         col_space: int | list[int] | dict[Hashable, int] | None = ...,
-        header: bool | list = ...,
+        header: bool | list[str] = ...,
         index: bool = ...,
         na_rep: str = ...,
         formatters: fmt.FormattersType | None = ...,
@@ -1132,7 +1132,7 @@ def to_string(
         buf: FilePath | WriteBuffer[str],
         columns: Axes | None = ...,
         col_space: int | list[int] | dict[Hashable, int] | None = ...,
-        header: bool | list = ...,
+        header: bool | list[str] = ...,
         index: bool = ...,
         na_rep: str = ...,
         formatters: fmt.FormattersType | None = ...,
@@ -1152,7 +1152,7 @@ def to_string(
         ...
 
     @Substitution(
-        header_type="bool or array-like of column names",
+        header_type="bool or list of str",
         header="Write out the column names. If a list of columns "
         "is given, it is assumed to be aliases for the "
         "column names",
@@ -1167,7 +1167,7 @@ def to_string(
         buf: FilePath | WriteBuffer[str] | None = None,
         columns: Axes | None = None,
         col_space: int | list[int] | dict[Hashable, int] | None = None,
-        header: bool | list = True,
+        header: bool | list[str] = True,
         index: bool = True,
         na_rep: str = "NaN",
         formatters: fmt.FormattersType | None = None,
diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
index 80872a107d1aa..03432c62de7a5 100644
--- a/pandas/io/formats/csvs.py
+++ b/pandas/io/formats/csvs.py
@@ -102,7 +102,7 @@ def decimal(self) -> str:
         return self.fmt.decimal
 
     @property
-    def header(self) -> bool | list:
+    def header(self) -> bool | list[str]:
         return self.fmt.header
 
     @property
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
index 9fca4b2c2c786..dceb4220c3d70 100644
--- a/pandas/io/formats/format.py
+++ b/pandas/io/formats/format.py
@@ -564,7 +564,7 @@ def __init__(
         frame: DataFrame,
         columns: Axes | None = None,
         col_space: ColspaceArgType | None = None,
-        header: bool | list = True,
+        header: bool | list[str] = True,
         index: bool = True,
         na_rep: str = "NaN",
         formatters: FormattersType | None = None,

From ca9b7accf87d6861408e815ef420265f2db06f20 Mon Sep 17 00:00:00 2001
From: Irv Lustig <irv@princeton.com>
Date: Tue, 4 Oct 2022 16:06:11 -0400
Subject: [PATCH 4/4] use Axes rather than Sequence[Hashable]

---
 pandas/_typing.py           |  2 +-
 pandas/core/frame.py        | 10 +++++-----
 pandas/core/generic.py      | 13 +++++++------
 pandas/io/formats/format.py |  8 +++-----
 4 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/pandas/_typing.py b/pandas/_typing.py
index b13eb362039fb..df79d28e48151 100644
--- a/pandas/_typing.py
+++ b/pandas/_typing.py
@@ -132,7 +132,7 @@
 Ordered = Optional[bool]
 JSONSerializable = Optional[Union[PythonScalar, List, Dict]]
 Frequency = Union[str, "BaseOffset"]
-Axes = Union[AnyArrayLike, List, range]
+Axes = ListLike
 
 RandomState = Union[
     int,
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 4858f1ea39186..f1c4e897d6724 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -3092,9 +3092,9 @@ def to_orc(
     def to_html(
         self,
         buf: FilePath | WriteBuffer[str],
-        columns: Sequence[Level] | None = ...,
+        columns: Axes | None = ...,
         col_space: ColspaceArgType | None = ...,
-        header: bool | Sequence[str] = ...,
+        header: bool = ...,
         index: bool = ...,
         na_rep: str = ...,
         formatters: FormattersType | None = ...,
@@ -3121,9 +3121,9 @@ def to_html(
     def to_html(
         self,
         buf: None = ...,
-        columns: Sequence[Level] | None = ...,
+        columns: Axes | None = ...,
         col_space: ColspaceArgType | None = ...,
-        header: bool | Sequence[str] = ...,
+        header: bool = ...,
         index: bool = ...,
         na_rep: str = ...,
         formatters: FormattersType | None = ...,
@@ -3159,7 +3159,7 @@ def to_html(
     def to_html(
         self,
         buf: FilePath | WriteBuffer[str] | None = None,
-        columns: Sequence[Level] | None = None,
+        columns: Axes | None = None,
         col_space: ColspaceArgType | None = None,
         header: bool = True,
         index: bool = True,
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 3df5d2aaf9896..f413c4a37ae53 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -44,6 +44,7 @@
     AlignJoin,
     AnyArrayLike,
     ArrayLike,
+    Axes,
     Axis,
     AxisInt,
     ColspaceArgType,
@@ -3238,9 +3239,9 @@ class      (index) object 'bird' 'bird' 'mammal' 'mammal'
     def to_latex(
         self,
         buf: None = ...,
-        columns: Sequence[Hashable] | None = ...,
+        columns: Axes | None = ...,
         col_space: ColspaceArgType | None = ...,
-        header: bool_t | Sequence[str] = ...,
+        header: bool_t | list[str] = ...,
         index: bool_t = ...,
         na_rep: str = ...,
         formatters: FormattersType | None = ...,
@@ -3266,9 +3267,9 @@ def to_latex(
     def to_latex(
         self,
         buf: FilePath | WriteBuffer[str],
-        columns: Sequence[Hashable] | None = ...,
+        columns: Axes | None = ...,
         col_space: ColspaceArgType | None = ...,
-        header: bool_t | Sequence[str] = ...,
+        header: bool_t | list[str] = ...,
         index: bool_t = ...,
         na_rep: str = ...,
         formatters: FormattersType | None = ...,
@@ -3295,9 +3296,9 @@ def to_latex(
     def to_latex(
         self,
         buf: FilePath | WriteBuffer[str] | None = None,
-        columns: Sequence[Hashable] | None = None,
+        columns: Axes | None = None,
         col_space: ColspaceArgType | None = None,
-        header: bool_t | Sequence[str] = True,
+        header: bool_t | list[str] = True,
         index: bool_t = True,
         na_rep: str = "NaN",
         formatters: FormattersType | None = None,
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
index cf5159c470558..6050691754d66 100644
--- a/pandas/io/formats/format.py
+++ b/pandas/io/formats/format.py
@@ -566,7 +566,7 @@ class DataFrameFormatter:
     def __init__(
         self,
         frame: DataFrame,
-        columns: Sequence[Hashable] | None = None,
+        columns: Axes | None = None,
         col_space: ColspaceArgType | None = None,
         header: bool | list[str] = True,
         index: bool = True,
@@ -688,11 +688,9 @@ def _initialize_justify(self, justify: str | None) -> str:
         else:
             return justify
 
-    def _initialize_columns(self, columns: Sequence[Hashable] | None) -> Index:
+    def _initialize_columns(self, columns: Axes | None) -> Index:
         if columns is not None:
-            # GH 47231 - columns doesn't have to be `Sequence[str]`
-            # Will fix in later PR
-            cols = ensure_index(cast(Axes, columns))
+            cols = ensure_index(columns)
             self.frame = self.frame[cols]
             return cols
         else: