From 0959372791249b782a9d8348f93e4d2dd52d3c7e Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 12 Dec 2021 10:41:37 +0000 Subject: [PATCH 01/11] Factor out existing occurrences --- pandas/core/common.py | 8 ++++++++ pandas/core/frame.py | 6 ++---- pandas/core/indexes/multi.py | 7 +------ pandas/io/json/_table_schema.py | 6 +----- pandas/io/pytables.py | 4 +--- pandas/io/sql.py | 6 ++---- 6 files changed, 15 insertions(+), 22 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 590296c4b12f5..e9e8215628070 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -604,3 +604,11 @@ def is_builtin_func(arg): otherwise return the arg """ return _builtin_table.get(arg, arg) + + +def fill_missing_names(names): + return [f"level_{i}" if name is None else name for i, name in enumerate(names)] + + +def revert_missing_names(names): + return [None if name == f"level_{i}" else name for i, name in enumerate(names)] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fa5e9dc51419a..50d683e46bf06 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -150,6 +150,7 @@ TimedeltaArray, ) from pandas.core.arrays.sparse import SparseFrameAccessor +import pandas.core.common as com from pandas.core.construction import ( extract_array, sanitize_array, @@ -5796,10 +5797,7 @@ class max type if not drop: to_insert: Iterable[tuple[Any, Any | None]] if isinstance(self.index, MultiIndex): - names = [ - (n if n is not None else f"level_{i}") - for i, n in enumerate(self.index.names) - ] + names = com.fill_missing_names(self.index.names) to_insert = zip(self.index.levels, self.index.codes) else: default = "index" if "index" not in self else "level_0" diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 88b37ffaa9493..6a322e73590ca 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -736,12 +736,7 @@ def dtypes(self) -> Series: """ from pandas import Series - return Series( - { - f"level_{idx}" if level.name is None else level.name: level.dtype - for idx, level in enumerate(self.levels) - } - ) + return Series(com.fill_missing_names(self.levels)) def __len__(self) -> int: return len(self.codes[0]) diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 75fd950cd6076..ed33784f44464 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -103,11 +103,7 @@ def set_default_names(data): data = data.copy() if data.index.nlevels > 1: - names = [ - name if name is not None else f"level_{i}" - for i, name in enumerate(data.index.names) - ] - data.index.names = names + data.index.names = com.fill_missing_names(data.index.names) else: data.index.name = data.index.name or "index" return data diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 18b2ff3837a15..997a6bfc67dbc 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3377,9 +3377,7 @@ def validate_multiindex( validate that we can store the multi-index; reset and return the new object """ - levels = [ - l if l is not None else f"level_{i}" for i, l in enumerate(obj.index.names) - ] + levels = com.fill_missing_names(obj.index.names) try: reset_obj = obj.reset_index() except ValueError as err: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 26869a660f4b4..548bd617a285f 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -44,6 +44,7 @@ Series, ) from pandas.core.base import PandasObject +import pandas.core.common as com from pandas.core.tools.datetimes import to_datetime from pandas.util.version import Version @@ -1010,10 +1011,7 @@ def _index_name(self, index, index_label): ): return ["index"] else: - return [ - l if l is not None else f"level_{i}" - for i, l in enumerate(self.frame.index.names) - ] + return com.fill_missing_names(self.frame.index.names) # for reading: index=(list of) string to specify column to set as index elif isinstance(index, str): From 276bdffa6f43cf76e19399ae10f71214e7cf510c Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 14 Dec 2021 10:17:00 +0000 Subject: [PATCH 02/11] Don't change index.to_frame --- pandas/core/common.py | 4 ---- pandas/core/frame.py | 7 +------ pandas/tests/frame/methods/test_to_records.py | 2 +- 3 files changed, 2 insertions(+), 11 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index e9e8215628070..1f7d9356f2de8 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -608,7 +608,3 @@ def is_builtin_func(arg): def fill_missing_names(names): return [f"level_{i}" if name is None else name for i, name in enumerate(names)] - - -def revert_missing_names(names): - return [None if name == f"level_{i}" else name for i, name in enumerate(names)] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 50d683e46bf06..b73b064f4dc95 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -150,7 +150,6 @@ TimedeltaArray, ) from pandas.core.arrays.sparse import SparseFrameAccessor -import pandas.core.common as com from pandas.core.construction import ( extract_array, sanitize_array, @@ -2371,11 +2370,7 @@ def to_records( index_names = list(self.index.names) if isinstance(self.index, MultiIndex): - count = 0 - for i, n in enumerate(index_names): - if n is None: - index_names[i] = f"level_{count}" - count += 1 + index_names = com.fill_missing_names(index_names) elif index_names[0] is None: index_names = ["index"] diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index 2c96cf291c154..2f54da687ba76 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -91,7 +91,7 @@ def test_to_records_index_name(self): df.index = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) df.index.names = ["A", None] rs = df.to_records() - assert "level_0" in rs.dtype.fields + assert "level_1" in rs.dtype.fields def test_to_records_with_unicode_index(self): # GH#13172 From 10bcdc4c1767c9365da86f009ae065a28ad0dbe6 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 14 Dec 2021 11:59:00 +0000 Subject: [PATCH 03/11] Update v1.4.0.rst --- doc/source/whatsnew/v1.4.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 916bcf3db9a4a..853bdeb3c68d0 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -854,6 +854,7 @@ Other - Bug in :meth:`DataFrame.shift` with ``axis=1`` and ``ExtensionDtype`` columns incorrectly raising when an incompatible ``fill_value`` is passed (:issue:`44564`) - Bug in :meth:`DataFrame.diff` when passing a NumPy integer object instead of an ``int`` object (:issue:`44572`) - Bug in :meth:`Series.replace` raising ``ValueError`` when using ``regex=True`` with a :class:`Series` containing ``np.nan`` values (:issue:`43344`) +- Bug in :meth:`DataFrame.to_records` missing names filled incorrectly (:issue:`44818`) .. ***DO NOT USE THIS SECTION*** From e2a1247aa0c349a5659a4a68ccb01887b3a6ca07 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 14 Dec 2021 12:39:11 +0000 Subject: [PATCH 04/11] Add docstring --- pandas/core/common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/common.py b/pandas/core/common.py index 1f7d9356f2de8..997cc394f311e 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -607,4 +607,7 @@ def is_builtin_func(arg): def fill_missing_names(names): + """ + If a name is missing then replace it by level_n, where n is the count + """ return [f"level_{i}" if name is None else name for i, name in enumerate(names)] From aa6374ad82018d83e5f8eb9ada02533e28dcba04 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 14 Dec 2021 13:50:17 +0000 Subject: [PATCH 05/11] Update multi.py --- pandas/core/indexes/multi.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 6a322e73590ca..a764a49719ad9 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -736,7 +736,10 @@ def dtypes(self) -> Series: """ from pandas import Series - return Series(com.fill_missing_names(self.levels)) + names = com.fill_missing_names([level.name for level in self.levels]) + return Series( + {names[idx]: level.dtype for idx, level in enumerate(self.levels)} + ) def __len__(self) -> int: return len(self.codes[0]) From a6c21091cf474e24ed8bd57ecb4642e0454a73f7 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 14 Dec 2021 18:29:35 +0000 Subject: [PATCH 06/11] Improve whatsnew and test --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/tests/frame/methods/test_to_records.py | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 853bdeb3c68d0..5200a36950f11 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -854,7 +854,7 @@ Other - Bug in :meth:`DataFrame.shift` with ``axis=1`` and ``ExtensionDtype`` columns incorrectly raising when an incompatible ``fill_value`` is passed (:issue:`44564`) - Bug in :meth:`DataFrame.diff` when passing a NumPy integer object instead of an ``int`` object (:issue:`44572`) - Bug in :meth:`Series.replace` raising ``ValueError`` when using ``regex=True`` with a :class:`Series` containing ``np.nan`` values (:issue:`43344`) -- Bug in :meth:`DataFrame.to_records` missing names filled incorrectly (:issue:`44818`) +- Bug in :meth:`DataFrame.to_records` where an incorrect n was used when missing names were replaced by level_n (:issue:`44818`) .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index 2f54da687ba76..a2e94782142ac 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -90,8 +90,16 @@ def test_to_records_index_name(self): df.index = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) df.index.names = ["A", None] - rs = df.to_records() - assert "level_1" in rs.dtype.fields + result = df.to_records() + expected = np.rec.fromarrays( + [np.array(["a", "a", "b"]), np.array(["x", "y", "z"])] + + [np.asarray(df.iloc[:, i]) for i in range(3)], + dtype={ + "names": ["A", "level_1", "0", "1", "2"], + "formats": [" Date: Tue, 14 Dec 2021 23:33:15 +0000 Subject: [PATCH 07/11] Trigger CI From 94a86a0218c08dc1ec879161ae571172886fdc30 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Wed, 15 Dec 2021 09:04:44 +0000 Subject: [PATCH 08/11] Trigger CI From f311e49f51a5f992998d6016338fd4a574388ea4 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 16 Dec 2021 13:04:22 +0000 Subject: [PATCH 09/11] Add types and improve docstring --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/core/common.py | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 05ae83f9f70a1..c11544765ed3c 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -856,7 +856,7 @@ Other - Bug in :meth:`DataFrame.shift` with ``axis=1`` and ``ExtensionDtype`` columns incorrectly raising when an incompatible ``fill_value`` is passed (:issue:`44564`) - Bug in :meth:`DataFrame.diff` when passing a NumPy integer object instead of an ``int`` object (:issue:`44572`) - Bug in :meth:`Series.replace` raising ``ValueError`` when using ``regex=True`` with a :class:`Series` containing ``np.nan`` values (:issue:`43344`) -- Bug in :meth:`DataFrame.to_records` where an incorrect n was used when missing names were replaced by level_n (:issue:`44818`) +- Bug in :meth:`DataFrame.to_records` where an incorrect ``n`` was used when missing names were replaced by ``level_n`` (:issue:`44818`) .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/core/common.py b/pandas/core/common.py index 997cc394f311e..d250646958fd5 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -20,6 +20,8 @@ Collection, Iterable, Iterator, + Hashable, + Sequence, cast, overload, ) @@ -606,8 +608,20 @@ def is_builtin_func(arg): return _builtin_table.get(arg, arg) -def fill_missing_names(names): +def fill_missing_names(names: Sequence[Hashable|None]) -> list[Hashable]: """ If a name is missing then replace it by level_n, where n is the count + + .. versionadded:: 1.4.0 + + Parameters + ---------- + names : list-like + list of column names or None values. + + Returns + ------- + list + list of column names with the None values replaced. """ return [f"level_{i}" if name is None else name for i, name in enumerate(names)] From ac573b89ee712e9bee2809e4e041c1603010e016 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 16 Dec 2021 13:11:01 +0000 Subject: [PATCH 10/11] Update common.py --- pandas/core/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index d250646958fd5..0a96e94bfc610 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -608,7 +608,7 @@ def is_builtin_func(arg): return _builtin_table.get(arg, arg) -def fill_missing_names(names: Sequence[Hashable|None]) -> list[Hashable]: +def fill_missing_names(names: Sequence[Hashable | None]) -> list[Hashable]: """ If a name is missing then replace it by level_n, where n is the count @@ -618,7 +618,7 @@ def fill_missing_names(names: Sequence[Hashable|None]) -> list[Hashable]: ---------- names : list-like list of column names or None values. - + Returns ------- list From 76e864b96473ec831c7bdacb2c1a2db4ef3ed7c8 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 16 Dec 2021 13:27:48 +0000 Subject: [PATCH 11/11] Update common.py --- pandas/core/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 0a96e94bfc610..2ebdfccc88f4e 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -18,9 +18,9 @@ Any, Callable, Collection, + Hashable, Iterable, Iterator, - Hashable, Sequence, cast, overload,