Skip to content

Commit 11881ea

Browse files
authored
BUG/PERF: DataFrame.to_records inconsistent dtypes for MultiIndex (#47279)
DataFrame.to_records fix inconsistent dtypes with MultiIndex
1 parent 5e3b51d commit 11881ea

File tree

4 files changed

+54
-8
lines changed

4 files changed

+54
-8
lines changed

asv_bench/benchmarks/frame_methods.py

+20
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,26 @@ def time_values_mixed_wide(self):
288288
self.df_mixed_wide.values
289289

290290

291+
class ToRecords:
292+
def setup(self):
293+
N = 100_000
294+
data = np.random.randn(N, 2)
295+
mi = MultiIndex.from_arrays(
296+
[
297+
np.arange(N),
298+
date_range("1970-01-01", periods=N, freq="ms"),
299+
]
300+
)
301+
self.df = DataFrame(data)
302+
self.df_mi = DataFrame(data, index=mi)
303+
304+
def time_to_records(self):
305+
self.df.to_records(index=True)
306+
307+
def time_to_records_multiindex(self):
308+
self.df_mi.to_records(index=True)
309+
310+
291311
class Repr:
292312
def setup(self):
293313
nrows = 10000

doc/source/whatsnew/v1.5.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -704,6 +704,7 @@ Performance improvements
704704
- Performance improvement in :meth:`.GroupBy.transform` for user-defined functions when only a single group exists (:issue:`44977`)
705705
- Performance improvement in :meth:`.GroupBy.apply` when grouping on a non-unique unsorted index (:issue:`46527`)
706706
- Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`45681`, :issue:`46040`, :issue:`46330`)
707+
- Performance improvement in :meth:`DataFrame.to_records` when the index is a :class:`MultiIndex` (:issue:`47263`)
707708
- Performance improvement in :attr:`MultiIndex.values` when the MultiIndex contains levels of type DatetimeIndex, TimedeltaIndex or ExtensionDtypes (:issue:`46288`)
708709
- Performance improvement in :func:`merge` when left and/or right are empty (:issue:`45838`)
709710
- Performance improvement in :meth:`DataFrame.join` when left and/or right are empty (:issue:`46015`)
@@ -765,6 +766,7 @@ Conversion
765766
- Bug in :func:`array` with ``FloatingDtype`` and values containing float-castable strings incorrectly raising (:issue:`45424`)
766767
- Bug when comparing string and datetime64ns objects causing ``OverflowError`` exception. (:issue:`45506`)
767768
- Bug in metaclass of generic abstract dtypes causing :meth:`DataFrame.apply` and :meth:`Series.apply` to raise for the built-in function ``type`` (:issue:`46684`)
769+
- Bug in :meth:`DataFrame.to_records` returning inconsistent numpy types if the index was a :class:`MultiIndex` (:issue:`47263`)
768770
- Bug in :meth:`DataFrame.to_dict` for ``orient="list"`` or ``orient="index"`` was not returning native types (:issue:`46751`)
769771

770772
Strings

pandas/core/frame.py

+4-7
Original file line numberDiff line numberDiff line change
@@ -2416,13 +2416,10 @@ def to_records(
24162416
dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
24172417
"""
24182418
if index:
2419-
if isinstance(self.index, MultiIndex):
2420-
# array of tuples to numpy cols. copy copy copy
2421-
ix_vals = list(map(np.array, zip(*self.index._values)))
2422-
else:
2423-
# error: List item 0 has incompatible type "ArrayLike"; expected
2424-
# "ndarray"
2425-
ix_vals = [self.index.values] # type: ignore[list-item]
2419+
ix_vals = [
2420+
np.asarray(self.index.get_level_values(i))
2421+
for i in range(self.index.nlevels)
2422+
]
24262423

24272424
arrays = ix_vals + [
24282425
np.asarray(self.iloc[:, i]) for i in range(len(self.columns))

pandas/tests/frame/methods/test_to_records.py

+28-1
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def test_to_records_index_name(self):
9696
+ [np.asarray(df.iloc[:, i]) for i in range(3)],
9797
dtype={
9898
"names": ["A", "level_1", "0", "1", "2"],
99-
"formats": ["<U1", "<U1", "<f8", "<f8", "<f8"],
99+
"formats": ["O", "O", "<f8", "<f8", "<f8"],
100100
},
101101
)
102102
tm.assert_numpy_array_equal(result, expected)
@@ -108,6 +108,33 @@ def test_to_records_with_unicode_index(self):
108108
expected = np.rec.array([("x", "y")], dtype=[("a", "O"), ("b", "O")])
109109
tm.assert_almost_equal(result, expected)
110110

111+
def test_to_records_index_dtype(self):
112+
# GH 47263: consistent data types for Index and MultiIndex
113+
df = DataFrame(
114+
{
115+
1: date_range("2022-01-01", periods=2),
116+
2: date_range("2022-01-01", periods=2),
117+
3: date_range("2022-01-01", periods=2),
118+
}
119+
)
120+
121+
expected = np.rec.array(
122+
[
123+
("2022-01-01", "2022-01-01", "2022-01-01"),
124+
("2022-01-02", "2022-01-02", "2022-01-02"),
125+
],
126+
dtype=[("1", "<M8[ns]"), ("2", "<M8[ns]"), ("3", "<M8[ns]")],
127+
)
128+
129+
result = df.to_records(index=False)
130+
tm.assert_almost_equal(result, expected)
131+
132+
result = df.set_index(1).to_records(index=True)
133+
tm.assert_almost_equal(result, expected)
134+
135+
result = df.set_index([1, 2]).to_records(index=True)
136+
tm.assert_almost_equal(result, expected)
137+
111138
def test_to_records_with_unicode_column_names(self):
112139
# xref issue: https://github.com/numpy/numpy/issues/2407
113140
# Issue GH#11879. to_records used to raise an exception when used

0 commit comments

Comments
 (0)