Skip to content

Commit 6a73045

Browse files
authored
PERF/CLN: Preserve concat(keys=range) RangeIndex level in the result (#57755)
* PERF/CLN: Preserve RangeIndex level in the result * Whitespace * Whitespace * Fix test * Address review
1 parent 95ab36d commit 6a73045

File tree

5 files changed

+58
-37
lines changed

5 files changed

+58
-37
lines changed

doc/source/whatsnew/v3.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,7 @@ Removal of prior version deprecations/changes
204204
- Enforced deprecation of ``axis=None`` acting the same as ``axis=0`` in the DataFrame reductions ``sum``, ``prod``, ``std``, ``var``, and ``sem``, passing ``axis=None`` will now reduce over both axes; this is particularly the case when doing e.g. ``numpy.sum(df)`` (:issue:`21597`)
205205
- Enforced deprecation of passing a dictionary to :meth:`SeriesGroupBy.agg` (:issue:`52268`)
206206
- Enforced deprecation of string ``A`` denoting frequency in :class:`YearEnd` and strings ``A-DEC``, ``A-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`57699`)
207+
- Enforced deprecation of the behavior of :func:`concat` when ``len(keys) != len(objs)`` would truncate to the shorter of the two. Now this raises a ``ValueError`` (:issue:`43485`)
207208
- Enforced silent-downcasting deprecation for :ref:`all relevant methods <whatsnew_220.silent_downcasting>` (:issue:`54710`)
208209
- In :meth:`DataFrame.stack`, the default value of ``future_stack`` is now ``True``; specifying ``False`` will raise a ``FutureWarning`` (:issue:`55448`)
209210
- Iterating over a :class:`.DataFrameGroupBy` or :class:`.SeriesGroupBy` will return tuples of length 1 for the groups when grouping by ``level`` a list of length 1 (:issue:`50064`)
@@ -255,6 +256,7 @@ Removal of prior version deprecations/changes
255256

256257
Performance improvements
257258
~~~~~~~~~~~~~~~~~~~~~~~~
259+
- :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`)
258260
- :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`)
259261
- :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`)
260262
- Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`)

pandas/core/groupby/groupby.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1202,7 +1202,7 @@ def _concat_objects(
12021202
else:
12031203
# GH5610, returns a MI, with the first level being a
12041204
# range index
1205-
keys = list(range(len(values)))
1205+
keys = RangeIndex(len(values))
12061206
result = concat(values, axis=0, keys=keys)
12071207

12081208
elif not not_indexed_same:

pandas/core/reshape/concat.py

+13-20
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,10 @@
1212
cast,
1313
overload,
1414
)
15-
import warnings
1615

1716
import numpy as np
1817

1918
from pandas.util._decorators import cache_readonly
20-
from pandas.util._exceptions import find_stack_level
2119

2220
from pandas.core.dtypes.common import (
2321
is_bool,
@@ -493,32 +491,27 @@ def _clean_keys_and_objs(
493491
objs_list = list(com.not_none(*objs_list))
494492
else:
495493
# GH#1649
496-
clean_keys = []
494+
key_indices = []
497495
clean_objs = []
498496
if is_iterator(keys):
499497
keys = list(keys)
500498
if len(keys) != len(objs_list):
501499
# GH#43485
502-
warnings.warn(
503-
"The behavior of pd.concat with len(keys) != len(objs) is "
504-
"deprecated. In a future version this will raise instead of "
505-
"truncating to the smaller of the two sequences",
506-
FutureWarning,
507-
stacklevel=find_stack_level(),
500+
raise ValueError(
501+
f"The length of the keys ({len(keys)}) must match "
502+
f"the length of the objects to concatenate ({len(objs_list)})"
508503
)
509-
for k, v in zip(keys, objs_list):
510-
if v is None:
511-
continue
512-
clean_keys.append(k)
513-
clean_objs.append(v)
504+
for i, obj in enumerate(objs_list):
505+
if obj is not None:
506+
key_indices.append(i)
507+
clean_objs.append(obj)
514508
objs_list = clean_objs
515509

516-
if isinstance(keys, MultiIndex):
517-
# TODO: retain levels?
518-
keys = type(keys).from_tuples(clean_keys, names=keys.names)
519-
else:
520-
name = getattr(keys, "name", None)
521-
keys = Index(clean_keys, name=name, dtype=getattr(keys, "dtype", None))
510+
if not isinstance(keys, Index):
511+
keys = Index(keys)
512+
513+
if len(key_indices) < len(keys):
514+
keys = keys.take(key_indices)
522515

523516
if len(objs_list) == 0:
524517
raise ValueError("All objects passed were None")

pandas/tests/groupby/methods/test_describe.py

+12-10
Original file line numberDiff line numberDiff line change
@@ -90,20 +90,22 @@ def test_frame_describe_multikey(tsframe):
9090

9191
def test_frame_describe_tupleindex():
9292
# GH 14848 - regression from 0.19.0 to 0.19.1
93-
df1 = DataFrame(
93+
name = "k"
94+
df = DataFrame(
9495
{
9596
"x": [1, 2, 3, 4, 5] * 3,
96-
"y": [10, 20, 30, 40, 50] * 3,
97-
"z": [100, 200, 300, 400, 500] * 3,
97+
name: [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5,
9898
}
9999
)
100-
df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
101-
df2 = df1.rename(columns={"k": "key"})
102-
msg = "Names should be list-like for a MultiIndex"
103-
with pytest.raises(ValueError, match=msg):
104-
df1.groupby("k").describe()
105-
with pytest.raises(ValueError, match=msg):
106-
df2.groupby("key").describe()
100+
result = df.groupby(name).describe()
101+
expected = DataFrame(
102+
[[5.0, 3.0, 1.581139, 1.0, 2.0, 3.0, 4.0, 5.0]] * 3,
103+
index=Index([(0, 0, 1), (0, 1, 0), (1, 0, 0)], tupleize_cols=False, name=name),
104+
columns=MultiIndex.from_arrays(
105+
[["x"] * 8, ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]]
106+
),
107+
)
108+
tm.assert_frame_equal(result, expected)
107109

108110

109111
def test_frame_describe_unstacked_format():

pandas/tests/reshape/concat/test_concat.py

+30-6
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
Index,
1818
MultiIndex,
1919
PeriodIndex,
20+
RangeIndex,
2021
Series,
2122
concat,
2223
date_range,
@@ -395,6 +396,29 @@ def test_concat_keys_with_none(self):
395396
expected = concat([df0, df0[:2], df0[:1], df0], keys=["b", "c", "d", "e"])
396397
tm.assert_frame_equal(result, expected)
397398

399+
@pytest.mark.parametrize("klass", [range, RangeIndex])
400+
@pytest.mark.parametrize("include_none", [True, False])
401+
def test_concat_preserves_rangeindex(self, klass, include_none):
402+
df = DataFrame([1, 2])
403+
df2 = DataFrame([3, 4])
404+
data = [df, None, df2, None] if include_none else [df, df2]
405+
keys_length = 4 if include_none else 2
406+
result = concat(data, keys=klass(keys_length))
407+
expected = DataFrame(
408+
[1, 2, 3, 4],
409+
index=MultiIndex(
410+
levels=(
411+
RangeIndex(start=0, stop=keys_length, step=keys_length / 2),
412+
RangeIndex(start=0, stop=2, step=1),
413+
),
414+
codes=(
415+
np.array([0, 0, 1, 1], dtype=np.int8),
416+
np.array([0, 1, 0, 1], dtype=np.int8),
417+
),
418+
),
419+
)
420+
tm.assert_frame_equal(result, expected)
421+
398422
def test_concat_bug_1719(self):
399423
ts1 = Series(
400424
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
@@ -705,7 +729,7 @@ def test_concat_multiindex_with_empty_rangeindex():
705729
# GH#41234
706730
mi = MultiIndex.from_tuples([("B", 1), ("C", 1)])
707731
df1 = DataFrame([[1, 2]], columns=mi)
708-
df2 = DataFrame(index=[1], columns=pd.RangeIndex(0))
732+
df2 = DataFrame(index=[1], columns=RangeIndex(0))
709733

710734
result = concat([df1, df2])
711735
expected = DataFrame([[1, 2], [np.nan, np.nan]], columns=mi)
@@ -830,14 +854,14 @@ def test_concat_mismatched_keys_length():
830854
sers = [ser + n for n in range(4)]
831855
keys = ["A", "B", "C"]
832856

833-
msg = r"The behavior of pd.concat with len\(keys\) != len\(objs\) is deprecated"
834-
with tm.assert_produces_warning(FutureWarning, match=msg):
857+
msg = r"The length of the keys"
858+
with pytest.raises(ValueError, match=msg):
835859
concat(sers, keys=keys, axis=1)
836-
with tm.assert_produces_warning(FutureWarning, match=msg):
860+
with pytest.raises(ValueError, match=msg):
837861
concat(sers, keys=keys, axis=0)
838-
with tm.assert_produces_warning(FutureWarning, match=msg):
862+
with pytest.raises(ValueError, match=msg):
839863
concat((x for x in sers), keys=(y for y in keys), axis=1)
840-
with tm.assert_produces_warning(FutureWarning, match=msg):
864+
with pytest.raises(ValueError, match=msg):
841865
concat((x for x in sers), keys=(y for y in keys), axis=0)
842866

843867

0 commit comments

Comments
 (0)