Skip to content

Commit e93ee07

Browse files
topper-123Terji Petersenmroeschke
authored
BUG/API: Indexes on empty frames/series should be RangeIndex (#49637)
* BUG/API: ndexes on empty frames/series should be RangeIndex, are Index[object] * fix black * fix window stuff * Add docs * double ticks * unneeded line * update thatsnew text * update whatsnew text * fix rst * Update doc/source/whatsnew/v2.0.0.rst Co-authored-by: Matthew Roeschke <[email protected]> Co-authored-by: Terji Petersen <[email protected]> Co-authored-by: Matthew Roeschke <[email protected]>
1 parent afca9f8 commit e93ee07

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+158
-109
lines changed

doc/source/whatsnew/v2.0.0.rst

+30
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,35 @@ The new behavior, as for datetime64, either gives exactly the requested dtype or
312312
ser.astype("timedelta64[s]")
313313
ser.astype("timedelta64[D]")
314314
315+
.. _whatsnew_200.api_breaking.zero_len_indexes:
316+
317+
Empty DataFrames/Series will now default to have a ``RangeIndex``
318+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
319+
320+
Before, constructing an empty (where ``data`` is ``None`` or an empty list-like argument) :class:`Series` or :class:`DataFrame` without
321+
specifying the axes (``index=None``, ``columns=None``) would return the axes as empty :class:`Index` with object dtype.
322+
323+
Now, the axes return an empty :class:`RangeIndex`.
324+
325+
*Previous behavior*:
326+
327+
.. code-block:: ipython
328+
329+
In [8]: pd.Series().index
330+
Out[8]:
331+
Index([], dtype='object')
332+
333+
In [9] pd.DataFrame().axes
334+
Out[9]:
335+
[Index([], dtype='object'), Index([], dtype='object')]
336+
337+
*New behavior*:
338+
339+
.. ipython:: python
340+
341+
pd.Series().index
342+
pd.DataFrame().axes
343+
315344
.. _whatsnew_200.api_breaking.deps:
316345

317346
Increased minimum versions for dependencies
@@ -373,6 +402,7 @@ Other API changes
373402
- Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`)
374403
- Changed behavior of :meth:`DataFrame.shift` with ``axis=1``, an integer ``fill_value``, and homogeneous datetime-like dtype, this now fills new columns with integer dtypes instead of casting to datetimelike (:issue:`49842`)
375404
- Files are now closed when encountering an exception in :func:`read_json` (:issue:`49921`)
405+
- Changed behavior of :func:`read_csv`, :func:`read_json` & :func:`read_fwf`, where the index will now always be a :class:`RangeIndex`, when no index is specified. Previously the index would be a :class:`Index` with dtype ``object`` if the new DataFrame/Series has length 0 (:issue:`49572`)
376406
- :meth:`DataFrame.values`, :meth:`DataFrame.to_numpy`, :meth:`DataFrame.xs`, :meth:`DataFrame.reindex`, :meth:`DataFrame.fillna`, and :meth:`DataFrame.replace` no longer silently consolidate the underlying arrays; do ``df = df.copy()`` to ensure consolidation (:issue:`49356`)
377407
-
378408

pandas/core/frame.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -632,8 +632,6 @@ def __init__(
632632
copy: bool | None = None,
633633
) -> None:
634634

635-
if data is None:
636-
data = {}
637635
if dtype is not None:
638636
dtype = self._validate_dtype(dtype)
639637

@@ -671,6 +669,12 @@ def __init__(
671669
else:
672670
copy = False
673671

672+
if data is None:
673+
index = index if index is not None else default_index(0)
674+
columns = columns if columns is not None else default_index(0)
675+
dtype = dtype if dtype is not None else pandas_dtype(object)
676+
data = []
677+
674678
if isinstance(data, (BlockManager, ArrayManager)):
675679
mgr = self._init_mgr(
676680
data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
@@ -777,7 +781,7 @@ def __init__(
777781
mgr = dict_to_mgr(
778782
{},
779783
index,
780-
columns,
784+
columns if columns is not None else default_index(0),
781785
dtype=dtype,
782786
typ=manager,
783787
)
@@ -2309,8 +2313,7 @@ def maybe_reorder(
23092313

23102314
result_index = None
23112315
if len(arrays) == 0 and index is None and length == 0:
2312-
# for backward compat use an object Index instead of RangeIndex
2313-
result_index = Index([])
2316+
result_index = default_index(0)
23142317

23152318
arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length)
23162319
return arrays, arr_columns, result_index

pandas/core/internals/construction.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -582,7 +582,7 @@ def _extract_index(data) -> Index:
582582
"""
583583
index: Index
584584
if len(data) == 0:
585-
return Index([])
585+
return default_index(0)
586586

587587
raw_lengths = []
588588
indexes: list[list[Hashable] | Index] = []

pandas/core/reshape/merge.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1084,8 +1084,8 @@ def _get_join_info(
10841084
else:
10851085
join_index = default_index(len(left_indexer))
10861086

1087-
if len(join_index) == 0:
1088-
join_index = join_index.astype(object)
1087+
if len(join_index) == 0 and not isinstance(join_index, MultiIndex):
1088+
join_index = default_index(0).set_names(join_index.name)
10891089
return join_index, left_indexer, right_indexer
10901090

10911091
def _create_join_index(

pandas/core/series.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -385,11 +385,16 @@ def __init__(
385385
if index is not None:
386386
index = ensure_index(index)
387387

388-
if data is None:
389-
data = {}
390388
if dtype is not None:
391389
dtype = self._validate_dtype(dtype)
392390

391+
if data is None:
392+
index = index if index is not None else default_index(0)
393+
if len(index) or dtype is not None:
394+
data = na_value_for_dtype(pandas_dtype(dtype), compat=False)
395+
else:
396+
data = []
397+
393398
if isinstance(data, MultiIndex):
394399
raise NotImplementedError(
395400
"initializing a Series from a MultiIndex is not supported"

pandas/core/window/common.py

+2
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ def dataframe_from_int_dict(data, frame_template):
3030
result = DataFrame(data, index=frame_template.index)
3131
if len(result.columns) > 0:
3232
result.columns = frame_template.columns[result.columns]
33+
else:
34+
result.columns = frame_template.columns.copy()
3335
return result
3436

3537
results = {}

pandas/io/parsers/base_parser.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@
8484
from pandas.core.indexes.api import (
8585
Index,
8686
MultiIndex,
87+
default_index,
8788
ensure_index_from_sequences,
8889
)
8990
from pandas.core.series import Series
@@ -1093,8 +1094,9 @@ def _get_empty_meta(
10931094
#
10941095
# Both must be non-null to ensure a successful construction. Otherwise,
10951096
# we have to create a generic empty Index.
1097+
index: Index
10961098
if (index_col is None or index_col is False) or index_names is None:
1097-
index = Index([])
1099+
index = default_index(0)
10981100
else:
10991101
data = [Series([], dtype=dtype_dict[name]) for name in index_names]
11001102
index = ensure_index_from_sequences(data, names=index_names)

pandas/tests/apply/test_frame_apply.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -114,14 +114,14 @@ def test_apply_with_reduce_empty():
114114
result = empty_frame.apply(x.append, axis=1, result_type="expand")
115115
tm.assert_frame_equal(result, empty_frame)
116116
result = empty_frame.apply(x.append, axis=1, result_type="reduce")
117-
expected = Series([], index=pd.Index([], dtype=object), dtype=np.float64)
117+
expected = Series([], dtype=np.float64)
118118
tm.assert_series_equal(result, expected)
119119

120120
empty_with_cols = DataFrame(columns=["a", "b", "c"])
121121
result = empty_with_cols.apply(x.append, axis=1, result_type="expand")
122122
tm.assert_frame_equal(result, empty_with_cols)
123123
result = empty_with_cols.apply(x.append, axis=1, result_type="reduce")
124-
expected = Series([], index=pd.Index([], dtype=object), dtype=np.float64)
124+
expected = Series([], dtype=np.float64)
125125
tm.assert_series_equal(result, expected)
126126

127127
# Ensure that x.append hasn't been called
@@ -147,7 +147,7 @@ def test_nunique_empty():
147147
tm.assert_series_equal(result, expected)
148148

149149
result = df.T.nunique()
150-
expected = Series([], index=pd.Index([]), dtype=np.float64)
150+
expected = Series([], dtype=np.float64)
151151
tm.assert_series_equal(result, expected)
152152

153153

pandas/tests/apply/test_str.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
from pandas import (
1010
DataFrame,
11-
Index,
1211
Series,
1312
)
1413
import pandas._testing as tm
@@ -149,8 +148,8 @@ def test_agg_cython_table_series(series, func, expected):
149148
tm.get_cython_table_params(
150149
Series(dtype=np.float64),
151150
[
152-
("cumprod", Series([], Index([]), dtype=np.float64)),
153-
("cumsum", Series([], Index([]), dtype=np.float64)),
151+
("cumprod", Series([], dtype=np.float64)),
152+
("cumsum", Series([], dtype=np.float64)),
154153
],
155154
),
156155
tm.get_cython_table_params(

pandas/tests/extension/base/constructors.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ def test_construct_empty_dataframe(self, dtype):
119119
# GH 33623
120120
result = pd.DataFrame(columns=["a"], dtype=dtype)
121121
expected = pd.DataFrame(
122-
{"a": pd.array([], dtype=dtype)}, index=pd.Index([], dtype="object")
122+
{"a": pd.array([], dtype=dtype)}, index=pd.RangeIndex(0)
123123
)
124124
self.assert_frame_equal(result, expected)
125125

pandas/tests/extension/base/missing.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def test_dropna_frame(self, data_missing):
5555

5656
# axis = 1
5757
result = df.dropna(axis="columns")
58-
expected = pd.DataFrame(index=[0, 1])
58+
expected = pd.DataFrame(index=pd.RangeIndex(2), columns=pd.Index([]))
5959
self.assert_frame_equal(result, expected)
6060

6161
# multiple

pandas/tests/frame/indexing/test_xs.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def test_xs_corner(self):
8484
# no columns but Index(dtype=object)
8585
df = DataFrame(index=["a", "b", "c"])
8686
result = df.xs("a")
87-
expected = Series([], name="a", index=Index([]), dtype=np.float64)
87+
expected = Series([], name="a", dtype=np.float64)
8888
tm.assert_series_equal(result, expected)
8989

9090
def test_xs_duplicates(self):

pandas/tests/frame/methods/test_count.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def test_count(self):
2828

2929
df = DataFrame()
3030
result = df.count()
31-
expected = Series(0, index=[])
31+
expected = Series(dtype="int64")
3232
tm.assert_series_equal(result, expected)
3333

3434
def test_count_objects(self, float_string_frame):

pandas/tests/frame/methods/test_get_numeric_data.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def test_get_numeric_data_preserve_dtype(self):
1717
# get the numeric data
1818
obj = DataFrame({"A": [1, "2", 3.0]})
1919
result = obj._get_numeric_data()
20-
expected = DataFrame(index=[0, 1, 2], dtype=object)
20+
expected = DataFrame(dtype=object, index=pd.RangeIndex(3), columns=[])
2121
tm.assert_frame_equal(result, expected)
2222

2323
def test_get_numeric_data(self):

pandas/tests/frame/methods/test_quantile.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,7 @@ def test_quantile_datetime(self):
420420
tm.assert_series_equal(result, expected)
421421

422422
result = df[["a", "c"]].quantile([0.5], numeric_only=True)
423-
expected = DataFrame(index=[0.5])
423+
expected = DataFrame(index=[0.5], columns=[])
424424
tm.assert_frame_equal(result, expected)
425425

426426
@pytest.mark.parametrize(
@@ -451,7 +451,7 @@ def test_quantile_dt64_empty(self, dtype, interp_method):
451451
interpolation=interpolation,
452452
method=method,
453453
)
454-
expected = DataFrame(index=[0.5])
454+
expected = DataFrame(index=[0.5], columns=[])
455455
tm.assert_frame_equal(res, expected)
456456

457457
@pytest.mark.parametrize("invalid", [-1, 2, [0.5, -1], [0.5, 2]])

pandas/tests/frame/methods/test_rank.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -483,7 +483,7 @@ def test_rank_object_first(self, frame_or_series, na_option, ascending, expected
483483
"data,expected",
484484
[
485485
({"a": [1, 2, "a"], "b": [4, 5, 6]}, DataFrame({"b": [1.0, 2.0, 3.0]})),
486-
({"a": [1, 2, "a"]}, DataFrame(index=range(3))),
486+
({"a": [1, 2, "a"]}, DataFrame(index=range(3), columns=[])),
487487
],
488488
)
489489
def test_rank_mixed_axis_zero(self, data, expected):

pandas/tests/frame/methods/test_to_csv.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,7 @@ def test_to_csv_dup_cols(self, nrows):
390390
def test_to_csv_empty(self):
391391
df = DataFrame(index=np.arange(10))
392392
result, expected = self._return_result_expected(df, 1000)
393-
tm.assert_frame_equal(result, expected, check_names=False)
393+
tm.assert_frame_equal(result, expected, check_column_type=False)
394394

395395
@pytest.mark.slow
396396
def test_to_csv_chunksize(self):

pandas/tests/frame/test_constructors.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -193,13 +193,11 @@ def test_series_with_name_not_matching_column(self):
193193
[
194194
lambda: DataFrame(),
195195
lambda: DataFrame(None),
196-
lambda: DataFrame({}),
197196
lambda: DataFrame(()),
198197
lambda: DataFrame([]),
199198
lambda: DataFrame(_ for _ in []),
200199
lambda: DataFrame(range(0)),
201200
lambda: DataFrame(data=None),
202-
lambda: DataFrame(data={}),
203201
lambda: DataFrame(data=()),
204202
lambda: DataFrame(data=[]),
205203
lambda: DataFrame(data=(_ for _ in [])),
@@ -213,6 +211,20 @@ def test_empty_constructor(self, constructor):
213211
assert len(result.columns) == 0
214212
tm.assert_frame_equal(result, expected)
215213

214+
@pytest.mark.parametrize(
215+
"constructor",
216+
[
217+
lambda: DataFrame({}),
218+
lambda: DataFrame(data={}),
219+
],
220+
)
221+
def test_empty_constructor_object_index(self, constructor):
222+
expected = DataFrame(columns=Index([]))
223+
result = constructor()
224+
assert len(result.index) == 0
225+
assert len(result.columns) == 0
226+
tm.assert_frame_equal(result, expected, check_index_type=True)
227+
216228
@pytest.mark.parametrize(
217229
"emptylike,expected_index,expected_columns",
218230
[
@@ -1391,7 +1403,7 @@ def test_constructor_generator(self):
13911403
def test_constructor_list_of_dicts(self):
13921404

13931405
result = DataFrame([{}])
1394-
expected = DataFrame(index=[0])
1406+
expected = DataFrame(index=RangeIndex(1), columns=[])
13951407
tm.assert_frame_equal(result, expected)
13961408

13971409
def test_constructor_ordered_dict_nested_preserve_order(self):
@@ -1762,7 +1774,7 @@ def test_constructor_empty_with_string_dtype(self):
17621774

17631775
def test_constructor_empty_with_string_extension(self, nullable_string_dtype):
17641776
# GH 34915
1765-
expected = DataFrame(index=[], columns=["c1"], dtype=nullable_string_dtype)
1777+
expected = DataFrame(columns=["c1"], dtype=nullable_string_dtype)
17661778
df = DataFrame(columns=["c1"], dtype=nullable_string_dtype)
17671779
tm.assert_frame_equal(df, expected)
17681780

pandas/tests/frame/test_reductions.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1163,7 +1163,7 @@ def test_any_all_bool_only(self):
11631163
)
11641164

11651165
result = df.all(bool_only=True)
1166-
expected = Series(dtype=np.bool_)
1166+
expected = Series(dtype=np.bool_, index=[])
11671167
tm.assert_series_equal(result, expected)
11681168

11691169
df = DataFrame(

pandas/tests/frame/test_stack_unstack.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1251,7 +1251,8 @@ def test_stack_timezone_aware_values():
12511251
@pytest.mark.parametrize("dropna", [True, False])
12521252
def test_stack_empty_frame(dropna):
12531253
# GH 36113
1254-
expected = Series(index=MultiIndex([[], []], [[], []]), dtype=np.float64)
1254+
levels = [np.array([], dtype=np.int64), np.array([], dtype=np.int64)]
1255+
expected = Series(dtype=np.float64, index=MultiIndex(levels=levels, codes=[[], []]))
12551256
result = DataFrame(dtype=np.float64).stack(dropna=dropna)
12561257
tm.assert_series_equal(result, expected)
12571258

pandas/tests/groupby/aggregate/test_aggregate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -674,7 +674,7 @@ def test_no_args_raises(self):
674674

675675
# but we do allow this
676676
result = gr.agg([])
677-
expected = DataFrame()
677+
expected = DataFrame(columns=[])
678678
tm.assert_frame_equal(result, expected)
679679

680680
def test_series_named_agg_duplicates_no_raises(self):

pandas/tests/groupby/aggregate/test_cython.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,9 @@ def test_cython_agg_nothing_to_agg():
103103
with pytest.raises(TypeError, match="Could not convert"):
104104
frame[["b"]].groupby(frame["a"]).mean()
105105
result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
106-
expected = DataFrame([], index=frame["a"].sort_values().drop_duplicates())
106+
expected = DataFrame(
107+
[], index=frame["a"].sort_values().drop_duplicates(), columns=[]
108+
)
107109
tm.assert_frame_equal(result, expected)
108110

109111

pandas/tests/groupby/test_grouping.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -683,7 +683,7 @@ def test_list_grouper_with_nat(self):
683683
[
684684
(
685685
"transform",
686-
Series(name=2, dtype=np.float64, index=Index([])),
686+
Series(name=2, dtype=np.float64),
687687
),
688688
(
689689
"agg",
@@ -875,7 +875,7 @@ def test_groupby_with_single_column(self):
875875
df = DataFrame({"a": list("abssbab")})
876876
tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]])
877877
# GH 13530
878-
exp = DataFrame(index=Index(["a", "b", "s"], name="a"))
878+
exp = DataFrame(index=Index(["a", "b", "s"], name="a"), columns=[])
879879
tm.assert_frame_equal(df.groupby("a").count(), exp)
880880
tm.assert_frame_equal(df.groupby("a").sum(), exp)
881881

pandas/tests/indexing/test_iloc.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ def check(result, expected):
246246
tm.assert_frame_equal(result, expected)
247247

248248
dfl = DataFrame(np.random.randn(5, 2), columns=list("AB"))
249-
check(dfl.iloc[:, 2:3], DataFrame(index=dfl.index))
249+
check(dfl.iloc[:, 2:3], DataFrame(index=dfl.index, columns=[]))
250250
check(dfl.iloc[:, 1:3], dfl.iloc[:, [1]])
251251
check(dfl.iloc[4:6], dfl.iloc[[4]])
252252

0 commit comments

Comments
 (0)