Skip to content

BUG/API: Indexes on empty frames/series should be RangeIndex #49637

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,35 @@ The new behavior, as for datetime64, either gives exactly the requested dtype or
ser.astype("timedelta64[s]")
ser.astype("timedelta64[D]")

.. _whatsnew_200.api_breaking.zero_len_indexes:

Empty DataFrames/Series will now default to have a ``RangeIndex``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Before, constructing an empty (where ``data`` is ``None`` or an empty list-like argument) :class:`Series` or :class:`DataFrame` without
specifying the axes (``index=None``, ``columns=None``) would return the axes as empty :class:`Index` with object dtype.

Now, the axes return an empty :class:`RangeIndex`.

*Previous behavior*:

.. code-block:: ipython

In [8]: pd.Series().index
Out[8]:
Index([], dtype='object')

In [9] pd.DataFrame().axes
Out[9]:
[Index([], dtype='object'), Index([], dtype='object')]

*New behavior*:

.. ipython:: python

pd.Series().index
pd.DataFrame().axes

.. _whatsnew_200.api_breaking.deps:

Increased minimum versions for dependencies
Expand Down Expand Up @@ -370,6 +399,7 @@ Other API changes
- Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`)
- Changed behavior of :meth:`DataFrame.shift` with ``axis=1``, an integer ``fill_value``, and homogeneous datetime-like dtype, this now fills new columns with integer dtypes instead of casting to datetimelike (:issue:`49842`)
- Files are now closed when encountering an exception in :func:`read_json` (:issue:`49921`)
- Changed behavior of :func:`read_csv`, :func:`read_json` & :func:`read_fwf`, where the index will now always be a :class:`RangeIndex`, when no index is specified. Previously the index would be a :class:`Index` with dtype ``object`` if the new DataFrame/Series has length 0 (:issue:`49572`)
- :meth:`DataFrame.values`, :meth:`DataFrame.to_numpy`, :meth:`DataFrame.xs`, :meth:`DataFrame.reindex`, :meth:`DataFrame.fillna`, and :meth:`DataFrame.replace` no longer silently consolidate the underlying arrays; do ``df = df.copy()`` to ensure consolidation (:issue:`49356`)
-

Expand Down
13 changes: 8 additions & 5 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,8 +632,6 @@ def __init__(
copy: bool | None = None,
) -> None:

if data is None:
data = {}
if dtype is not None:
dtype = self._validate_dtype(dtype)

Expand Down Expand Up @@ -671,6 +669,12 @@ def __init__(
else:
copy = False

if data is None:
index = index if index is not None else default_index(0)
columns = columns if columns is not None else default_index(0)
dtype = dtype if dtype is not None else pandas_dtype(object)
data = []

if isinstance(data, (BlockManager, ArrayManager)):
mgr = self._init_mgr(
data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
Expand Down Expand Up @@ -777,7 +781,7 @@ def __init__(
mgr = dict_to_mgr(
{},
index,
columns,
columns if columns is not None else default_index(0),
dtype=dtype,
typ=manager,
)
Expand Down Expand Up @@ -2309,8 +2313,7 @@ def maybe_reorder(

result_index = None
if len(arrays) == 0 and index is None and length == 0:
# for backward compat use an object Index instead of RangeIndex
result_index = Index([])
result_index = default_index(0)

arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length)
return arrays, arr_columns, result_index
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,7 @@ def _extract_index(data) -> Index:
"""
index: Index
if len(data) == 0:
return Index([])
return default_index(0)

raw_lengths = []
indexes: list[list[Hashable] | Index] = []
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1084,8 +1084,8 @@ def _get_join_info(
else:
join_index = default_index(len(left_indexer))

if len(join_index) == 0:
join_index = join_index.astype(object)
if len(join_index) == 0 and not isinstance(join_index, MultiIndex):
join_index = default_index(0).set_names(join_index.name)
return join_index, left_indexer, right_indexer

def _create_join_index(
Expand Down
9 changes: 7 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,11 +385,16 @@ def __init__(
if index is not None:
index = ensure_index(index)

if data is None:
data = {}
if dtype is not None:
dtype = self._validate_dtype(dtype)

if data is None:
index = index if index is not None else default_index(0)
if len(index) or dtype is not None:
data = na_value_for_dtype(pandas_dtype(dtype), compat=False)
else:
data = []

if isinstance(data, MultiIndex):
raise NotImplementedError(
"initializing a Series from a MultiIndex is not supported"
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/window/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ def dataframe_from_int_dict(data, frame_template):
result = DataFrame(data, index=frame_template.index)
if len(result.columns) > 0:
result.columns = frame_template.columns[result.columns]
else:
result.columns = frame_template.columns.copy()
return result

results = {}
Expand Down
4 changes: 3 additions & 1 deletion pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
from pandas.core.indexes.api import (
Index,
MultiIndex,
default_index,
ensure_index_from_sequences,
)
from pandas.core.series import Series
Expand Down Expand Up @@ -1093,8 +1094,9 @@ def _get_empty_meta(
#
# Both must be non-null to ensure a successful construction. Otherwise,
# we have to create a generic empty Index.
index: Index
if (index_col is None or index_col is False) or index_names is None:
index = Index([])
index = default_index(0)
else:
data = [Series([], dtype=dtype_dict[name]) for name in index_names]
index = ensure_index_from_sequences(data, names=index_names)
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/apply/test_frame_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,14 +114,14 @@ def test_apply_with_reduce_empty():
result = empty_frame.apply(x.append, axis=1, result_type="expand")
tm.assert_frame_equal(result, empty_frame)
result = empty_frame.apply(x.append, axis=1, result_type="reduce")
expected = Series([], index=pd.Index([], dtype=object), dtype=np.float64)
expected = Series([], dtype=np.float64)
tm.assert_series_equal(result, expected)

empty_with_cols = DataFrame(columns=["a", "b", "c"])
result = empty_with_cols.apply(x.append, axis=1, result_type="expand")
tm.assert_frame_equal(result, empty_with_cols)
result = empty_with_cols.apply(x.append, axis=1, result_type="reduce")
expected = Series([], index=pd.Index([], dtype=object), dtype=np.float64)
expected = Series([], dtype=np.float64)
tm.assert_series_equal(result, expected)

# Ensure that x.append hasn't been called
Expand All @@ -147,7 +147,7 @@ def test_nunique_empty():
tm.assert_series_equal(result, expected)

result = df.T.nunique()
expected = Series([], index=pd.Index([]), dtype=np.float64)
expected = Series([], dtype=np.float64)
tm.assert_series_equal(result, expected)


Expand Down
5 changes: 2 additions & 3 deletions pandas/tests/apply/test_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm
Expand Down Expand Up @@ -149,8 +148,8 @@ def test_agg_cython_table_series(series, func, expected):
tm.get_cython_table_params(
Series(dtype=np.float64),
[
("cumprod", Series([], Index([]), dtype=np.float64)),
("cumsum", Series([], Index([]), dtype=np.float64)),
("cumprod", Series([], dtype=np.float64)),
("cumsum", Series([], dtype=np.float64)),
],
),
tm.get_cython_table_params(
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/base/constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def test_construct_empty_dataframe(self, dtype):
# GH 33623
result = pd.DataFrame(columns=["a"], dtype=dtype)
expected = pd.DataFrame(
{"a": pd.array([], dtype=dtype)}, index=pd.Index([], dtype="object")
{"a": pd.array([], dtype=dtype)}, index=pd.RangeIndex(0)
)
self.assert_frame_equal(result, expected)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/base/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def test_dropna_frame(self, data_missing):

# axis = 1
result = df.dropna(axis="columns")
expected = pd.DataFrame(index=[0, 1])
expected = pd.DataFrame(index=pd.RangeIndex(2), columns=pd.Index([]))
self.assert_frame_equal(result, expected)

# multiple
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/indexing/test_xs.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def test_xs_corner(self):
# no columns but Index(dtype=object)
df = DataFrame(index=["a", "b", "c"])
result = df.xs("a")
expected = Series([], name="a", index=Index([]), dtype=np.float64)
expected = Series([], name="a", dtype=np.float64)
tm.assert_series_equal(result, expected)

def test_xs_duplicates(self):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_count(self):

df = DataFrame()
result = df.count()
expected = Series(0, index=[])
expected = Series(dtype="int64")
tm.assert_series_equal(result, expected)

def test_count_objects(self, float_string_frame):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_get_numeric_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def test_get_numeric_data_preserve_dtype(self):
# get the numeric data
obj = DataFrame({"A": [1, "2", 3.0]})
result = obj._get_numeric_data()
expected = DataFrame(index=[0, 1, 2], dtype=object)
expected = DataFrame(dtype=object, index=pd.RangeIndex(3), columns=[])
tm.assert_frame_equal(result, expected)

def test_get_numeric_data(self):
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/frame/methods/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ def test_quantile_datetime(self):
tm.assert_series_equal(result, expected)

result = df[["a", "c"]].quantile([0.5], numeric_only=True)
expected = DataFrame(index=[0.5])
expected = DataFrame(index=[0.5], columns=[])
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
Expand Down Expand Up @@ -451,7 +451,7 @@ def test_quantile_dt64_empty(self, dtype, interp_method):
interpolation=interpolation,
method=method,
)
expected = DataFrame(index=[0.5])
expected = DataFrame(index=[0.5], columns=[])
tm.assert_frame_equal(res, expected)

@pytest.mark.parametrize("invalid", [-1, 2, [0.5, -1], [0.5, 2]])
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@ def test_rank_object_first(self, frame_or_series, na_option, ascending, expected
"data,expected",
[
({"a": [1, 2, "a"], "b": [4, 5, 6]}, DataFrame({"b": [1.0, 2.0, 3.0]})),
({"a": [1, 2, "a"]}, DataFrame(index=range(3))),
({"a": [1, 2, "a"]}, DataFrame(index=range(3), columns=[])),
],
)
def test_rank_mixed_axis_zero(self, data, expected):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ def test_to_csv_dup_cols(self, nrows):
def test_to_csv_empty(self):
df = DataFrame(index=np.arange(10))
result, expected = self._return_result_expected(df, 1000)
tm.assert_frame_equal(result, expected, check_names=False)
tm.assert_frame_equal(result, expected, check_column_type=False)

@pytest.mark.slow
def test_to_csv_chunksize(self):
Expand Down
20 changes: 16 additions & 4 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,13 +193,11 @@ def test_series_with_name_not_matching_column(self):
[
lambda: DataFrame(),
lambda: DataFrame(None),
lambda: DataFrame({}),
lambda: DataFrame(()),
lambda: DataFrame([]),
lambda: DataFrame(_ for _ in []),
lambda: DataFrame(range(0)),
lambda: DataFrame(data=None),
lambda: DataFrame(data={}),
lambda: DataFrame(data=()),
lambda: DataFrame(data=[]),
lambda: DataFrame(data=(_ for _ in [])),
Expand All @@ -213,6 +211,20 @@ def test_empty_constructor(self, constructor):
assert len(result.columns) == 0
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"constructor",
[
lambda: DataFrame({}),
lambda: DataFrame(data={}),
],
)
def test_empty_constructor_object_index(self, constructor):
expected = DataFrame(columns=Index([]))
result = constructor()
assert len(result.index) == 0
assert len(result.columns) == 0
tm.assert_frame_equal(result, expected, check_index_type=True)

@pytest.mark.parametrize(
"emptylike,expected_index,expected_columns",
[
Expand Down Expand Up @@ -1391,7 +1403,7 @@ def test_constructor_generator(self):
def test_constructor_list_of_dicts(self):

result = DataFrame([{}])
expected = DataFrame(index=[0])
expected = DataFrame(index=RangeIndex(1), columns=[])
tm.assert_frame_equal(result, expected)

def test_constructor_ordered_dict_nested_preserve_order(self):
Expand Down Expand Up @@ -1762,7 +1774,7 @@ def test_constructor_empty_with_string_dtype(self):

def test_constructor_empty_with_string_extension(self, nullable_string_dtype):
# GH 34915
expected = DataFrame(index=[], columns=["c1"], dtype=nullable_string_dtype)
expected = DataFrame(columns=["c1"], dtype=nullable_string_dtype)
df = DataFrame(columns=["c1"], dtype=nullable_string_dtype)
tm.assert_frame_equal(df, expected)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1163,7 +1163,7 @@ def test_any_all_bool_only(self):
)

result = df.all(bool_only=True)
expected = Series(dtype=np.bool_)
expected = Series(dtype=np.bool_, index=[])
tm.assert_series_equal(result, expected)

df = DataFrame(
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -1251,7 +1251,8 @@ def test_stack_timezone_aware_values():
@pytest.mark.parametrize("dropna", [True, False])
def test_stack_empty_frame(dropna):
# GH 36113
expected = Series(index=MultiIndex([[], []], [[], []]), dtype=np.float64)
levels = [np.array([], dtype=np.int64), np.array([], dtype=np.int64)]
expected = Series(dtype=np.float64, index=MultiIndex(levels=levels, codes=[[], []]))
result = DataFrame(dtype=np.float64).stack(dropna=dropna)
tm.assert_series_equal(result, expected)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,7 @@ def test_no_args_raises(self):

# but we do allow this
result = gr.agg([])
expected = DataFrame()
expected = DataFrame(columns=[])
tm.assert_frame_equal(result, expected)

def test_series_named_agg_duplicates_no_raises(self):
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/groupby/aggregate/test_cython.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,9 @@ def test_cython_agg_nothing_to_agg():
with pytest.raises(TypeError, match="Could not convert"):
frame[["b"]].groupby(frame["a"]).mean()
result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
expected = DataFrame([], index=frame["a"].sort_values().drop_duplicates())
expected = DataFrame(
[], index=frame["a"].sort_values().drop_duplicates(), columns=[]
)
tm.assert_frame_equal(result, expected)


Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/groupby/test_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -683,7 +683,7 @@ def test_list_grouper_with_nat(self):
[
(
"transform",
Series(name=2, dtype=np.float64, index=Index([])),
Series(name=2, dtype=np.float64),
),
(
"agg",
Expand Down Expand Up @@ -875,7 +875,7 @@ def test_groupby_with_single_column(self):
df = DataFrame({"a": list("abssbab")})
tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]])
# GH 13530
exp = DataFrame(index=Index(["a", "b", "s"], name="a"))
exp = DataFrame(index=Index(["a", "b", "s"], name="a"), columns=[])
tm.assert_frame_equal(df.groupby("a").count(), exp)
tm.assert_frame_equal(df.groupby("a").sum(), exp)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexing/test_iloc.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def check(result, expected):
tm.assert_frame_equal(result, expected)

dfl = DataFrame(np.random.randn(5, 2), columns=list("AB"))
check(dfl.iloc[:, 2:3], DataFrame(index=dfl.index))
check(dfl.iloc[:, 2:3], DataFrame(index=dfl.index, columns=[]))
check(dfl.iloc[:, 1:3], dfl.iloc[:, [1]])
check(dfl.iloc[4:6], dfl.iloc[[4]])

Expand Down
Loading