Skip to content

BUG: inconsistent behaviour for empty DataFrames #48327

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3177,3 +3177,35 @@ def test_frame_allow_non_nano(self, arr):
def test_frame_from_dict_allow_non_nano(self, arr):
df = DataFrame({0: arr})
assert df.dtypes[0] == arr.dtype


def test_dtype_warning_on_empty_list_df():
# pd.Series([]) without a specified dtype warns the user
expected = pd.DataFrame({"a": pd.Series([]), "b": pd.Series([])})

with tm.assert_produces_warning(FutureWarning):
# Lists does not warn the user
result = pd.DataFrame({"a": [], "b": []})
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Error

E           AssertionError: Did not see expected warning of class 'FutureWarning'

pandas/_testing/_warnings.py:143: AssertionError

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

>>> df_with_series = pd.DataFrame({"a": pd.Series([]), "b": pd.Series([])}) # Prints a warning
<stdin>:1: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
>>> df_with_list = pd.DataFrame({"a": [], "b": []}) # Does not print a warning
>>> tm.assert_frame_equal(df_with_series, df_with_list) # Both df are equal

tm.assert_frame_equal(result, expected) # This is true


def test_empty_constructs():
# There should be a consistency for dtype when it's not supplied by the user
result = pd.DataFrame({"a": [], "b": []})
expected = pd.DataFrame(columns=["a", "b"])

tm.assert_frame_equal(result, expected)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Error

E       AssertionError: DataFrame.index are different
E       
E       DataFrame.index classes are different
E       [left]:  RangeIndex(start=0, stop=0, step=1)
E       [right]: Index([], dtype='object')

pandas/_testing/asserters.py:318: AssertionError



def test_empty_df_without_column_names():
# Given
result_with_data = pd.DataFrame([1, 2, 3])
expected_with_data = pd.DataFrame(pd.Series([1, 2, 3]))
# Then
tm.assert_frame_equal(result_with_data, expected_with_data) # True

# But when it's empty
result_empty = pd.DataFrame([])
expected_empty = pd.DataFrame(pd.Series([]))

tm.assert_frame_equal(result_empty, expected_empty)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Error returned

E       AssertionError: DataFrame are different
E       
E       DataFrame shape mismatch
E       [left]:  (0, 0)
E       [right]: (0, 1)

pandas/tests/frame/test_constructors.py:3211: AssertionError

14 changes: 14 additions & 0 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1331,3 +1331,17 @@ def test_result_name_when_one_group(name):
expected = Series([1, 2], name=name)

tm.assert_series_equal(result, expected)


def test_apply_on_empty_groupby_dataframe():
df = pd.DataFrame([(date.today(), 2, 3)], columns=["date", "a", "b"])
df["date"] = pd.to_datetime(df["date"])
df = df[df["b"] == 1] # An empty dataframe
result = df.set_index("date").groupby("a", group_keys=True).apply(lambda x: x)

df2 = pd.DataFrame([(date.today(), 2, 3)], columns=["date", "a", "b"])
df2["date"] = pd.to_datetime(df2["date"])
df3 = df2.set_index("date").groupby("a", group_keys=True).apply(lambda x: x)
expected = df3.iloc[:0] # An empty dataframe

tm.assert_frame_equal(result, expected)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Error

E       AssertionError: DataFrame.index are different
E       
E       DataFrame.index classes are different
E       [left]:  Int64Index([], dtype='int64', name='a')
E       [right]: MultiIndex([], names=['a', 'date'])

pandas/_testing/asserters.py:318: AssertionError

9 changes: 9 additions & 0 deletions pandas/tests/groupby/test_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,12 @@ def test_groupby_sample_with_selections():
result = df.groupby("a")[["b", "c"]].sample(n=None, frac=None)
expected = DataFrame({"b": [1, 2], "c": [1, 2]}, index=result.index)
tm.assert_frame_equal(result, expected)


def test_groupby_sample_with_empty_inputs():
df = DataFrame({"a": [], "b": []})

gb_df = df.groupby("a").sample()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [1], in <cell line: 3>()
      1 empty_df = pd.DataFrame({'a': [], 'b': []})
----> 2 empty_df.groupby('a').sample()

File ~.../pandas/core/groupby/groupby.py:4284, in sample(self, n, frac, replace, weights, random_state)
   4275         assert frac is not None
   4276         sample_size = round(frac * group_size)
   4278     grp_sample = sample.sample(
   4279         group_size,
   4280         size=sample_size,
   4281         replace=replace,
   4282         weights=None if weights is None else weights_arr[grp_indices],
   4283         random_state=random_state,
-> 4284     )
   4285     sampled_indices.append(grp_indices[grp_sample])
   4287 sampled_indices = np.concatenate(sampled_indices)

File <__array_function__ internals>:180, in concatenate(*args, **kwargs)

ValueError: need at least one array to concatenate

result = gb_df.empty
expected = True
assert result == expected