Skip to content

Commit 692c153

Browse files
committed
fix tests which trigger NullKeyWarning
this will help with PDEP-11 (pandas-dev#53094) as an intermediate step to identify tests that will fail under the default value
1 parent 41131a1 commit 692c153

File tree

5 files changed

+33
-26
lines changed

5 files changed

+33
-26
lines changed

pandas/tests/groupby/test_categorical.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ def test_basic_cut_grouping():
192192
# GH 9603
193193
df = DataFrame({"a": [1, 0, 0, 0]})
194194
c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list("abcd")))
195-
result = df.groupby(c, observed=False).apply(len)
195+
result = df.groupby(c, observed=False, dropna=True).apply(len)
196196

197197
exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered)
198198
expected = Series([1, 0, 0, 0], index=exp_index)
@@ -568,7 +568,7 @@ def test_observed_groups_with_nan(observed):
568568
"vals": [1, 2, 3],
569569
}
570570
)
571-
g = df.groupby("cat", observed=observed)
571+
g = df.groupby("cat", observed=observed, dropna=True)
572572
result = g.groups
573573
if observed:
574574
expected = {"a": Index([0, 2], dtype="int64")}
@@ -587,7 +587,7 @@ def test_observed_nth():
587587
ser = Series([1, 2, 3])
588588
df = DataFrame({"cat": cat, "ser": ser})
589589

590-
result = df.groupby("cat", observed=False)["ser"].nth(0)
590+
result = df.groupby("cat", observed=False, dropna=True)["ser"].nth(0)
591591
expected = df["ser"].iloc[[0]]
592592
tm.assert_series_equal(result, expected)
593593

@@ -597,7 +597,7 @@ def test_dataframe_categorical_with_nan(observed):
597597
s1 = Categorical([np.nan, "a", np.nan, "a"], categories=["a", "b", "c"])
598598
s2 = Series([1, 2, 3, 4])
599599
df = DataFrame({"s1": s1, "s2": s2})
600-
result = df.groupby("s1", observed=observed).first().reset_index()
600+
result = df.groupby("s1", observed=observed, dropna=True).first().reset_index()
601601
if observed:
602602
expected = DataFrame(
603603
{"s1": Categorical(["a"], categories=["a", "b", "c"]), "s2": [2]}
@@ -768,7 +768,9 @@ def test_categorical_series(series, data):
768768
# Group the given series by a series with categorical data type such that group A
769769
# takes indices 0 and 3 and group B indices 1 and 2, obtaining the values mapped in
770770
# the given data.
771-
groupby = series.groupby(Series(list("ABBA"), dtype="category"), observed=False)
771+
groupby = series.groupby(
772+
Series(list("ABBA"), dtype="category"), observed=False, dropna=True
773+
)
772774
result = groupby.aggregate(list)
773775
expected = Series(data, index=CategoricalIndex(data.keys()))
774776
tm.assert_series_equal(result, expected)
@@ -973,7 +975,7 @@ def test_groupby_empty_with_category():
973975
# test fix for when group by on None resulted in
974976
# coercion of dtype categorical -> float
975977
df = DataFrame({"A": [None] * 3, "B": Categorical(["train", "train", "test"])})
976-
result = df.groupby("A").first()["B"]
978+
result = df.groupby("A", dropna=True).first()["B"]
977979
expected = Series(
978980
Categorical([], categories=["test", "train"]),
979981
index=Series([], dtype="object", name="A"),

pandas/tests/groupby/test_groupby.py

+11-10
Original file line numberDiff line numberDiff line change
@@ -140,9 +140,9 @@ def test_len():
140140
def test_len_nan_group():
141141
# issue 11016
142142
df = DataFrame({"a": [np.nan] * 3, "b": [1, 2, 3]})
143-
assert len(df.groupby("a")) == 0
143+
assert len(df.groupby("a", dropna=True)) == 0
144144
assert len(df.groupby("b")) == 3
145-
assert len(df.groupby(["a", "b"])) == 0
145+
assert len(df.groupby(["a", "b"], dropna=True)) == 0
146146

147147

148148
def test_groupby_timedelta_median():
@@ -922,6 +922,7 @@ def test_groupby_complex_numbers():
922922
tm.assert_frame_equal(result, expected)
923923

924924

925+
@pytest.mark.filterwarnings("ignore::pandas.errors.NullKeyWarning")
925926
def test_groupby_series_indexed_differently():
926927
s1 = Series(
927928
[5.0, -9.0, 4.0, 100.0, -5.0, 55.0, 6.7],
@@ -1215,7 +1216,7 @@ def test_groupby_nat_exclude():
12151216
"str": [np.nan, "a", np.nan, "a", np.nan, "a", np.nan, "b"],
12161217
}
12171218
)
1218-
grouped = df.groupby("dt")
1219+
grouped = df.groupby("dt", dropna=True)
12191220

12201221
expected = [
12211222
RangeIndex(start=1, stop=13, step=6),
@@ -1253,7 +1254,7 @@ def test_groupby_nat_exclude():
12531254
assert nan_df["nat"].dtype == "datetime64[s]"
12541255

12551256
for key in ["nan", "nat"]:
1256-
grouped = nan_df.groupby(key)
1257+
grouped = nan_df.groupby(key, dropna=True)
12571258
assert grouped.groups == {}
12581259
assert grouped.ngroups == 0
12591260
assert grouped.indices == {}
@@ -1266,7 +1267,7 @@ def test_groupby_nat_exclude():
12661267
def test_groupby_two_group_keys_all_nan():
12671268
# GH #36842: Grouping over two group keys shouldn't raise an error
12681269
df = DataFrame({"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 2]})
1269-
result = df.groupby(["a", "b"]).indices
1270+
result = df.groupby(["a", "b"], dropna=True).indices
12701271
assert result == {}
12711272

12721273

@@ -2050,7 +2051,7 @@ def test_groupby_only_none_group():
20502051
# see GH21624
20512052
# this was crashing with "ValueError: Length of passed values is 1, index implies 0"
20522053
df = DataFrame({"g": [None], "x": 1})
2053-
actual = df.groupby("g")["x"].transform("sum")
2054+
actual = df.groupby("g", dropna=True)["x"].transform("sum")
20542055
expected = Series([np.nan], name="x")
20552056

20562057
tm.assert_series_equal(actual, expected)
@@ -2295,7 +2296,7 @@ def test_groupby_mean_duplicate_index(rand_series_with_duplicate_datetimeindex):
22952296
def test_groupby_all_nan_groups_drop():
22962297
# GH 15036
22972298
s = Series([1, 2, 3], [np.nan, np.nan, np.nan])
2298-
result = s.groupby(s.index).sum()
2299+
result = s.groupby(s.index, dropna=True).sum()
22992300
expected = Series([], index=Index([], dtype=np.float64), dtype=np.int64)
23002301
tm.assert_series_equal(result, expected)
23012302

@@ -2459,7 +2460,7 @@ def test_groupby_none_in_first_mi_level():
24592460
# GH#47348
24602461
arr = [[None, 1, 0, 1], [2, 3, 2, 3]]
24612462
ser = Series(1, index=MultiIndex.from_arrays(arr, names=["a", "b"]))
2462-
result = ser.groupby(level=[0, 1]).sum()
2463+
result = ser.groupby(level=[0, 1], dropna=True).sum()
24632464
expected = Series(
24642465
[1, 2], MultiIndex.from_tuples([(0.0, 2), (1.0, 3)], names=["a", "b"])
24652466
)
@@ -2632,9 +2633,9 @@ def test_groupby_method_drop_na(method):
26322633
df = DataFrame({"A": ["a", np.nan, "b", np.nan, "c"], "B": range(5)})
26332634

26342635
if method == "nth":
2635-
result = getattr(df.groupby("A"), method)(n=0)
2636+
result = getattr(df.groupby("A", dropna=True), method)(n=0)
26362637
else:
2637-
result = getattr(df.groupby("A"), method)()
2638+
result = getattr(df.groupby("A", dropna=True), method)()
26382639

26392640
if method in ["first", "last"]:
26402641
expected = DataFrame({"B": [0, 2, 4]}).set_index(

pandas/tests/groupby/test_grouping.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -691,7 +691,7 @@ def test_groupby_level_with_nas(self, sort):
691691

692692
# factorizing doesn't confuse things
693693
s = Series(np.arange(8.0), index=index)
694-
result = s.groupby(level=0, sort=sort).sum()
694+
result = s.groupby(level=0, sort=sort, dropna=True).sum()
695695
expected = Series([6.0, 18.0], index=[0.0, 1.0])
696696
tm.assert_series_equal(result, expected)
697697

@@ -817,7 +817,7 @@ def test_groupby_level_index_value_all_na(self):
817817
df = DataFrame(
818818
[["x", np.nan, 10], [None, np.nan, 20]], columns=["A", "B", "C"]
819819
).set_index(["A", "B"])
820-
result = df.groupby(level=["A", "B"]).sum()
820+
result = df.groupby(level=["A", "B"], dropna=True).sum()
821821
expected = DataFrame(
822822
data=[],
823823
index=MultiIndex(

pandas/tests/groupby/test_indexing.py

+1
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,7 @@ def test_groupby_duplicated_columns(func):
294294
tm.assert_frame_equal(result, expected)
295295

296296

297+
@pytest.mark.filterwarnings("ignore::pandas.errors.NullKeyWarning")
297298
def test_groupby_get_nonexisting_groups():
298299
# GH#32492
299300
df = pd.DataFrame(

pandas/tests/groupby/test_reductions.py

+11-8
Original file line numberDiff line numberDiff line change
@@ -656,32 +656,34 @@ def test_multifunc_skipna(func, values, dtype, result_dtype, skipna):
656656
tm.assert_series_equal(result, expected)
657657

658658

659-
def test_cython_median():
659+
def test_cython_median(dropna):
660660
arr = np.random.default_rng(2).standard_normal(1000)
661661
arr[::2] = np.nan
662662
df = DataFrame(arr)
663663

664664
labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float)
665665
labels[::17] = np.nan
666666

667-
result = df.groupby(labels).median()
668-
exp = df.groupby(labels).agg(np.nanmedian)
667+
result = df.groupby(labels, dropna=dropna).median()
668+
exp = df.groupby(labels, dropna=dropna).agg(np.nanmedian)
669669
tm.assert_frame_equal(result, exp)
670670

671671
df = DataFrame(np.random.default_rng(2).standard_normal((1000, 5)))
672-
rs = df.groupby(labels).agg(np.median)
673-
xp = df.groupby(labels).median()
672+
rs = df.groupby(labels, dropna=dropna).agg(np.median)
673+
xp = df.groupby(labels, dropna=dropna).median()
674674
tm.assert_frame_equal(rs, xp)
675675

676676

677-
def test_median_empty_bins(observed):
677+
def test_median_empty_bins(observed, dropna):
678678
df = DataFrame(np.random.default_rng(2).integers(0, 44, 500))
679679

680680
grps = range(0, 55, 5)
681681
bins = pd.cut(df[0], grps)
682682

683-
result = df.groupby(bins, observed=observed).median()
684-
expected = df.groupby(bins, observed=observed).agg(lambda x: x.median())
683+
result = df.groupby(bins, observed=observed, dropna=dropna).median()
684+
expected = df.groupby(bins, observed=observed, dropna=dropna).agg(
685+
lambda x: x.median()
686+
)
685687
tm.assert_frame_equal(result, expected)
686688

687689

@@ -1069,6 +1071,7 @@ def test_max_nan_bug():
10691071

10701072

10711073
@pytest.mark.slow
1074+
@pytest.mark.filterwarnings("ignore::pandas.errors.NullKeyWarning")
10721075
@pytest.mark.parametrize("with_nan", [True, False])
10731076
@pytest.mark.parametrize("keys", [["joe"], ["joe", "jim"]])
10741077
def test_series_groupby_nunique(sort, dropna, as_index, with_nan, keys):

0 commit comments

Comments
 (0)