forked from pandas-dev/pandas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_size.py
130 lines (106 loc) · 4.15 KB
/
test_size.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import numpy as np
import pytest
import pandas.util._test_decorators as td
from pandas.core.dtypes.common import is_integer_dtype
from pandas import (
DataFrame,
Index,
PeriodIndex,
Series,
)
import pandas._testing as tm
@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
def test_size(df, by):
grouped = df.groupby(by=by)
result = grouped.size()
for key, group in grouped:
assert result[key] == len(group)
@pytest.mark.parametrize(
"by",
[
[0, 0, 0, 0],
[0, 1, 1, 1],
[1, 0, 1, 1],
[0, None, None, None],
pytest.param([None, None, None, None], marks=pytest.mark.xfail),
],
)
def test_size_axis_1(df, axis_1, by, sort, dropna):
# GH#45715
counts = {key: sum(value == key for value in by) for key in dict.fromkeys(by)}
if dropna:
counts = {key: value for key, value in counts.items() if key is not None}
expected = Series(counts, dtype="int64")
if sort:
expected = expected.sort_index()
if is_integer_dtype(expected.index.dtype) and not any(x is None for x in by):
expected.index = expected.index.astype(int)
msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
grouped = df.groupby(by=by, axis=axis_1, sort=sort, dropna=dropna)
result = grouped.size()
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
@pytest.mark.parametrize("sort", [True, False])
def test_size_sort(sort, by):
df = DataFrame(np.random.default_rng(2).choice(20, (1000, 3)), columns=list("ABC"))
left = df.groupby(by=by, sort=sort).size()
right = df.groupby(by=by, sort=sort)["C"].apply(lambda a: a.shape[0])
tm.assert_series_equal(left, right, check_names=False)
def test_size_series_dataframe():
# https://github.com/pandas-dev/pandas/issues/11699
df = DataFrame(columns=["A", "B"])
out = Series(dtype="int64", index=Index([], name="A"))
tm.assert_series_equal(df.groupby("A").size(), out)
def test_size_groupby_all_null():
# https://github.com/pandas-dev/pandas/issues/23050
# Assert no 'Value Error : Length of passed values is 2, index implies 0'
df = DataFrame({"A": [None, None]}) # all-null groups
result = df.groupby("A").size()
expected = Series(dtype="int64", index=Index([], name="A"))
tm.assert_series_equal(result, expected)
def test_size_period_index():
# https://github.com/pandas-dev/pandas/issues/34010
ser = Series([1], index=PeriodIndex(["2000"], name="A", freq="D"))
grp = ser.groupby(level="A")
result = grp.size()
tm.assert_series_equal(result, ser)
@pytest.mark.parametrize("as_index", [True, False])
def test_size_on_categorical(as_index):
df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"])
df["A"] = df["A"].astype("category")
result = df.groupby(["A", "B"], as_index=as_index, observed=False).size()
expected = DataFrame(
[[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"]
)
expected["A"] = expected["A"].astype("category")
if as_index:
expected = expected.set_index(["A", "B"])["size"].rename(None)
tm.assert_equal(result, expected)
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
def test_size_series_masked_type_returns_Int64(dtype):
# GH 54132
ser = Series([1, 1, 1], index=["a", "a", "b"], dtype=dtype)
result = ser.groupby(level=0).size()
expected = Series([2, 1], dtype="Int64", index=["a", "b"])
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"dtype",
[
object,
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
],
)
def test_size_strings(dtype):
# GH#55627
df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype)
result = df.groupby("a")["b"].size()
exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"
expected = Series(
[2, 1],
index=Index(["a", "b"], name="a", dtype=dtype),
name="b",
dtype=exp_dtype,
)
tm.assert_series_equal(result, expected)