|
1 | 1 | from itertools import product
|
| 2 | +from string import ascii_lowercase |
2 | 3 |
|
3 | 4 | import numpy as np
|
4 | 5 | import pytest
|
5 | 6 |
|
6 |
| -from pandas import DataFrame, Index, MultiIndex, Period, Series, Timedelta, Timestamp |
| 7 | +import pandas as pd |
| 8 | +from pandas import ( |
| 9 | + DataFrame, |
| 10 | + Index, |
| 11 | + MultiIndex, |
| 12 | + Period, |
| 13 | + Series, |
| 14 | + Timedelta, |
| 15 | + Timestamp, |
| 16 | + date_range, |
| 17 | +) |
7 | 18 | import pandas._testing as tm
|
8 | 19 |
|
9 | 20 |
|
@@ -229,3 +240,125 @@ def test_count_groupby_column_with_nan_in_groupby_column(self):
|
229 | 240 | index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]}
|
230 | 241 | )
|
231 | 242 | tm.assert_frame_equal(expected, res)
|
| 243 | + |
| 244 | + |
| 245 | +def test_groupby_timedelta_cython_count(): |
| 246 | + df = DataFrame( |
| 247 | + {"g": list("ab" * 2), "delt": np.arange(4).astype("timedelta64[ns]")} |
| 248 | + ) |
| 249 | + expected = Series([2, 2], index=pd.Index(["a", "b"], name="g"), name="delt") |
| 250 | + result = df.groupby("g").delt.count() |
| 251 | + tm.assert_series_equal(expected, result) |
| 252 | + |
| 253 | + |
| 254 | +def test_count(): |
| 255 | + n = 1 << 15 |
| 256 | + dr = date_range("2015-08-30", periods=n // 10, freq="T") |
| 257 | + |
| 258 | + df = DataFrame( |
| 259 | + { |
| 260 | + "1st": np.random.choice(list(ascii_lowercase), n), |
| 261 | + "2nd": np.random.randint(0, 5, n), |
| 262 | + "3rd": np.random.randn(n).round(3), |
| 263 | + "4th": np.random.randint(-10, 10, n), |
| 264 | + "5th": np.random.choice(dr, n), |
| 265 | + "6th": np.random.randn(n).round(3), |
| 266 | + "7th": np.random.randn(n).round(3), |
| 267 | + "8th": np.random.choice(dr, n) - np.random.choice(dr, 1), |
| 268 | + "9th": np.random.choice(list(ascii_lowercase), n), |
| 269 | + } |
| 270 | + ) |
| 271 | + |
| 272 | + for col in df.columns.drop(["1st", "2nd", "4th"]): |
| 273 | + df.loc[np.random.choice(n, n // 10), col] = np.nan |
| 274 | + |
| 275 | + df["9th"] = df["9th"].astype("category") |
| 276 | + |
| 277 | + for key in ["1st", "2nd", ["1st", "2nd"]]: |
| 278 | + left = df.groupby(key).count() |
| 279 | + right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) |
| 280 | + tm.assert_frame_equal(left, right) |
| 281 | + |
| 282 | + |
| 283 | +def test_count_non_nulls(): |
| 284 | + # GH#5610 |
| 285 | + # count counts non-nulls |
| 286 | + df = pd.DataFrame( |
| 287 | + [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, np.nan]], |
| 288 | + columns=["A", "B", "C"], |
| 289 | + ) |
| 290 | + |
| 291 | + count_as = df.groupby("A").count() |
| 292 | + count_not_as = df.groupby("A", as_index=False).count() |
| 293 | + |
| 294 | + expected = DataFrame([[1, 2], [0, 0]], columns=["B", "C"], index=[1, 3]) |
| 295 | + expected.index.name = "A" |
| 296 | + tm.assert_frame_equal(count_not_as, expected.reset_index()) |
| 297 | + tm.assert_frame_equal(count_as, expected) |
| 298 | + |
| 299 | + count_B = df.groupby("A")["B"].count() |
| 300 | + tm.assert_series_equal(count_B, expected["B"]) |
| 301 | + |
| 302 | + |
| 303 | +def test_count_object(): |
| 304 | + df = pd.DataFrame({"a": ["a"] * 3 + ["b"] * 3, "c": [2] * 3 + [3] * 3}) |
| 305 | + result = df.groupby("c").a.count() |
| 306 | + expected = pd.Series([3, 3], index=pd.Index([2, 3], name="c"), name="a") |
| 307 | + tm.assert_series_equal(result, expected) |
| 308 | + |
| 309 | + df = pd.DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3}) |
| 310 | + result = df.groupby("c").a.count() |
| 311 | + expected = pd.Series([1, 3], index=pd.Index([2, 3], name="c"), name="a") |
| 312 | + tm.assert_series_equal(result, expected) |
| 313 | + |
| 314 | + |
| 315 | +def test_count_cross_type(): |
| 316 | + # GH8169 |
| 317 | + vals = np.hstack( |
| 318 | + (np.random.randint(0, 5, (100, 2)), np.random.randint(0, 2, (100, 2))) |
| 319 | + ) |
| 320 | + |
| 321 | + df = pd.DataFrame(vals, columns=["a", "b", "c", "d"]) |
| 322 | + df[df == 2] = np.nan |
| 323 | + expected = df.groupby(["c", "d"]).count() |
| 324 | + |
| 325 | + for t in ["float32", "object"]: |
| 326 | + df["a"] = df["a"].astype(t) |
| 327 | + df["b"] = df["b"].astype(t) |
| 328 | + result = df.groupby(["c", "d"]).count() |
| 329 | + tm.assert_frame_equal(result, expected) |
| 330 | + |
| 331 | + |
| 332 | +def test_lower_int_prec_count(): |
| 333 | + df = DataFrame( |
| 334 | + { |
| 335 | + "a": np.array([0, 1, 2, 100], np.int8), |
| 336 | + "b": np.array([1, 2, 3, 6], np.uint32), |
| 337 | + "c": np.array([4, 5, 6, 8], np.int16), |
| 338 | + "grp": list("ab" * 2), |
| 339 | + } |
| 340 | + ) |
| 341 | + result = df.groupby("grp").count() |
| 342 | + expected = DataFrame( |
| 343 | + {"a": [2, 2], "b": [2, 2], "c": [2, 2]}, index=pd.Index(list("ab"), name="grp") |
| 344 | + ) |
| 345 | + tm.assert_frame_equal(result, expected) |
| 346 | + |
| 347 | + |
| 348 | +def test_count_uses_size_on_exception(): |
| 349 | + class RaisingObjectException(Exception): |
| 350 | + pass |
| 351 | + |
| 352 | + class RaisingObject: |
| 353 | + def __init__(self, msg="I will raise inside Cython"): |
| 354 | + super().__init__() |
| 355 | + self.msg = msg |
| 356 | + |
| 357 | + def __eq__(self, other): |
| 358 | + # gets called in Cython to check that raising calls the method |
| 359 | + raise RaisingObjectException(self.msg) |
| 360 | + |
| 361 | + df = DataFrame({"a": [RaisingObject() for _ in range(4)], "grp": list("ab" * 2)}) |
| 362 | + result = df.groupby("grp").count() |
| 363 | + expected = DataFrame({"a": [2, 2]}, index=pd.Index(list("ab"), name="grp")) |
| 364 | + tm.assert_frame_equal(result, expected) |
0 commit comments