Skip to content

Commit 1cb13a3

Browse files
authored
Merge pull request #59 from pandas-dev/master
Sync Fork from Upstream Repo
2 parents d9198f0 + 7017599 commit 1cb13a3

File tree

7 files changed

+75
-23
lines changed

7 files changed

+75
-23
lines changed

doc/source/whatsnew/v1.0.2.rst

+4-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Fixed regressions
1919
- Fixed regression in :meth:`Series.align` when ``other`` is a DataFrame and ``method`` is not None (:issue:`31785`)
2020
- Fixed regression in :meth:`pandas.core.groupby.RollingGroupby.apply` where the ``raw`` parameter was ignored (:issue:`31754`)
2121
- Fixed regression in :meth:`rolling(..).corr() <pandas.core.window.Rolling.corr>` when using a time offset (:issue:`31789`)
22+
- Fixed regression in :meth:`DataFrameGroupBy.nunique` which was modifying the original values if ``NaN`` values were present (:issue:`31950`)
2223
- Fixed regression where :func:`read_pickle` raised a ``UnicodeDecodeError`` when reading a py27 pickle with :class:`MultiIndex` column (:issue:`31988`).
2324
- Fixed regression in :class:`DataFrame` arithmetic operations with mis-matched columns (:issue:`31623`)
2425
- Fixed regression in :meth:`GroupBy.agg` calling a user-provided function an extra time on an empty input (:issue:`31760`)
@@ -65,6 +66,7 @@ Bug fixes
6566
**Categorical**
6667

6768
- Fixed bug where :meth:`Categorical.from_codes` improperly raised a ``ValueError`` when passed nullable integer codes. (:issue:`31779`)
69+
- Fixed bug where :meth:`Categorical` constructor would raise a ``TypeError`` when given a numpy array containing ``pd.NA``. (:issue:`31927`)
6870
- Bug in :class:`Categorical` that would ignore or crash when calling :meth:`Series.replace` with a list-like ``to_replace`` (:issue:`31720`)
6971

7072
**I/O**
@@ -76,6 +78,7 @@ Bug fixes
7678

7779
- Fix bug in :meth:`DataFrame.convert_dtypes` for columns that were already using the ``"string"`` dtype (:issue:`31731`).
7880
- Fixed bug in setting values using a slice indexer with string dtype (:issue:`31772`)
81+
- Fixed bug where :meth:`GroupBy.first` and :meth:`GroupBy.last` would raise a ``TypeError`` when groups contained ``pd.NA`` in a column of object dtype (:issue:`32123`)
7982
- Fix bug in :meth:`Series.convert_dtypes` for series with mix of integers and strings (:issue:`32117`)
8083

8184
.. ---------------------------------------------------------------------------
@@ -85,4 +88,4 @@ Bug fixes
8588
Contributors
8689
~~~~~~~~~~~~
8790

88-
.. contributors:: v1.0.1..v1.0.2|HEAD
91+
.. contributors:: v1.0.1..v1.0.2|HEAD

pandas/_libs/groupby.pyx

+4-2
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ from pandas._libs.algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE,
2222
from pandas._libs.algos import (take_2d_axis1_float64_float64,
2323
groupsort_indexer, tiebreakers)
2424

25+
from pandas._libs.missing cimport checknull
26+
2527
cdef int64_t NPY_NAT = get_nat()
2628
_int64_max = np.iinfo(np.int64).max
2729

@@ -887,7 +889,7 @@ def group_last(rank_t[:, :] out,
887889
for j in range(K):
888890
val = values[i, j]
889891

890-
if val == val:
892+
if not checknull(val):
891893
# NB: use _treat_as_na here once
892894
# conditional-nogil is available.
893895
nobs[lab, j] += 1
@@ -976,7 +978,7 @@ def group_nth(rank_t[:, :] out,
976978
for j in range(K):
977979
val = values[i, j]
978980

979-
if val == val:
981+
if not checknull(val):
980982
# NB: use _treat_as_na here once
981983
# conditional-nogil is available.
982984
nobs[lab, j] += 1

pandas/_libs/hashtable_class_helper.pxi.in

+7-2
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
1010
# ----------------------------------------------------------------------
1111

1212
from pandas._libs.tslibs.util cimport get_c_string
13+
from pandas._libs.missing cimport C_NA
1314

1415
{{py:
1516

@@ -1032,8 +1033,12 @@ cdef class PyObjectHashTable(HashTable):
10321033
val = values[i]
10331034
hash(val)
10341035

1035-
if ignore_na and ((val != val or val is None)
1036-
or (use_na_value and val == na_value)):
1036+
if ignore_na and (
1037+
(val is C_NA)
1038+
or (val != val)
1039+
or (val is None)
1040+
or (use_na_value and val == na_value)
1041+
):
10371042
# if missing values do not count as unique values (i.e. if
10381043
# ignore_na is True), skip the hashtable entry for them, and
10391044
# replace the corresponding label with na_sentinel

pandas/core/groupby/generic.py

+6-18
Original file line numberDiff line numberDiff line change
@@ -591,30 +591,18 @@ def nunique(self, dropna: bool = True) -> Series:
591591

592592
val = self.obj._internal_get_values()
593593

594-
# GH 27951
595-
# temporary fix while we wait for NumPy bug 12629 to be fixed
596-
val[isna(val)] = np.datetime64("NaT")
597-
598-
try:
599-
sorter = np.lexsort((val, ids))
600-
except TypeError: # catches object dtypes
601-
msg = f"val.dtype must be object, got {val.dtype}"
602-
assert val.dtype == object, msg
603-
val, _ = algorithms.factorize(val, sort=False)
604-
sorter = np.lexsort((val, ids))
605-
_isna = lambda a: a == -1
606-
else:
607-
_isna = isna
608-
609-
ids, val = ids[sorter], val[sorter]
594+
codes, _ = algorithms.factorize(val, sort=False)
595+
sorter = np.lexsort((codes, ids))
596+
codes = codes[sorter]
597+
ids = ids[sorter]
610598

611599
# group boundaries are where group ids change
612600
# unique observations are where sorted values change
613601
idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
614-
inc = np.r_[1, val[1:] != val[:-1]]
602+
inc = np.r_[1, codes[1:] != codes[:-1]]
615603

616604
# 1st item of each group is a new unique observation
617-
mask = _isna(val)
605+
mask = codes == -1
618606
if dropna:
619607
inc[idx] = 1
620608
inc[mask] = 0

pandas/tests/arrays/categorical/test_constructors.py

+12
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,18 @@ def test_constructor_with_categorical_categories(self):
458458
result = Categorical(["a", "b"], categories=CategoricalIndex(["a", "b", "c"]))
459459
tm.assert_categorical_equal(result, expected)
460460

461+
@pytest.mark.parametrize("klass", [lambda x: np.array(x, dtype=object), list])
462+
def test_construction_with_null(self, klass, nulls_fixture):
463+
# https://github.com/pandas-dev/pandas/issues/31927
464+
values = klass(["a", nulls_fixture, "b"])
465+
result = Categorical(values)
466+
467+
dtype = CategoricalDtype(["a", "b"])
468+
codes = [0, -1, 1]
469+
expected = Categorical.from_codes(codes=codes, dtype=dtype)
470+
471+
tm.assert_categorical_equal(result, expected)
472+
461473
def test_from_codes(self):
462474

463475
# too few categories

pandas/tests/groupby/test_function.py

+2
Original file line numberDiff line numberDiff line change
@@ -1017,6 +1017,7 @@ def test_frame_describe_unstacked_format():
10171017
@pytest.mark.parametrize("dropna", [False, True])
10181018
def test_series_groupby_nunique(n, m, sort, dropna):
10191019
def check_nunique(df, keys, as_index=True):
1020+
original_df = df.copy()
10201021
gr = df.groupby(keys, as_index=as_index, sort=sort)
10211022
left = gr["julie"].nunique(dropna=dropna)
10221023

@@ -1026,6 +1027,7 @@ def check_nunique(df, keys, as_index=True):
10261027
right = right.reset_index(drop=True)
10271028

10281029
tm.assert_series_equal(left, right, check_names=False)
1030+
tm.assert_frame_equal(df, original_df)
10291031

10301032
days = date_range("2015-08-23", periods=10)
10311033

pandas/tests/groupby/test_nth.py

+40
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,46 @@ def test_first_last_nth(df):
5454
tm.assert_frame_equal(result, expected)
5555

5656

57+
@pytest.mark.parametrize("method", ["first", "last"])
58+
def test_first_last_with_na_object(method, nulls_fixture):
59+
# https://github.com/pandas-dev/pandas/issues/32123
60+
groups = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby(
61+
"a"
62+
)
63+
result = getattr(groups, method)()
64+
65+
if method == "first":
66+
values = [1, 3]
67+
else:
68+
values = [2, 3]
69+
70+
values = np.array(values, dtype=result["b"].dtype)
71+
idx = pd.Index([1, 2], name="a")
72+
expected = pd.DataFrame({"b": values}, index=idx)
73+
74+
tm.assert_frame_equal(result, expected)
75+
76+
77+
@pytest.mark.parametrize("index", [0, -1])
78+
def test_nth_with_na_object(index, nulls_fixture):
79+
# https://github.com/pandas-dev/pandas/issues/32123
80+
groups = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby(
81+
"a"
82+
)
83+
result = groups.nth(index)
84+
85+
if index == 0:
86+
values = [1, 3]
87+
else:
88+
values = [2, nulls_fixture]
89+
90+
values = np.array(values, dtype=result["b"].dtype)
91+
idx = pd.Index([1, 2], name="a")
92+
expected = pd.DataFrame({"b": values}, index=idx)
93+
94+
tm.assert_frame_equal(result, expected)
95+
96+
5797
def test_first_last_nth_dtypes(df_mixed_floats):
5898

5999
df = df_mixed_floats.copy()

0 commit comments

Comments
 (0)