Skip to content

Commit 24ad221

Browse files
committed
BUG: DataFrame.groupby with as_index=False shouldn't modify grouping columns
1 parent 1ce9f0c commit 24ad221

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+786
-350
lines changed

asv_bench/benchmarks/arithmetic.py

+53
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,59 @@ def time_frame_op_with_series_axis1(self, opname):
101101
getattr(operator, opname)(self.df, self.ser)
102102

103103

104+
class FrameWithFrameWide:
105+
# Many-columns, mixed dtypes
106+
107+
params = [
108+
[
109+
# GH#32779 has discussion of which operators are included here
110+
operator.add,
111+
operator.floordiv,
112+
operator.gt,
113+
]
114+
]
115+
param_names = ["op"]
116+
117+
def setup(self, op):
118+
# we choose dtypes so as to make the blocks
119+
# a) not perfectly match between right and left
120+
# b) appreciably bigger than single columns
121+
n_cols = 2000
122+
n_rows = 500
123+
124+
# construct dataframe with 2 blocks
125+
arr1 = np.random.randn(n_rows, int(n_cols / 2)).astype("f8")
126+
arr2 = np.random.randn(n_rows, int(n_cols / 2)).astype("f4")
127+
df = pd.concat(
128+
[pd.DataFrame(arr1), pd.DataFrame(arr2)], axis=1, ignore_index=True,
129+
)
130+
# should already be the case, but just to be sure
131+
df._consolidate_inplace()
132+
133+
# TODO: GH#33198 the setting here shoudlnt need two steps
134+
arr1 = np.random.randn(n_rows, int(n_cols / 4)).astype("f8")
135+
arr2 = np.random.randn(n_rows, int(n_cols / 2)).astype("i8")
136+
arr3 = np.random.randn(n_rows, int(n_cols / 4)).astype("f8")
137+
df2 = pd.concat(
138+
[pd.DataFrame(arr1), pd.DataFrame(arr2), pd.DataFrame(arr3)],
139+
axis=1,
140+
ignore_index=True,
141+
)
142+
# should already be the case, but just to be sure
143+
df2._consolidate_inplace()
144+
145+
self.left = df
146+
self.right = df2
147+
148+
def time_op_different_blocks(self, op):
149+
# blocks (and dtypes) are not aligned
150+
op(self.left, self.right)
151+
152+
def time_op_same_blocks(self, op):
153+
# blocks (and dtypes) are aligned
154+
op(self.left, self.left)
155+
156+
104157
class Ops:
105158

106159
params = [[True, False], ["default", 1]]

doc/source/whatsnew/v1.1.0.rst

+61-2
Original file line numberDiff line numberDiff line change
@@ -561,6 +561,63 @@ Assignment to multiple columns of a :class:`DataFrame` when some of the columns
561561
df[['a', 'c']] = 1
562562
df
563563
564+
.. _whatsnew_110.api_breaking.groupby_nunique:
565+
566+
Using groupby with ``nunique`` and ``as_index=True``
567+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
568+
569+
Using :meth:`DataFrame.groupby` with ``as_index=True`` and the aggregation ``nunique`` would include the grouping column(s) in the columns of the result. Now, the grouping columns only appear in the index. This is now consistent with other aggregation functions. (:issue:`32579`)
570+
571+
.. ipython:: python
572+
573+
df = pd.DataFrame({"a": ["x", "x", "y", "y"], "b": [1, 1, 2, 3]})
574+
df
575+
576+
*Previous behavior*:
577+
578+
.. code-block:: ipython
579+
580+
In [3]: df.groupby("a", as_index=True).nunique()
581+
Out[4]:
582+
a b
583+
a
584+
x 1 1
585+
y 1 2
586+
587+
*New behavior*:
588+
589+
.. ipython:: python
590+
591+
df.groupby("a", as_index=True).nunique()
592+
593+
.. _whatsnew_110.api_breaking.groupby_as_index_false:
594+
595+
Using groupby with ``as_index=False``
596+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
597+
598+
Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxmax``, ``idxmin``, ``mad``, ``nunique``, or ``skew`` would modify the grouping column. Now, the grouping column remains unchanged. (:issue:`21090`)
599+
600+
.. ipython:: python
601+
602+
df = pd.DataFrame({"a": ["x", "x", "y", "y"], "b": [1, 1, 2, 3]})
603+
df
604+
605+
*Previous behavior*:
606+
607+
.. code-block:: ipython
608+
609+
In [3]: df.groupby("a", as_index=False).nunique()
610+
Out[4]:
611+
a b
612+
0 1 1
613+
1 1 2
614+
615+
*New behavior*:
616+
617+
.. ipython:: python
618+
619+
df.groupby("a", as_index=False).nunique()
620+
564621
.. _whatsnew_110.deprecations:
565622

566623
Deprecations
@@ -611,7 +668,7 @@ Performance improvements
611668
and :meth:`~pandas.core.groupby.groupby.Groupby.last` (:issue:`34178`)
612669
- Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`).
613670
- Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`).
614-
671+
- Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`)
615672

616673
.. ---------------------------------------------------------------------------
617674
@@ -816,9 +873,10 @@ Groupby/resample/rolling
816873
- Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`)
817874
- Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`)
818875
- Bug in :meth:`Rolling.min` and :meth:`Rolling.max`: Growing memory usage after multiple calls when using a fixed window (:issue:`30726`)
876+
- Bug in :meth:`Series.groupby` would raise ``ValueError`` when grouping by :class:`PeriodIndex` level (:issue:`34010`)
819877
- Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`)
820878
- Bug in :meth:`GroupBy.rolling.apply` ignores args and kwargs parameters (:issue:`33433`)
821-
879+
- Bug in :meth:`DataFrameGroupby.std` and :meth:`DataFrameGroupby.sem` would modify grouped-by columns when ``as_index=False`` (:issue:`10355`)
822880

823881
Reshaping
824882
^^^^^^^^^
@@ -883,6 +941,7 @@ Other
883941
- Bug in :meth:`DataFrame.__dir__` caused a segfault when using unicode surrogates in a column name (:issue:`25509`)
884942
- Bug in :meth:`DataFrame.plot.scatter` caused an error when plotting variable marker sizes (:issue:`32904`)
885943
- :class:`IntegerArray` now implements the ``sum`` operation (:issue:`33172`)
944+
- More informative error message with ``np.min`` or ``np.max`` on unordered :class:`Categorical` (:issue:`33115`)
886945
- Bug in :class:`Tick` comparisons raising ``TypeError`` when comparing against timedelta-like objects (:issue:`34088`)
887946

888947
.. ---------------------------------------------------------------------------

pandas/_libs/index.pyx

+3-2
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ cnp.import_array()
2222
from pandas._libs cimport util
2323

2424
from pandas._libs.tslibs.nattype cimport c_NaT as NaT
25-
from pandas._libs.tslibs.base cimport ABCTimestamp, ABCTimedelta, ABCPeriod
25+
from pandas._libs.tslibs.base cimport ABCTimestamp, ABCTimedelta
26+
from pandas._libs.tslibs.period cimport is_period_object
2627

2728
from pandas._libs.hashtable cimport HashTable
2829

@@ -479,7 +480,7 @@ cdef class PeriodEngine(Int64Engine):
479480
cdef int64_t _unbox_scalar(self, scalar) except? -1:
480481
if scalar is NaT:
481482
return scalar.value
482-
if isinstance(scalar, ABCPeriod):
483+
if is_period_object(scalar):
483484
# NB: we assume that we have the correct freq here.
484485
return scalar.ordinal
485486
raise TypeError(scalar)

pandas/_libs/internals.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ cdef class BlockPlacement:
4949
else:
5050
# Cython memoryview interface requires ndarray to be writeable.
5151
arr = np.require(val, dtype=np.int64, requirements='W')
52-
assert arr.ndim == 1
52+
assert arr.ndim == 1, arr.shape
5353
self._as_array = arr
5454
self._has_array = True
5555

pandas/_libs/lib.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ from pandas._libs.tslibs.nattype cimport (
7474
from pandas._libs.tslibs.conversion cimport convert_to_tsobject
7575
from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64
7676
from pandas._libs.tslibs.timezones cimport get_timezone, tz_compare
77-
from pandas._libs.tslibs.base cimport is_period_object
77+
from pandas._libs.tslibs.period cimport is_period_object
7878

7979
from pandas._libs.missing cimport (
8080
checknull,

pandas/_libs/tslibs/base.pxd

-7
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,3 @@ cdef class ABCTimedelta(timedelta):
66

77
cdef class ABCTimestamp(datetime):
88
pass
9-
10-
11-
cdef class ABCPeriod:
12-
pass
13-
14-
15-
cdef bint is_period_object(object obj)

pandas/_libs/tslibs/base.pyx

-8
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,3 @@ cdef class ABCTimedelta(timedelta):
1414

1515
cdef class ABCTimestamp(datetime):
1616
pass
17-
18-
19-
cdef class ABCPeriod:
20-
pass
21-
22-
23-
cdef bint is_period_object(object obj):
24-
return isinstance(obj, ABCPeriod)

pandas/_libs/tslibs/conversion.pyx

+5-4
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ from cpython.datetime cimport (datetime, time, tzinfo,
1313
PyDateTime_IMPORT)
1414
PyDateTime_IMPORT
1515

16-
from pandas._libs.tslibs.base cimport ABCTimestamp, is_period_object
16+
from pandas._libs.tslibs.base cimport ABCTimestamp
1717

1818
from pandas._libs.tslibs.np_datetime cimport (
1919
check_dts_bounds, npy_datetimestruct, pandas_datetime_to_datetimestruct,
@@ -290,10 +290,11 @@ cdef convert_to_tsobject(object ts, object tz, object unit,
290290
# Keep the converter same as PyDateTime's
291291
ts = datetime.combine(ts, time())
292292
return convert_datetime_to_tsobject(ts, tz)
293-
elif is_period_object(ts):
294-
raise ValueError("Cannot convert Period to Timestamp "
295-
"unambiguously. Use to_timestamp")
296293
else:
294+
from .period import Period
295+
if isinstance(ts, Period):
296+
raise ValueError("Cannot convert Period to Timestamp "
297+
"unambiguously. Use to_timestamp")
297298
raise TypeError(f'Cannot convert input [{ts}] of type {type(ts)} to '
298299
f'Timestamp')
299300

pandas/_libs/tslibs/nattype.pyx

+4-3
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ from pandas._libs.tslibs.np_datetime cimport (
2929
get_timedelta64_value,
3030
)
3131
cimport pandas._libs.tslibs.util as util
32-
from pandas._libs.tslibs.base cimport is_period_object
3332

3433

3534
# ----------------------------------------------------------------------
@@ -149,7 +148,7 @@ cdef class _NaT(datetime):
149148
elif util.is_offset_object(other):
150149
return c_NaT
151150

152-
elif util.is_integer_object(other) or is_period_object(other):
151+
elif util.is_integer_object(other):
153152
# For Period compat
154153
# TODO: the integer behavior is deprecated, remove it
155154
return c_NaT
@@ -163,6 +162,7 @@ cdef class _NaT(datetime):
163162
return result
164163
raise TypeError(f"Cannot add NaT to ndarray with dtype {other.dtype}")
165164

165+
# Includes Period going through here
166166
return NotImplemented
167167

168168
def __sub__(self, other):
@@ -185,7 +185,7 @@ cdef class _NaT(datetime):
185185
elif util.is_offset_object(other):
186186
return c_NaT
187187

188-
elif util.is_integer_object(other) or is_period_object(other):
188+
elif util.is_integer_object(other):
189189
# For Period compat
190190
# TODO: the integer behavior is deprecated, remove it
191191
return c_NaT
@@ -216,6 +216,7 @@ cdef class _NaT(datetime):
216216
f"Cannot subtract NaT from ndarray with dtype {other.dtype}"
217217
)
218218

219+
# Includes Period going through here
219220
return NotImplemented
220221

221222
def __pos__(self):

0 commit comments

Comments
 (0)