From 6646c809f3857cdde841a3ee00104aad3015dba9 Mon Sep 17 00:00:00 2001 From: Nicolas Lepleux Date: Thu, 12 Dec 2019 15:17:18 +0100 Subject: [PATCH 1/9] Fix groupby crash when axis = 1 and columns is not a MultiIndex --- pandas/core/groupby/grouper.py | 10 ++++++++-- pandas/tests/groupby/test_grouping.py | 12 ++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index b0df04f18ff1d..dca3062f67d01 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -491,8 +491,14 @@ def get_grouper( raise ValueError("multiple levels only valid with MultiIndex") if isinstance(level, str): - if obj.index.name != level: - raise ValueError(f"level name {level} is not the name of the index") + if axis == 0: + axis_name = "index" + else: + axis_name = "columns" + if getattr(obj, axis_name).name != level: + raise ValueError( + f"level name {level} is not the name of the {axis_name}" + ) elif level > 0 or level < -1: raise ValueError("level > 0 or level < -1 only valid with MultiIndex") diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 2c84c2f034fc6..c4096ef76ce9e 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -511,6 +511,18 @@ def test_groupby_level_index_names(self): with pytest.raises(ValueError, match=msg): df.groupby(level="foo") + def test_groupby_level_columns_names(self): + # This used to raise with 'level name exp is not the name of the index' + df = ( + DataFrame({"exp": ["A"] * 3 + ["B"] * 3, "var1": range(6)}) + .set_index("exp") + .T + ) + df.groupby(level="exp", axis=1) + msg = "level name foo is not the name of the columns" + with pytest.raises(ValueError, match=msg): + df.groupby(level="foo", axis=1) + @pytest.mark.parametrize("sort", [True, False]) def test_groupby_level_with_nas(self, sort): # GH 17537 From 0135842c53320f8a08a350723b2d0fb50fbd860e Mon Sep 17 00:00:00 2001 From: Nicolas Lepleux Date: Thu, 12 Dec 2019 16:22:37 +0100 Subject: [PATCH 2/9] Suggested changes by jreback --- pandas/core/groupby/grouper.py | 5 +---- pandas/tests/groupby/test_grouping.py | 23 +++++++---------------- 2 files changed, 8 insertions(+), 20 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index dca3062f67d01..b98ec356a3e3b 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -491,10 +491,7 @@ def get_grouper( raise ValueError("multiple levels only valid with MultiIndex") if isinstance(level, str): - if axis == 0: - axis_name = "index" - else: - axis_name = "columns" + axis_name = obj._get_axis_name(axis) if getattr(obj, axis_name).name != level: raise ValueError( f"level name {level} is not the name of the {axis_name}" diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index c4096ef76ce9e..f62aab9f8513b 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -501,27 +501,18 @@ def test_groupby_level(self, sort, mframe, df): with pytest.raises(ValueError, match=msg): df.groupby(level=1) - def test_groupby_level_index_names(self): + @pytest.mark.parametrize("axis", [0, 1]) + def test_groupby_level_index_names(self, axis): # GH4014 this used to raise ValueError since 'exp'>1 (in py2) df = DataFrame({"exp": ["A"] * 3 + ["B"] * 3, "var1": range(6)}).set_index( "exp" ) - df.groupby(level="exp") - msg = "level name foo is not the name of the index" + if axis: + df = df.T + df.groupby(level="exp", axis=axis) + msg = f"level name foo is not the name of the {df._get_axis_name(axis)}" with pytest.raises(ValueError, match=msg): - df.groupby(level="foo") - - def test_groupby_level_columns_names(self): - # This used to raise with 'level name exp is not the name of the index' - df = ( - DataFrame({"exp": ["A"] * 3 + ["B"] * 3, "var1": range(6)}) - .set_index("exp") - .T - ) - df.groupby(level="exp", axis=1) - msg = "level name foo is not the name of the columns" - with pytest.raises(ValueError, match=msg): - df.groupby(level="foo", axis=1) + df.groupby(level="foo", axis=axis) @pytest.mark.parametrize("sort", [True, False]) def test_groupby_level_with_nas(self, sort): From 9da523c29bf51a46a7bd68967f486d8535499659 Mon Sep 17 00:00:00 2001 From: Nicolas Lepleux Date: Thu, 12 Dec 2019 16:25:10 +0100 Subject: [PATCH 3/9] whatsnew commit --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 doc/source/whatsnew/v1.0.0.rst diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst old mode 100644 new mode 100755 index 29060a93923eb..34fd0f892400c --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -815,7 +815,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.rolling().quantile()` ignoring ``interpolation`` keyword argument (:issue:`28779`) - Bug in :meth:`DataFrame.groupby` where ``any``, ``all``, ``nunique`` and transform functions would incorrectly handle duplicate column labels (:issue:`21668`) - Bug in :meth:`DataFrameGroupBy.agg` with timezone-aware datetime64 column incorrectly casting results to the original dtype (:issue:`29641`) -- +- Bug in :meth:`DataFrame.groupby` when using axis=1 and having a single level columns index Reshaping ^^^^^^^^^ From 7651314123d738e37daeb821d0c024acb4601648 Mon Sep 17 00:00:00 2001 From: nlepleux Date: Thu, 12 Dec 2019 21:33:31 +0100 Subject: [PATCH 4/9] Update pandas/core/groupby/grouper.py Thanks to Tom for the tip Co-Authored-By: Tom Augspurger --- pandas/core/groupby/grouper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index b98ec356a3e3b..4d4d7cdec978a 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -491,7 +491,7 @@ def get_grouper( raise ValueError("multiple levels only valid with MultiIndex") if isinstance(level, str): - axis_name = obj._get_axis_name(axis) + if obj._get_axis(axis) != level: if getattr(obj, axis_name).name != level: raise ValueError( f"level name {level} is not the name of the {axis_name}" From c9649ed46f5b364b839b4b982135504ae40bee5a Mon Sep 17 00:00:00 2001 From: Nicolas Lepleux Date: Fri, 13 Dec 2019 09:23:59 +0100 Subject: [PATCH 5/9] Suggested changes for pull request. I still need to merge with master for CI issues --- : | 95 +++++++++++++++++++++++++++ doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/groupby/grouper.py | 5 +- pandas/tests/groupby/test_grouping.py | 3 +- 4 files changed, 99 insertions(+), 6 deletions(-) create mode 100644 : diff --git a/: b/: new file mode 100644 index 0000000000000..bf3d4714582c1 --- /dev/null +++ b/: @@ -0,0 +1,95 @@ +name: pandas-dev +channels: + - https://conda.anaconda.org/conda-forge/ +dependencies: + # required + - numpy>=1.15 + - python=3.6 + - python-dateutil>=2.6.1 + - pytz + + # benchmarks + - asv + + # building + - cython>=0.29.13 + + # code checks + - black=19.10b0 + - cpplint + - flake8 + - flake8-comprehensions>=3.1.0 # used by flake8, linting of unnecessary comprehensions + - flake8-rst>=0.6.0,<=0.7.0 # linting of code blocks in rst files + - isort # check that imports are in the right order + - mypy=0.730 + - pycodestyle # used by flake8 + + # documentation + - gitpython # obtain contributors from git for whatsnew + - sphinx + - numpydoc>=0.9.0 + + # documentation (jupyter notebooks) + - nbconvert>=5.4.1 + - nbsphinx + - pandoc + # Dask and its dependencies + - dask-core + - toolz>=0.7.3 + - fsspec>=0.5.1 + - partd>=0.3.10 + - cloudpickle>=0.2.1 + + # web (jinja2 is also needed, but it's also an optional pandas dependency) + - markdown + - feedparser + - pyyaml + - requests + + # testing + - boto3 + - botocore>=1.11 + - hypothesis>=3.82 + - moto # mock S3 + - pytest>=5.0.1 + - pytest-cov + - pytest-xdist>=1.21 + - seaborn + - statsmodels + + # unused (required indirectly may be?) + - ipywidgets + - nbformat + - notebook>=5.7.5 + - pip + + # optional + - blosc + - bottleneck>=1.2.1 + - ipykernel + - ipython>=5.6.0 + - jinja2 # pandas.Styler + - matplotlib>=2.2.2 # pandas.plotting, Series.plot, DataFrame.plot + - numexpr>=2.6.8 + - scipy>=1.1 + + # optional for io + - beautifulsoup4>=4.6.0 # pandas.read_html + - fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet + - html5lib # pandas.read_html + - lxml # pandas.read_html + - openpyxl<=3.0.1 # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile + - pyarrow>=0.13.1 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather + - pyqt>=5.9.2 # pandas.read_clipboard + - pytables>=3.4.2 # pandas.read_hdf, DataFrame.to_hdf + - python-snappy # required by pyarrow + - s3fs # pandas.read_csv... when using 's3://...' path + - sqlalchemy # pandas.read_sql, DataFrame.to_sql + - xarray # DataFrame.to_xarray + - xlrd # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile + - xlsxwriter # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile + - xlwt # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile + - odfpy # pandas.read_excel + - pyreadstat # pandas.read_spss + - pip: + - git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 34fd0f892400c..df24e2a1b901a 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -815,7 +815,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.rolling().quantile()` ignoring ``interpolation`` keyword argument (:issue:`28779`) - Bug in :meth:`DataFrame.groupby` where ``any``, ``all``, ``nunique`` and transform functions would incorrectly handle duplicate column labels (:issue:`21668`) - Bug in :meth:`DataFrameGroupBy.agg` with timezone-aware datetime64 column incorrectly casting results to the original dtype (:issue:`29641`) -- Bug in :meth:`DataFrame.groupby` when using axis=1 and having a single level columns index +- Bug in :meth:`DataFrame.groupby` when using axis=1 and having a single level columns index (:issue:`30208`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 4d4d7cdec978a..d35805e117d50 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -491,10 +491,9 @@ def get_grouper( raise ValueError("multiple levels only valid with MultiIndex") if isinstance(level, str): - if obj._get_axis(axis) != level: - if getattr(obj, axis_name).name != level: + if obj._get_axis(axis).name != level: raise ValueError( - f"level name {level} is not the name of the {axis_name}" + f"level name {level} is not the name of the {obj._get_axis_name(axis)}" ) elif level > 0 or level < -1: raise ValueError("level > 0 or level < -1 only valid with MultiIndex") diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index f62aab9f8513b..f2af397357e4f 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -501,13 +501,12 @@ def test_groupby_level(self, sort, mframe, df): with pytest.raises(ValueError, match=msg): df.groupby(level=1) - @pytest.mark.parametrize("axis", [0, 1]) def test_groupby_level_index_names(self, axis): # GH4014 this used to raise ValueError since 'exp'>1 (in py2) df = DataFrame({"exp": ["A"] * 3 + ["B"] * 3, "var1": range(6)}).set_index( "exp" ) - if axis: + if axis in (1, "columns"): df = df.T df.groupby(level="exp", axis=axis) msg = f"level name foo is not the name of the {df._get_axis_name(axis)}" From 9798ad7a654d71ba99bf18328dfdaa5e11e0e23e Mon Sep 17 00:00:00 2001 From: Nicolas Lepleux Date: Fri, 13 Dec 2019 09:36:06 +0100 Subject: [PATCH 6/9] pep8 --- pandas/core/groupby/grouper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index d35805e117d50..18b08f191770f 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -493,7 +493,8 @@ def get_grouper( if isinstance(level, str): if obj._get_axis(axis).name != level: raise ValueError( - f"level name {level} is not the name of the {obj._get_axis_name(axis)}" + f"level name {level} is not the name " + f"of the {obj._get_axis_name(axis)}" ) elif level > 0 or level < -1: raise ValueError("level > 0 or level < -1 only valid with MultiIndex") From eb7446922f66678ef057dcc11742fad36fbaf042 Mon Sep 17 00:00:00 2001 From: nlepleux Date: Fri, 13 Dec 2019 16:23:03 +0100 Subject: [PATCH 7/9] Delete : --- : | 95 --------------------------------------------------------------- 1 file changed, 95 deletions(-) delete mode 100644 : diff --git a/: b/: deleted file mode 100644 index bf3d4714582c1..0000000000000 --- a/: +++ /dev/null @@ -1,95 +0,0 @@ -name: pandas-dev -channels: - - https://conda.anaconda.org/conda-forge/ -dependencies: - # required - - numpy>=1.15 - - python=3.6 - - python-dateutil>=2.6.1 - - pytz - - # benchmarks - - asv - - # building - - cython>=0.29.13 - - # code checks - - black=19.10b0 - - cpplint - - flake8 - - flake8-comprehensions>=3.1.0 # used by flake8, linting of unnecessary comprehensions - - flake8-rst>=0.6.0,<=0.7.0 # linting of code blocks in rst files - - isort # check that imports are in the right order - - mypy=0.730 - - pycodestyle # used by flake8 - - # documentation - - gitpython # obtain contributors from git for whatsnew - - sphinx - - numpydoc>=0.9.0 - - # documentation (jupyter notebooks) - - nbconvert>=5.4.1 - - nbsphinx - - pandoc - # Dask and its dependencies - - dask-core - - toolz>=0.7.3 - - fsspec>=0.5.1 - - partd>=0.3.10 - - cloudpickle>=0.2.1 - - # web (jinja2 is also needed, but it's also an optional pandas dependency) - - markdown - - feedparser - - pyyaml - - requests - - # testing - - boto3 - - botocore>=1.11 - - hypothesis>=3.82 - - moto # mock S3 - - pytest>=5.0.1 - - pytest-cov - - pytest-xdist>=1.21 - - seaborn - - statsmodels - - # unused (required indirectly may be?) - - ipywidgets - - nbformat - - notebook>=5.7.5 - - pip - - # optional - - blosc - - bottleneck>=1.2.1 - - ipykernel - - ipython>=5.6.0 - - jinja2 # pandas.Styler - - matplotlib>=2.2.2 # pandas.plotting, Series.plot, DataFrame.plot - - numexpr>=2.6.8 - - scipy>=1.1 - - # optional for io - - beautifulsoup4>=4.6.0 # pandas.read_html - - fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet - - html5lib # pandas.read_html - - lxml # pandas.read_html - - openpyxl<=3.0.1 # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - - pyarrow>=0.13.1 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather - - pyqt>=5.9.2 # pandas.read_clipboard - - pytables>=3.4.2 # pandas.read_hdf, DataFrame.to_hdf - - python-snappy # required by pyarrow - - s3fs # pandas.read_csv... when using 's3://...' path - - sqlalchemy # pandas.read_sql, DataFrame.to_sql - - xarray # DataFrame.to_xarray - - xlrd # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - - xlsxwriter # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - - xlwt # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - - odfpy # pandas.read_excel - - pyreadstat # pandas.read_spss - - pip: - - git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master From 3b0f1e7a93cc0af4bc7ee47f0c66dd7aa20af4ed Mon Sep 17 00:00:00 2001 From: Nicolas Lepleux Date: Tue, 17 Dec 2019 19:29:20 +0100 Subject: [PATCH 8/9] Suggested fix for 30253 --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/groupby/generic.py | 7 +++++++ pandas/tests/groupby/test_groupby.py | 19 +++++++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0f0d86e271061..e12e718d8840a 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -825,6 +825,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` where ``any``, ``all``, ``nunique`` and transform functions would incorrectly handle duplicate column labels (:issue:`21668`) - Bug in :meth:`DataFrameGroupBy.agg` with timezone-aware datetime64 column incorrectly casting results to the original dtype (:issue:`29641`) - Bug in :meth:`DataFrame.groupby` when using axis=1 and having a single level columns index (:issue:`30208`) +- Bug in :meth:`DataFrame.groupby` when using nunique on axis=1 (:issue:`30253`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ec183fcfc154a..4365a1af05228 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1818,10 +1818,17 @@ def groupby_series(obj, col=None): # Try to consolidate with normal wrapping functions from pandas.core.reshape.concat import concat + axis_number = obj._get_axis_number(self.axis) + if axis_number: + obj = obj.T + results = [groupby_series(content, label) for label, content in obj.items()] results = concat(results, axis=1) results.columns.names = obj.columns.names + if axis_number: + results = results.T + if not self.as_index: results.index = ibase.default_index(len(results)) return results diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 93d4dc6046735..a753fe204de2e 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1994,3 +1994,22 @@ def test_dup_labels_output_shape(groupby_func, idx): assert result.shape == (1, 2) tm.assert_index_equal(result.columns, idx) + + +def test_groupby_crash_on_nunique(axis): + # Fix following 30253 + df = pd.DataFrame( + {("A", "B"): [1, 2], ("A", "C"): [1, 3], ("D", "B"): [0, 0]}, + index=pd.bdate_range("20191212", "20191213"), + ) + df.columns.names = ["COL1", "COL2"] + + axis_number = df._get_axis_number(axis) + if not axis_number: + df = df.T + + nunique_t = df.T.groupby(level=0, axis=axis).nunique().T + + nunique = df.groupby(axis=int(not axis_number), level=0).nunique() + + pd.testing.assert_frame_equal(nunique_t, nunique) From b5af5d66fe07b5f62bcba332a9579f5c504d8b30 Mon Sep 17 00:00:00 2001 From: Nicolas Lepleux Date: Wed, 18 Dec 2019 10:39:52 +0100 Subject: [PATCH 9/9] Follow up on #30253 --- pandas/core/groupby/generic.py | 14 +++++++++----- pandas/tests/groupby/test_groupby.py | 14 ++++++-------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 19283883021d5..eaa4f51c155a9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1814,16 +1814,20 @@ def groupby_series(obj, col=None): from pandas.core.reshape.concat import concat axis_number = obj._get_axis_number(self.axis) - if axis_number: - obj = obj.T + other_axis = int(not axis_number) + if axis_number == 0: + iter_func = obj.items + else: + iter_func = obj.iterrows - results = [groupby_series(content, label) for label, content in obj.items()] + results = [groupby_series(content, label) for label, content in iter_func()] results = concat(results, axis=1) - results.columns.names = obj.columns.names - if axis_number: + if axis_number == 1: results = results.T + results._get_axis(other_axis).names = obj._get_axis(other_axis).names + if not self.as_index: results.index = ibase.default_index(len(results)) return results diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a753fe204de2e..8f88f68c69f2b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1998,18 +1998,16 @@ def test_dup_labels_output_shape(groupby_func, idx): def test_groupby_crash_on_nunique(axis): # Fix following 30253 - df = pd.DataFrame( - {("A", "B"): [1, 2], ("A", "C"): [1, 3], ("D", "B"): [0, 0]}, - index=pd.bdate_range("20191212", "20191213"), - ) - df.columns.names = ["COL1", "COL2"] + df = pd.DataFrame({("A", "B"): [1, 2], ("A", "C"): [1, 3], ("D", "B"): [0, 0]}) axis_number = df._get_axis_number(axis) if not axis_number: df = df.T - nunique_t = df.T.groupby(level=0, axis=axis).nunique().T + result = df.groupby(axis=axis_number, level=0).nunique() - nunique = df.groupby(axis=int(not axis_number), level=0).nunique() + expected = pd.DataFrame({"A": [1, 2], "D": [1, 1]}) + if not axis_number: + expected = expected.T - pd.testing.assert_frame_equal(nunique_t, nunique) + tm.assert_frame_equal(result, expected)