"Duplicated level name" when using groupby with two different attributes of a datetime #21250

mattharrison · 2018-05-29T22:56:48Z

Code Sample, a copy-pastable example if possible

from io import StringIO
import pandas as pd

data = '''Date,Amount
10/30/2010,54
11/20/2010,53'''
df = pd.read_csv(StringIO(data), parse_dates=[0])

df.groupby([df.Date.dt.year, df.Date.dt.month]).mean()

Problem description

This worked prior to 0.23 and seems like a regression. I now get the following error:


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _cython_agg_blocks(self, how, alt, numeric_only, min_count)
   4017                 result, _ = self.grouper.aggregate(
-> 4018                     block.values, how, axis=agg_axis, min_count=min_count)
   4019             except NotImplementedError:

~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in aggregate(self, values, how, axis, min_count)
   2626         return self._cython_operation('aggregate', values, how, axis,
-> 2627                                       min_count=min_count)
   2628 

~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _cython_operation(self, kind, values, how, axis, min_count, **kwargs)
   2532                                           "supported for the 'how' argument")
-> 2533             out_shape = (self.ngroups,) + values.shape[1:]
   2534 

pandas/_libs/properties.pyx in pandas._libs.properties.CachedProperty.__get__()

~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in ngroups(self)
   2361     def ngroups(self):
-> 2362         return len(self.result_index)
   2363 

pandas/_libs/properties.pyx in pandas._libs.properties.CachedProperty.__get__()

~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in result_index(self)
   2380                             verify_integrity=False,
-> 2381                             names=self.names)
   2382         return result

~/.env/36/lib/python3.6/site-packages/pandas/core/indexes/multi.py in __new__(cls, levels, labels, sortorder, names, dtype, copy, name, verify_integrity, _set_identity)
    231             # handles name validation
--> 232             result._set_names(names)
    233 

~/.env/36/lib/python3.6/site-packages/pandas/core/indexes/multi.py in _set_names(self, names, level, validate)
    694                         'level {}, is already used for level '
--> 695                         '{}.'.format(name, l, used[name]))
    696 

ValueError: Duplicated level name: "Date", assigned to level 1, is already used for level 0.

During handling of the above exception, another exception occurred:

UnboundLocalError                         Traceback (most recent call last)
~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in mean(self, *args, **kwargs)
   1305         try:
-> 1306             return self._cython_agg_general('mean', **kwargs)
   1307         except GroupByError:

~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _cython_agg_general(self, how, alt, numeric_only, min_count)
   3973         new_items, new_blocks = self._cython_agg_blocks(
-> 3974             how, alt=alt, numeric_only=numeric_only, min_count=min_count)
   3975         return self._wrap_agged_blocks(new_items, new_blocks)

~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _cython_agg_blocks(self, how, alt, numeric_only, min_count)
   4038                 # see if we can cast the block back to the original dtype
-> 4039                 result = block._try_coerce_and_cast_result(result)
   4040                 newb = block.make_block(result)

UnboundLocalError: local variable 'result' referenced before assignment

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-13-4b644a75a9ce> in <module>()
     10 df.dtypes
     11 
---> 12 df.groupby([df.Date.dt.year, df.Date.dt.month]).mean()

~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in mean(self, *args, **kwargs)
   1310             with _group_selection_context(self):
   1311                 f = lambda x: x.mean(axis=self.axis, **kwargs)
-> 1312                 return self._python_agg_general(f)
   1313 
   1314     @Substitution(name='groupby')

~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _python_agg_general(self, func, *args, **kwargs)
   1086                 output[name] = self._try_cast(values[mask], result)
   1087 
-> 1088         return self._wrap_aggregated_output(output)
   1089 
   1090     def _wrap_applied_output(self, *args, **kwargs):

~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _wrap_aggregated_output(self, output, names)
   4728             result = result._consolidate()
   4729         else:
-> 4730             index = self.grouper.result_index
   4731             result = DataFrame(output, index=index, columns=output_keys)
   4732 

pandas/_libs/properties.pyx in pandas._libs.properties.CachedProperty.__get__()

~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in result_index(self)
   2379                             labels=labels,
   2380                             verify_integrity=False,
-> 2381                             names=self.names)
   2382         return result
   2383 

~/.env/36/lib/python3.6/site-packages/pandas/core/indexes/multi.py in __new__(cls, levels, labels, sortorder, names, dtype, copy, name, verify_integrity, _set_identity)
    230         if names is not None:
    231             # handles name validation
--> 232             result._set_names(names)
    233 
    234         if sortorder is not None:

~/.env/36/lib/python3.6/site-packages/pandas/core/indexes/multi.py in _set_names(self, names, level, validate)
    693                         'Duplicated level name: "{}", assigned to '
    694                         'level {}, is already used for level '
--> 695                         '{}.'.format(name, l, used[name]))
    696 
    697             self.levels[l].rename(name, inplace=True)

ValueError: Duplicated level name: "Date", assigned to level 1, is already used for level 0.

Expected Output

           Amount
Date Date        
2010 10        54
     11        53

Output of `pd.show_versions()`

INSTALLED VERSIONS ------------------ commit: None python: 3.6.4.final.0 python-bits: 64 OS: Darwin OS-release: 16.5.0 machine: x86_64 processor: i386 byteorder: little LC_ALL: None LANG: en_US.UTF-8 LOCALE: en_US.UTF-8

pandas: 0.23.0
pytest: 3.0.6
pip: 10.0.1
setuptools: 39.2.0
Cython: 0.27.3
numpy: 1.14.3
scipy: 1.1.0
pyarrow: None
xarray: None
IPython: 6.2.1
sphinx: 1.7.5
patsy: 0.5.0
dateutil: 2.7.3
pytz: 2018.4
blosc: None
bottleneck: None
tables: None
numexpr: None
feather: None
matplotlib: 2.2.2
openpyxl: 2.4.7
xlrd: 1.0.0
xlwt: 1.2.0
xlsxwriter: None
lxml: 3.7.2
bs4: 4.6.0
html5lib: 0.9999999
sqlalchemy: 1.2.5
pymysql: None
psycopg2: 2.7.4 (dt dec pq3 ext lo64)
jinja2: 2.10
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: None

The text was updated successfully, but these errors were encountered:

jschendel · 2018-05-30T00:27:17Z

Thanks for the report. This is a dupe of #21075 (see comments for a workaround), which was closed in favor of #19029.

mattharrison · 2018-05-30T01:01:52Z

(My bad with the search fu). It seems to me that pulling off a date time attribute should change the column name. I see the pr that tacks on 1,2,etc, but wouldn't it be better to tack on "_month", "_hour", etc?

jschendel · 2018-05-30T23:42:26Z

No worries. Would rather have two dupes than no issue being reported at all!

For the workaround I was referring #21075 (comment), or in you case:

df.groupby([df.Date.dt.year.rename('year'), df.Date.dt.month.rename(month')]).mean()

I suspect the PR is tacking on 0,1,... since that handles things most generically, and it corresponds to the level enumeration of a MultiIndex.

As to why something like df.Date.dt.year doesn't automatically change the name? I don't know for sure, but I suspect it's a combination of:

I don't know of a well agreed upon convention of how to rename (rename entirely, add suffixes, etc.)
There's a rename method that's easily chainable to explicitly accommodate any desired renaming
Probably best to retain existing behavior in the absence of a clearly better alternative

That being said, feel free to open a new issue if you feel strongly about automatically renaming these things.

jschendel added Groupby Regression Functionality that used to work in a prior pandas version Duplicate Report Duplicate issue or pull request MultiIndex labels May 30, 2018

jschendel closed this as completed May 30, 2018

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

"Duplicated level name" when using groupby with two different attributes of a datetime #21250

"Duplicated level name" when using groupby with two different attributes of a datetime #21250

mattharrison commented May 29, 2018

jschendel commented May 30, 2018

mattharrison commented May 30, 2018

jschendel commented May 30, 2018

"Duplicated level name" when using groupby with two different attributes of a datetime #21250

"Duplicated level name" when using groupby with two different attributes of a datetime #21250

Comments

mattharrison commented May 29, 2018

Code Sample, a copy-pastable example if possible

Problem description

Expected Output

Output of pd.show_versions()

jschendel commented May 30, 2018

mattharrison commented May 30, 2018

jschendel commented May 30, 2018

Output of `pd.show_versions()`