You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
importpandasaspdimportnumpyasnpdefgen_one_df(start=0):
np.random.seed(0)
returnpd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
'key2' : ['one', 'two', 'one', 'two', 'one'],
'data1' : np.random.randn(5),
'data2' : np.random.randn(5)})[start:]
df=gen_one_df()
print(df)
grouped=df.groupby([str(x) forxindf.dtypes], axis=1)
forx, yingrouped:
print(x)
print(y)
defreturn_new_data(x):
# return pd.Series(range(5), index=range(count, count+5))# return df.reindex(range(count, 5))returngen_one_df(0)
# this works, probably due to index of the return value not being the same or whatever.# return gen_one_df(1)print(grouped.apply(return_new_data))
Problem description
Here, just as a trivial test of the functionality of group apply for DataFrames. I first group the frame by columns based datatypes. Then when applying, I simply return two identical dataframes (see return_new_data) that I assume to be concatenated together along columns (with group names as part of multiindex). However, this fails when the returned dataframes have the same index as the one on which GroupBy object is based on.
I think this issue is probably caused by the large amount of "well-educated guessing" in the code, and I think this should be added to #13056.
Expected Output
# original dataframe.
data1 data2 key1 key2
0 1.764052 -0.977278 a one
1 0.400157 0.950088 a two
2 0.978738 -0.151357 b one
3 2.240893 -0.103219 b two
4 1.867558 0.410599 a one
# each group.
float64
data1 data2
0 1.764052 -0.977278
1 0.400157 0.950088
2 0.978738 -0.151357
3 2.240893 -0.103219
4 1.867558 0.410599
object
key1 key2
0 a one
1 a two
2 b one
3 b two
4 a one
# expected output.
float64 object
data1 data2 key1 key2 data1 data2 key1 key2
0 1.764052 -0.977278 a one 1.764052 -0.977278 a one
1 0.400157 0.950088 a two 0.400157 0.950088 a two
2 0.978738 -0.151357 b one 0.978738 -0.151357 b one
3 2.240893 -0.103219 b two 2.240893 -0.103219 b two
4 1.867558 0.410599 a one 1.867558 0.410599 a one
actual output:
ValueError Traceback (most recent call last)
<ipython-input-1-d64daf6b7b92> in <module>()
26 # return gen_one_df(1)
27
---> 28 print(grouped.apply(return_new_data))
~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/groupby.py in apply(self, func, *args, **kwargs)
803 # ignore SettingWithCopy here in case the user mutates
804 with option_context('mode.chained_assignment', None):
--> 805 return self._python_apply_general(f)
806
807 def _python_apply_general(self, f):
~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/groupby.py in _python_apply_general(self, f)
812 keys,
813 values,
--> 814 not_indexed_same=mutated or self.mutated)
815
816 def _iterate_slices(self):
~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/groupby.py in _wrap_applied_output(self, keys, values, not_indexed_same)
3854 elif isinstance(v, DataFrame):
3855 return self._concat_objects(keys, values,
-> 3856 not_indexed_same=not_indexed_same)
3857 elif self.grouper.groupings is not None:
3858 if len(self.grouper.groupings) > 1:
~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/groupby.py in _concat_objects(self, keys, values, not_indexed_same)
991 result = result.take(indexer, axis=self.axis)
992 else:
--> 993 result = result.reindex(ax, axis=self.axis)
994
995 elif self.group_keys:
~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
125 @wraps(func)
126 def wrapper(*args, **kwargs):
--> 127 return func(*args, **kwargs)
128
129 if not PY2:
[16/388]
~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
2933 kwargs.pop('axis', None)
2934 kwargs.pop('labels', None)
-> 2935 return super(DataFrame, self).reindex(**kwargs)
2936
2937 @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)
~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
3021 # perform the reindex on the axes
3022 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 3023 fill_value, copy).__finalize__(self)
3024
3025 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
2863 if columns is not None:
2864 frame = frame._reindex_columns(columns, method, copy, level,
-> 2865 fill_value, limit, tolerance)
2866
2867 index = axes['index']
~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/frame.py in _reindex_columns(self, new_columns, method, copy, level, fill_value, limit, tolerance)
2888 return self._reindex_with_indexers({1: [new_columns, indexer]},
2889 copy=copy, fill_value=fill_value,
-> 2890 allow_dups=False)
2891
2892 def _reindex_multi(self, axes, copy, fill_value):
~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
3143 fill_value=fill_value,
3144 allow_dups=allow_dups,
-> 3145 copy=copy)
3146
3147 if copy and new_data is self._data:
~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
4137 # some axes don't allow reindexing with dups
4138 if not allow_dups:
-> 4139 self.axes[axis]._can_reindex(indexer)
4140
4141 if axis >= self.ndim:
~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/indexes/base.py in _can_reindex(self, indexer)
2942 # trying to reindex on an axis with duplicates
2943 if not self.is_unique and len(indexer):
-> 2944 raise ValueError("cannot reindex from a duplicate axis")
2945
2946 def reindex(self, target, method=None, level=None, limit=None,
ValueError: cannot reindex from a duplicate axis
I added to the other issue. this would require an effort to make the appropriate test cases then systematic fixing. If you want to submit xfailed tests would be helpful.
Code Sample, a copy-pastable example if possible
Problem description
Here, just as a trivial test of the functionality of group apply for DataFrames. I first group the frame by columns based datatypes. Then when
apply
ing, I simply return two identical dataframes (seereturn_new_data
) that I assume to be concatenated together along columns (with group names as part of multiindex). However, this fails when the returned dataframes have the same index as the one on whichGroupBy
object is based on.I think this issue is probably caused by the large amount of "well-educated guessing" in the code, and I think this should be added to #13056.
Expected Output
actual output:
Output of
pd.show_versions()
INSTALLED VERSIONS
commit: None
python: 3.6.4.final.0
python-bits: 64
OS: Linux
OS-release: 2.6.32-573.3.1.el6.x86_64
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: en_US.iso885915
LOCALE: en_US.ISO8859-15
pandas: 0.22.0
pytest: 3.4.0
pip: 9.0.1
setuptools: 38.4.0
Cython: None
numpy: 1.13.3
scipy: 1.0.0
pyarrow: None
xarray: None
IPython: 6.2.1
sphinx: None
patsy: 0.5.0
dateutil: 2.6.1
pytz: 2017.3
blosc: None
bottleneck: None
tables: None
numexpr: None
feather: None
matplotlib: 2.1.2
openpyxl: None
xlrd: None
xlwt: None
xlsxwriter: None
lxml: None
bs4: None
html5lib: 0.9999999
sqlalchemy: None
pymysql: None
psycopg2: None
jinja2: 2.10
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: None
The text was updated successfully, but these errors were encountered: