Skip to content

sparse = True in pd.get_dummies triggers TypeError at further manipulation #19313

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
TMorville opened this issue Jan 19, 2018 · 1 comment
Closed
Labels
Duplicate Report Duplicate issue or pull request

Comments

@TMorville
Copy link

import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

iris = load_iris()
data = iris.data
column_names = iris.feature_names

df = pd.DataFrame(iris.data, columns = column_names)

print(df.head(n=5))

df_dummy = pd.get_dummies(df, sparse = True, columns = ['petal length (cm)'] )

print(list(df), df_dummy.shape)

X = df_dummy.drop(['sepal length (cm)'], axis = 1)

Problem description

I can't do anything with the dataframe based on col names when using Sparse = True - such as dropping a variable, which is replicated above. The full error print is under details.

Expected Output

The expected output can be seen by removing sparse = True from df_dummy = pd.get_dummies(df, sparse = True, columns = ['petal length (cm)'] ), and then running the subsequent lines.

Output of pd.show_versions()

INSTALLED VERSIONS

commit: None

pandas: 0.21.0
pytest: 3.2.1
pip: 9.0.1
setuptools: 36.5.0.post20170921
Cython: 0.26.1
numpy: 1.12.1
scipy: 1.0.0
pyarrow: None
xarray: None
IPython: 6.1.0
sphinx: 1.6.3
patsy: 0.4.1
dateutil: 2.6.1
pytz: 2017.2
blosc: None
bottleneck: 1.2.1
tables: 3.4.2
numexpr: 2.6.2
feather: None
matplotlib: 2.1.0
openpyxl: 2.4.8
xlrd: 1.1.0
xlwt: 1.2.0
xlsxwriter: 1.0.2
lxml: 4.1.0
bs4: 4.6.0
html5lib: 0.999999999
sqlalchemy: 1.1.13
pymysql: None
psycopg2: None
jinja2: 2.9.6
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: None
In [ ]:

COMPLETE ERROR PRINT


TypeError Traceback (most recent call last)
in ()
15 print(list(df), df_dummy.shape)
16
---> 17 X = df_dummy.drop(['sepal length (cm)'], axis = 1)

/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
2528 for axis, labels in axes.items():
2529 if labels is not None:
-> 2530 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
2531
2532 if inplace:

/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in drop_axis(self, labels, axis, level, errors)
2561 else:
2562 new_axis = axis.drop(labels, errors=errors)
-> 2563 dropped = self.reindex(**{axis_name: new_axis})
2564 try:
2565 dropped.axes[axis
].set_names(axis.names, inplace=True)

/anaconda3/lib/python3.6/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
125 @wraps(func)
126 def wrapper(*args, **kwargs):
--> 127 return func(*args, **kwargs)
128
129 if not PY2:

/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
2933 kwargs.pop('axis', None)
2934 kwargs.pop('labels', None)
-> 2935 return super(DataFrame, self).reindex(**kwargs)
2936
2937 @appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)

/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
3021 # perform the reindex on the axes
3022 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 3023 fill_value, copy).finalize(self)
3024
3025 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,

/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
2863 if columns is not None:
2864 frame = frame._reindex_columns(columns, method, copy, level,
-> 2865 fill_value, limit, tolerance)
2866
2867 index = axes['index']

/anaconda3/lib/python3.6/site-packages/pandas/core/sparse/frame.py in _reindex_columns(self, columns, method, copy, level, fill_value, limit, takeable)
701 return self._constructor(
702 sdict, index=self.index, columns=columns,
--> 703 default_fill_value=self._default_fill_value).finalize(self)
704
705 def _reindex_with_indexers(self, reindexers, method=None, fill_value=None,

/anaconda3/lib/python3.6/site-packages/pandas/core/sparse/frame.py in init(self, data, index, columns, default_kind, default_fill_value, dtype, copy)
89 fill_value=default_fill_value)
90 elif isinstance(data, dict):
---> 91 mgr = self._init_dict(data, index, columns, dtype=dtype)
92 elif isinstance(data, (np.ndarray, list)):
93 mgr = self._init_matrix(data, index, columns, dtype=dtype)

/anaconda3/lib/python3.6/site-packages/pandas/core/sparse/frame.py in _init_dict(self, data, index, columns, dtype)
168 sdict.update((c, nan_arr) for c in columns if c not in sdict)
169
--> 170 return to_manager(sdict, columns, index)
171
172 def _init_matrix(self, data, index, columns, dtype=None):

/anaconda3/lib/python3.6/site-packages/pandas/core/sparse/frame.py in to_manager(sdf, columns, index)
896
897 return create_block_manager_from_arrays(
--> 898 [sdf[c] for c in columns], columns, axes)
899
900

/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in create_block_manager_from_arrays(arrays, names, axes)
4630
4631 try:
-> 4632 blocks = form_blocks(arrays, names, axes)
4633 mgr = BlockManager(blocks, axes)
4634 mgr._consolidate_inplace()

/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in form_blocks(arrays, names, axes)
4726
4727 if len(sparse_items) > 0:
-> 4728 sparse_blocks = _sparse_blockify(sparse_items)
4729 blocks.extend(sparse_blocks)
4730

/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in _sparse_blockify(tuples, dtype)
4788 array = _maybe_to_sparse(array)
4789 block = make_block(array, klass=SparseBlock, fastpath=True,
-> 4790 placement=[i])
4791 new_blocks.append(block)
4792

/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in make_block(values, placement, klass, ndim, dtype, fastpath)
2950 placement=placement, dtype=dtype)
2951
-> 2952 return klass(values, ndim=ndim, fastpath=fastpath, placement=placement)
2953
2954 # TODO: flexible with index=None and/or items=None

/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in init(self, values, placement, ndim, fastpath, **kwargs)
1701
1702 if not isinstance(values, self._holder):
-> 1703 raise TypeError("values must be {0}".format(self._holder.name))
1704
1705 self.values = values

@TomAugspurger
Copy link
Contributor

I think this is a duplicate of #18686

Basically, get_dummies(..., sparse=True) converts everything to sparse, not just the newly created columns.

@TomAugspurger TomAugspurger added the Duplicate Report Duplicate issue or pull request label Jan 19, 2018
@TomAugspurger TomAugspurger added this to the No action milestone Jan 19, 2018
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Duplicate Report Duplicate issue or pull request
Projects
None yet
Development

No branches or pull requests

2 participants