sparse = True in pd.get_dummies triggers TypeError at further manipulation #19313

TMorville · 2018-01-19T16:46:15Z

import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

iris = load_iris()
data = iris.data
column_names = iris.feature_names

df = pd.DataFrame(iris.data, columns = column_names)

print(df.head(n=5))

df_dummy = pd.get_dummies(df, sparse = True, columns = ['petal length (cm)'] )

print(list(df), df_dummy.shape)

X = df_dummy.drop(['sepal length (cm)'], axis = 1)

Problem description

I can't do anything with the dataframe based on col names when using Sparse = True - such as dropping a variable, which is replicated above. The full error print is under details.

Expected Output

The expected output can be seen by removing sparse = True from df_dummy = pd.get_dummies(df, sparse = True, columns = ['petal length (cm)'] ), and then running the subsequent lines.

Output of `pd.show_versions()`

INSTALLED VERSIONS

commit: None

pandas: 0.21.0
pytest: 3.2.1
pip: 9.0.1
setuptools: 36.5.0.post20170921
Cython: 0.26.1
numpy: 1.12.1
scipy: 1.0.0
pyarrow: None
xarray: None
IPython: 6.1.0
sphinx: 1.6.3
patsy: 0.4.1
dateutil: 2.6.1
pytz: 2017.2
blosc: None
bottleneck: 1.2.1
tables: 3.4.2
numexpr: 2.6.2
feather: None
matplotlib: 2.1.0
openpyxl: 2.4.8
xlrd: 1.1.0
xlwt: 1.2.0
xlsxwriter: 1.0.2
lxml: 4.1.0
bs4: 4.6.0
html5lib: 0.999999999
sqlalchemy: 1.1.13
pymysql: None
psycopg2: None
jinja2: 2.9.6
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: None
In [ ]:

COMPLETE ERROR PRINT

TypeError Traceback (most recent call last)
in ()
15 print(list(df), df_dummy.shape)
16
---> 17 X = df_dummy.drop(['sepal length (cm)'], axis = 1)

/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
2528 for axis, labels in axes.items():
2529 if labels is not None:
-> 2530 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
2531
2532 if inplace:

/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in drop_axis(self, labels, axis, level, errors)
2561 else:
2562 new_axis = axis.drop(labels, errors=errors)
-> 2563 dropped = self.reindex(**{axis_name: new_axis})
2564 try:
2565 dropped.axes[axis].set_names(axis.names, inplace=True)

/anaconda3/lib/python3.6/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
125 @wraps(func)
126 def wrapper(*args, **kwargs):
--> 127 return func(*args, **kwargs)
128
129 if not PY2:

/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
2933 kwargs.pop('axis', None)
2934 kwargs.pop('labels', None)
-> 2935 return super(DataFrame, self).reindex(**kwargs)
2936
2937 @appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)

/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
3021 # perform the reindex on the axes
3022 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 3023 fill_value, copy).finalize(self)
3024
3025 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,

/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
2863 if columns is not None:
2864 frame = frame._reindex_columns(columns, method, copy, level,
-> 2865 fill_value, limit, tolerance)
2866
2867 index = axes['index']

/anaconda3/lib/python3.6/site-packages/pandas/core/sparse/frame.py in _reindex_columns(self, columns, method, copy, level, fill_value, limit, takeable)
701 return self._constructor(
702 sdict, index=self.index, columns=columns,
--> 703 default_fill_value=self._default_fill_value).finalize(self)
704
705 def _reindex_with_indexers(self, reindexers, method=None, fill_value=None,

/anaconda3/lib/python3.6/site-packages/pandas/core/sparse/frame.py in init(self, data, index, columns, default_kind, default_fill_value, dtype, copy)
89 fill_value=default_fill_value)
90 elif isinstance(data, dict):
---> 91 mgr = self._init_dict(data, index, columns, dtype=dtype)
92 elif isinstance(data, (np.ndarray, list)):
93 mgr = self._init_matrix(data, index, columns, dtype=dtype)

/anaconda3/lib/python3.6/site-packages/pandas/core/sparse/frame.py in _init_dict(self, data, index, columns, dtype)
168 sdict.update((c, nan_arr) for c in columns if c not in sdict)
169
--> 170 return to_manager(sdict, columns, index)
171
172 def _init_matrix(self, data, index, columns, dtype=None):

/anaconda3/lib/python3.6/site-packages/pandas/core/sparse/frame.py in to_manager(sdf, columns, index)
896
897 return create_block_manager_from_arrays(
--> 898 [sdf[c] for c in columns], columns, axes)
899
900

/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in create_block_manager_from_arrays(arrays, names, axes)
4630
4631 try:
-> 4632 blocks = form_blocks(arrays, names, axes)
4633 mgr = BlockManager(blocks, axes)
4634 mgr._consolidate_inplace()

/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in form_blocks(arrays, names, axes)
4726
4727 if len(sparse_items) > 0:
-> 4728 sparse_blocks = _sparse_blockify(sparse_items)
4729 blocks.extend(sparse_blocks)
4730

/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in _sparse_blockify(tuples, dtype)
4788 array = _maybe_to_sparse(array)
4789 block = make_block(array, klass=SparseBlock, fastpath=True,
-> 4790 placement=[i])
4791 new_blocks.append(block)
4792

/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in make_block(values, placement, klass, ndim, dtype, fastpath)
2950 placement=placement, dtype=dtype)
2951
-> 2952 return klass(values, ndim=ndim, fastpath=fastpath, placement=placement)
2953
2954 # TODO: flexible with index=None and/or items=None

/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in init(self, values, placement, ndim, fastpath, **kwargs)
1701
1702 if not isinstance(values, self._holder):
-> 1703 raise TypeError("values must be {0}".format(self._holder.name))
1704
1705 self.values = values

The text was updated successfully, but these errors were encountered:

TomAugspurger · 2018-01-19T18:32:12Z

I think this is a duplicate of #18686

Basically, get_dummies(..., sparse=True) converts everything to sparse, not just the newly created columns.

TomAugspurger closed this as completed Jan 19, 2018

TomAugspurger added the Duplicate Report Duplicate issue or pull request label Jan 19, 2018

TomAugspurger added this to the No action milestone Jan 19, 2018

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

sparse = True in pd.get_dummies triggers TypeError at further manipulation #19313

sparse = True in pd.get_dummies triggers TypeError at further manipulation #19313

TMorville commented Jan 19, 2018

INSTALLED VERSIONS

COMPLETE ERROR PRINT

TomAugspurger commented Jan 19, 2018

sparse = True in pd.get_dummies triggers TypeError at further manipulation #19313

sparse = True in pd.get_dummies triggers TypeError at further manipulation #19313

Comments

TMorville commented Jan 19, 2018

Problem description

Expected Output

Output of pd.show_versions()

INSTALLED VERSIONS

COMPLETE ERROR PRINT

TomAugspurger commented Jan 19, 2018

Output of `pd.show_versions()`