Skip to content

Commit 2ef4ed1

Browse files
committed
BUG: fix issue with sparse concatting
This was originally brought up in :issue:`18686` and :issue:`18914`. Basically the problem is when you use get_dummies with sparse=True it will return a SparseDataFrame with sparse and dense columns. This is in fact not what we want. What we want is a DataFrame with sparse and dense columns. Inside of pandas.core.dtypes.concat is a function that defines the factory class which needed to be changed.
1 parent c19bdc9 commit 2ef4ed1

File tree

5 files changed

+26
-18
lines changed

5 files changed

+26
-18
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,7 @@ Reshaping
363363
- Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`)
364364
- Bug in :func:`cut` which fails when using readonly arrays (:issue:`18773`)
365365
- Bug in :func:`Dataframe.pivot_table` which fails when the ``aggfunc`` arg is of type string. The behavior is now consistent with other methods like ``agg`` and ``apply`` (:issue:`18713`)
366+
- Bug in :func:`concat` when concatting sparse and dense series it returns only a SparseDataFrame. Should be a DataFrame. (:issue:`18914`, :issue:`18686`, and :issue:`16874`)
366367

367368

368369
Numeric

pandas/core/dtypes/concat.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,10 @@ def _get_series_result_type(result, objs=None):
8989
def _get_frame_result_type(result, objs):
9090
"""
9191
return appropriate class of DataFrame-like concat
92-
if any block is SparseBlock, return SparseDataFrame
92+
if all blocks are SparseBlock, return SparseDataFrame
9393
otherwise, return 1st obj
9494
"""
95-
if any(b.is_sparse for b in result.blocks):
95+
if result.blocks and all(b.is_sparse for b in result.blocks):
9696
from pandas.core.sparse.api import SparseDataFrame
9797
return SparseDataFrame
9898
else:

pandas/core/sparse/series.py

-1
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,6 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block',
168168
if index is None:
169169
index = data.index.view()
170170
else:
171-
172171
data = data.reindex(index, copy=False)
173172

174173
else:

pandas/tests/reshape/test_reshape.py

+9
Original file line numberDiff line numberDiff line change
@@ -454,6 +454,15 @@ def test_dataframe_dummies_preserve_categorical_dtype(self, dtype):
454454

455455
tm.assert_frame_equal(result, expected)
456456

457+
@pytest.mark.parametrize('sparse', [True, False])
458+
def test_get_dummies_dont_sparsify_all_columns(self, sparse):
459+
# GH18914
460+
df = DataFrame.from_items([('GDP', [1, 2]), ('Nation', ['AB', 'CD'])])
461+
df = get_dummies(df, columns=['Nation'], sparse=sparse)
462+
df2 = df.reindex(columns=['GDP'])
463+
464+
tm.assert_frame_equal(df[['GDP']], df2)
465+
457466

458467
class TestCategoricalReshape(object):
459468

pandas/tests/sparse/test_combine_concat.py

+14-15
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# pylint: disable-msg=E1101,W0612
2+
import pytest
23

34
import numpy as np
45
import pandas as pd
@@ -317,37 +318,35 @@ def test_concat_axis1(self):
317318
assert isinstance(res, pd.SparseDataFrame)
318319
tm.assert_frame_equal(res.to_dense(), exp)
319320

320-
def test_concat_sparse_dense(self):
321-
sparse = self.dense1.to_sparse()
322321

322+
@pytest.mark.parametrize('fill_value', [None, 0])
323+
def test_concat_sparse_dense(self, fill_value):
324+
sparse = self.dense1.to_sparse(fill_value=fill_value)
323325
res = pd.concat([sparse, self.dense2])
324326
exp = pd.concat([self.dense1, self.dense2])
325-
assert isinstance(res, pd.SparseDataFrame)
326-
tm.assert_frame_equal(res.to_dense(), exp)
327327

328-
res = pd.concat([self.dense2, sparse])
329-
exp = pd.concat([self.dense2, self.dense1])
330-
assert isinstance(res, pd.SparseDataFrame)
331-
tm.assert_frame_equal(res.to_dense(), exp)
332-
333-
sparse = self.dense1.to_sparse(fill_value=0)
334-
335-
res = pd.concat([sparse, self.dense2])
336-
exp = pd.concat([self.dense1, self.dense2])
337328
assert isinstance(res, pd.SparseDataFrame)
338329
tm.assert_frame_equal(res.to_dense(), exp)
339330

340331
res = pd.concat([self.dense2, sparse])
341332
exp = pd.concat([self.dense2, self.dense1])
333+
342334
assert isinstance(res, pd.SparseDataFrame)
343335
tm.assert_frame_equal(res.to_dense(), exp)
344336

345337
res = pd.concat([self.dense3, sparse], axis=1)
346338
exp = pd.concat([self.dense3, self.dense1], axis=1)
347-
assert isinstance(res, pd.SparseDataFrame)
339+
# See GH18914 and #18686 for why this should be
340+
# A DataFrame
341+
assert isinstance(res, pd.DataFrame)
342+
for column in self.dense3.columns:
343+
tm.assert_series_equal(res[column], exp[column])
344+
348345
tm.assert_frame_equal(res, exp)
349346

350347
res = pd.concat([sparse, self.dense3], axis=1)
351348
exp = pd.concat([self.dense1, self.dense3], axis=1)
352-
assert isinstance(res, pd.SparseDataFrame)
349+
assert isinstance(res, pd.DataFrame)
350+
for column in self.dense3.columns:
351+
tm.assert_series_equal(res[column], exp[column])
353352
tm.assert_frame_equal(res, exp)

0 commit comments

Comments
 (0)