Skip to content

Commit d3851ac

Browse files
hexgnujreback
authored andcommitted
BUG: fix issue with concat creating SparseFrame if not all series are sparse. (#18924)
1 parent 113f788 commit d3851ac

File tree

6 files changed

+71
-38
lines changed

6 files changed

+71
-38
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,7 @@ Reshaping
540540
- Bug in :func:`DataFrame.merge` in which merging using ``Index`` objects as vectors raised an Exception (:issue:`19038`)
541541
- Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`)
542542
- Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`)
543+
- Bug in :func:`concat` when concatting sparse and dense series it returns only a ``SparseDataFrame``. Should be a ``DataFrame``. (:issue:`18914`, :issue:`18686`, and :issue:`16874`)
543544
-
544545

545546

pandas/core/dtypes/concat.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
_TD_DTYPE)
2020
from pandas.core.dtypes.generic import (
2121
ABCDatetimeIndex, ABCTimedeltaIndex,
22-
ABCPeriodIndex, ABCRangeIndex)
22+
ABCPeriodIndex, ABCRangeIndex, ABCSparseDataFrame)
2323

2424

2525
def get_dtype_kinds(l):
@@ -89,14 +89,16 @@ def _get_series_result_type(result, objs=None):
8989
def _get_frame_result_type(result, objs):
9090
"""
9191
return appropriate class of DataFrame-like concat
92-
if any block is SparseBlock, return SparseDataFrame
92+
if all blocks are SparseBlock, return SparseDataFrame
9393
otherwise, return 1st obj
9494
"""
95-
if any(b.is_sparse for b in result.blocks):
95+
96+
if result.blocks and all(b.is_sparse for b in result.blocks):
9697
from pandas.core.sparse.api import SparseDataFrame
9798
return SparseDataFrame
9899
else:
99-
return objs[0]
100+
return next(obj for obj in objs if not isinstance(obj,
101+
ABCSparseDataFrame))
100102

101103

102104
def _concat_compat(to_concat, axis=0):

pandas/core/dtypes/generic.py

+2
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ def _check(cls, inst):
4343

4444
ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series", ))
4545
ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe", ))
46+
ABCSparseDataFrame = create_pandas_abc_type("ABCSparseDataFrame", "_subtyp",
47+
("sparse_frame", ))
4648
ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel",))
4749
ABCSparseSeries = create_pandas_abc_type("ABCSparseSeries", "_subtyp",
4850
('sparse_series',

pandas/tests/dtypes/test_generic.py

+2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ class TestABCClasses(object):
1818
df = pd.DataFrame({'names': ['a', 'b', 'c']}, index=multi_index)
1919
sparse_series = pd.Series([1, 2, 3]).to_sparse()
2020
sparse_array = pd.SparseArray(np.random.randn(10))
21+
sparse_frame = pd.SparseDataFrame({'a': [1, -1, None]})
2122

2223
def test_abc_types(self):
2324
assert isinstance(pd.Index(['a', 'b', 'c']), gt.ABCIndex)
@@ -37,6 +38,7 @@ def test_abc_types(self):
3738
assert isinstance(self.df.to_panel(), gt.ABCPanel)
3839
assert isinstance(self.sparse_series, gt.ABCSparseSeries)
3940
assert isinstance(self.sparse_array, gt.ABCSparseArray)
41+
assert isinstance(self.sparse_frame, gt.ABCSparseDataFrame)
4042
assert isinstance(self.categorical, gt.ABCCategorical)
4143
assert isinstance(pd.Period('2012', freq='A-DEC'), gt.ABCPeriod)
4244

pandas/tests/reshape/test_reshape.py

+9
Original file line numberDiff line numberDiff line change
@@ -454,6 +454,15 @@ def test_dataframe_dummies_preserve_categorical_dtype(self, dtype):
454454

455455
tm.assert_frame_equal(result, expected)
456456

457+
@pytest.mark.parametrize('sparse', [True, False])
458+
def test_get_dummies_dont_sparsify_all_columns(self, sparse):
459+
# GH18914
460+
df = DataFrame.from_items([('GDP', [1, 2]), ('Nation', ['AB', 'CD'])])
461+
df = get_dummies(df, columns=['Nation'], sparse=sparse)
462+
df2 = df.reindex(columns=['GDP'])
463+
464+
tm.assert_frame_equal(df[['GDP']], df2)
465+
457466

458467
class TestCategoricalReshape(object):
459468

pandas/tests/sparse/test_combine_concat.py

+51-34
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
# pylint: disable-msg=E1101,W0612
2+
import pytest
23

34
import numpy as np
45
import pandas as pd
56
import pandas.util.testing as tm
7+
import itertools
68

79

810
class TestSparseSeriesConcat(object):
@@ -317,37 +319,52 @@ def test_concat_axis1(self):
317319
assert isinstance(res, pd.SparseDataFrame)
318320
tm.assert_frame_equal(res.to_dense(), exp)
319321

320-
def test_concat_sparse_dense(self):
321-
sparse = self.dense1.to_sparse()
322-
323-
res = pd.concat([sparse, self.dense2])
324-
exp = pd.concat([self.dense1, self.dense2])
325-
assert isinstance(res, pd.SparseDataFrame)
326-
tm.assert_frame_equal(res.to_dense(), exp)
327-
328-
res = pd.concat([self.dense2, sparse])
329-
exp = pd.concat([self.dense2, self.dense1])
330-
assert isinstance(res, pd.SparseDataFrame)
331-
tm.assert_frame_equal(res.to_dense(), exp)
332-
333-
sparse = self.dense1.to_sparse(fill_value=0)
334-
335-
res = pd.concat([sparse, self.dense2])
336-
exp = pd.concat([self.dense1, self.dense2])
337-
assert isinstance(res, pd.SparseDataFrame)
338-
tm.assert_frame_equal(res.to_dense(), exp)
339-
340-
res = pd.concat([self.dense2, sparse])
341-
exp = pd.concat([self.dense2, self.dense1])
342-
assert isinstance(res, pd.SparseDataFrame)
343-
tm.assert_frame_equal(res.to_dense(), exp)
344-
345-
res = pd.concat([self.dense3, sparse], axis=1)
346-
exp = pd.concat([self.dense3, self.dense1], axis=1)
347-
assert isinstance(res, pd.SparseDataFrame)
348-
tm.assert_frame_equal(res, exp)
349-
350-
res = pd.concat([sparse, self.dense3], axis=1)
351-
exp = pd.concat([self.dense1, self.dense3], axis=1)
352-
assert isinstance(res, pd.SparseDataFrame)
353-
tm.assert_frame_equal(res, exp)
322+
@pytest.mark.parametrize('fill_value,sparse_idx,dense_idx',
323+
itertools.product([None, 0, 1, np.nan],
324+
[0, 1],
325+
[1, 0]))
326+
def test_concat_sparse_dense_rows(self, fill_value, sparse_idx, dense_idx):
327+
frames = [self.dense1, self.dense2]
328+
sparse_frame = [frames[dense_idx],
329+
frames[sparse_idx].to_sparse(fill_value=fill_value)]
330+
dense_frame = [frames[dense_idx], frames[sparse_idx]]
331+
332+
# This will try both directions sparse + dense and dense + sparse
333+
for _ in range(2):
334+
res = pd.concat(sparse_frame)
335+
exp = pd.concat(dense_frame)
336+
337+
assert isinstance(res, pd.SparseDataFrame)
338+
tm.assert_frame_equal(res.to_dense(), exp)
339+
340+
sparse_frame = sparse_frame[::-1]
341+
dense_frame = dense_frame[::-1]
342+
343+
@pytest.mark.parametrize('fill_value,sparse_idx,dense_idx',
344+
itertools.product([None, 0, 1, np.nan],
345+
[0, 1],
346+
[1, 0]))
347+
def test_concat_sparse_dense_cols(self, fill_value, sparse_idx, dense_idx):
348+
# See GH16874, GH18914 and #18686 for why this should be a DataFrame
349+
350+
frames = [self.dense1, self.dense3]
351+
352+
sparse_frame = [frames[dense_idx],
353+
frames[sparse_idx].to_sparse(fill_value=fill_value)]
354+
dense_frame = [frames[dense_idx], frames[sparse_idx]]
355+
356+
# This will try both directions sparse + dense and dense + sparse
357+
for _ in range(2):
358+
res = pd.concat(sparse_frame, axis=1)
359+
exp = pd.concat(dense_frame, axis=1)
360+
361+
for column in frames[dense_idx].columns:
362+
if dense_idx == sparse_idx:
363+
tm.assert_frame_equal(res[column], exp[column])
364+
else:
365+
tm.assert_series_equal(res[column], exp[column])
366+
367+
tm.assert_frame_equal(res, exp)
368+
369+
sparse_frame = sparse_frame[::-1]
370+
dense_frame = dense_frame[::-1]

0 commit comments

Comments
 (0)