wip

TomAugspurger · TomAugspurger · commit e159ef205e6d · 2018-08-16T06:29:12.000-05:00
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -335,6 +335,7 @@ This has some notable changes
 - ``SparseArray.astype(np.dtype)`` will create a dense NumPy array. To keep astype to a SparseArray with a different subdtype, use ``.astype(sparse_dtype)`` or a string like ``.astype('Sparse[float32]')``.
 - Setting ``SparseArray.fill_value`` to a fill value with a different dtype is now allowed.
 - Bug in ``SparseArray.nbytes`` under-reporting its memory usage by not including the size of its sparse index.
+- The result of concatenating a SparseSeries and a dense Series is a Series with sparse dtype.
 
 .. _whatsnew_0240.api.datetimelike.normalize:
 
diff --git a/foo.csv b/foo.csv
@@ -0,0 +1,4 @@
+1,
+2, 1.23, 4.56
+3, 1.24, 4.57
+4, 1.25, 4.58
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -2080,6 +2080,7 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
     concat_plan = combine_concat_plans(concat_plans, concat_axis)
     blocks = []
 
+    import pdb; pdb.set_trace()
     for placement, join_units in concat_plan:
 
         if len(join_units) == 1 and not join_units[0].indexers:
diff --git a/pandas/tests/sparse/test_combine_concat.py b/pandas/tests/sparse/test_combine_concat.py
@@ -38,8 +38,7 @@ class TestSparseSeriesConcat(object):
 
     @pytest.mark.parametrize('kind', [
         'integer',
-        pytest.param('block',
-                     marks=pytest.mark.xfail(reason='Broken', strict="TODO")),
+        'block',
     ])
     def test_concat(self, kind):
         val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
@@ -51,15 +50,15 @@ def test_concat(self, kind):
         res = pd.concat([sparse1, sparse2])
         exp = pd.concat([pd.Series(val1), pd.Series(val2)])
         exp = pd.SparseSeries(exp, kind=kind)
-        tm.assert_sp_series_equal(res, exp)
+        tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True)
 
         sparse1 = pd.SparseSeries(val1, fill_value=0, name='x', kind=kind)
         sparse2 = pd.SparseSeries(val2, fill_value=0, name='y', kind=kind)
 
         res = pd.concat([sparse1, sparse2])
         exp = pd.concat([pd.Series(val1), pd.Series(val2)])
         exp = pd.SparseSeries(exp, fill_value=0, kind=kind)
-        tm.assert_sp_series_equal(res, exp)
+        tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True)
 
     def test_concat_axis1(self):
         val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
@@ -127,10 +126,8 @@ def test_concat_different_kind(self):
         tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True)
 
     @pytest.mark.parametrize('kind', [
-        pytest.param('integer',
-                     marks=pytest.mark.xfail(reason="Return Series[Sparse]")),
-        pytest.param('block',
-                     marks=pytest.mark.xfail(reason='Broken', strict="TODO")),
+        'integer',
+        'block',
     ])
     def test_concat_sparse_dense(self, kind):
         # use first input's fill_value
@@ -147,27 +144,43 @@ def test_concat_sparse_dense(self, kind):
 
         res = pd.concat([dense, sparse, dense])
         exp = pd.concat([dense, pd.Series(val1), dense])
-        exp = pd.SparseSeries(exp, kind=kind)
-        tm.assert_sp_series_equal(res, exp)
+        # XXX: changed from SparseSeries to Series[sparse]
+        exp = pd.Series(
+            pd.SparseArray(exp, kind=kind),
+            index=exp.index,
+            name=exp.name,
+        )
+        tm.assert_series_equal(res, exp)
 
         sparse = pd.SparseSeries(val1, name='x', kind=kind, fill_value=0)
         dense = pd.Series(val2, name='y')
 
         res = pd.concat([sparse, dense])
+        # XXX: changed from SparseSeries to Series[sparse]
         exp = pd.concat([pd.Series(val1), dense])
-        exp = pd.SparseSeries(exp, kind=kind, fill_value=0)
-        tm.assert_sp_series_equal(res, exp)
+        exp = pd.Series(
+            pd.SparseArray(exp, kind=kind, fill_value=0),
+            index=exp.index,
+            name=exp.name,
+        )
+        tm.assert_series_equal(res, exp)
 
         res = pd.concat([dense, sparse, dense])
         exp = pd.concat([dense, pd.Series(val1), dense])
-        exp = pd.SparseSeries(exp, kind=kind, fill_value=0)
-        tm.assert_sp_series_equal(res, exp)
+        # XXX: changed from SparseSeries to Series[sparse]
+        exp = pd.Series(
+            pd.SparseArray(exp, kind=kind, fill_value=0),
+            index = exp.index,
+            name = exp.name,
+        )
+        tm.assert_series_equal(res, exp)
 
     @pytest.mark.xfail(reason="Correct result is unclear.", strict=True)
     def test_concat_mixed_dtypes(self):
         # Concatenating sparse, regular, and categorical.
         # Who should "win" in the dtype determination?
         # This test assumes that sparse wins.
+        # At the moment, we're just object.
         df1 = pd.DataFrame({"A": pd.SparseArray([1, 2, 3])})
         df2 = pd.DataFrame({"A": [1, 2, 3]})
         df3 = pd.DataFrame({"A": ['a', 'b', 'c']}).astype('category')
@@ -273,54 +286,54 @@ def test_concat_different_columns_sort_warns(self):
 
     def test_concat_different_columns(self):
         # fill_value = np.nan
-        sparse = self.dense1.to_sparse()
-        sparse3 = self.dense3.to_sparse()
-
-        res = pd.concat([sparse, sparse3], sort=True)
-        exp = pd.concat([self.dense1, self.dense3], sort=True).to_sparse()
-        tm.assert_sp_frame_equal(res, exp, check_kind=False)
-
-        res = pd.concat([sparse3, sparse], sort=True)
-        exp = pd.concat([self.dense3, self.dense1], sort=True).to_sparse()
-        exp._default_fill_value = np.nan
-        tm.assert_sp_frame_equal(res, exp, check_kind=False)
+        # sparse = self.dense1.to_sparse()
+        # sparse3 = self.dense3.to_sparse()
 
-        # fill_value = 0
+        # res = pd.concat([sparse, sparse3], sort=True)
+        # exp = pd.concat([self.dense1, self.dense3], sort=True).to_sparse()
+        # tm.assert_sp_frame_equal(res, exp, check_kind=False)
+        #
+        # res = pd.concat([sparse3, sparse], sort=True)
+        # exp = pd.concat([self.dense3, self.dense1], sort=True).to_sparse()
+        # exp._default_fill_value = np.nan
+        # tm.assert_sp_frame_equal(res, exp, check_kind=False)
+        #
+        # # fill_value = 0
         sparse = self.dense1.to_sparse(fill_value=0)
         sparse3 = self.dense3.to_sparse(fill_value=0)
 
         # this test is buggy. from here on out
         # exp doesn't handle C (all NaN) correctly.
         # We correctly don't have any sparse values since the
         # values are all NaN, and the fill_value is 0.
-        raise pytest.xfail("Test is buggy.")
-        # res = pd.concat([sparse, sparse3], sort=True)
-        # exp = (pd.concat([self.dense1, self.dense3], sort=True)
-        #          .to_sparse(fill_value=0))
-        # exp._default_fill_value = np.nan
+        # raise pytest.xfail("Test is buggy.")
+        res = pd.concat([sparse, sparse3], sort=True)
+        exp = (pd.concat([self.dense1, self.dense3], sort=True)
+                 .to_sparse(fill_value=0))
+        exp._default_fill_value = np.nan
 
-        # tm.assert_sp_frame_equal(res, exp, check_kind=False,
-        #                          consolidate_block_indices=True)
+        tm.assert_sp_frame_equal(res, exp, check_kind=False,
+                                 consolidate_block_indices=True)
 
-        # res = pd.concat([sparse3, sparse], sort=True)
-        # exp = (pd.concat([self.dense3, self.dense1], sort=True)
-        #          .to_sparse(fill_value=0))
-        # exp._default_fill_value = np.nan
-        # tm.assert_sp_frame_equal(res, exp, check_kind=False)
-        #
-        # # different fill values
-        # sparse = self.dense1.to_sparse()
-        # sparse3 = self.dense3.to_sparse(fill_value=0)
-        # # each columns keeps its fill_value, thus compare in dense
-        # res = pd.concat([sparse, sparse3], sort=True)
-        # exp = pd.concat([self.dense1, self.dense3], sort=True)
-        # assert isinstance(res, pd.SparseDataFrame)
-        # tm.assert_frame_equal(res.to_dense(), exp)
-        #
-        # res = pd.concat([sparse3, sparse], sort=True)
-        # exp = pd.concat([self.dense3, self.dense1], sort=True)
-        # assert isinstance(res, pd.SparseDataFrame)
-        # tm.assert_frame_equal(res.to_dense(), exp)
+        res = pd.concat([sparse3, sparse], sort=True)
+        exp = (pd.concat([self.dense3, self.dense1], sort=True)
+                 .to_sparse(fill_value=0))
+        exp._default_fill_value = np.nan
+        tm.assert_sp_frame_equal(res, exp, check_kind=False)
+
+        # different fill values
+        sparse = self.dense1.to_sparse()
+        sparse3 = self.dense3.to_sparse(fill_value=0)
+        # each columns keeps its fill_value, thus compare in dense
+        res = pd.concat([sparse, sparse3], sort=True)
+        exp = pd.concat([self.dense1, self.dense3], sort=True)
+        assert isinstance(res, pd.SparseDataFrame)
+        tm.assert_frame_equal(res.to_dense(), exp)
+
+        res = pd.concat([sparse3, sparse], sort=True)
+        exp = pd.concat([self.dense3, self.dense1], sort=True)
+        assert isinstance(res, pd.SparseDataFrame)
+        tm.assert_frame_equal(res.to_dense(), exp)
 
     def test_concat_series(self):
         # fill_value = np.nan
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
@@ -1564,7 +1564,7 @@ def assert_sp_array_equal(left, right, check_dtype=True, check_kind=True,
         left_index = left.sp_index
         right_index = right.sp_index
 
-    if consolidate_block_indices:
+    if consolidate_block_indices and left.kind == 'block':
         # we'll probably remove this hack...
         left_index = left_index.to_int_index().to_block_index()
         right_index = right_index.to_int_index().to_block_index()