Skip to content

Commit e159ef2

Browse files
committed
wip
1 parent c35c7c2 commit e159ef2

File tree

5 files changed

+72
-53
lines changed

5 files changed

+72
-53
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,7 @@ This has some notable changes
335335
- ``SparseArray.astype(np.dtype)`` will create a dense NumPy array. To keep astype to a SparseArray with a different subdtype, use ``.astype(sparse_dtype)`` or a string like ``.astype('Sparse[float32]')``.
336336
- Setting ``SparseArray.fill_value`` to a fill value with a different dtype is now allowed.
337337
- Bug in ``SparseArray.nbytes`` under-reporting its memory usage by not including the size of its sparse index.
338+
- The result of concatenating a SparseSeries and a dense Series is a Series with sparse dtype.
338339

339340
.. _whatsnew_0240.api.datetimelike.normalize:
340341

foo.csv

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
1,
2+
2, 1.23, 4.56
3+
3, 1.24, 4.57
4+
4, 1.25, 4.58

pandas/core/internals/managers.py

+1
Original file line numberDiff line numberDiff line change
@@ -2080,6 +2080,7 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
20802080
concat_plan = combine_concat_plans(concat_plans, concat_axis)
20812081
blocks = []
20822082

2083+
import pdb; pdb.set_trace()
20832084
for placement, join_units in concat_plan:
20842085

20852086
if len(join_units) == 1 and not join_units[0].indexers:

pandas/tests/sparse/test_combine_concat.py

+65-52
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,7 @@ class TestSparseSeriesConcat(object):
3838

3939
@pytest.mark.parametrize('kind', [
4040
'integer',
41-
pytest.param('block',
42-
marks=pytest.mark.xfail(reason='Broken', strict="TODO")),
41+
'block',
4342
])
4443
def test_concat(self, kind):
4544
val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
@@ -51,15 +50,15 @@ def test_concat(self, kind):
5150
res = pd.concat([sparse1, sparse2])
5251
exp = pd.concat([pd.Series(val1), pd.Series(val2)])
5352
exp = pd.SparseSeries(exp, kind=kind)
54-
tm.assert_sp_series_equal(res, exp)
53+
tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True)
5554

5655
sparse1 = pd.SparseSeries(val1, fill_value=0, name='x', kind=kind)
5756
sparse2 = pd.SparseSeries(val2, fill_value=0, name='y', kind=kind)
5857

5958
res = pd.concat([sparse1, sparse2])
6059
exp = pd.concat([pd.Series(val1), pd.Series(val2)])
6160
exp = pd.SparseSeries(exp, fill_value=0, kind=kind)
62-
tm.assert_sp_series_equal(res, exp)
61+
tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True)
6362

6463
def test_concat_axis1(self):
6564
val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
@@ -127,10 +126,8 @@ def test_concat_different_kind(self):
127126
tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True)
128127

129128
@pytest.mark.parametrize('kind', [
130-
pytest.param('integer',
131-
marks=pytest.mark.xfail(reason="Return Series[Sparse]")),
132-
pytest.param('block',
133-
marks=pytest.mark.xfail(reason='Broken', strict="TODO")),
129+
'integer',
130+
'block',
134131
])
135132
def test_concat_sparse_dense(self, kind):
136133
# use first input's fill_value
@@ -147,27 +144,43 @@ def test_concat_sparse_dense(self, kind):
147144

148145
res = pd.concat([dense, sparse, dense])
149146
exp = pd.concat([dense, pd.Series(val1), dense])
150-
exp = pd.SparseSeries(exp, kind=kind)
151-
tm.assert_sp_series_equal(res, exp)
147+
# XXX: changed from SparseSeries to Series[sparse]
148+
exp = pd.Series(
149+
pd.SparseArray(exp, kind=kind),
150+
index=exp.index,
151+
name=exp.name,
152+
)
153+
tm.assert_series_equal(res, exp)
152154

153155
sparse = pd.SparseSeries(val1, name='x', kind=kind, fill_value=0)
154156
dense = pd.Series(val2, name='y')
155157

156158
res = pd.concat([sparse, dense])
159+
# XXX: changed from SparseSeries to Series[sparse]
157160
exp = pd.concat([pd.Series(val1), dense])
158-
exp = pd.SparseSeries(exp, kind=kind, fill_value=0)
159-
tm.assert_sp_series_equal(res, exp)
161+
exp = pd.Series(
162+
pd.SparseArray(exp, kind=kind, fill_value=0),
163+
index=exp.index,
164+
name=exp.name,
165+
)
166+
tm.assert_series_equal(res, exp)
160167

161168
res = pd.concat([dense, sparse, dense])
162169
exp = pd.concat([dense, pd.Series(val1), dense])
163-
exp = pd.SparseSeries(exp, kind=kind, fill_value=0)
164-
tm.assert_sp_series_equal(res, exp)
170+
# XXX: changed from SparseSeries to Series[sparse]
171+
exp = pd.Series(
172+
pd.SparseArray(exp, kind=kind, fill_value=0),
173+
index = exp.index,
174+
name = exp.name,
175+
)
176+
tm.assert_series_equal(res, exp)
165177

166178
@pytest.mark.xfail(reason="Correct result is unclear.", strict=True)
167179
def test_concat_mixed_dtypes(self):
168180
# Concatenating sparse, regular, and categorical.
169181
# Who should "win" in the dtype determination?
170182
# This test assumes that sparse wins.
183+
# At the moment, we're just object.
171184
df1 = pd.DataFrame({"A": pd.SparseArray([1, 2, 3])})
172185
df2 = pd.DataFrame({"A": [1, 2, 3]})
173186
df3 = pd.DataFrame({"A": ['a', 'b', 'c']}).astype('category')
@@ -273,54 +286,54 @@ def test_concat_different_columns_sort_warns(self):
273286

274287
def test_concat_different_columns(self):
275288
# fill_value = np.nan
276-
sparse = self.dense1.to_sparse()
277-
sparse3 = self.dense3.to_sparse()
278-
279-
res = pd.concat([sparse, sparse3], sort=True)
280-
exp = pd.concat([self.dense1, self.dense3], sort=True).to_sparse()
281-
tm.assert_sp_frame_equal(res, exp, check_kind=False)
282-
283-
res = pd.concat([sparse3, sparse], sort=True)
284-
exp = pd.concat([self.dense3, self.dense1], sort=True).to_sparse()
285-
exp._default_fill_value = np.nan
286-
tm.assert_sp_frame_equal(res, exp, check_kind=False)
289+
# sparse = self.dense1.to_sparse()
290+
# sparse3 = self.dense3.to_sparse()
287291

288-
# fill_value = 0
292+
# res = pd.concat([sparse, sparse3], sort=True)
293+
# exp = pd.concat([self.dense1, self.dense3], sort=True).to_sparse()
294+
# tm.assert_sp_frame_equal(res, exp, check_kind=False)
295+
#
296+
# res = pd.concat([sparse3, sparse], sort=True)
297+
# exp = pd.concat([self.dense3, self.dense1], sort=True).to_sparse()
298+
# exp._default_fill_value = np.nan
299+
# tm.assert_sp_frame_equal(res, exp, check_kind=False)
300+
#
301+
# # fill_value = 0
289302
sparse = self.dense1.to_sparse(fill_value=0)
290303
sparse3 = self.dense3.to_sparse(fill_value=0)
291304

292305
# this test is buggy. from here on out
293306
# exp doesn't handle C (all NaN) correctly.
294307
# We correctly don't have any sparse values since the
295308
# values are all NaN, and the fill_value is 0.
296-
raise pytest.xfail("Test is buggy.")
297-
# res = pd.concat([sparse, sparse3], sort=True)
298-
# exp = (pd.concat([self.dense1, self.dense3], sort=True)
299-
# .to_sparse(fill_value=0))
300-
# exp._default_fill_value = np.nan
309+
# raise pytest.xfail("Test is buggy.")
310+
res = pd.concat([sparse, sparse3], sort=True)
311+
exp = (pd.concat([self.dense1, self.dense3], sort=True)
312+
.to_sparse(fill_value=0))
313+
exp._default_fill_value = np.nan
301314

302-
# tm.assert_sp_frame_equal(res, exp, check_kind=False,
303-
# consolidate_block_indices=True)
315+
tm.assert_sp_frame_equal(res, exp, check_kind=False,
316+
consolidate_block_indices=True)
304317

305-
# res = pd.concat([sparse3, sparse], sort=True)
306-
# exp = (pd.concat([self.dense3, self.dense1], sort=True)
307-
# .to_sparse(fill_value=0))
308-
# exp._default_fill_value = np.nan
309-
# tm.assert_sp_frame_equal(res, exp, check_kind=False)
310-
#
311-
# # different fill values
312-
# sparse = self.dense1.to_sparse()
313-
# sparse3 = self.dense3.to_sparse(fill_value=0)
314-
# # each columns keeps its fill_value, thus compare in dense
315-
# res = pd.concat([sparse, sparse3], sort=True)
316-
# exp = pd.concat([self.dense1, self.dense3], sort=True)
317-
# assert isinstance(res, pd.SparseDataFrame)
318-
# tm.assert_frame_equal(res.to_dense(), exp)
319-
#
320-
# res = pd.concat([sparse3, sparse], sort=True)
321-
# exp = pd.concat([self.dense3, self.dense1], sort=True)
322-
# assert isinstance(res, pd.SparseDataFrame)
323-
# tm.assert_frame_equal(res.to_dense(), exp)
318+
res = pd.concat([sparse3, sparse], sort=True)
319+
exp = (pd.concat([self.dense3, self.dense1], sort=True)
320+
.to_sparse(fill_value=0))
321+
exp._default_fill_value = np.nan
322+
tm.assert_sp_frame_equal(res, exp, check_kind=False)
323+
324+
# different fill values
325+
sparse = self.dense1.to_sparse()
326+
sparse3 = self.dense3.to_sparse(fill_value=0)
327+
# each columns keeps its fill_value, thus compare in dense
328+
res = pd.concat([sparse, sparse3], sort=True)
329+
exp = pd.concat([self.dense1, self.dense3], sort=True)
330+
assert isinstance(res, pd.SparseDataFrame)
331+
tm.assert_frame_equal(res.to_dense(), exp)
332+
333+
res = pd.concat([sparse3, sparse], sort=True)
334+
exp = pd.concat([self.dense3, self.dense1], sort=True)
335+
assert isinstance(res, pd.SparseDataFrame)
336+
tm.assert_frame_equal(res.to_dense(), exp)
324337

325338
def test_concat_series(self):
326339
# fill_value = np.nan

pandas/util/testing.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1564,7 +1564,7 @@ def assert_sp_array_equal(left, right, check_dtype=True, check_kind=True,
15641564
left_index = left.sp_index
15651565
right_index = right.sp_index
15661566

1567-
if consolidate_block_indices:
1567+
if consolidate_block_indices and left.kind == 'block':
15681568
# we'll probably remove this hack...
15691569
left_index = left_index.to_int_index().to_block_index()
15701570
right_index = right_index.to_int_index().to_block_index()

0 commit comments

Comments
 (0)