Skip to content

Commit 4c1c3a9

Browse files
TomAugspurgerPingviinituutti
authored andcommitted
BUG/PERF: Sparse get_dummies uses concat (pandas-dev#24372)
1 parent 1ab873c commit 4c1c3a9

File tree

9 files changed

+46
-29
lines changed

9 files changed

+46
-29
lines changed

asv_bench/benchmarks/join_merge.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def setup(self, axis):
5050
self.empty_right = [df, DataFrame()]
5151

5252
def time_concat_series(self, axis):
53-
concat(self.series, axis=axis)
53+
concat(self.series, axis=axis, sort=False)
5454

5555
def time_concat_small_frames(self, axis):
5656
concat(self.small_frames, axis=axis)

asv_bench/benchmarks/panel_ctor.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import warnings
22
from datetime import datetime, timedelta
33

4-
from pandas import DataFrame, Panel, DatetimeIndex, date_range
4+
from pandas import DataFrame, Panel, date_range
55

66

77
class DifferentIndexes(object):
@@ -23,9 +23,9 @@ def time_from_dict(self):
2323
class SameIndexes(object):
2424

2525
def setup(self):
26-
idx = DatetimeIndex(start=datetime(1990, 1, 1),
27-
end=datetime(2012, 1, 1),
28-
freq='D')
26+
idx = date_range(start=datetime(1990, 1, 1),
27+
end=datetime(2012, 1, 1),
28+
freq='D')
2929
df = DataFrame({'a': 0, 'b': 1, 'c': 2}, index=idx)
3030
self.data_frames = dict(enumerate([df] * 100))
3131

@@ -40,10 +40,10 @@ def setup(self):
4040
start = datetime(1990, 1, 1)
4141
end = datetime(2012, 1, 1)
4242
df1 = DataFrame({'a': 0, 'b': 1, 'c': 2},
43-
index=DatetimeIndex(start=start, end=end, freq='D'))
43+
index=date_range(start=start, end=end, freq='D'))
4444
end += timedelta(days=1)
4545
df2 = DataFrame({'a': 0, 'b': 1, 'c': 2},
46-
index=DatetimeIndex(start=start, end=end, freq='D'))
46+
index=date_range(start=start, end=end, freq='D'))
4747
dfs = [df1] * 50 + [df2] * 50
4848
self.data_frames = dict(enumerate(dfs))
4949

asv_bench/benchmarks/reindex.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
import numpy as np
22
import pandas.util.testing as tm
3-
from pandas import (DataFrame, Series, DatetimeIndex, MultiIndex, Index,
3+
from pandas import (DataFrame, Series, MultiIndex, Index,
44
date_range)
55
from .pandas_vb_common import lib
66

77

88
class Reindex(object):
99

1010
def setup(self):
11-
rng = DatetimeIndex(start='1/1/1970', periods=10000, freq='1min')
11+
rng = date_range(start='1/1/1970', periods=10000, freq='1min')
1212
self.df = DataFrame(np.random.rand(10000, 10), index=rng,
1313
columns=range(10))
1414
self.df['foo'] = 'bar'

asv_bench/benchmarks/timedelta.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import datetime
22

33
import numpy as np
4-
from pandas import Series, timedelta_range, to_timedelta, Timestamp, \
5-
Timedelta, TimedeltaIndex, DataFrame
4+
5+
from pandas import (
6+
DataFrame, Series, Timedelta, Timestamp, timedelta_range, to_timedelta)
67

78

89
class TimedeltaConstructor(object):
@@ -122,8 +123,8 @@ def time_timedelta_nanoseconds(self, series):
122123
class TimedeltaIndexing(object):
123124

124125
def setup(self):
125-
self.index = TimedeltaIndex(start='1985', periods=1000, freq='D')
126-
self.index2 = TimedeltaIndex(start='1986', periods=1000, freq='D')
126+
self.index = timedelta_range(start='1985', periods=1000, freq='D')
127+
self.index2 = timedelta_range(start='1986', periods=1000, freq='D')
127128
self.series = Series(range(1000), index=self.index)
128129
self.timedelta = self.index[500]
129130

asv_bench/benchmarks/timestamp.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import datetime
22

3-
from pandas import Timestamp
4-
import pytz
53
import dateutil
4+
import pytz
5+
6+
from pandas import Timestamp
67

78

89
class TimestampConstruction(object):
@@ -46,7 +47,7 @@ def time_dayofweek(self, tz, freq):
4647
self.ts.dayofweek
4748

4849
def time_weekday_name(self, tz, freq):
49-
self.ts.weekday_name
50+
self.ts.day_name
5051

5152
def time_dayofyear(self, tz, freq):
5253
self.ts.dayofyear

doc/source/whatsnew/v0.24.0.rst

+3-1
Original file line numberDiff line numberDiff line change
@@ -647,7 +647,7 @@ changes were made:
647647
* The ``out`` and ``mode`` parameters are now longer accepted (previously, this raised if they were specified).
648648
* Passing a scalar for ``indices`` is no longer allowed.
649649

650-
- The result of concatenating a mix of sparse and dense Series is a Series with sparse values, rather than a ``SparseSeries``.
650+
- The result of :func:`concat` with a mix of sparse and dense Series is a Series with sparse values, rather than a ``SparseSeries``.
651651
- ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray.
652652
- Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed.
653653
- ``DataFrame[column]`` is now a :class:`Series` with sparse values, rather than a :class:`SparseSeries`, when slicing a single column with sparse values (:issue:`23559`).
@@ -1104,6 +1104,7 @@ Other API Changes
11041104
- The order of the arguments of :func:`DataFrame.to_html` and :func:`DataFrame.to_string` is rearranged to be consistent with each other. (:issue:`23614`)
11051105
- :meth:`CategoricalIndex.reindex` now raises a ``ValueError`` if the target index is non-unique and not equal to the current index. It previously only raised if the target index was not of a categorical dtype (:issue:`23963`).
11061106
- :func:`Series.to_list` and :func:`Index.to_list` are now aliases of ``Series.tolist`` respectively ``Index.tolist`` (:issue:`8826`)
1107+
- The result of ``SparseSeries.unstack`` is now a :class:`DataFrame` with sparse values, rather than a :class:`SparseDataFrame` (issue:`24372`).
11071108

11081109
.. _whatsnew_0240.deprecations:
11091110

@@ -1616,6 +1617,7 @@ Sparse
16161617
- Bug in :meth:`SparseArary.unique` not returning the unique values (:issue:`19595`)
16171618
- Bug in :meth:`SparseArray.nonzero` and :meth:`SparseDataFrame.dropna` returning shifted/incorrect results (:issue:`21172`)
16181619
- Bug in :meth:`DataFrame.apply` where dtypes would lose sparseness (:issue:`23744`)
1620+
- Bug in :func:`concat` when concatenating a list of :class:`Series` with all-sparse values changing the ``fill_value`` and converting to a dense Series (:issue:`24371`)
16191621

16201622
Style
16211623
^^^^^

pandas/core/dtypes/concat.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -66,19 +66,19 @@ def _get_series_result_type(result, objs=None):
6666
return appropriate class of Series concat
6767
input is either dict or array-like
6868
"""
69+
from pandas import SparseSeries, SparseDataFrame, DataFrame
70+
6971
# concat Series with axis 1
7072
if isinstance(result, dict):
7173
# concat Series with axis 1
72-
if all(is_sparse(c) for c in compat.itervalues(result)):
73-
from pandas.core.sparse.api import SparseDataFrame
74+
if all(isinstance(c, (SparseSeries, SparseDataFrame))
75+
for c in compat.itervalues(result)):
7476
return SparseDataFrame
7577
else:
76-
from pandas.core.frame import DataFrame
7778
return DataFrame
7879

7980
# otherwise it is a SingleBlockManager (axis = 0)
8081
if result._block.is_sparse:
81-
from pandas.core.sparse.api import SparseSeries
8282
return SparseSeries
8383
else:
8484
return objs[0]._constructor

pandas/core/reshape/reshape.py

+16-7
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111

1212
from pandas.core.dtypes.cast import maybe_promote
1313
from pandas.core.dtypes.common import (
14-
ensure_platform_int, is_bool_dtype, is_extension_array_dtype, is_list_like,
15-
is_object_dtype, needs_i8_conversion)
14+
ensure_platform_int, is_bool_dtype, is_extension_array_dtype,
15+
is_integer_dtype, is_list_like, is_object_dtype, needs_i8_conversion)
1616
from pandas.core.dtypes.missing import notna
1717

1818
from pandas import compat
@@ -853,6 +853,7 @@ def check_len(item, name):
853853

854854
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
855855
sparse=False, drop_first=False, dtype=None):
856+
from pandas.core.reshape.concat import concat
856857
# Series avoids inconsistent NaN handling
857858
codes, levels = _factorize_from_iterable(Series(data))
858859

@@ -909,7 +910,15 @@ def _make_col_name(prefix, prefix_sep, level):
909910
index = None
910911

911912
if sparse:
912-
sparse_series = {}
913+
914+
if is_integer_dtype(dtype):
915+
fill_value = 0
916+
elif dtype == bool:
917+
fill_value = False
918+
else:
919+
fill_value = 0.0
920+
921+
sparse_series = []
913922
N = len(data)
914923
sp_indices = [[] for _ in range(len(dummy_cols))]
915924
mask = codes != -1
@@ -926,12 +935,12 @@ def _make_col_name(prefix, prefix_sep, level):
926935
dummy_cols = dummy_cols[1:]
927936
for col, ixs in zip(dummy_cols, sp_indices):
928937
sarr = SparseArray(np.ones(len(ixs), dtype=dtype),
929-
sparse_index=IntIndex(N, ixs), fill_value=0,
938+
sparse_index=IntIndex(N, ixs),
939+
fill_value=fill_value,
930940
dtype=dtype)
931-
sparse_series[col] = Series(data=sarr, index=index)
941+
sparse_series.append(Series(data=sarr, index=index, name=col))
932942

933-
out = DataFrame(sparse_series, index=index, columns=dummy_cols,
934-
dtype=dtype)
943+
out = concat(sparse_series, axis=1, copy=False)
935944
return out
936945

937946
else:

pandas/tests/sparse/test_reshape.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,8 @@ def test_sparse_frame_unstack(sparse_df):
3535

3636
def test_sparse_series_unstack(sparse_df, multi_index3):
3737
frame = pd.SparseSeries(np.ones(3), index=multi_index3).unstack()
38-
tm.assert_sp_frame_equal(frame, sparse_df)
38+
39+
arr = np.array([1, np.nan, np.nan])
40+
arrays = {i: pd.SparseArray(np.roll(arr, i)) for i in range(3)}
41+
expected = pd.DataFrame(arrays)
42+
tm.assert_frame_equal(frame, expected)

0 commit comments

Comments
 (0)