Skip to content

BUG/PERF: Sparse get_dummies uses concat #24372

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Dec 21, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion asv_bench/benchmarks/join_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def setup(self, axis):
self.empty_right = [df, DataFrame()]

def time_concat_series(self, axis):
concat(self.series, axis=axis)
concat(self.series, axis=axis, sort=False)

def time_concat_small_frames(self, axis):
concat(self.small_frames, axis=axis)
Expand Down
12 changes: 6 additions & 6 deletions asv_bench/benchmarks/panel_ctor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import warnings
from datetime import datetime, timedelta

from pandas import DataFrame, Panel, DatetimeIndex, date_range
from pandas import DataFrame, Panel, date_range


class DifferentIndexes(object):
Expand All @@ -23,9 +23,9 @@ def time_from_dict(self):
class SameIndexes(object):

def setup(self):
idx = DatetimeIndex(start=datetime(1990, 1, 1),
end=datetime(2012, 1, 1),
freq='D')
idx = date_range(start=datetime(1990, 1, 1),
end=datetime(2012, 1, 1),
freq='D')
df = DataFrame({'a': 0, 'b': 1, 'c': 2}, index=idx)
self.data_frames = dict(enumerate([df] * 100))

Expand All @@ -40,10 +40,10 @@ def setup(self):
start = datetime(1990, 1, 1)
end = datetime(2012, 1, 1)
df1 = DataFrame({'a': 0, 'b': 1, 'c': 2},
index=DatetimeIndex(start=start, end=end, freq='D'))
index=date_range(start=start, end=end, freq='D'))
end += timedelta(days=1)
df2 = DataFrame({'a': 0, 'b': 1, 'c': 2},
index=DatetimeIndex(start=start, end=end, freq='D'))
index=date_range(start=start, end=end, freq='D'))
dfs = [df1] * 50 + [df2] * 50
self.data_frames = dict(enumerate(dfs))

Expand Down
4 changes: 2 additions & 2 deletions asv_bench/benchmarks/reindex.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import numpy as np
import pandas.util.testing as tm
from pandas import (DataFrame, Series, DatetimeIndex, MultiIndex, Index,
from pandas import (DataFrame, Series, MultiIndex, Index,
date_range)
from .pandas_vb_common import lib


class Reindex(object):

def setup(self):
rng = DatetimeIndex(start='1/1/1970', periods=10000, freq='1min')
rng = date_range(start='1/1/1970', periods=10000, freq='1min')
self.df = DataFrame(np.random.rand(10000, 10), index=rng,
columns=range(10))
self.df['foo'] = 'bar'
Expand Down
9 changes: 5 additions & 4 deletions asv_bench/benchmarks/timedelta.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import datetime

import numpy as np
from pandas import Series, timedelta_range, to_timedelta, Timestamp, \
Timedelta, TimedeltaIndex, DataFrame

from pandas import (
DataFrame, Series, Timedelta, Timestamp, timedelta_range, to_timedelta)


class TimedeltaConstructor(object):
Expand Down Expand Up @@ -122,8 +123,8 @@ def time_timedelta_nanoseconds(self, series):
class TimedeltaIndexing(object):

def setup(self):
self.index = TimedeltaIndex(start='1985', periods=1000, freq='D')
self.index2 = TimedeltaIndex(start='1986', periods=1000, freq='D')
self.index = timedelta_range(start='1985', periods=1000, freq='D')
self.index2 = timedelta_range(start='1986', periods=1000, freq='D')
self.series = Series(range(1000), index=self.index)
self.timedelta = self.index[500]

Expand Down
7 changes: 4 additions & 3 deletions asv_bench/benchmarks/timestamp.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import datetime

from pandas import Timestamp
import pytz
import dateutil
import pytz

from pandas import Timestamp


class TimestampConstruction(object):
Expand Down Expand Up @@ -46,7 +47,7 @@ def time_dayofweek(self, tz, freq):
self.ts.dayofweek

def time_weekday_name(self, tz, freq):
self.ts.weekday_name
self.ts.day_name

def time_dayofyear(self, tz, freq):
self.ts.dayofyear
Expand Down
4 changes: 3 additions & 1 deletion doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,7 @@ changes were made:
* The ``out`` and ``mode`` parameters are now longer accepted (previously, this raised if they were specified).
* Passing a scalar for ``indices`` is no longer allowed.

- The result of concatenating a mix of sparse and dense Series is a Series with sparse values, rather than a ``SparseSeries``.
- The result of :func:`concat` with a mix of sparse and dense Series is a Series with sparse values, rather than a ``SparseSeries``.
- ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray.
- Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed.
- ``DataFrame[column]`` is now a :class:`Series` with sparse values, rather than a :class:`SparseSeries`, when slicing a single column with sparse values (:issue:`23559`).
Expand Down Expand Up @@ -1103,6 +1103,7 @@ Other API Changes
- The order of the arguments of :func:`DataFrame.to_html` and :func:`DataFrame.to_string` is rearranged to be consistent with each other. (:issue:`23614`)
- :meth:`CategoricalIndex.reindex` now raises a ``ValueError`` if the target index is non-unique and not equal to the current index. It previously only raised if the target index was not of a categorical dtype (:issue:`23963`).
- :func:`Series.to_list` and :func:`Index.to_list` are now aliases of ``Series.tolist`` respectively ``Index.tolist`` (:issue:`8826`)
- The result of ``SparseSeries.unstack`` is now a :class:`DataFrame` with sparse values, rather than a :class:`SparseDataFrame` (issue:`24372`).

.. _whatsnew_0240.deprecations:

Expand Down Expand Up @@ -1613,6 +1614,7 @@ Sparse
- Bug in :meth:`SparseArary.unique` not returning the unique values (:issue:`19595`)
- Bug in :meth:`SparseArray.nonzero` and :meth:`SparseDataFrame.dropna` returning shifted/incorrect results (:issue:`21172`)
- Bug in :meth:`DataFrame.apply` where dtypes would lose sparseness (:issue:`23744`)
- Bug in :func:`concat` when concatenating a list of :class:`Series` with all-sparse values changing the ``fill_value`` and converting to a dense Series (:issue:`24371`)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When the input to concat is a List[Series[Sparse]], we now return a DataFrame with sparse values. Previously this was a dense DataFrame (probably a bug), so it isn't API breaking.


Style
^^^^^
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,19 +66,19 @@ def _get_series_result_type(result, objs=None):
return appropriate class of Series concat
input is either dict or array-like
"""
from pandas import SparseSeries, SparseDataFrame, DataFrame

# concat Series with axis 1
if isinstance(result, dict):
# concat Series with axis 1
if all(is_sparse(c) for c in compat.itervalues(result)):
from pandas.core.sparse.api import SparseDataFrame
if all(isinstance(c, (SparseSeries, SparseDataFrame))
for c in compat.itervalues(result)):
return SparseDataFrame
else:
from pandas.core.frame import DataFrame
return DataFrame

# otherwise it is a SingleBlockManager (axis = 0)
if result._block.is_sparse:
from pandas.core.sparse.api import SparseSeries
return SparseSeries
else:
return objs[0]._constructor
Expand Down
23 changes: 16 additions & 7 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@

from pandas.core.dtypes.cast import maybe_promote
from pandas.core.dtypes.common import (
ensure_platform_int, is_bool_dtype, is_extension_array_dtype, is_list_like,
is_object_dtype, needs_i8_conversion)
ensure_platform_int, is_bool_dtype, is_extension_array_dtype,
is_integer_dtype, is_list_like, is_object_dtype, needs_i8_conversion)
from pandas.core.dtypes.missing import notna

from pandas import compat
Expand Down Expand Up @@ -853,6 +853,7 @@ def check_len(item, name):

def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
sparse=False, drop_first=False, dtype=None):
from pandas.core.reshape.concat import concat
# Series avoids inconsistent NaN handling
codes, levels = _factorize_from_iterable(Series(data))

Expand Down Expand Up @@ -909,7 +910,15 @@ def _make_col_name(prefix, prefix_sep, level):
index = None

if sparse:
sparse_series = {}

if is_integer_dtype(dtype):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we have a routine in pandas.core.dtypes.missing for this already

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

na_value_for_dtype, or something else? We need something a little different, since we want the 0 value for each dtype.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we have that too let’s try to not reinvent the wheel

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't a function like this in any of the dtypes modules.

fill_value = 0
elif dtype == bool:
fill_value = False
else:
fill_value = 0.0

sparse_series = []
N = len(data)
sp_indices = [[] for _ in range(len(dummy_cols))]
mask = codes != -1
Expand All @@ -926,12 +935,12 @@ def _make_col_name(prefix, prefix_sep, level):
dummy_cols = dummy_cols[1:]
for col, ixs in zip(dummy_cols, sp_indices):
sarr = SparseArray(np.ones(len(ixs), dtype=dtype),
sparse_index=IntIndex(N, ixs), fill_value=0,
sparse_index=IntIndex(N, ixs),
fill_value=fill_value,
dtype=dtype)
sparse_series[col] = Series(data=sarr, index=index)
sparse_series.append(Series(data=sarr, index=index, name=col))

out = DataFrame(sparse_series, index=index, columns=dummy_cols,
dtype=dtype)
out = concat(sparse_series, axis=1, copy=False)
return out

else:
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/sparse/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,8 @@ def test_sparse_frame_unstack(sparse_df):

def test_sparse_series_unstack(sparse_df, multi_index3):
frame = pd.SparseSeries(np.ones(3), index=multi_index3).unstack()
tm.assert_sp_frame_equal(frame, sparse_df)

arr = np.array([1, np.nan, np.nan])
arrays = {i: pd.SparseArray(np.roll(arr, i)) for i in range(3)}
expected = pd.DataFrame(arrays)
tm.assert_frame_equal(frame, expected)