Skip to content

Commit 26782a9

Browse files
sinhrksjreback
authored andcommitted
BUG: Sparse concat results in dense
closes #12844 closes #10536
1 parent bac68d6 commit 26782a9

File tree

13 files changed

+474
-289
lines changed

13 files changed

+474
-289
lines changed

doc/source/whatsnew/v0.18.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ These changes conform sparse handling to return the correct types and work to ma
123123
- Bug in ``SparseArray.to_frame()`` results in ``DataFrame``, rather than ``SparseDataFrame`` (:issue:`9850`)
124124
- Bug in ``SparseArray.to_dense()`` does not preserve ``dtype`` (:issue:`10648`)
125125
- Bug in ``SparseArray.to_dense()`` incorrectly handle ``fill_value`` (:issue:`12797`)
126+
- Bug in ``pd.concat()`` of ``SparseSeries`` results in dense (:issue:`10536`)
126127

127128
.. _whatsnew_0181.api:
128129

pandas/core/categorical.py

+1-57
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from pandas.core.common import (
1919
ABCSeries, ABCIndexClass, ABCCategoricalIndex, isnull, notnull,
2020
is_dtype_equal, is_categorical_dtype, is_integer_dtype,
21-
_possibly_infer_to_datetimelike, get_dtype_kinds, is_list_like,
21+
_possibly_infer_to_datetimelike, is_list_like,
2222
is_sequence, is_null_slice, is_bool, _ensure_object, _ensure_int64,
2323
_coerce_indexer_dtype)
2424
from pandas.types.api import CategoricalDtype
@@ -1897,59 +1897,3 @@ def _convert_to_list_like(list_like):
18971897
else:
18981898
# is this reached?
18991899
return [list_like]
1900-
1901-
1902-
def _concat_compat(to_concat, axis=0):
1903-
"""Concatenate an object/categorical array of arrays, each of which is a
1904-
single dtype
1905-
1906-
Parameters
1907-
----------
1908-
to_concat : array of arrays
1909-
axis : int
1910-
Axis to provide concatenation in the current implementation this is
1911-
always 0, e.g. we only have 1D categoricals
1912-
1913-
Returns
1914-
-------
1915-
Categorical
1916-
A single array, preserving the combined dtypes
1917-
"""
1918-
1919-
def convert_categorical(x):
1920-
# coerce to object dtype
1921-
if is_categorical_dtype(x.dtype):
1922-
return x.get_values()
1923-
return x.ravel()
1924-
1925-
if get_dtype_kinds(to_concat) - set(['object', 'category']):
1926-
# convert to object type and perform a regular concat
1927-
from pandas.core.common import _concat_compat
1928-
return _concat_compat([np.array(x, copy=False, dtype=object)
1929-
for x in to_concat], axis=0)
1930-
1931-
# we could have object blocks and categoricals here
1932-
# if we only have a single categoricals then combine everything
1933-
# else its a non-compat categorical
1934-
categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)]
1935-
1936-
# validate the categories
1937-
categories = categoricals[0]
1938-
rawcats = categories.categories
1939-
for x in categoricals[1:]:
1940-
if not categories.is_dtype_equal(x):
1941-
raise ValueError("incompatible categories in categorical concat")
1942-
1943-
# we've already checked that all categoricals are the same, so if their
1944-
# length is equal to the input then we have all the same categories
1945-
if len(categoricals) == len(to_concat):
1946-
# concating numeric types is much faster than concating object types
1947-
# and fastpath takes a shorter path through the constructor
1948-
return Categorical(np.concatenate([x.codes for x in to_concat],
1949-
axis=0),
1950-
rawcats, ordered=categoricals[0].ordered,
1951-
fastpath=True)
1952-
else:
1953-
concatted = np.concatenate(list(map(convert_categorical, to_concat)),
1954-
axis=0)
1955-
return Categorical(concatted, rawcats)

pandas/core/common.py

-102
Original file line numberDiff line numberDiff line change
@@ -1918,108 +1918,6 @@ def _all_none(*args):
19181918
return True
19191919

19201920

1921-
def get_dtype_kinds(l):
1922-
"""
1923-
Parameters
1924-
----------
1925-
l : list of arrays
1926-
1927-
Returns
1928-
-------
1929-
a set of kinds that exist in this list of arrays
1930-
"""
1931-
1932-
typs = set()
1933-
for arr in l:
1934-
1935-
dtype = arr.dtype
1936-
if is_categorical_dtype(dtype):
1937-
typ = 'category'
1938-
elif is_sparse(arr):
1939-
typ = 'sparse'
1940-
elif is_datetimetz(arr):
1941-
typ = 'datetimetz'
1942-
elif is_datetime64_dtype(dtype):
1943-
typ = 'datetime'
1944-
elif is_timedelta64_dtype(dtype):
1945-
typ = 'timedelta'
1946-
elif is_object_dtype(dtype):
1947-
typ = 'object'
1948-
elif is_bool_dtype(dtype):
1949-
typ = 'bool'
1950-
else:
1951-
typ = dtype.kind
1952-
typs.add(typ)
1953-
return typs
1954-
1955-
1956-
def _concat_compat(to_concat, axis=0):
1957-
"""
1958-
provide concatenation of an array of arrays each of which is a single
1959-
'normalized' dtypes (in that for example, if it's object, then it is a
1960-
non-datetimelike and provide a combined dtype for the resulting array that
1961-
preserves the overall dtype if possible)
1962-
1963-
Parameters
1964-
----------
1965-
to_concat : array of arrays
1966-
axis : axis to provide concatenation
1967-
1968-
Returns
1969-
-------
1970-
a single array, preserving the combined dtypes
1971-
"""
1972-
1973-
# filter empty arrays
1974-
# 1-d dtypes always are included here
1975-
def is_nonempty(x):
1976-
try:
1977-
return x.shape[axis] > 0
1978-
except Exception:
1979-
return True
1980-
1981-
nonempty = [x for x in to_concat if is_nonempty(x)]
1982-
1983-
# If all arrays are empty, there's nothing to convert, just short-cut to
1984-
# the concatenation, #3121.
1985-
#
1986-
# Creating an empty array directly is tempting, but the winnings would be
1987-
# marginal given that it would still require shape & dtype calculation and
1988-
# np.concatenate which has them both implemented is compiled.
1989-
1990-
typs = get_dtype_kinds(to_concat)
1991-
1992-
# these are mandated to handle empties as well
1993-
if 'datetime' in typs or 'datetimetz' in typs or 'timedelta' in typs:
1994-
from pandas.tseries.common import _concat_compat
1995-
return _concat_compat(to_concat, axis=axis, typs=typs)
1996-
1997-
elif 'sparse' in typs:
1998-
from pandas.sparse.array import _concat_compat
1999-
return _concat_compat(to_concat, axis=axis)
2000-
2001-
elif 'category' in typs:
2002-
from pandas.core.categorical import _concat_compat
2003-
return _concat_compat(to_concat, axis=axis)
2004-
2005-
if not nonempty:
2006-
# we have all empties, but may need to coerce the result dtype to
2007-
# object if we have non-numeric type operands (numpy would otherwise
2008-
# cast this to float)
2009-
typs = get_dtype_kinds(to_concat)
2010-
if len(typs) != 1:
2011-
2012-
if (not len(typs - set(['i', 'u', 'f'])) or
2013-
not len(typs - set(['bool', 'i', 'u']))):
2014-
# let numpy coerce
2015-
pass
2016-
else:
2017-
# coerce to object
2018-
to_concat = [x.astype('object') for x in to_concat]
2019-
2020-
return np.concatenate(to_concat, axis=axis)
2021-
2022-
20231921
def _where_compat(mask, arr1, arr2):
20241922
if arr1.dtype == _NS_DTYPE and arr2.dtype == _NS_DTYPE:
20251923
new_vals = np.where(mask, arr1.view('i8'), arr2.view('i8'))

pandas/core/internals.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from pandas.tseries.index import DatetimeIndex
3131
from pandas.formats.printing import pprint_thing
3232
import pandas.core.common as com
33+
import pandas.types.concat as _concat
3334
import pandas.core.missing as missing
3435
import pandas.core.convert as convert
3536
from pandas.sparse.array import _maybe_to_sparse, SparseArray
@@ -4646,7 +4647,7 @@ def concatenate_join_units(join_units, concat_axis, copy):
46464647
if copy and concat_values.base is not None:
46474648
concat_values = concat_values.copy()
46484649
else:
4649-
concat_values = com._concat_compat(to_concat, axis=concat_axis)
4650+
concat_values = _concat._concat_compat(to_concat, axis=concat_axis)
46504651

46514652
return concat_values
46524653

pandas/core/reshape.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from pandas.core.groupby import get_group_index, _compress_group_index
1919

2020
import pandas.core.common as com
21+
import pandas.types.concat as _concat
2122
import pandas.core.algorithms as algos
2223
import pandas.algos as _algos
2324

@@ -848,7 +849,8 @@ def lreshape(data, groups, dropna=True, label=None):
848849
pivot_cols = []
849850

850851
for target, names in zip(keys, values):
851-
mdata[target] = com._concat_compat([data[col].values for col in names])
852+
to_concat = [data[col].values for col in names]
853+
mdata[target] = _concat._concat_compat(to_concat)
852854
pivot_cols.append(target)
853855

854856
for col in id_cols:

pandas/indexes/base.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from pandas.util.decorators import (Appender, Substitution, cache_readonly,
1818
deprecate, deprecate_kwarg)
1919
import pandas.core.common as com
20+
import pandas.types.concat as _concat
2021
import pandas.core.missing as missing
2122
import pandas.core.algorithms as algos
2223
from pandas.formats.printing import pprint_thing
@@ -1713,7 +1714,7 @@ def union(self, other):
17131714
if len(indexer) > 0:
17141715
other_diff = algos.take_nd(other._values, indexer,
17151716
allow_fill=False)
1716-
result = com._concat_compat((self.values, other_diff))
1717+
result = _concat._concat_compat((self.values, other_diff))
17171718

17181719
try:
17191720
self.values[0] < other_diff[0]

pandas/sparse/array.py

-43
Original file line numberDiff line numberDiff line change
@@ -563,46 +563,3 @@ def _make_index(length, indices, kind):
563563

564564
ops.add_special_arithmetic_methods(SparseArray, arith_method=_arith_method,
565565
use_numexpr=False)
566-
567-
568-
def _concat_compat(to_concat, axis=0):
569-
"""
570-
provide concatenation of an sparse/dense array of arrays each of which is a
571-
single dtype
572-
573-
Parameters
574-
----------
575-
to_concat : array of arrays
576-
axis : axis to provide concatenation
577-
578-
Returns
579-
-------
580-
a single array, preserving the combined dtypes
581-
"""
582-
583-
def convert_sparse(x, axis):
584-
# coerce to native type
585-
if isinstance(x, SparseArray):
586-
x = x.get_values()
587-
x = x.ravel()
588-
if axis > 0:
589-
x = np.atleast_2d(x)
590-
return x
591-
592-
typs = com.get_dtype_kinds(to_concat)
593-
594-
# we have more than one type here, so densify and regular concat
595-
to_concat = [convert_sparse(x, axis) for x in to_concat]
596-
result = np.concatenate(to_concat, axis=axis)
597-
598-
if not len(typs - set(['sparse', 'f', 'i'])):
599-
600-
# we can remain sparse
601-
result = SparseArray(result.ravel())
602-
603-
else:
604-
605-
# coerce to object if needed
606-
result = result.astype('object')
607-
608-
return result

0 commit comments

Comments
 (0)