diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index c6642c5216262..1811f7de69d63 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -120,6 +120,7 @@ These changes conform sparse handling to return the correct types and work to ma - Bug in ``SparseArray.to_frame()`` results in ``DataFrame``, rather than ``SparseDataFrame`` (:issue:`9850`) - Bug in ``SparseArray.to_dense()`` does not preserve ``dtype`` (:issue:`10648`) - Bug in ``SparseArray.to_dense()`` incorrectly handle ``fill_value`` (:issue:`12797`) +- Bug in ``pd.concat()`` of ``SparseSeries`` results in dense (:issue:`10536`) .. _whatsnew_0181.api: diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 986f7ad55361a..811befc636c1f 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -18,7 +18,7 @@ from pandas.core.common import ( ABCSeries, ABCIndexClass, ABCCategoricalIndex, isnull, notnull, is_dtype_equal, is_categorical_dtype, is_integer_dtype, - _possibly_infer_to_datetimelike, get_dtype_kinds, is_list_like, + _possibly_infer_to_datetimelike, is_list_like, is_sequence, is_null_slice, is_bool, _ensure_object, _ensure_int64, _coerce_indexer_dtype) from pandas.types.api import CategoricalDtype @@ -1873,59 +1873,3 @@ def _convert_to_list_like(list_like): else: # is this reached? return [list_like] - - -def _concat_compat(to_concat, axis=0): - """Concatenate an object/categorical array of arrays, each of which is a - single dtype - - Parameters - ---------- - to_concat : array of arrays - axis : int - Axis to provide concatenation in the current implementation this is - always 0, e.g. we only have 1D categoricals - - Returns - ------- - Categorical - A single array, preserving the combined dtypes - """ - - def convert_categorical(x): - # coerce to object dtype - if is_categorical_dtype(x.dtype): - return x.get_values() - return x.ravel() - - if get_dtype_kinds(to_concat) - set(['object', 'category']): - # convert to object type and perform a regular concat - from pandas.core.common import _concat_compat - return _concat_compat([np.array(x, copy=False, dtype=object) - for x in to_concat], axis=0) - - # we could have object blocks and categoricals here - # if we only have a single categoricals then combine everything - # else its a non-compat categorical - categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)] - - # validate the categories - categories = categoricals[0] - rawcats = categories.categories - for x in categoricals[1:]: - if not categories.is_dtype_equal(x): - raise ValueError("incompatible categories in categorical concat") - - # we've already checked that all categoricals are the same, so if their - # length is equal to the input then we have all the same categories - if len(categoricals) == len(to_concat): - # concating numeric types is much faster than concating object types - # and fastpath takes a shorter path through the constructor - return Categorical(np.concatenate([x.codes for x in to_concat], - axis=0), - rawcats, ordered=categoricals[0].ordered, - fastpath=True) - else: - concatted = np.concatenate(list(map(convert_categorical, to_concat)), - axis=0) - return Categorical(concatted, rawcats) diff --git a/pandas/core/common.py b/pandas/core/common.py index c0f47a48a46a8..7ea9223b6106e 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1918,108 +1918,6 @@ def _all_none(*args): return True -def get_dtype_kinds(l): - """ - Parameters - ---------- - l : list of arrays - - Returns - ------- - a set of kinds that exist in this list of arrays - """ - - typs = set() - for arr in l: - - dtype = arr.dtype - if is_categorical_dtype(dtype): - typ = 'category' - elif is_sparse(arr): - typ = 'sparse' - elif is_datetimetz(arr): - typ = 'datetimetz' - elif is_datetime64_dtype(dtype): - typ = 'datetime' - elif is_timedelta64_dtype(dtype): - typ = 'timedelta' - elif is_object_dtype(dtype): - typ = 'object' - elif is_bool_dtype(dtype): - typ = 'bool' - else: - typ = dtype.kind - typs.add(typ) - return typs - - -def _concat_compat(to_concat, axis=0): - """ - provide concatenation of an array of arrays each of which is a single - 'normalized' dtypes (in that for example, if it's object, then it is a - non-datetimelike and provide a combined dtype for the resulting array that - preserves the overall dtype if possible) - - Parameters - ---------- - to_concat : array of arrays - axis : axis to provide concatenation - - Returns - ------- - a single array, preserving the combined dtypes - """ - - # filter empty arrays - # 1-d dtypes always are included here - def is_nonempty(x): - try: - return x.shape[axis] > 0 - except Exception: - return True - - nonempty = [x for x in to_concat if is_nonempty(x)] - - # If all arrays are empty, there's nothing to convert, just short-cut to - # the concatenation, #3121. - # - # Creating an empty array directly is tempting, but the winnings would be - # marginal given that it would still require shape & dtype calculation and - # np.concatenate which has them both implemented is compiled. - - typs = get_dtype_kinds(to_concat) - - # these are mandated to handle empties as well - if 'datetime' in typs or 'datetimetz' in typs or 'timedelta' in typs: - from pandas.tseries.common import _concat_compat - return _concat_compat(to_concat, axis=axis, typs=typs) - - elif 'sparse' in typs: - from pandas.sparse.array import _concat_compat - return _concat_compat(to_concat, axis=axis) - - elif 'category' in typs: - from pandas.core.categorical import _concat_compat - return _concat_compat(to_concat, axis=axis) - - if not nonempty: - # we have all empties, but may need to coerce the result dtype to - # object if we have non-numeric type operands (numpy would otherwise - # cast this to float) - typs = get_dtype_kinds(to_concat) - if len(typs) != 1: - - if (not len(typs - set(['i', 'u', 'f'])) or - not len(typs - set(['bool', 'i', 'u']))): - # let numpy coerce - pass - else: - # coerce to object - to_concat = [x.astype('object') for x in to_concat] - - return np.concatenate(to_concat, axis=axis) - - def _where_compat(mask, arr1, arr2): if arr1.dtype == _NS_DTYPE and arr2.dtype == _NS_DTYPE: new_vals = np.where(mask, arr1.view('i8'), arr2.view('i8')) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 463a2da529b5d..987e83a9a40b4 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -30,6 +30,7 @@ from pandas.tseries.index import DatetimeIndex from pandas.formats.printing import pprint_thing import pandas.core.common as com +import pandas.types.concat as _concat import pandas.core.missing as missing import pandas.core.convert as convert from pandas.sparse.array import _maybe_to_sparse, SparseArray @@ -4646,7 +4647,7 @@ def concatenate_join_units(join_units, concat_axis, copy): if copy and concat_values.base is not None: concat_values = concat_values.copy() else: - concat_values = com._concat_compat(to_concat, axis=concat_axis) + concat_values = _concat._concat_compat(to_concat, axis=concat_axis) return concat_values diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 5c775f8a0d937..7e0c094aec4c2 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -18,6 +18,7 @@ from pandas.core.groupby import get_group_index, _compress_group_index import pandas.core.common as com +import pandas.types.concat as _concat import pandas.core.algorithms as algos import pandas.algos as _algos @@ -848,7 +849,8 @@ def lreshape(data, groups, dropna=True, label=None): pivot_cols = [] for target, names in zip(keys, values): - mdata[target] = com._concat_compat([data[col].values for col in names]) + to_concat = [data[col].values for col in names] + mdata[target] = _concat._concat_compat(to_concat) pivot_cols.append(target) for col in id_cols: diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 644b6720dfaac..f924300678565 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -17,6 +17,7 @@ from pandas.util.decorators import (Appender, Substitution, cache_readonly, deprecate, deprecate_kwarg) import pandas.core.common as com +import pandas.types.concat as _concat import pandas.core.missing as missing import pandas.core.algorithms as algos from pandas.formats.printing import pprint_thing @@ -1713,7 +1714,7 @@ def union(self, other): if len(indexer) > 0: other_diff = algos.take_nd(other._values, indexer, allow_fill=False) - result = com._concat_compat((self.values, other_diff)) + result = _concat._concat_compat((self.values, other_diff)) try: self.values[0] < other_diff[0] diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index 92eb2a9230c3b..a4b8d43996387 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -574,46 +574,3 @@ def _make_index(length, indices, kind): ops.add_special_arithmetic_methods(SparseArray, arith_method=_arith_method, use_numexpr=False) - - -def _concat_compat(to_concat, axis=0): - """ - provide concatenation of an sparse/dense array of arrays each of which is a - single dtype - - Parameters - ---------- - to_concat : array of arrays - axis : axis to provide concatenation - - Returns - ------- - a single array, preserving the combined dtypes - """ - - def convert_sparse(x, axis): - # coerce to native type - if isinstance(x, SparseArray): - x = x.get_values() - x = x.ravel() - if axis > 0: - x = np.atleast_2d(x) - return x - - typs = com.get_dtype_kinds(to_concat) - - # we have more than one type here, so densify and regular concat - to_concat = [convert_sparse(x, axis) for x in to_concat] - result = np.concatenate(to_concat, axis=axis) - - if not len(typs - set(['sparse', 'f', 'i'])): - - # we can remain sparse - result = SparseArray(result.ravel()) - - else: - - # coerce to object if needed - result = result.astype('object') - - return result diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index ef0860f3bd980..1d5b90c19decb 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -951,6 +951,120 @@ def _check_results_to_coo(results, check): assert_equal(il, il_result) assert_equal(jl, jl_result) + def test_concat(self): + val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) + val2 = np.array([3, np.nan, 4, 0, 0]) + + for kind in ['integer', 'block']: + sparse1 = pd.SparseSeries(val1, name='x', kind=kind) + sparse2 = pd.SparseSeries(val2, name='y', kind=kind) + + res = pd.concat([sparse1, sparse2]) + exp = pd.concat([pd.Series(val1), pd.Series(val2)]) + exp = pd.SparseSeries(exp, kind=kind) + tm.assert_sp_series_equal(res, exp) + + sparse1 = pd.SparseSeries(val1, fill_value=0, name='x', kind=kind) + sparse2 = pd.SparseSeries(val2, fill_value=0, name='y', kind=kind) + + res = pd.concat([sparse1, sparse2]) + exp = pd.concat([pd.Series(val1), pd.Series(val2)]) + exp = pd.SparseSeries(exp, fill_value=0, kind=kind) + tm.assert_sp_series_equal(res, exp) + + def test_concat_axis1(self): + val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) + val2 = np.array([3, np.nan, 4, 0, 0]) + + sparse1 = pd.SparseSeries(val1, name='x') + sparse2 = pd.SparseSeries(val2, name='y') + + res = pd.concat([sparse1, sparse2], axis=1) + exp = pd.concat([pd.Series(val1, name='x'), + pd.Series(val2, name='y')], axis=1) + exp = pd.SparseDataFrame(exp) + tm.assert_sp_frame_equal(res, exp) + + def test_concat_different_fill(self): + val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) + val2 = np.array([3, np.nan, 4, 0, 0]) + + for kind in ['integer', 'block']: + sparse1 = pd.SparseSeries(val1, name='x', kind=kind) + sparse2 = pd.SparseSeries(val2, name='y', kind=kind, fill_value=0) + + res = pd.concat([sparse1, sparse2]) + exp = pd.concat([pd.Series(val1), pd.Series(val2)]) + exp = pd.SparseSeries(exp, kind=kind) + tm.assert_sp_series_equal(res, exp) + + res = pd.concat([sparse2, sparse1]) + exp = pd.concat([pd.Series(val2), pd.Series(val1)]) + exp = pd.SparseSeries(exp, kind=kind, fill_value=0) + tm.assert_sp_series_equal(res, exp) + + def test_concat_axis1_different_fill(self): + val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) + val2 = np.array([3, np.nan, 4, 0, 0]) + + sparse1 = pd.SparseSeries(val1, name='x') + sparse2 = pd.SparseSeries(val2, name='y', fill_value=0) + + res = pd.concat([sparse1, sparse2], axis=1) + exp = pd.concat([pd.Series(val1, name='x'), + pd.Series(val2, name='y')], axis=1) + self.assertIsInstance(res, pd.SparseDataFrame) + tm.assert_frame_equal(res.to_dense(), exp) + + def test_concat_different_kind(self): + val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) + val2 = np.array([3, np.nan, 4, 0, 0]) + + sparse1 = pd.SparseSeries(val1, name='x', kind='integer') + sparse2 = pd.SparseSeries(val2, name='y', kind='block', fill_value=0) + + res = pd.concat([sparse1, sparse2]) + exp = pd.concat([pd.Series(val1), pd.Series(val2)]) + exp = pd.SparseSeries(exp, kind='integer') + tm.assert_sp_series_equal(res, exp) + + res = pd.concat([sparse2, sparse1]) + exp = pd.concat([pd.Series(val2), pd.Series(val1)]) + exp = pd.SparseSeries(exp, kind='block', fill_value=0) + tm.assert_sp_series_equal(res, exp) + + def test_concat_sparse_dense(self): + # use first input's fill_value + val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) + val2 = np.array([3, np.nan, 4, 0, 0]) + + for kind in ['integer', 'block']: + sparse = pd.SparseSeries(val1, name='x', kind=kind) + dense = pd.Series(val2, name='y') + + res = pd.concat([sparse, dense]) + exp = pd.concat([pd.Series(val1), dense]) + exp = pd.SparseSeries(exp, kind=kind) + tm.assert_sp_series_equal(res, exp) + + res = pd.concat([dense, sparse, dense]) + exp = pd.concat([dense, pd.Series(val1), dense]) + exp = pd.SparseSeries(exp, kind=kind) + tm.assert_sp_series_equal(res, exp) + + sparse = pd.SparseSeries(val1, name='x', kind=kind, fill_value=0) + dense = pd.Series(val2, name='y') + + res = pd.concat([sparse, dense]) + exp = pd.concat([pd.Series(val1), dense]) + exp = pd.SparseSeries(exp, kind=kind, fill_value=0) + tm.assert_sp_series_equal(res, exp) + + res = pd.concat([dense, sparse, dense]) + exp = pd.concat([dense, pd.Series(val1), dense]) + exp = pd.SparseSeries(exp, kind=kind, fill_value=0) + tm.assert_sp_series_equal(res, exp) + def _dense_series_compare(s, f): result = f(s) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index ed4583a23255b..84a431393b0bf 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -19,6 +19,7 @@ import pandas.core.algorithms as algos import pandas.core.common as com +import pandas.types.concat as _concat import pandas.algos as _algos import pandas.hashtable as _hash @@ -986,21 +987,24 @@ def get_result(self): values = [x._values for x in non_empties] else: values = [x._values for x in self.objs] - new_data = com._concat_compat(values) + new_data = _concat._concat_compat(values) name = com._consensus_name_attr(self.objs) - return (Series(new_data, index=self.new_axes[0], - name=name, - dtype=new_data.dtype) + cons = _concat._get_series_result_type(new_data) + + return (cons(new_data, index=self.new_axes[0], + name=name, dtype=new_data.dtype) .__finalize__(self, method='concat')) # combine as columns in a frame else: data = dict(zip(range(len(self.objs)), self.objs)) + cons = _concat._get_series_result_type(data) + index, columns = self.new_axes - tmpdf = DataFrame(data, index=index) - tmpdf.columns = columns - return tmpdf.__finalize__(self, method='concat') + df = cons(data, index=index) + df.columns = columns + return df.__finalize__(self, method='concat') # combine block managers else: @@ -1019,9 +1023,10 @@ def get_result(self): mgrs_indexers.append((obj._data, indexers)) - new_data = concatenate_block_managers( - mgrs_indexers, self.new_axes, - concat_axis=self.axis, copy=self.copy) + new_data = concatenate_block_managers(mgrs_indexers, + self.new_axes, + concat_axis=self.axis, + copy=self.copy) if not self.copy: new_data._consolidate_inplace() diff --git a/pandas/tseries/common.py b/pandas/tseries/common.py index c4f100eb8f4d3..8937e83c7009a 100644 --- a/pandas/tseries/common.py +++ b/pandas/tseries/common.py @@ -15,8 +15,7 @@ is_datetime_arraylike, is_integer_dtype, is_list_like, is_datetime64_dtype, is_datetime64tz_dtype, - is_timedelta64_dtype, is_categorical_dtype, - get_dtype_kinds) + is_timedelta64_dtype, is_categorical_dtype) def is_datetimelike(data): @@ -238,71 +237,3 @@ class CombinedDatetimelikeProperties(DatetimeProperties, TimedeltaProperties): # the Series.dt class property. For Series objects, .dt will always be one # of the more specific classes above. __doc__ = DatetimeProperties.__doc__ - - -def _concat_compat(to_concat, axis=0, typs=None): - """ - provide concatenation of an datetimelike array of arrays each of which is a - single M8[ns], datetimet64[ns, tz] or m8[ns] dtype - - Parameters - ---------- - to_concat : array of arrays - axis : axis to provide concatenation - - Returns - ------- - a single array, preserving the combined dtypes - """ - - def convert_to_pydatetime(x, axis): - # coerce to an object dtype - - # if dtype is of datetimetz or timezone - if x.dtype.kind == _NS_DTYPE.kind: - if getattr(x, 'tz', None) is not None: - x = x.asobject.values - else: - shape = x.shape - x = tslib.ints_to_pydatetime(x.view(np.int64).ravel()) - x = x.reshape(shape) - - elif x.dtype == _TD_DTYPE: - shape = x.shape - x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel()) - x = x.reshape(shape) - - if axis == 1: - x = np.atleast_2d(x) - return x - - if typs is None: - typs = get_dtype_kinds(to_concat) - - # must be single dtype - if len(typs) == 1: - - if 'datetimetz' in typs: - # datetime with no tz should be stored as "datetime" in typs, - # thus no need to care - - # we require ALL of the same tz for datetimetz - tzs = set([x.tz for x in to_concat]) - if len(tzs) == 1: - return DatetimeIndex(np.concatenate([x.tz_localize(None).asi8 - for x in to_concat]), - tz=list(tzs)[0]) - - elif 'datetime' in typs: - new_values = np.concatenate([x.view(np.int64) for x in to_concat], - axis=axis) - return new_values.view(_NS_DTYPE) - - elif 'timedelta' in typs: - new_values = np.concatenate([x.view(np.int64) for x in to_concat], - axis=axis) - return new_values.view(_TD_DTYPE) - - # need to coerce to object - to_concat = [convert_to_pydatetime(x, axis) for x in to_concat] - return np.concatenate(to_concat, axis=axis) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index dc40387cc365f..94344c5a03d50 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -26,6 +26,7 @@ from pandas.util.decorators import (Appender, cache_readonly, deprecate_kwarg, Substitution) import pandas.core.common as com +import pandas.types.concat as _concat import pandas.tseries.offsets as offsets import pandas.tseries.tools as tools @@ -1151,7 +1152,7 @@ def _fast_union(self, other): if left_end < right_end: loc = right.searchsorted(left_end, side='right') right_chunk = right.values[loc:] - dates = com._concat_compat((left.values, right_chunk)) + dates = _concat._concat_compat((left.values, right_chunk)) return self._shallow_copy(dates) else: return left @@ -2219,7 +2220,7 @@ def _process_concat_data(to_concat, name): # well, technically not a "class" anymore...oh well klass = DatetimeIndex._simple_new kwargs = {'tz': tz} - concat = com._concat_compat + concat = _concat._concat_compat else: for i, x in enumerate(to_concat): if isinstance(x, DatetimeIndex): diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 56012a8c4ad6a..423ccea7d4673 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -12,6 +12,7 @@ from pandas.tseries.frequencies import to_offset from pandas.core.base import _shared_docs import pandas.core.common as com +import pandas.types.concat as _concat from pandas.util.decorators import Appender, Substitution from pandas.tseries.base import TimelikeOps, DatetimeIndexOpsMixin from pandas.tseries.timedeltas import (to_timedelta, @@ -514,7 +515,7 @@ def append(self, other): break to_concat = self._ensure_compat_concat(to_concat) - return Index(com._concat_compat(to_concat), name=name) + return Index(_concat._concat_compat(to_concat), name=name) def join(self, other, how='left', level=None, return_indexers=False): """ @@ -585,7 +586,7 @@ def _fast_union(self, other): if left_end < right_end: loc = right.searchsorted(left_end, side='right') right_chunk = right.values[loc:] - dates = com._concat_compat((left.values, right_chunk)) + dates = _concat._concat_compat((left.values, right_chunk)) return self._shallow_copy(dates) else: return left diff --git a/pandas/types/concat.py b/pandas/types/concat.py new file mode 100644 index 0000000000000..228c48041c0f8 --- /dev/null +++ b/pandas/types/concat.py @@ -0,0 +1,329 @@ +""" +Utility functions related to concat +""" + +import numpy as np +import pandas.core.common as com +import pandas.tslib as tslib +from pandas import compat +from pandas.compat import map + + +def get_dtype_kinds(l): + """ + Parameters + ---------- + l : list of arrays + + Returns + ------- + a set of kinds that exist in this list of arrays + """ + + typs = set() + for arr in l: + + dtype = arr.dtype + if com.is_categorical_dtype(dtype): + typ = 'category' + elif com.is_sparse(arr): + typ = 'sparse' + elif com.is_datetimetz(arr): + typ = 'datetimetz' + elif com.is_datetime64_dtype(dtype): + typ = 'datetime' + elif com.is_timedelta64_dtype(dtype): + typ = 'timedelta' + elif com.is_object_dtype(dtype): + typ = 'object' + elif com.is_bool_dtype(dtype): + typ = 'bool' + else: + typ = dtype.kind + typs.add(typ) + return typs + + +def _get_series_result_type(result): + """ + return appropriate class of Series concat + input is either dict or array-like + """ + if isinstance(result, dict): + # concat Series with axis 1 + if all(com.is_sparse(c) for c in compat.itervalues(result)): + from pandas.sparse.api import SparseDataFrame + return SparseDataFrame + else: + from pandas.core.frame import DataFrame + return DataFrame + + elif com.is_sparse(result): + # concat Series with axis 1 + from pandas.sparse.api import SparseSeries + return SparseSeries + else: + from pandas.core.series import Series + return Series + + +def _concat_compat(to_concat, axis=0): + """ + provide concatenation of an array of arrays each of which is a single + 'normalized' dtypes (in that for example, if it's object, then it is a + non-datetimelike and provide a combined dtype for the resulting array that + preserves the overall dtype if possible) + + Parameters + ---------- + to_concat : array of arrays + axis : axis to provide concatenation + + Returns + ------- + a single array, preserving the combined dtypes + """ + + # filter empty arrays + # 1-d dtypes always are included here + def is_nonempty(x): + try: + return x.shape[axis] > 0 + except Exception: + return True + + nonempty = [x for x in to_concat if is_nonempty(x)] + + # If all arrays are empty, there's nothing to convert, just short-cut to + # the concatenation, #3121. + # + # Creating an empty array directly is tempting, but the winnings would be + # marginal given that it would still require shape & dtype calculation and + # np.concatenate which has them both implemented is compiled. + + typs = get_dtype_kinds(to_concat) + + # these are mandated to handle empties as well + if 'datetime' in typs or 'datetimetz' in typs or 'timedelta' in typs: + return _concat_datetime(to_concat, axis=axis, typs=typs) + + elif 'sparse' in typs: + return _concat_sparse(to_concat, axis=axis, typs=typs) + + elif 'category' in typs: + return _concat_categorical(to_concat, axis=axis) + + if not nonempty: + # we have all empties, but may need to coerce the result dtype to + # object if we have non-numeric type operands (numpy would otherwise + # cast this to float) + typs = get_dtype_kinds(to_concat) + if len(typs) != 1: + + if (not len(typs - set(['i', 'u', 'f'])) or + not len(typs - set(['bool', 'i', 'u']))): + # let numpy coerce + pass + else: + # coerce to object + to_concat = [x.astype('object') for x in to_concat] + + return np.concatenate(to_concat, axis=axis) + + +def _concat_categorical(to_concat, axis=0): + """Concatenate an object/categorical array of arrays, each of which is a + single dtype + + Parameters + ---------- + to_concat : array of arrays + axis : int + Axis to provide concatenation in the current implementation this is + always 0, e.g. we only have 1D categoricals + + Returns + ------- + Categorical + A single array, preserving the combined dtypes + """ + + from pandas.core.categorical import Categorical + + def convert_categorical(x): + # coerce to object dtype + if com.is_categorical_dtype(x.dtype): + return x.get_values() + return x.ravel() + + if get_dtype_kinds(to_concat) - set(['object', 'category']): + # convert to object type and perform a regular concat + return _concat_compat([np.array(x, copy=False, dtype=object) + for x in to_concat], axis=0) + + # we could have object blocks and categoricals here + # if we only have a single categoricals then combine everything + # else its a non-compat categorical + categoricals = [x for x in to_concat if com.is_categorical_dtype(x.dtype)] + + # validate the categories + categories = categoricals[0] + rawcats = categories.categories + for x in categoricals[1:]: + if not categories.is_dtype_equal(x): + raise ValueError("incompatible categories in categorical concat") + + # we've already checked that all categoricals are the same, so if their + # length is equal to the input then we have all the same categories + if len(categoricals) == len(to_concat): + # concating numeric types is much faster than concating object types + # and fastpath takes a shorter path through the constructor + return Categorical(np.concatenate([x.codes for x in to_concat], + axis=0), + rawcats, ordered=categoricals[0].ordered, + fastpath=True) + else: + concatted = np.concatenate(list(map(convert_categorical, to_concat)), + axis=0) + return Categorical(concatted, rawcats) + + +def _concat_datetime(to_concat, axis=0, typs=None): + """ + provide concatenation of an datetimelike array of arrays each of which is a + single M8[ns], datetimet64[ns, tz] or m8[ns] dtype + + Parameters + ---------- + to_concat : array of arrays + axis : axis to provide concatenation + typs : set of to_concat dtypes + + Returns + ------- + a single array, preserving the combined dtypes + """ + + def convert_to_pydatetime(x, axis): + # coerce to an object dtype + + # if dtype is of datetimetz or timezone + if x.dtype.kind == com._NS_DTYPE.kind: + if getattr(x, 'tz', None) is not None: + x = x.asobject.values + else: + shape = x.shape + x = tslib.ints_to_pydatetime(x.view(np.int64).ravel()) + x = x.reshape(shape) + + elif x.dtype == com._TD_DTYPE: + shape = x.shape + x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel()) + x = x.reshape(shape) + + if axis == 1: + x = np.atleast_2d(x) + return x + + if typs is None: + typs = get_dtype_kinds(to_concat) + + # must be single dtype + if len(typs) == 1: + + if 'datetimetz' in typs: + # datetime with no tz should be stored as "datetime" in typs, + # thus no need to care + + # we require ALL of the same tz for datetimetz + tzs = set([x.tz for x in to_concat]) + if len(tzs) == 1: + from pandas.tseries.index import DatetimeIndex + new_values = np.concatenate([x.tz_localize(None).asi8 + for x in to_concat]) + return DatetimeIndex(new_values, tz=list(tzs)[0]) + + elif 'datetime' in typs: + new_values = np.concatenate([x.view(np.int64) for x in to_concat], + axis=axis) + return new_values.view(com._NS_DTYPE) + + elif 'timedelta' in typs: + new_values = np.concatenate([x.view(np.int64) for x in to_concat], + axis=axis) + return new_values.view(com._TD_DTYPE) + + # need to coerce to object + to_concat = [convert_to_pydatetime(x, axis) for x in to_concat] + return np.concatenate(to_concat, axis=axis) + + +def _concat_sparse(to_concat, axis=0, typs=None): + """ + provide concatenation of an sparse/dense array of arrays each of which is a + single dtype + + Parameters + ---------- + to_concat : array of arrays + axis : axis to provide concatenation + typs : set of to_concat dtypes + + Returns + ------- + a single array, preserving the combined dtypes + """ + + from pandas.sparse.array import SparseArray, _make_index + + def convert_sparse(x, axis): + # coerce to native type + if isinstance(x, SparseArray): + x = x.get_values() + x = x.ravel() + if axis > 0: + x = np.atleast_2d(x) + return x + + if typs is None: + typs = com.get_dtype_kinds(to_concat) + + if len(typs) == 1: + # concat input as it is if all inputs are sparse + # and have the same fill_value + fill_values = set(c.fill_value for c in to_concat) + if len(fill_values) == 1: + sp_values = [c.sp_values for c in to_concat] + indexes = [c.sp_index.to_int_index() for c in to_concat] + + indices = [] + loc = 0 + for idx in indexes: + indices.append(idx.indices + loc) + loc += idx.length + sp_values = np.concatenate(sp_values) + indices = np.concatenate(indices) + sp_index = _make_index(loc, indices, kind=to_concat[0].sp_index) + + return SparseArray(sp_values, sparse_index=sp_index, + fill_value=to_concat[0].fill_value) + + # input may be sparse / dense mixed and may have different fill_value + # input must contain sparse at least 1 + sparses = [c for c in to_concat if com.is_sparse(c)] + fill_values = [c.fill_value for c in sparses] + sp_indexes = [c.sp_index for c in sparses] + + # densify and regular concat + to_concat = [convert_sparse(x, axis) for x in to_concat] + result = np.concatenate(to_concat, axis=axis) + + if not len(typs - set(['sparse', 'f', 'i'])): + # sparsify if inputs are sparse and dense numerics + # first sparse input's fill_value and SparseIndex is used + result = SparseArray(result.ravel(), fill_value=fill_values[0], + kind=sp_indexes[0]) + else: + # coerce to object if needed + result = result.astype('object') + return result