diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 7d79367cef1e2..ab33407e417a8 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -108,6 +108,7 @@ API changes - ``read_csv`` no longer allows a combination of strings and integers for the ``usecols`` parameter (:issue:`12678`) - ``pd.show_versions()`` now includes ``pandas_datareader`` version (:issue:`12740`) - Provide a proper ``__name__`` and ``__qualname__`` attributes for generic functions (:issue:`12021`) +- ``pd.concat(ignore_index=True)`` now uses ``RangeIndex`` as default (:issue:`12695`) .. _whatsnew_0181.apply_resample: @@ -233,6 +234,7 @@ Bug Fixes - Bug in ``concat`` raises ``AttributeError`` when input data contains tz-aware datetime and timedelta (:issue:`12620`) +- Bug in ``concat`` doesn't handle empty ``Series`` properly (:issue:`11082`) - Bug in ``pivot_table`` when ``margins=True`` and ``dropna=True`` where nulls still contributed to margin count (:issue:`12577`) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 52be7444f445a..ed4583a23255b 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -15,7 +15,7 @@ from pandas.core.internals import (items_overlap_with_suffix, concatenate_block_managers) from pandas.util.decorators import Appender, Substitution -from pandas.core.common import ABCSeries, isnull +from pandas.core.common import ABCSeries import pandas.core.algorithms as algos import pandas.core.common as com @@ -906,13 +906,14 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, break else: - # filter out the empties - # if we have not multi-index possibiltes - df = DataFrame([obj.shape for obj in objs]).sum(1) - non_empties = df[df != 0] + # filter out the empties if we have not multi-index possibiltes + # note to keep empty Series as it affect to result columns / name + non_empties = [obj for obj in objs + if sum(obj.shape) > 0 or isinstance(obj, Series)] + if (len(non_empties) and (keys is None and names is None and levels is None and join_axes is None)): - objs = [objs[i] for i in non_empties.index] + objs = non_empties sample = objs[0] if sample is None: @@ -979,7 +980,14 @@ def get_result(self): # stack blocks if self.axis == 0: - new_data = com._concat_compat([x._values for x in self.objs]) + # concat Series with length to keep dtype as much + non_empties = [x for x in self.objs if len(x) > 0] + if len(non_empties) > 0: + values = [x._values for x in non_empties] + else: + values = [x._values for x in self.objs] + new_data = com._concat_compat(values) + name = com._consensus_name_attr(self.objs) return (Series(new_data, index=self.new_axes[0], name=name, @@ -991,18 +999,6 @@ def get_result(self): data = dict(zip(range(len(self.objs)), self.objs)) index, columns = self.new_axes tmpdf = DataFrame(data, index=index) - # checks if the column variable already stores valid column - # names (because set via the 'key' argument in the 'concat' - # function call. If that's not the case, use the series names - # as column names - if (columns.equals(Index(np.arange(len(self.objs)))) and - not self.ignore_index): - columns = np.array([data[i].name - for i in range(len(data))], - dtype='object') - indexer = isnull(columns) - if indexer.any(): - columns[indexer] = np.arange(len(indexer[indexer])) tmpdf.columns = columns return tmpdf.__finalize__(self, method='concat') @@ -1082,32 +1078,34 @@ def _get_concat_axis(self): if self.axis == 0: indexes = [x.index for x in self.objs] elif self.ignore_index: - idx = Index(np.arange(len(self.objs))) - idx.is_unique = True # arange is always unique + idx = com._default_index(len(self.objs)) return idx elif self.keys is None: - names = [] - for x in self.objs: + names = [None] * len(self.objs) + num = 0 + has_names = False + for i, x in enumerate(self.objs): if not isinstance(x, Series): raise TypeError("Cannot concatenate type 'Series' " "with object of type " "%r" % type(x).__name__) if x.name is not None: - names.append(x.name) + names[i] = x.name + has_names = True else: - idx = Index(np.arange(len(self.objs))) - idx.is_unique = True - return idx - - return Index(names) + names[i] = num + num += 1 + if has_names: + return Index(names) + else: + return com._default_index(len(self.objs)) else: return _ensure_index(self.keys) else: indexes = [x._data.axes[self.axis] for x in self.objs] if self.ignore_index: - idx = Index(np.arange(sum(len(i) for i in indexes))) - idx.is_unique = True + idx = com._default_index(sum(len(i) for i in indexes)) return idx if self.keys is None: diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 6d5370bedf65a..17ab6ae96baf8 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -1252,6 +1252,66 @@ def test_concat_period_series(self): tm.assert_series_equal(result, expected) self.assertEqual(result.dtype, 'object') + def test_concat_empty_series(self): + # GH 11082 + s1 = pd.Series([1, 2, 3], name='x') + s2 = pd.Series(name='y') + res = pd.concat([s1, s2], axis=1) + exp = pd.DataFrame({'x': [1, 2, 3], 'y': [np.nan, np.nan, np.nan]}) + tm.assert_frame_equal(res, exp) + + s1 = pd.Series([1, 2, 3], name='x') + s2 = pd.Series(name='y') + res = pd.concat([s1, s2], axis=0) + # name will be reset + exp = pd.Series([1, 2, 3]) + tm.assert_series_equal(res, exp) + + # empty Series with no name + s1 = pd.Series([1, 2, 3], name='x') + s2 = pd.Series(name=None) + res = pd.concat([s1, s2], axis=1) + exp = pd.DataFrame({'x': [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, + columns=['x', 0]) + tm.assert_frame_equal(res, exp) + + def test_default_index(self): + # is_series and ignore_index + s1 = pd.Series([1, 2, 3], name='x') + s2 = pd.Series([4, 5, 6], name='y') + res = pd.concat([s1, s2], axis=1, ignore_index=True) + self.assertIsInstance(res.columns, pd.RangeIndex) + exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) + # use check_index_type=True to check the result have + # RangeIndex (default index) + tm.assert_frame_equal(res, exp, check_index_type=True, + check_column_type=True) + + # is_series and all inputs have no names + s1 = pd.Series([1, 2, 3]) + s2 = pd.Series([4, 5, 6]) + res = pd.concat([s1, s2], axis=1, ignore_index=False) + self.assertIsInstance(res.columns, pd.RangeIndex) + exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) + exp.columns = pd.RangeIndex(2) + tm.assert_frame_equal(res, exp, check_index_type=True, + check_column_type=True) + + # is_dataframe and ignore_index + df1 = pd.DataFrame({'A': [1, 2], 'B': [5, 6]}) + df2 = pd.DataFrame({'A': [3, 4], 'B': [7, 8]}) + + res = pd.concat([df1, df2], axis=0, ignore_index=True) + exp = pd.DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], + columns=['A', 'B']) + tm.assert_frame_equal(res, exp, check_index_type=True, + check_column_type=True) + + res = pd.concat([df1, df2], axis=1, ignore_index=True) + exp = pd.DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]]) + tm.assert_frame_equal(res, exp, check_index_type=True, + check_column_type=True) + def test_indicator(self): # PR #10054. xref #7412 and closes #8790. df1 = DataFrame({'col1': [0, 1], 'col_left': [