Skip to content

Commit c6b21b4

Browse files
committed
API: concat will now concatenate mixed Series and DataFrames using the Series name
or numbering columns as needed (GH2385)
1 parent 62e5651 commit c6b21b4

File tree

6 files changed

+156
-26
lines changed

6 files changed

+156
-26
lines changed

doc/source/merging.rst

+27
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,33 @@ This is also a valid argument to ``DataFrame.append``:
213213
214214
df1.append(df2, ignore_index=True)
215215
216+
.. _merging.mixed_ndims:
217+
218+
Concatenating with mixed ndims
219+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
220+
221+
You can concatenate a mix of Series and DataFrames. The
222+
Series will be transformed to DataFrames with the column name as
223+
the name of the Series.
224+
225+
.. ipython:: python
226+
227+
df1 = DataFrame(randn(6, 4), columns=['A', 'B', 'C', 'D'])
228+
s1 = Series(randn(6), name='foo')
229+
concat([df1, s1],axis=1)
230+
231+
If unnamed Series are passed they will be numbered consecutively.
232+
233+
.. ipython:: python
234+
235+
s2 = Series(randn(6))
236+
concat([df1, s2, s2, s2],axis=1)
237+
238+
Passing ``ignore_index=True`` will drop all name references.
239+
240+
.. ipython:: python
241+
242+
concat([df1, s1],axis=1,ignore_index=True)
216243
217244
More concatenating with group keys
218245
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ API Changes
9898
- The top-level :func:`pandas.eval` function does not allow you use the
9999
``'@'`` prefix and provides you with an error message telling you so.
100100
- ``NameResolutionError`` was removed because it isn't necessary anymore.
101+
- ``concat`` will now concatenate mixed Series and DataFrames using the Series name
102+
or numbering columns as needed (:issue:`2385`)
101103

102104
Experimental Features
103105
~~~~~~~~~~~~~~~~~~~~~

doc/source/v0.14.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ API changes
6666
- The top-level :func:`pandas.eval` function does not allow you use the
6767
``'@'`` prefix and provides you with an error message telling you so.
6868
- ``NameResolutionError`` was removed because it isn't necessary anymore.
69+
- ``concat`` will now concatenate mixed Series and DataFrames using the Series name
70+
or numbering columns as needed (:issue:`2385`). See :ref:`the docs <mergine.mixed_ndims>`
6971

7072
MultiIndexing Using Slicers
7173
~~~~~~~~~~~~~~~~~~~~~~~~~~~

pandas/tools/merge.py

+61-15
Original file line numberDiff line numberDiff line change
@@ -970,40 +970,86 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None,
970970
objs = clean_objs
971971
keys = clean_keys
972972

973-
# consolidate data
974-
self.objs = []
973+
if len(objs) == 0:
974+
raise Exception('All objects passed were None')
975+
976+
# consolidate data & figure out what our result ndim is going to be
977+
ndims = set()
975978
for obj in objs:
976979
if not isinstance(obj, NDFrame):
977980
raise TypeError("cannot concatenate a non-NDFrame object")
978981

979-
# skip completely empty
980-
if not np.sum(obj.shape):
981-
continue
982-
983982
# consolidate
984983
obj.consolidate(inplace=True)
985-
self.objs.append(obj)
984+
ndims.add(obj.ndim)
985+
986+
# get the sample
987+
# want the higest ndim that we have, and must be non-empty
988+
# unless all objs are empty
989+
sample = None
990+
if len(ndims) > 1:
991+
max_ndim = max(ndims)
992+
for obj in objs:
993+
if obj.ndim == max_ndim and np.sum(obj.shape):
994+
sample = obj
995+
break
986996

987-
if len(self.objs) == 0:
988-
raise Exception('All objects passed were None')
989-
990-
# need the first as a sample non-empty as a sample
991-
sample = next(obj for obj in self.objs if np.prod(obj.shape))
997+
else:
998+
# filter out the empties
999+
# if we have not multi-index possibiltes
1000+
df = DataFrame([ obj.shape for obj in objs ]).sum(1)
1001+
non_empties = df[df!=0]
1002+
if len(non_empties) and (keys is None and names is None and levels is None and join_axes is None):
1003+
objs = [ objs[i] for i in non_empties.index ]
1004+
sample = objs[0]
1005+
1006+
if sample is None:
1007+
sample = objs[0]
1008+
self.objs = objs
9921009

9931010
# Need to flip BlockManager axis in the DataFrame special case
994-
if isinstance(sample, DataFrame):
1011+
self._is_frame = isinstance(sample, DataFrame)
1012+
if self._is_frame:
9951013
axis = 1 if axis == 0 else 0
9961014

9971015
self._is_series = isinstance(sample, ABCSeries)
9981016
if not 0 <= axis <= sample.ndim:
9991017
raise AssertionError("axis must be between 0 and {0}, "
10001018
"input was {1}".format(sample.ndim, axis))
10011019

1020+
# if we have mixed ndims, then convert to highest ndim
1021+
# creating column numbers as needed
1022+
if len(ndims) > 1:
1023+
current_column = 0
1024+
max_ndim = sample.ndim
1025+
self.objs, objs = [], self.objs
1026+
for obj in objs:
1027+
1028+
ndim = obj.ndim
1029+
if ndim == max_ndim:
1030+
pass
1031+
1032+
elif ndim != max_ndim-1:
1033+
raise ValueError("cannot concatenate unaligned mixed "
1034+
"dimensional NDFrame objects")
1035+
1036+
else:
1037+
name = getattr(obj,'name',None)
1038+
if ignore_index or name is None:
1039+
name = current_column
1040+
current_column += 1
1041+
1042+
# doing a row-wise concatenation so need everything
1043+
# to line up
1044+
if self._is_frame and axis == 1:
1045+
name = 0
1046+
obj = sample._constructor({ name : obj })
1047+
1048+
self.objs.append(obj)
1049+
10021050
# note: this is the BlockManager axis (since DataFrame is transposed)
10031051
self.axis = axis
1004-
10051052
self.join_axes = join_axes
1006-
10071053
self.keys = keys
10081054
self.names = names
10091055
self.levels = levels

pandas/tools/tests/test_merge.py

+62-9
Original file line numberDiff line numberDiff line change
@@ -1657,11 +1657,73 @@ def test_handle_empty_objects(self):
16571657
# GH3259
16581658
df = DataFrame(dict(A = range(10000)),index=date_range('20130101',periods=10000,freq='s'))
16591659
empty = DataFrame()
1660+
result = concat([df,empty],axis=1)
1661+
assert_frame_equal(result, df)
1662+
result = concat([empty,df],axis=1)
1663+
assert_frame_equal(result, df)
1664+
16601665
result = concat([df,empty])
16611666
assert_frame_equal(result, df)
16621667
result = concat([empty,df])
16631668
assert_frame_equal(result, df)
16641669

1670+
def test_concat_mixed_objs(self):
1671+
1672+
# concat mixed series/frames
1673+
# G2385
1674+
1675+
# axis 1
1676+
index=date_range('01-Jan-2013', periods=10, freq='H')
1677+
arr = np.arange(10, dtype='int64')
1678+
s1 = Series(arr, index=index)
1679+
s2 = Series(arr, index=index)
1680+
df = DataFrame(arr.reshape(-1,1), index=index)
1681+
1682+
expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 0])
1683+
result = concat([df,df], axis=1)
1684+
assert_frame_equal(result, expected)
1685+
1686+
expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 1])
1687+
result = concat([s1,s2], axis=1)
1688+
assert_frame_equal(result, expected)
1689+
1690+
expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2])
1691+
result = concat([s1,s2,s1], axis=1)
1692+
assert_frame_equal(result, expected)
1693+
1694+
expected = DataFrame(np.repeat(arr,5).reshape(-1,5), index=index, columns = [0, 0, 1, 2, 3])
1695+
result = concat([s1,df,s2,s2,s1], axis=1)
1696+
assert_frame_equal(result, expected)
1697+
1698+
# with names
1699+
s1.name = 'foo'
1700+
expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 0])
1701+
result = concat([s1,df,s2], axis=1)
1702+
assert_frame_equal(result, expected)
1703+
1704+
s2.name = 'bar'
1705+
expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 'bar'])
1706+
result = concat([s1,df,s2], axis=1)
1707+
assert_frame_equal(result, expected)
1708+
1709+
# ignore index
1710+
expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2])
1711+
result = concat([s1,df,s2], axis=1, ignore_index=True)
1712+
assert_frame_equal(result, expected)
1713+
1714+
# axis 0
1715+
expected = DataFrame(np.tile(arr,3).reshape(-1,1), index=index.tolist() * 3, columns = [0])
1716+
result = concat([s1,df,s2])
1717+
assert_frame_equal(result, expected)
1718+
1719+
expected = DataFrame(np.tile(arr,3).reshape(-1,1), columns = [0])
1720+
result = concat([s1,df,s2], ignore_index=True)
1721+
assert_frame_equal(result, expected)
1722+
1723+
# invalid concatente of mixed dims
1724+
panel = tm.makePanel()
1725+
self.assertRaises(ValueError, lambda : concat([panel,s1],axis=1))
1726+
16651727
def test_panel_join(self):
16661728
panel = tm.makePanel()
16671729
tm.add_nans(panel)
@@ -1991,15 +2053,6 @@ def test_concat_invalid_first_argument(self):
19912053
# generator ok though
19922054
concat(DataFrame(np.random.rand(5,5)) for _ in range(3))
19932055

1994-
def test_concat_mixed_types_fails(self):
1995-
df = DataFrame(randn(10, 1))
1996-
1997-
with tm.assertRaisesRegexp(TypeError, "Cannot concatenate.+"):
1998-
concat([df[0], df], axis=1)
1999-
2000-
with tm.assertRaisesRegexp(TypeError, "Cannot concatenate.+"):
2001-
concat([df, df[0]], axis=1)
2002-
20032056
class TestOrderedMerge(tm.TestCase):
20042057

20052058
def setUp(self):

vb_suite/join_merge.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -195,9 +195,9 @@ def sample(values, k):
195195
empty = DataFrame()
196196
"""
197197

198-
concat_empty_frames1 = Benchmark('concat([df,empty)', setup,
198+
concat_empty_frames1 = Benchmark('concat([df,empty])', setup,
199199
start_date=datetime(2012, 1, 1))
200-
concat_empty_frames2 = Benchmark('concat([empty,df)', setup,
200+
concat_empty_frames2 = Benchmark('concat([empty,df])', setup,
201201
start_date=datetime(2012, 1, 1))
202202

203203

0 commit comments

Comments
 (0)