Skip to content

Commit af63b99

Browse files
committed
Merge pull request #6438 from jreback/concat
PERF/API: concat improvements
2 parents 2983b69 + c6b21b4 commit af63b99

File tree

7 files changed

+190
-22
lines changed

7 files changed

+190
-22
lines changed

doc/source/merging.rst

+27
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,33 @@ This is also a valid argument to ``DataFrame.append``:
213213
214214
df1.append(df2, ignore_index=True)
215215
216+
.. _merging.mixed_ndims:
217+
218+
Concatenating with mixed ndims
219+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
220+
221+
You can concatenate a mix of Series and DataFrames. The
222+
Series will be transformed to DataFrames with the column name as
223+
the name of the Series.
224+
225+
.. ipython:: python
226+
227+
df1 = DataFrame(randn(6, 4), columns=['A', 'B', 'C', 'D'])
228+
s1 = Series(randn(6), name='foo')
229+
concat([df1, s1],axis=1)
230+
231+
If unnamed Series are passed they will be numbered consecutively.
232+
233+
.. ipython:: python
234+
235+
s2 = Series(randn(6))
236+
concat([df1, s2, s2, s2],axis=1)
237+
238+
Passing ``ignore_index=True`` will drop all name references.
239+
240+
.. ipython:: python
241+
242+
concat([df1, s1],axis=1,ignore_index=True)
216243
217244
More concatenating with group keys
218245
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

doc/source/release.rst

+3
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ API Changes
9898
- The top-level :func:`pandas.eval` function does not allow you use the
9999
``'@'`` prefix and provides you with an error message telling you so.
100100
- ``NameResolutionError`` was removed because it isn't necessary anymore.
101+
- ``concat`` will now concatenate mixed Series and DataFrames using the Series name
102+
or numbering columns as needed (:issue:`2385`)
101103

102104
Experimental Features
103105
~~~~~~~~~~~~~~~~~~~~~
@@ -166,6 +168,7 @@ Bug Fixes
166168
- Bug in ``Series.reindex`` when specifying a ``method`` with some nan values was inconsistent (noted on a resample) (:issue:`6418`)
167169
- Bug in :meth:`DataFrame.replace` where nested dicts were erroneously
168170
depending on the order of dictionary keys and values (:issue:`5338`).
171+
- Perf issue in concatting with empty objects (:issue:`3259`)
169172

170173
pandas 0.13.1
171174
-------------

doc/source/v0.14.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ API changes
6666
- The top-level :func:`pandas.eval` function does not allow you use the
6767
``'@'`` prefix and provides you with an error message telling you so.
6868
- ``NameResolutionError`` was removed because it isn't necessary anymore.
69+
- ``concat`` will now concatenate mixed Series and DataFrames using the Series name
70+
or numbering columns as needed (:issue:`2385`). See :ref:`the docs <mergine.mixed_ndims>`
6971

7072
MultiIndexing Using Slicers
7173
~~~~~~~~~~~~~~~~~~~~~~~~~~~

pandas/core/groupby.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -2209,10 +2209,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
22092209

22102210
# make Nones an empty object
22112211
if com._count_not_none(*values) != len(values):
2212-
v = None
2213-
for v in values:
2214-
if v is not None:
2215-
break
2212+
v = next(v for v in values if v is not None)
22162213
if v is None:
22172214
return DataFrame()
22182215
elif isinstance(v, NDFrame):

pandas/tools/merge.py

+64-9
Original file line numberDiff line numberDiff line change
@@ -957,7 +957,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None,
957957
objs = [objs[k] for k in keys]
958958

959959
if keys is None:
960-
objs = [obj for obj in objs if obj is not None]
960+
objs = [obj for obj in objs if obj is not None ]
961961
else:
962962
# #1649
963963
clean_keys = []
@@ -973,28 +973,83 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None,
973973
if len(objs) == 0:
974974
raise Exception('All objects passed were None')
975975

976-
# consolidate data
976+
# consolidate data & figure out what our result ndim is going to be
977+
ndims = set()
977978
for obj in objs:
978-
if isinstance(obj, NDFrame):
979-
obj.consolidate(inplace=True)
980-
self.objs = objs
979+
if not isinstance(obj, NDFrame):
980+
raise TypeError("cannot concatenate a non-NDFrame object")
981+
982+
# consolidate
983+
obj.consolidate(inplace=True)
984+
ndims.add(obj.ndim)
985+
986+
# get the sample
987+
# want the higest ndim that we have, and must be non-empty
988+
# unless all objs are empty
989+
sample = None
990+
if len(ndims) > 1:
991+
max_ndim = max(ndims)
992+
for obj in objs:
993+
if obj.ndim == max_ndim and np.sum(obj.shape):
994+
sample = obj
995+
break
981996

982-
sample = objs[0]
997+
else:
998+
# filter out the empties
999+
# if we have not multi-index possibiltes
1000+
df = DataFrame([ obj.shape for obj in objs ]).sum(1)
1001+
non_empties = df[df!=0]
1002+
if len(non_empties) and (keys is None and names is None and levels is None and join_axes is None):
1003+
objs = [ objs[i] for i in non_empties.index ]
1004+
sample = objs[0]
1005+
1006+
if sample is None:
1007+
sample = objs[0]
1008+
self.objs = objs
9831009

9841010
# Need to flip BlockManager axis in the DataFrame special case
985-
if isinstance(sample, DataFrame):
1011+
self._is_frame = isinstance(sample, DataFrame)
1012+
if self._is_frame:
9861013
axis = 1 if axis == 0 else 0
9871014

9881015
self._is_series = isinstance(sample, ABCSeries)
9891016
if not 0 <= axis <= sample.ndim:
9901017
raise AssertionError("axis must be between 0 and {0}, "
9911018
"input was {1}".format(sample.ndim, axis))
9921019

1020+
# if we have mixed ndims, then convert to highest ndim
1021+
# creating column numbers as needed
1022+
if len(ndims) > 1:
1023+
current_column = 0
1024+
max_ndim = sample.ndim
1025+
self.objs, objs = [], self.objs
1026+
for obj in objs:
1027+
1028+
ndim = obj.ndim
1029+
if ndim == max_ndim:
1030+
pass
1031+
1032+
elif ndim != max_ndim-1:
1033+
raise ValueError("cannot concatenate unaligned mixed "
1034+
"dimensional NDFrame objects")
1035+
1036+
else:
1037+
name = getattr(obj,'name',None)
1038+
if ignore_index or name is None:
1039+
name = current_column
1040+
current_column += 1
1041+
1042+
# doing a row-wise concatenation so need everything
1043+
# to line up
1044+
if self._is_frame and axis == 1:
1045+
name = 0
1046+
obj = sample._constructor({ name : obj })
1047+
1048+
self.objs.append(obj)
1049+
9931050
# note: this is the BlockManager axis (since DataFrame is transposed)
9941051
self.axis = axis
995-
9961052
self.join_axes = join_axes
997-
9981053
self.keys = keys
9991054
self.names = names
10001055
self.levels = levels

pandas/tools/tests/test_merge.py

+78-9
Original file line numberDiff line numberDiff line change
@@ -1653,6 +1653,77 @@ def test_handle_empty_objects(self):
16531653

16541654
tm.assert_frame_equal(concatted, expected)
16551655

1656+
# empty as first element with time series
1657+
# GH3259
1658+
df = DataFrame(dict(A = range(10000)),index=date_range('20130101',periods=10000,freq='s'))
1659+
empty = DataFrame()
1660+
result = concat([df,empty],axis=1)
1661+
assert_frame_equal(result, df)
1662+
result = concat([empty,df],axis=1)
1663+
assert_frame_equal(result, df)
1664+
1665+
result = concat([df,empty])
1666+
assert_frame_equal(result, df)
1667+
result = concat([empty,df])
1668+
assert_frame_equal(result, df)
1669+
1670+
def test_concat_mixed_objs(self):
1671+
1672+
# concat mixed series/frames
1673+
# G2385
1674+
1675+
# axis 1
1676+
index=date_range('01-Jan-2013', periods=10, freq='H')
1677+
arr = np.arange(10, dtype='int64')
1678+
s1 = Series(arr, index=index)
1679+
s2 = Series(arr, index=index)
1680+
df = DataFrame(arr.reshape(-1,1), index=index)
1681+
1682+
expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 0])
1683+
result = concat([df,df], axis=1)
1684+
assert_frame_equal(result, expected)
1685+
1686+
expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 1])
1687+
result = concat([s1,s2], axis=1)
1688+
assert_frame_equal(result, expected)
1689+
1690+
expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2])
1691+
result = concat([s1,s2,s1], axis=1)
1692+
assert_frame_equal(result, expected)
1693+
1694+
expected = DataFrame(np.repeat(arr,5).reshape(-1,5), index=index, columns = [0, 0, 1, 2, 3])
1695+
result = concat([s1,df,s2,s2,s1], axis=1)
1696+
assert_frame_equal(result, expected)
1697+
1698+
# with names
1699+
s1.name = 'foo'
1700+
expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 0])
1701+
result = concat([s1,df,s2], axis=1)
1702+
assert_frame_equal(result, expected)
1703+
1704+
s2.name = 'bar'
1705+
expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 'bar'])
1706+
result = concat([s1,df,s2], axis=1)
1707+
assert_frame_equal(result, expected)
1708+
1709+
# ignore index
1710+
expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2])
1711+
result = concat([s1,df,s2], axis=1, ignore_index=True)
1712+
assert_frame_equal(result, expected)
1713+
1714+
# axis 0
1715+
expected = DataFrame(np.tile(arr,3).reshape(-1,1), index=index.tolist() * 3, columns = [0])
1716+
result = concat([s1,df,s2])
1717+
assert_frame_equal(result, expected)
1718+
1719+
expected = DataFrame(np.tile(arr,3).reshape(-1,1), columns = [0])
1720+
result = concat([s1,df,s2], ignore_index=True)
1721+
assert_frame_equal(result, expected)
1722+
1723+
# invalid concatente of mixed dims
1724+
panel = tm.makePanel()
1725+
self.assertRaises(ValueError, lambda : concat([panel,s1],axis=1))
1726+
16561727
def test_panel_join(self):
16571728
panel = tm.makePanel()
16581729
tm.add_nans(panel)
@@ -1967,6 +2038,13 @@ def test_concat_series_axis1_same_names_ignore_index(self):
19672038
result = concat([s1, s2], axis=1, ignore_index=True)
19682039
self.assertTrue(np.array_equal(result.columns, [0, 1]))
19692040

2041+
def test_concat_invalid(self):
2042+
2043+
# trying to concat a ndframe with a non-ndframe
2044+
df1 = mkdf(10, 2)
2045+
for obj in [1, dict(), [1, 2], (1, 2) ]:
2046+
self.assertRaises(TypeError, lambda x: concat([ df1, obj ]))
2047+
19702048
def test_concat_invalid_first_argument(self):
19712049
df1 = mkdf(10, 2)
19722050
df2 = mkdf(10, 2)
@@ -1975,15 +2053,6 @@ def test_concat_invalid_first_argument(self):
19752053
# generator ok though
19762054
concat(DataFrame(np.random.rand(5,5)) for _ in range(3))
19772055

1978-
def test_concat_mixed_types_fails(self):
1979-
df = DataFrame(randn(10, 1))
1980-
1981-
with tm.assertRaisesRegexp(TypeError, "Cannot concatenate.+"):
1982-
concat([df[0], df], axis=1)
1983-
1984-
with tm.assertRaisesRegexp(TypeError, "Cannot concatenate.+"):
1985-
concat([df, df[0]], axis=1)
1986-
19872056
class TestOrderedMerge(tm.TestCase):
19882057

19892058
def setUp(self):

vb_suite/join_merge.py

+15
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,21 @@ def sample(values, k):
186186
concat_small_frames = Benchmark('concat([df] * 1000)', setup,
187187
start_date=datetime(2012, 1, 1))
188188

189+
190+
#----------------------------------------------------------------------
191+
# Concat empty
192+
193+
setup = common_setup + """
194+
df = DataFrame(dict(A = range(10000)),index=date_range('20130101',periods=10000,freq='s'))
195+
empty = DataFrame()
196+
"""
197+
198+
concat_empty_frames1 = Benchmark('concat([df,empty])', setup,
199+
start_date=datetime(2012, 1, 1))
200+
concat_empty_frames2 = Benchmark('concat([empty,df])', setup,
201+
start_date=datetime(2012, 1, 1))
202+
203+
189204
#----------------------------------------------------------------------
190205
# Ordered merge
191206

0 commit comments

Comments
 (0)