Skip to content

PERF/API: concat improvements #6438

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 21, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions doc/source/merging.rst
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,33 @@ This is also a valid argument to ``DataFrame.append``:

df1.append(df2, ignore_index=True)

.. _merging.mixed_ndims:

Concatenating with mixed ndims
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

You can concatenate a mix of Series and DataFrames. The
Series will be transformed to DataFrames with the column name as
the name of the Series.

.. ipython:: python

df1 = DataFrame(randn(6, 4), columns=['A', 'B', 'C', 'D'])
s1 = Series(randn(6), name='foo')
concat([df1, s1],axis=1)

If unnamed Series are passed they will be numbered consecutively.

.. ipython:: python

s2 = Series(randn(6))
concat([df1, s2, s2, s2],axis=1)

Passing ``ignore_index=True`` will drop all name references.

.. ipython:: python

concat([df1, s1],axis=1,ignore_index=True)

More concatenating with group keys
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
3 changes: 3 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ API Changes
- The top-level :func:`pandas.eval` function does not allow you use the
``'@'`` prefix and provides you with an error message telling you so.
- ``NameResolutionError`` was removed because it isn't necessary anymore.
- ``concat`` will now concatenate mixed Series and DataFrames using the Series name
or numbering columns as needed (:issue:`2385`)

Experimental Features
~~~~~~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -166,6 +168,7 @@ Bug Fixes
- Bug in ``Series.reindex`` when specifying a ``method`` with some nan values was inconsistent (noted on a resample) (:issue:`6418`)
- Bug in :meth:`DataFrame.replace` where nested dicts were erroneously
depending on the order of dictionary keys and values (:issue:`5338`).
- Perf issue in concatting with empty objects (:issue:`3259`)

pandas 0.13.1
-------------
Expand Down
2 changes: 2 additions & 0 deletions doc/source/v0.14.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ API changes
- The top-level :func:`pandas.eval` function does not allow you use the
``'@'`` prefix and provides you with an error message telling you so.
- ``NameResolutionError`` was removed because it isn't necessary anymore.
- ``concat`` will now concatenate mixed Series and DataFrames using the Series name
or numbering columns as needed (:issue:`2385`). See :ref:`the docs <mergine.mixed_ndims>`

MultiIndexing Using Slicers
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
5 changes: 1 addition & 4 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2209,10 +2209,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):

# make Nones an empty object
if com._count_not_none(*values) != len(values):
v = None
for v in values:
if v is not None:
break
v = next(v for v in values if v is not None)
if v is None:
return DataFrame()
elif isinstance(v, NDFrame):
Expand Down
73 changes: 64 additions & 9 deletions pandas/tools/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -957,7 +957,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None,
objs = [objs[k] for k in keys]

if keys is None:
objs = [obj for obj in objs if obj is not None]
objs = [obj for obj in objs if obj is not None ]
else:
# #1649
clean_keys = []
Expand All @@ -973,28 +973,83 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None,
if len(objs) == 0:
raise Exception('All objects passed were None')

# consolidate data
# consolidate data & figure out what our result ndim is going to be
ndims = set()
for obj in objs:
if isinstance(obj, NDFrame):
obj.consolidate(inplace=True)
self.objs = objs
if not isinstance(obj, NDFrame):
raise TypeError("cannot concatenate a non-NDFrame object")

# consolidate
obj.consolidate(inplace=True)
ndims.add(obj.ndim)

# get the sample
# want the higest ndim that we have, and must be non-empty
# unless all objs are empty
sample = None
if len(ndims) > 1:
max_ndim = max(ndims)
for obj in objs:
if obj.ndim == max_ndim and np.sum(obj.shape):
sample = obj
break

sample = objs[0]
else:
# filter out the empties
# if we have not multi-index possibiltes
df = DataFrame([ obj.shape for obj in objs ]).sum(1)
non_empties = df[df!=0]
if len(non_empties) and (keys is None and names is None and levels is None and join_axes is None):
objs = [ objs[i] for i in non_empties.index ]
sample = objs[0]

if sample is None:
sample = objs[0]
self.objs = objs

# Need to flip BlockManager axis in the DataFrame special case
if isinstance(sample, DataFrame):
self._is_frame = isinstance(sample, DataFrame)
if self._is_frame:
axis = 1 if axis == 0 else 0

self._is_series = isinstance(sample, ABCSeries)
if not 0 <= axis <= sample.ndim:
raise AssertionError("axis must be between 0 and {0}, "
"input was {1}".format(sample.ndim, axis))

# if we have mixed ndims, then convert to highest ndim
# creating column numbers as needed
if len(ndims) > 1:
current_column = 0
max_ndim = sample.ndim
self.objs, objs = [], self.objs
for obj in objs:

ndim = obj.ndim
if ndim == max_ndim:
pass

elif ndim != max_ndim-1:
raise ValueError("cannot concatenate unaligned mixed "
"dimensional NDFrame objects")

else:
name = getattr(obj,'name',None)
if ignore_index or name is None:
name = current_column
current_column += 1

# doing a row-wise concatenation so need everything
# to line up
if self._is_frame and axis == 1:
name = 0
obj = sample._constructor({ name : obj })

self.objs.append(obj)

# note: this is the BlockManager axis (since DataFrame is transposed)
self.axis = axis

self.join_axes = join_axes

self.keys = keys
self.names = names
self.levels = levels
Expand Down
87 changes: 78 additions & 9 deletions pandas/tools/tests/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1653,6 +1653,77 @@ def test_handle_empty_objects(self):

tm.assert_frame_equal(concatted, expected)

# empty as first element with time series
# GH3259
df = DataFrame(dict(A = range(10000)),index=date_range('20130101',periods=10000,freq='s'))
empty = DataFrame()
result = concat([df,empty],axis=1)
assert_frame_equal(result, df)
result = concat([empty,df],axis=1)
assert_frame_equal(result, df)

result = concat([df,empty])
assert_frame_equal(result, df)
result = concat([empty,df])
assert_frame_equal(result, df)

def test_concat_mixed_objs(self):

# concat mixed series/frames
# G2385

# axis 1
index=date_range('01-Jan-2013', periods=10, freq='H')
arr = np.arange(10, dtype='int64')
s1 = Series(arr, index=index)
s2 = Series(arr, index=index)
df = DataFrame(arr.reshape(-1,1), index=index)

expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 0])
result = concat([df,df], axis=1)
assert_frame_equal(result, expected)

expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 1])
result = concat([s1,s2], axis=1)
assert_frame_equal(result, expected)

expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2])
result = concat([s1,s2,s1], axis=1)
assert_frame_equal(result, expected)

expected = DataFrame(np.repeat(arr,5).reshape(-1,5), index=index, columns = [0, 0, 1, 2, 3])
result = concat([s1,df,s2,s2,s1], axis=1)
assert_frame_equal(result, expected)

# with names
s1.name = 'foo'
expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 0])
result = concat([s1,df,s2], axis=1)
assert_frame_equal(result, expected)

s2.name = 'bar'
expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 'bar'])
result = concat([s1,df,s2], axis=1)
assert_frame_equal(result, expected)

# ignore index
expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2])
result = concat([s1,df,s2], axis=1, ignore_index=True)
assert_frame_equal(result, expected)

# axis 0
expected = DataFrame(np.tile(arr,3).reshape(-1,1), index=index.tolist() * 3, columns = [0])
result = concat([s1,df,s2])
assert_frame_equal(result, expected)

expected = DataFrame(np.tile(arr,3).reshape(-1,1), columns = [0])
result = concat([s1,df,s2], ignore_index=True)
assert_frame_equal(result, expected)

# invalid concatente of mixed dims
panel = tm.makePanel()
self.assertRaises(ValueError, lambda : concat([panel,s1],axis=1))

def test_panel_join(self):
panel = tm.makePanel()
tm.add_nans(panel)
Expand Down Expand Up @@ -1967,6 +2038,13 @@ def test_concat_series_axis1_same_names_ignore_index(self):
result = concat([s1, s2], axis=1, ignore_index=True)
self.assertTrue(np.array_equal(result.columns, [0, 1]))

def test_concat_invalid(self):

# trying to concat a ndframe with a non-ndframe
df1 = mkdf(10, 2)
for obj in [1, dict(), [1, 2], (1, 2) ]:
self.assertRaises(TypeError, lambda x: concat([ df1, obj ]))

def test_concat_invalid_first_argument(self):
df1 = mkdf(10, 2)
df2 = mkdf(10, 2)
Expand All @@ -1975,15 +2053,6 @@ def test_concat_invalid_first_argument(self):
# generator ok though
concat(DataFrame(np.random.rand(5,5)) for _ in range(3))

def test_concat_mixed_types_fails(self):
df = DataFrame(randn(10, 1))

with tm.assertRaisesRegexp(TypeError, "Cannot concatenate.+"):
concat([df[0], df], axis=1)

with tm.assertRaisesRegexp(TypeError, "Cannot concatenate.+"):
concat([df, df[0]], axis=1)

class TestOrderedMerge(tm.TestCase):

def setUp(self):
Expand Down
15 changes: 15 additions & 0 deletions vb_suite/join_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,21 @@ def sample(values, k):
concat_small_frames = Benchmark('concat([df] * 1000)', setup,
start_date=datetime(2012, 1, 1))


#----------------------------------------------------------------------
# Concat empty

setup = common_setup + """
df = DataFrame(dict(A = range(10000)),index=date_range('20130101',periods=10000,freq='s'))
empty = DataFrame()
"""

concat_empty_frames1 = Benchmark('concat([df,empty])', setup,
start_date=datetime(2012, 1, 1))
concat_empty_frames2 = Benchmark('concat([empty,df])', setup,
start_date=datetime(2012, 1, 1))


#----------------------------------------------------------------------
# Ordered merge

Expand Down