Skip to content

Arguably better handling of input data in constructor for DataFrame (fix for #4297) #4317

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 35 additions & 13 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5680,49 +5680,71 @@ def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None):
return create_block_manager_from_arrays(arrays, arr_names, axes)

def extract_index(data):
# Slightly misleading name.
# Indexes are only extracted for elements in the iterable
# `data` inheriting from Series.
from pandas.core.index import _union_indexes

index = None
if len(data) == 0:
index = Index([])
elif len(data) > 0 and index is None:
elif len(data) > 0:
raw_lengths = []
indexes = []

have_raw_arrays = False
have_series = False
have_dicts = False
have_mappings = False

# Loop over the element, such as vectors `v` corresponding
# to columns in the DataFrame
for v in data:
if isinstance(v, Series):
have_series = True
indexes.append(v.index)
elif isinstance(v, dict):
have_dicts = True
indexes.append(v.keys())
elif isinstance(v, (list, tuple, np.ndarray)):
have_raw_arrays = True
raw_lengths.append(len(v))
else:
# When an OrderedDict, the mapping aspect
# is given priority, although there is a warning when
# mixture of sequences and mapping. The unit tests
# show that this is the desired behaviour.
# Also, shouldn't a `bytes` object be considered a scalar ?
is_mapping = isinstance(v, collections.Mapping)
is_sequence = (isinstance(v, collections.Sequence) or \
_is_sequence(v)) and not isinstance(v, basestring)
if is_mapping:
have_mappings = True
indexes.append(v.keys())
elif is_sequence:
# This is a sequence-but-not-a-string
# Although strings have a __len__,
# they will be considered scalar.
have_raw_arrays = True
raw_lengths.append(len(v))
else:
# Item v is silently ignored,
# as it is not anything an index can be inferred
# from.
pass

if not indexes and not raw_lengths:
raise ValueError('If using all scalar values, you must must pass'
raise ValueError('If using all scalar values, you must pass'
' an index')

if have_series or have_dicts:
if have_series or have_mappings:
index = _union_indexes(indexes)

if have_raw_arrays:
lengths = list(set(raw_lengths))
if len(lengths) > 1:
raise ValueError('arrays must all be same length')
raise ValueError('Arrays must all be same length')

if have_dicts:
if have_mappings:
raise ValueError('Mixing dicts with non-Series may lead to '
'ambiguous ordering.')

if have_series:
if lengths[0] != len(index):
msg = ('array length %d does not match index length %d'
msg = ('Array length %d does not match index length %d'
% (lengths[0], len(index)))
raise ValueError(msg)
else:
Expand Down
13 changes: 12 additions & 1 deletion pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2236,7 +2236,7 @@ def testit():

def testit():
DataFrame({'a': False, 'b': True})
assertRaisesRegexp(ValueError, 'If using all scalar values, you must must pass an index', testit)
assertRaisesRegexp(ValueError, 'If using all scalar values, you must pass an index', testit)

def test_insert_error_msmgs(self):

Expand Down Expand Up @@ -2774,6 +2774,17 @@ def test_constructor_from_items(self):
# pass some columns
recons = DataFrame.from_items(items, columns=['C', 'B', 'A'])
assert_frame_equal(recons, self.frame.ix[:, ['C', 'B', 'A']])
# not any column either a dict, a list, a tuple, or a numpy.ndarray
import array
recons_ar = DataFrame.from_items([('A', array.array('i', range(10)))])
recons_rg = DataFrame.from_items([('A', range(10))])
recons_np = DataFrame.from_items([('A', np.array(range(10)))])
self.assertEquals(tuple(recons_ar['A']),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably better to use assert_frame_equals(recons_ar, recons_rg) here. where assert_frame_equals is from pandas.util.testing

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed.
I'll have to put all this on hold until time again (may be as long as in few weeks).

In the unlikely case someone is in a hurry to see this merged, or irrevocably rejected, he/she should feel free to edit further.

tuple(recons_rg['A']))
self.assertEquals(tuple(recons_ar['A']),
tuple(recons_np['A']))



# orient='index'

Expand Down