Skip to content

Commit 731c9a8

Browse files
committed
BUG: Fix initialization of DataFrame from dict with NaN as key
closes pandas-dev#18455
1 parent d163de7 commit 731c9a8

File tree

6 files changed

+79
-39
lines changed

6 files changed

+79
-39
lines changed

pandas/core/frame.py

+17-32
Original file line numberDiff line numberDiff line change
@@ -416,44 +416,29 @@ def _init_dict(self, data, index, columns, dtype=None):
416416
Needs to handle a lot of exceptional cases.
417417
"""
418418
if columns is not None:
419-
columns = _ensure_index(columns)
419+
arrays = Series(data, index=columns, dtype=object)
420+
data_names = arrays.index
420421

421-
# GH10856
422-
# raise ValueError if only scalars in dict
422+
missing = arrays.isnull()
423423
if index is None:
424-
extract_index(list(data.values()))
425-
426-
# prefilter if columns passed
427-
data = {k: v for k, v in compat.iteritems(data) if k in columns}
428-
429-
if index is None:
430-
index = extract_index(list(data.values()))
431-
424+
# GH10856
425+
# raise ValueError if only scalars in dict
426+
index = extract_index(arrays[~missing].tolist())
432427
else:
433428
index = _ensure_index(index)
434429

435-
arrays = []
436-
data_names = []
437-
for k in columns:
438-
if k not in data:
439-
# no obvious "empty" int column
440-
if dtype is not None and issubclass(dtype.type,
441-
np.integer):
442-
continue
443-
444-
if dtype is None:
445-
# 1783
446-
v = np.empty(len(index), dtype=object)
447-
elif np.issubdtype(dtype, np.flexible):
448-
v = np.empty(len(index), dtype=object)
449-
else:
450-
v = np.empty(len(index), dtype=dtype)
451-
452-
v.fill(np.nan)
430+
# no obvious "empty" int column
431+
if missing.any() and not (dtype is not None and
432+
issubclass(dtype.type, np.integer)):
433+
if dtype is None or np.issubdtype(dtype, np.flexible):
434+
# 1783
435+
nan_dtype = object
453436
else:
454-
v = data[k]
455-
data_names.append(k)
456-
arrays.append(v)
437+
nan_dtype = dtype
438+
v = np.empty(len(index), dtype=nan_dtype)
439+
v.fill(np.nan)
440+
arrays.loc[missing] = [v] * missing.sum()
441+
arrays = arrays.tolist()
457442

458443
else:
459444
keys = list(data.keys())

pandas/core/generic.py

-1
Original file line numberDiff line numberDiff line change
@@ -6185,7 +6185,6 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
61856185
if not is_bool_dtype(dt):
61866186
raise ValueError(msg.format(dtype=dt))
61876187

6188-
cond = cond.astype(bool, copy=False)
61896188
cond = -cond if inplace else cond
61906189

61916190
# try to align with other

pandas/core/series.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -3208,7 +3208,11 @@ def _try_cast(arr, take_fast_path):
32083208
return subarr
32093209

32103210
elif isinstance(data, (list, tuple)) and len(data) > 0:
3211-
if dtype is not None:
3211+
if all(is_list_like(item) for item in data):
3212+
# Ensure nested lists are not interpreted as further dimensions:
3213+
subarr = np.empty(len(data), dtype='object')
3214+
subarr[:] = data
3215+
elif dtype is not None:
32123216
try:
32133217
subarr = _try_cast(data, False)
32143218
except Exception:

pandas/tests/frame/test_constructors.py

+46-4
Original file line numberDiff line numberDiff line change
@@ -275,8 +275,50 @@ def test_constructor_dict(self):
275275
with tm.assert_raises_regex(ValueError, msg):
276276
DataFrame({'a': 0.7}, columns=['a'])
277277

278-
with tm.assert_raises_regex(ValueError, msg):
279-
DataFrame({'a': 0.7}, columns=['b'])
278+
# No reason to raise if item is not used:
279+
result = DataFrame({'a': 0.7}, columns=['b'])
280+
expected = DataFrame(columns=['b'])
281+
tm.assert_frame_equal(result, expected)
282+
283+
@pytest.mark.parametrize("value", [2, np.nan, None, float('nan')])
284+
def test_constructor_dict_nan_key(self, value):
285+
# GH 18455
286+
cols = [1, value, 3]
287+
idx = ['a', value]
288+
values = [[0, 3], [1, 4], [2, 5]]
289+
data = {cols[c]: pd.Series(values[c], index=idx) for c in range(3)}
290+
result = pd.DataFrame(data).sort_values(1).sort_values('a', axis=1)
291+
expected = pd.DataFrame(np.arange(6).reshape(2, 3),
292+
index=idx, columns=cols)
293+
tm.assert_frame_equal(result, expected)
294+
295+
result = pd.DataFrame(data, index=idx).sort_values('a', axis=1)
296+
tm.assert_frame_equal(result, expected)
297+
298+
result = pd.DataFrame(data, index=idx, columns=cols)
299+
tm.assert_frame_equal(result, expected)
300+
301+
@pytest.mark.xfail(reason='GH 18485 comparison fails on MI with NaNs)')
302+
@pytest.mark.parametrize("value", [np.nan, None, float('nan')])
303+
def test_constructor_dict_nan_tuple_key(self, value):
304+
# GH 18455
305+
cols = Index([(11, 21), (value, 22), (13, value)])
306+
idx = Index([('a', value), (value, 2)])
307+
values = [[0, 3], [1, 4], [2, 5]]
308+
data = {cols[c]: pd.Series(values[c], index=idx) for c in range(3)}
309+
result = (DataFrame(data)
310+
.sort_values((11, 21))
311+
.sort_values(('a', value), axis=1))
312+
expected = pd.DataFrame(np.arange(6).reshape(2, 3),
313+
index=idx, columns=cols)
314+
tm.assert_frame_equal(result, expected)
315+
316+
result = pd.DataFrame(data, index=idx).sort_values(('a', value),
317+
axis=1)
318+
tm.assert_frame_equal(result, expected)
319+
320+
result = pd.DataFrame(data, index=idx, columns=cols)
321+
tm.assert_frame_equal(result, expected)
280322

281323
def test_constructor_multi_index(self):
282324
# GH 4078
@@ -723,15 +765,15 @@ def test_constructor_corner(self):
723765

724766
# does not error but ends up float
725767
df = DataFrame(index=lrange(10), columns=['a', 'b'], dtype=int)
726-
assert df.values.dtype == np.object_
768+
assert df.values.dtype == np.dtype('float64')
727769

728770
# #1783 empty dtype object
729771
df = DataFrame({}, columns=['foo', 'bar'])
730772
assert df.values.dtype == np.object_
731773

732774
df = DataFrame({'b': 1}, index=lrange(10), columns=list('abc'),
733775
dtype=int)
734-
assert df.values.dtype == np.object_
776+
assert df.values.dtype == np.dtype('float64')
735777

736778
def test_constructor_scalar_inference(self):
737779
data = {'int': 1, 'bool': True,

pandas/tests/io/test_excel.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -516,7 +516,7 @@ def test_read_one_empty_col_with_header(self):
516516
)
517517
expected_header_none = DataFrame(pd.Series([0], dtype='int64'))
518518
tm.assert_frame_equal(actual_header_none, expected_header_none)
519-
expected_header_zero = DataFrame(columns=[0], dtype='int64')
519+
expected_header_zero = DataFrame(columns=[0])
520520
tm.assert_frame_equal(actual_header_zero, expected_header_zero)
521521

522522
def test_set_column_names_in_parameter(self):

pandas/tests/series/test_constructors.py

+10
Original file line numberDiff line numberDiff line change
@@ -713,6 +713,16 @@ def test_constructor_set(self):
713713
values = frozenset(values)
714714
pytest.raises(TypeError, Series, values)
715715

716+
@pytest.mark.parametrize('klass', [list, tuple, set, Series])
717+
def test_constructor_complex_values(self, klass):
718+
d = {1: klass([1, 2, 3]), 3: klass([4, 5, 6]), 5: klass([7, 8, 9])}
719+
result = Series(d).sort_index()
720+
721+
exp_data = np.empty(len(d), dtype='object')
722+
exp_data[:] = [d[i] for i in [1, 3, 5]]
723+
expected = Series(exp_data, index=pd.Index([1, 3, 5]))
724+
tm.assert_series_equal(result, expected)
725+
716726
def test_fromDict(self):
717727
data = {'a': 0, 'b': 1, 'c': 2, 'd': 3}
718728

0 commit comments

Comments
 (0)