Skip to content

Commit 6f9b502

Browse files
committed
BUG: Fix initialization of DataFrame from dict with NaN as key
closes #18455
1 parent de39a15 commit 6f9b502

File tree

7 files changed

+76
-43
lines changed

7 files changed

+76
-43
lines changed

doc/source/whatsnew/v0.23.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -591,3 +591,5 @@ Other
591591
^^^^^
592592

593593
- Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`)
594+
- Fixed construction of a :class:`DataFrame` from a ``dict`` containing ``NaN`` as key (:issue:`18455`)
595+
- Suppressed error in the construction of a :class:`DataFrame` from a ``dict`` containing scalar values when the corresponding keys are not included in the passed index (:issue:`18600`)

pandas/core/frame.py

+17-34
Original file line numberDiff line numberDiff line change
@@ -418,44 +418,29 @@ def _init_dict(self, data, index, columns, dtype=None):
418418
Needs to handle a lot of exceptional cases.
419419
"""
420420
if columns is not None:
421-
columns = _ensure_index(columns)
421+
arrays = Series(data, index=columns, dtype=object)
422+
data_names = arrays.index
422423

423-
# GH10856
424-
# raise ValueError if only scalars in dict
424+
missing = arrays.isnull()
425425
if index is None:
426-
extract_index(list(data.values()))
427-
428-
# prefilter if columns passed
429-
data = {k: v for k, v in compat.iteritems(data) if k in columns}
430-
431-
if index is None:
432-
index = extract_index(list(data.values()))
433-
426+
# GH10856
427+
# raise ValueError if only scalars in dict
428+
index = extract_index(arrays[~missing].tolist())
434429
else:
435430
index = _ensure_index(index)
436431

437-
arrays = []
438-
data_names = []
439-
for k in columns:
440-
if k not in data:
441-
# no obvious "empty" int column
442-
if dtype is not None and issubclass(dtype.type,
443-
np.integer):
444-
continue
445-
446-
if dtype is None:
447-
# 1783
448-
v = np.empty(len(index), dtype=object)
449-
elif np.issubdtype(dtype, np.flexible):
450-
v = np.empty(len(index), dtype=object)
451-
else:
452-
v = np.empty(len(index), dtype=dtype)
453-
454-
v.fill(np.nan)
432+
# no obvious "empty" int column
433+
if missing.any() and not (dtype is not None and
434+
issubclass(dtype.type, np.integer)):
435+
if dtype is None or np.issubdtype(dtype, np.flexible):
436+
# 1783
437+
nan_dtype = object
455438
else:
456-
v = data[k]
457-
data_names.append(k)
458-
arrays.append(v)
439+
nan_dtype = dtype
440+
v = np.empty(len(index), dtype=nan_dtype)
441+
v.fill(np.nan)
442+
arrays.loc[missing] = [v] * missing.sum()
443+
arrays = arrays.tolist()
459444

460445
else:
461446
keys = list(data.keys())
@@ -6042,8 +6027,6 @@ def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None):
60426027
# figure out the index, if necessary
60436028
if index is None:
60446029
index = extract_index(arrays)
6045-
else:
6046-
index = _ensure_index(index)
60476030

60486031
# don't force copy because getting jammed in an ndarray anyway
60496032
arrays = _homogenize(arrays, index, dtype)

pandas/core/generic.py

-1
Original file line numberDiff line numberDiff line change
@@ -6468,7 +6468,6 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
64686468
if not is_bool_dtype(dt):
64696469
raise ValueError(msg.format(dtype=dt))
64706470

6471-
cond = cond.astype(bool, copy=False)
64726471
cond = -cond if inplace else cond
64736472

64746473
# try to align with other

pandas/core/internals.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4767,7 +4767,7 @@ def form_blocks(arrays, names, axes):
47674767
items_dict = defaultdict(list)
47684768
extra_locs = []
47694769

4770-
names_idx = Index(names)
4770+
names_idx = _ensure_index(names)
47714771
if names_idx.equals(axes[0]):
47724772
names_indexer = np.arange(len(names_idx))
47734773
else:

pandas/core/series.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@
3636
maybe_upcast, infer_dtype_from_scalar,
3737
maybe_convert_platform,
3838
maybe_cast_to_datetime, maybe_castable,
39-
construct_1d_arraylike_from_scalar)
39+
construct_1d_arraylike_from_scalar,
40+
construct_1d_object_array_from_listlike)
4041
from pandas.core.dtypes.missing import isna, notna, remove_na_arraylike
4142

4243
from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
@@ -268,6 +269,7 @@ def _init_dict(self, data, index=None, dtype=None):
268269
# raises KeyError), so we iterate the entire dict, and align
269270
if data:
270271
keys, values = zip(*compat.iteritems(data))
272+
values = list(values)
271273
else:
272274
keys, values = [], []
273275

@@ -3166,7 +3168,13 @@ def _try_cast(arr, take_fast_path):
31663168
try:
31673169
subarr = maybe_cast_to_datetime(arr, dtype)
31683170
if not is_extension_type(subarr):
3169-
subarr = np.array(subarr, dtype=dtype, copy=copy)
3171+
# Take care in creating object arrays (but generators are not
3172+
# supported, hence the __len__ check):
3173+
if dtype == 'object' and (hasattr(subarr, '__len__') and
3174+
not isinstance(subarr, np.ndarray)):
3175+
subarr = construct_1d_object_array_from_listlike(subarr)
3176+
else:
3177+
subarr = np.array(subarr, dtype=dtype, copy=copy)
31703178
except (ValueError, TypeError):
31713179
if is_categorical_dtype(dtype):
31723180
subarr = Categorical(arr, dtype.categories,

pandas/tests/frame/test_constructors.py

+45-4
Original file line numberDiff line numberDiff line change
@@ -287,8 +287,49 @@ def test_constructor_dict(self):
287287
with tm.assert_raises_regex(ValueError, msg):
288288
DataFrame({'a': 0.7}, columns=['a'])
289289

290-
with tm.assert_raises_regex(ValueError, msg):
291-
DataFrame({'a': 0.7}, columns=['b'])
290+
# No reason to raise if item is not used:
291+
result = DataFrame({'a': 0.7}, columns=['b'])
292+
expected = DataFrame(columns=['b'])
293+
tm.assert_frame_equal(result, expected)
294+
295+
@pytest.mark.parametrize("value", [2, np.nan, None, float('nan')])
296+
def test_constructor_dict_nan_key(self, value):
297+
# GH 18455
298+
cols = [1, value, 3]
299+
idx = ['a', value]
300+
values = [[0, 3], [1, 4], [2, 5]]
301+
data = {cols[c]: pd.Series(values[c], index=idx) for c in range(3)}
302+
result = pd.DataFrame(data).sort_values(1).sort_values('a', axis=1)
303+
expected = pd.DataFrame(np.arange(6).reshape(2, 3),
304+
index=idx, columns=cols)
305+
tm.assert_frame_equal(result, expected)
306+
307+
result = pd.DataFrame(data, index=idx).sort_values('a', axis=1)
308+
tm.assert_frame_equal(result, expected)
309+
310+
result = pd.DataFrame(data, index=idx, columns=cols)
311+
tm.assert_frame_equal(result, expected)
312+
313+
@pytest.mark.parametrize("value", [np.nan, None, float('nan')])
314+
def test_constructor_dict_nan_tuple_key(self, value):
315+
# GH 18455
316+
cols = Index([(11, 21), (value, 22), (13, value)])
317+
idx = Index([('a', value), (value, 2)])
318+
values = [[0, 3], [1, 4], [2, 5]]
319+
data = {cols[c]: pd.Series(values[c], index=idx) for c in range(3)}
320+
result = (DataFrame(data)
321+
.sort_values((11, 21))
322+
.sort_values(('a', value), axis=1))
323+
expected = pd.DataFrame(np.arange(6).reshape(2, 3),
324+
index=idx, columns=cols)
325+
tm.assert_frame_equal(result, expected)
326+
327+
result = pd.DataFrame(data, index=idx).sort_values(('a', value),
328+
axis=1)
329+
tm.assert_frame_equal(result, expected)
330+
331+
result = pd.DataFrame(data, index=idx, columns=cols)
332+
tm.assert_frame_equal(result, expected)
292333

293334
def test_constructor_multi_index(self):
294335
# GH 4078
@@ -735,15 +776,15 @@ def test_constructor_corner(self):
735776

736777
# does not error but ends up float
737778
df = DataFrame(index=lrange(10), columns=['a', 'b'], dtype=int)
738-
assert df.values.dtype == np.object_
779+
assert df.values.dtype == np.dtype('float64')
739780

740781
# #1783 empty dtype object
741782
df = DataFrame({}, columns=['foo', 'bar'])
742783
assert df.values.dtype == np.object_
743784

744785
df = DataFrame({'b': 1}, index=lrange(10), columns=list('abc'),
745786
dtype=int)
746-
assert df.values.dtype == np.object_
787+
assert df.values.dtype == np.dtype('float64')
747788

748789
def test_constructor_scalar_inference(self):
749790
data = {'int': 1, 'bool': True,

pandas/tests/io/test_excel.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -511,7 +511,7 @@ def test_read_one_empty_col_with_header(self):
511511
)
512512
expected_header_none = DataFrame(pd.Series([0], dtype='int64'))
513513
tm.assert_frame_equal(actual_header_none, expected_header_none)
514-
expected_header_zero = DataFrame(columns=[0], dtype='int64')
514+
expected_header_zero = DataFrame(columns=[0])
515515
tm.assert_frame_equal(actual_header_zero, expected_header_zero)
516516

517517
def test_set_column_names_in_parameter(self):

0 commit comments

Comments
 (0)