Skip to content

Commit fe556d6

Browse files
committed
BUG: Fix initialization of DataFrame from dict with NaN as key
closes #18455 closes #19646
1 parent d2ab407 commit fe556d6

File tree

7 files changed

+79
-43
lines changed

7 files changed

+79
-43
lines changed

doc/source/whatsnew/v0.23.0.txt

+4
Original file line numberDiff line numberDiff line change
@@ -1135,6 +1135,10 @@ Reshaping
11351135
- Bug in :func:`DataFrame.unstack` which casts int to float if ``columns`` is a ``MultiIndex`` with unused levels (:issue:`17845`)
11361136
- Bug in :func:`DataFrame.unstack` which raises an error if ``index`` is a ``MultiIndex`` with unused labels on the unstacked level (:issue:`18562`)
11371137
- Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`)
1138+
- Fixed construction of a :class:`DataFrame` from a ``dict`` containing ``NaN`` as key (:issue:`18455`)
1139+
- Suppressed error in the construction of a :class:`DataFrame` from a ``dict`` containing scalar values when the corresponding keys are not included in the passed index (:issue:`18600`)
1140+
1141+
- Fixed (changed from ``object`` to ``float64``) dtype of :class:`DataFrame` initialized with axes, no data, and ``dtype=int`` (:issue:`19646`)
11381142
- Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`)
11391143
- Bug in :func:`cut` which fails when using readonly arrays (:issue:`18773`)
11401144
- Bug in :func:`DataFrame.pivot_table` which fails when the ``aggfunc`` arg is of type string. The behavior is now consistent with other methods like ``agg`` and ``apply`` (:issue:`18713`)

pandas/core/frame.py

+16-34
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from pandas.core.dtypes.cast import (
2828
maybe_upcast,
2929
cast_scalar_to_array,
30+
construct_1d_arraylike_from_scalar,
3031
maybe_cast_to_datetime,
3132
maybe_infer_to_datetimelike,
3233
maybe_convert_platform,
@@ -429,44 +430,27 @@ def _init_dict(self, data, index, columns, dtype=None):
429430
Needs to handle a lot of exceptional cases.
430431
"""
431432
if columns is not None:
432-
columns = _ensure_index(columns)
433+
arrays = Series(data, index=columns, dtype=object)
434+
data_names = arrays.index
433435

434-
# GH10856
435-
# raise ValueError if only scalars in dict
436+
missing = arrays.isnull()
436437
if index is None:
437-
extract_index(list(data.values()))
438-
439-
# prefilter if columns passed
440-
data = {k: v for k, v in compat.iteritems(data) if k in columns}
441-
442-
if index is None:
443-
index = extract_index(list(data.values()))
444-
438+
# GH10856
439+
# raise ValueError if only scalars in dict
440+
index = extract_index(arrays[~missing])
445441
else:
446442
index = _ensure_index(index)
447443

448-
arrays = []
449-
data_names = []
450-
for k in columns:
451-
if k not in data:
452-
# no obvious "empty" int column
453-
if dtype is not None and issubclass(dtype.type,
454-
np.integer):
455-
continue
456-
457-
if dtype is None:
458-
# 1783
459-
v = np.empty(len(index), dtype=object)
460-
elif np.issubdtype(dtype, np.flexible):
461-
v = np.empty(len(index), dtype=object)
462-
else:
463-
v = np.empty(len(index), dtype=dtype)
464-
465-
v.fill(np.nan)
444+
# no obvious "empty" int column
445+
if missing.any() and not is_integer_dtype(dtype):
446+
if dtype is None or np.issubdtype(dtype, np.flexible):
447+
# 1783
448+
nan_dtype = object
466449
else:
467-
v = data[k]
468-
data_names.append(k)
469-
arrays.append(v)
450+
nan_dtype = dtype
451+
v = construct_1d_arraylike_from_scalar(np.nan, len(index),
452+
nan_dtype)
453+
arrays.loc[missing] = [v] * missing.sum()
470454

471455
else:
472456
keys = com._dict_keys_to_ordered_list(data)
@@ -7253,8 +7237,6 @@ def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None):
72537237
# figure out the index, if necessary
72547238
if index is None:
72557239
index = extract_index(arrays)
7256-
else:
7257-
index = _ensure_index(index)
72587240

72597241
# don't force copy because getting jammed in an ndarray anyway
72607242
arrays = _homogenize(arrays, index, dtype)

pandas/core/generic.py

-1
Original file line numberDiff line numberDiff line change
@@ -7341,7 +7341,6 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
73417341
if not is_bool_dtype(dt):
73427342
raise ValueError(msg.format(dtype=dt))
73437343

7344-
cond = cond.astype(bool, copy=False)
73457344
cond = -cond if inplace else cond
73467345

73477346
# try to align with other

pandas/core/internals.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4841,7 +4841,7 @@ def form_blocks(arrays, names, axes):
48414841
items_dict = defaultdict(list)
48424842
extra_locs = []
48434843

4844-
names_idx = Index(names)
4844+
names_idx = _ensure_index(names)
48454845
if names_idx.equals(axes[0]):
48464846
names_indexer = np.arange(len(names_idx))
48474847
else:

pandas/core/series.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
is_extension_array_dtype,
2525
is_datetime64tz_dtype,
2626
is_timedelta64_dtype,
27+
is_object_dtype,
2728
is_list_like,
2829
is_hashable,
2930
is_iterator,
@@ -38,7 +39,8 @@
3839
maybe_upcast, infer_dtype_from_scalar,
3940
maybe_convert_platform,
4041
maybe_cast_to_datetime, maybe_castable,
41-
construct_1d_arraylike_from_scalar)
42+
construct_1d_arraylike_from_scalar,
43+
construct_1d_object_array_from_listlike)
4244
from pandas.core.dtypes.missing import isna, notna, remove_na_arraylike
4345

4446
from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
@@ -297,6 +299,7 @@ def _init_dict(self, data, index=None, dtype=None):
297299
# raises KeyError), so we iterate the entire dict, and align
298300
if data:
299301
keys, values = zip(*compat.iteritems(data))
302+
values = list(values)
300303
else:
301304
keys, values = [], []
302305

@@ -4042,7 +4045,13 @@ def _try_cast(arr, take_fast_path):
40424045

40434046
try:
40444047
subarr = maybe_cast_to_datetime(arr, dtype)
4045-
if not is_extension_type(subarr):
4048+
# Take care in creating object arrays (but iterators are not
4049+
# supported):
4050+
if is_object_dtype(dtype) and (is_list_like(subarr) and
4051+
not (is_iterator(subarr) or
4052+
isinstance(subarr, np.ndarray))):
4053+
subarr = construct_1d_object_array_from_listlike(subarr)
4054+
elif not is_extension_type(subarr):
40464055
subarr = np.array(subarr, dtype=dtype, copy=copy)
40474056
except (ValueError, TypeError):
40484057
if is_categorical_dtype(dtype):

pandas/tests/frame/test_constructors.py

+46-4
Original file line numberDiff line numberDiff line change
@@ -287,8 +287,50 @@ def test_constructor_dict(self):
287287
with tm.assert_raises_regex(ValueError, msg):
288288
DataFrame({'a': 0.7}, columns=['a'])
289289

290-
with tm.assert_raises_regex(ValueError, msg):
291-
DataFrame({'a': 0.7}, columns=['b'])
290+
@pytest.mark.parametrize("scalar", [2, np.nan, None, 'D'])
291+
def test_constructor_invalid_items_unused(self, scalar):
292+
# No error if invalid (scalar) value is in fact not used:
293+
result = DataFrame({'a': scalar}, columns=['b'])
294+
expected = DataFrame(columns=['b'])
295+
tm.assert_frame_equal(result, expected)
296+
297+
@pytest.mark.parametrize("value", [2, np.nan, None, float('nan')])
298+
def test_constructor_dict_nan_key(self, value):
299+
# GH 18455
300+
cols = [1, value, 3]
301+
idx = ['a', value]
302+
values = [[0, 3], [1, 4], [2, 5]]
303+
data = {cols[c]: Series(values[c], index=idx) for c in range(3)}
304+
result = DataFrame(data).sort_values(1).sort_values('a', axis=1)
305+
expected = DataFrame(np.arange(6, dtype='int64').reshape(2, 3),
306+
index=idx, columns=cols)
307+
tm.assert_frame_equal(result, expected)
308+
309+
result = DataFrame(data, index=idx).sort_values('a', axis=1)
310+
tm.assert_frame_equal(result, expected)
311+
312+
result = DataFrame(data, index=idx, columns=cols)
313+
tm.assert_frame_equal(result, expected)
314+
315+
@pytest.mark.parametrize("value", [np.nan, None, float('nan')])
316+
def test_constructor_dict_nan_tuple_key(self, value):
317+
# GH 18455
318+
cols = Index([(11, 21), (value, 22), (13, value)])
319+
idx = Index([('a', value), (value, 2)])
320+
values = [[0, 3], [1, 4], [2, 5]]
321+
data = {cols[c]: Series(values[c], index=idx) for c in range(3)}
322+
result = (DataFrame(data)
323+
.sort_values((11, 21))
324+
.sort_values(('a', value), axis=1))
325+
expected = DataFrame(np.arange(6, dtype='int64').reshape(2, 3),
326+
index=idx, columns=cols)
327+
tm.assert_frame_equal(result, expected)
328+
329+
result = DataFrame(data, index=idx).sort_values(('a', value), axis=1)
330+
tm.assert_frame_equal(result, expected)
331+
332+
result = DataFrame(data, index=idx, columns=cols)
333+
tm.assert_frame_equal(result, expected)
292334

293335
@pytest.mark.skipif(not PY36, reason='Insertion order for Python>=3.6')
294336
def test_constructor_dict_order_insertion(self):
@@ -753,15 +795,15 @@ def test_constructor_corner(self):
753795

754796
# does not error but ends up float
755797
df = DataFrame(index=lrange(10), columns=['a', 'b'], dtype=int)
756-
assert df.values.dtype == np.object_
798+
assert df.values.dtype == np.dtype('float64')
757799

758800
# #1783 empty dtype object
759801
df = DataFrame({}, columns=['foo', 'bar'])
760802
assert df.values.dtype == np.object_
761803

762804
df = DataFrame({'b': 1}, index=lrange(10), columns=list('abc'),
763805
dtype=int)
764-
assert df.values.dtype == np.object_
806+
assert df.values.dtype == np.dtype('float64')
765807

766808
def test_constructor_scalar_inference(self):
767809
data = {'int': 1, 'bool': True,

pandas/tests/io/test_excel.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,7 @@ def test_read_one_empty_col_with_header(self, ext):
461461
)
462462
expected_header_none = DataFrame(pd.Series([0], dtype='int64'))
463463
tm.assert_frame_equal(actual_header_none, expected_header_none)
464-
expected_header_zero = DataFrame(columns=[0], dtype='int64')
464+
expected_header_zero = DataFrame(columns=[0])
465465
tm.assert_frame_equal(actual_header_zero, expected_header_zero)
466466

467467
@td.skip_if_no('openpyxl')

0 commit comments

Comments
 (0)