Skip to content

Commit d270bbb

Browse files
toobazjreback
authored andcommitted
Construction of Series from dict containing NaN as key (#18496)
closes #18480 closes #18515
1 parent f7df0ff commit d270bbb

File tree

9 files changed

+96
-65
lines changed

9 files changed

+96
-65
lines changed

doc/source/whatsnew/v0.22.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ Other API Changes
9999

100100
- :func:`Series.astype` and :func:`Index.astype` with an incompatible dtype will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`18231`)
101101
- ``Series`` construction with an ``object`` dtyped tz-aware datetime and ``dtype=object`` specified, will now return an ``object`` dtyped ``Series``, previously this would infer the datetime dtype (:issue:`18231`)
102+
- A :class:`Series` of ``dtype=category`` constructed from an empty ``dict`` will now have categories of ``dtype=object`` rather than ``dtype=float64``, consistently with the case in which an empty list is passed (:issue:`18515`)
102103
- ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`)
103104
- All-NaN levels in a ``MultiIndex`` are now assigned ``float`` rather than ``object`` dtype, promoting consistency with ``Index`` (:issue:`17929`).
104105
- :class:`Timestamp` will no longer silently ignore unused or invalid ``tz`` or ``tzinfo`` keyword arguments (:issue:`17690`)
@@ -242,5 +243,6 @@ Other
242243

243244
- Improved error message when attempting to use a Python keyword as an identifier in a numexpr query (:issue:`18221`)
244245
- Fixed a bug where creating a Series from an array that contains both tz-naive and tz-aware values will result in a Series whose dtype is tz-aware instead of object (:issue:`16406`)
246+
- Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`)
245247
- Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`)
246248
-

pandas/core/base.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -874,9 +874,8 @@ def _map_values(self, mapper, na_action=None):
874874
# convert to an Series for efficiency.
875875
# we specify the keys here to handle the
876876
# possibility that they are tuples
877-
from pandas import Series, Index
878-
index = Index(mapper, tupleize_cols=False)
879-
mapper = Series(mapper, index=index)
877+
from pandas import Series
878+
mapper = Series(mapper)
880879

881880
if isinstance(mapper, ABCSeries):
882881
# Since values were input this means we came from either

pandas/core/indexes/base.py

-21
Original file line numberDiff line numberDiff line change
@@ -2822,27 +2822,6 @@ def get_indexer_for(self, target, **kwargs):
28222822
indexer, _ = self.get_indexer_non_unique(target, **kwargs)
28232823
return indexer
28242824

2825-
_index_shared_docs['_get_values_from_dict'] = """
2826-
Return the values of the input dictionary in the order the keys are
2827-
in the index. np.nan is returned for index values not in the
2828-
dictionary.
2829-
2830-
Parameters
2831-
----------
2832-
data : dict
2833-
The dictionary from which to extract the values
2834-
2835-
Returns
2836-
-------
2837-
np.array
2838-
2839-
"""
2840-
2841-
@Appender(_index_shared_docs['_get_values_from_dict'])
2842-
def _get_values_from_dict(self, data):
2843-
return lib.fast_multiget(data, self.values,
2844-
default=np.nan)
2845-
28462825
def _maybe_promote(self, other):
28472826
# A hack, but it works
28482827
from pandas.core.indexes.datetimes import DatetimeIndex

pandas/core/indexes/datetimelike.py

-8
Original file line numberDiff line numberDiff line change
@@ -700,14 +700,6 @@ def __rsub__(self, other):
700700
def _add_delta(self, other):
701701
return NotImplemented
702702

703-
@Appender(_index_shared_docs['_get_values_from_dict'])
704-
def _get_values_from_dict(self, data):
705-
if len(data):
706-
return np.array([data.get(i, np.nan)
707-
for i in self.asobject.values])
708-
709-
return np.array([np.nan])
710-
711703
def _add_delta_td(self, other):
712704
# add a delta of a timedeltalike
713705
# return the i8 result view

pandas/core/indexes/datetimes.py

-11
Original file line numberDiff line numberDiff line change
@@ -1457,17 +1457,6 @@ def get_value_maybe_box(self, series, key):
14571457
key, tz=self.tz)
14581458
return _maybe_box(self, values, series, key)
14591459

1460-
@Appender(_index_shared_docs['_get_values_from_dict'])
1461-
def _get_values_from_dict(self, data):
1462-
if len(data):
1463-
# coerce back to datetime objects for lookup
1464-
data = com._dict_compat(data)
1465-
return lib.fast_multiget(data,
1466-
self.asobject.values,
1467-
default=np.nan)
1468-
1469-
return np.array([np.nan])
1470-
14711460
def get_loc(self, key, method=None, tolerance=None):
14721461
"""
14731462
Get integer location for requested label

pandas/core/series.py

+42-13
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@
4242
_default_index,
4343
_asarray_tuplesafe,
4444
_values_from_object,
45-
_try_sort,
4645
_maybe_match_name,
4746
SettingWithCopyError,
4847
_maybe_box_datetimelike,
@@ -198,18 +197,9 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
198197
data = data.reindex(index, copy=copy)
199198
data = data._data
200199
elif isinstance(data, dict):
201-
if index is None:
202-
if isinstance(data, OrderedDict):
203-
index = Index(data)
204-
else:
205-
index = Index(_try_sort(data))
206-
207-
try:
208-
data = index._get_values_from_dict(data)
209-
except TypeError:
210-
data = ([data.get(i, np.nan) for i in index]
211-
if data else np.nan)
212-
200+
data, index = self._init_dict(data, index, dtype)
201+
dtype = None
202+
copy = False
213203
elif isinstance(data, SingleBlockManager):
214204
if index is None:
215205
index = data.index
@@ -257,6 +247,45 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
257247
self.name = name
258248
self._set_axis(0, index, fastpath=True)
259249

250+
def _init_dict(self, data, index=None, dtype=None):
251+
"""
252+
Derive the "_data" and "index" attributes of a new Series from a
253+
dictionary input.
254+
255+
Parameters
256+
----------
257+
data : dict or dict-like
258+
Data used to populate the new Series
259+
index : Index or index-like, default None
260+
index for the new Series: if None, use dict keys
261+
dtype : dtype, default None
262+
dtype for the new Series: if None, infer from data
263+
264+
Returns
265+
-------
266+
_data : BlockManager for the new Series
267+
index : index for the new Series
268+
"""
269+
# Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')]
270+
# raises KeyError), so we iterate the entire dict, and align
271+
if data:
272+
keys, values = zip(*compat.iteritems(data))
273+
else:
274+
keys, values = [], []
275+
276+
# Input is now list-like, so rely on "standard" construction:
277+
s = Series(values, index=keys, dtype=dtype)
278+
279+
# Now we just make sure the order is respected, if any
280+
if index is not None:
281+
s = s.reindex(index, copy=False)
282+
elif not isinstance(data, OrderedDict):
283+
try:
284+
s = s.sort_index()
285+
except TypeError:
286+
pass
287+
return s._data, s.index
288+
260289
@classmethod
261290
def from_array(cls, arr, index=None, name=None, dtype=None, copy=False,
262291
fastpath=False):

pandas/tests/series/test_apply.py

+1
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,7 @@ def test_map_dict_with_tuple_keys(self):
422422
converted to a multi-index, preventing tuple values
423423
from being mapped properly.
424424
"""
425+
# GH 18496
425426
df = pd.DataFrame({'a': [(1, ), (2, ), (3, 4), (5, 6)]})
426427
label_mappings = {(1, ): 'A', (2, ): 'B', (3, 4): 'A', (5, 6): 'B'}
427428

pandas/tests/series/test_combine_concat.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,8 @@ def test_concat_empty_series_dtypes(self):
181181
# categorical
182182
assert pd.concat([Series(dtype='category'),
183183
Series(dtype='category')]).dtype == 'category'
184-
assert pd.concat([Series(dtype='category'),
184+
# GH 18515
185+
assert pd.concat([Series(np.array([]), dtype='category'),
185186
Series(dtype='float64')]).dtype == 'float64'
186187
assert pd.concat([Series(dtype='category'),
187188
Series(dtype='object')]).dtype == 'object'

pandas/tests/series/test_constructors.py

+47-8
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import pytest
55

66
from datetime import datetime, timedelta
7+
from collections import OrderedDict
78

89
from numpy import nan
910
import numpy as np
@@ -79,17 +80,42 @@ def test_constructor(self):
7980
m = MultiIndex.from_arrays([[1, 2], [3, 4]])
8081
pytest.raises(NotImplementedError, Series, m)
8182

82-
def test_constructor_empty(self):
83+
@pytest.mark.parametrize('input_class', [list, dict, OrderedDict])
84+
def test_constructor_empty(self, input_class):
8385
empty = Series()
84-
empty2 = Series([])
86+
empty2 = Series(input_class())
8587

86-
# the are Index() and RangeIndex() which don't compare type equal
88+
# these are Index() and RangeIndex() which don't compare type equal
8789
# but are just .equals
8890
assert_series_equal(empty, empty2, check_index_type=False)
8991

90-
empty = Series(index=lrange(10))
91-
empty2 = Series(np.nan, index=lrange(10))
92-
assert_series_equal(empty, empty2)
92+
# With explicit dtype:
93+
empty = Series(dtype='float64')
94+
empty2 = Series(input_class(), dtype='float64')
95+
assert_series_equal(empty, empty2, check_index_type=False)
96+
97+
# GH 18515 : with dtype=category:
98+
empty = Series(dtype='category')
99+
empty2 = Series(input_class(), dtype='category')
100+
assert_series_equal(empty, empty2, check_index_type=False)
101+
102+
if input_class is not list:
103+
# With index:
104+
empty = Series(index=lrange(10))
105+
empty2 = Series(input_class(), index=lrange(10))
106+
assert_series_equal(empty, empty2)
107+
108+
# With index and dtype float64:
109+
empty = Series(np.nan, index=lrange(10))
110+
empty2 = Series(input_class(), index=lrange(10), dtype='float64')
111+
assert_series_equal(empty, empty2)
112+
113+
@pytest.mark.parametrize('input_arg', [np.nan, float('nan')])
114+
def test_constructor_nan(self, input_arg):
115+
empty = Series(dtype='float64', index=lrange(10))
116+
empty2 = Series(input_arg, index=lrange(10))
117+
118+
assert_series_equal(empty, empty2, check_index_type=False)
93119

94120
def test_constructor_series(self):
95121
index1 = ['d', 'b', 'a', 'c']
@@ -625,6 +651,21 @@ def test_constructor_dict(self):
625651
expected.iloc[1] = 1
626652
assert_series_equal(result, expected)
627653

654+
@pytest.mark.parametrize("value", [2, np.nan, None, float('nan')])
655+
def test_constructor_dict_nan_key(self, value):
656+
# GH 18480
657+
d = {1: 'a', value: 'b', float('nan'): 'c', 4: 'd'}
658+
result = Series(d).sort_values()
659+
expected = Series(['a', 'b', 'c', 'd'], index=[1, value, np.nan, 4])
660+
assert_series_equal(result, expected)
661+
662+
# MultiIndex:
663+
d = {(1, 1): 'a', (2, np.nan): 'b', (3, value): 'c'}
664+
result = Series(d).sort_values()
665+
expected = Series(['a', 'b', 'c'],
666+
index=Index([(1, 1), (2, np.nan), (3, value)]))
667+
assert_series_equal(result, expected)
668+
628669
def test_constructor_dict_datetime64_index(self):
629670
# GH 9456
630671

@@ -658,8 +699,6 @@ def test_constructor_tuple_of_tuples(self):
658699
s = Series(data)
659700
assert tuple(s) == data
660701

661-
@pytest.mark.xfail(reason='GH 18480 (Series initialization from dict with '
662-
'NaN keys')
663702
def test_constructor_dict_of_tuples(self):
664703
data = {(1, 2): 3,
665704
(None, 5): 6}

0 commit comments

Comments
 (0)