Skip to content

Commit c2ed35a

Browse files
committed
Merge pull request #5018 from jreback/column_nan
BUG: Fix a bug when indexing np.nan via loc/iloc (GH5016)
2 parents 1f00335 + 96e1bbc commit c2ed35a

File tree

7 files changed

+101
-8
lines changed

7 files changed

+101
-8
lines changed

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,7 @@ Bug Fixes
497497
- Fixed wrong index name during read_csv if using usecols. Applies to c parser only. (:issue:`4201`)
498498
- ``Timestamp`` objects can now appear in the left hand side of a comparison
499499
operation with a ``Series`` or ``DataFrame`` object (:issue:`4982`).
500+
- Fix a bug when indexing with ``np.nan`` via ``iloc/loc`` (:issue:`5016`)
500501

501502
pandas 0.12.0
502503
-------------

pandas/core/index.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,7 @@ def _convert_scalar_indexer(self, key, typ=None):
424424
def to_int():
425425
ikey = int(key)
426426
if ikey != key:
427-
self._convert_indexer_error(key, 'label')
427+
return self._convert_indexer_error(key, 'label')
428428
return ikey
429429

430430
if typ == 'iloc':

pandas/core/indexing.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
# pylint: disable=W0223
22

33
from datetime import datetime
4-
from pandas.core.common import _asarray_tuplesafe, is_list_like
54
from pandas.core.index import Index, MultiIndex, _ensure_index
65
from pandas.compat import range, zip
76
import pandas.compat as compat
87
import pandas.core.common as com
98
from pandas.core.common import (_is_bool_indexer, is_integer_dtype,
9+
_asarray_tuplesafe, is_list_like, isnull,
1010
ABCSeries, ABCDataFrame, ABCPanel)
1111
import pandas.lib as lib
1212

@@ -979,12 +979,20 @@ def _has_valid_type(self, key, axis):
979979
else:
980980

981981
def error():
982+
if isnull(key):
983+
raise ValueError("cannot use label indexing with a null key")
982984
raise KeyError("the label [%s] is not in the [%s]" % (key,self.obj._get_axis_name(axis)))
983985

984-
key = self._convert_scalar_indexer(key, axis)
985986
try:
987+
key = self._convert_scalar_indexer(key, axis)
986988
if not key in ax:
987989
error()
990+
except (TypeError) as e:
991+
992+
# python 3 type errors should be raised
993+
if 'unorderable' in str(e): # pragma: no cover
994+
error()
995+
raise
988996
except:
989997
error()
990998

pandas/core/internals.py

+35-4
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,13 @@ def ref_locs(self):
9797
indexer = self.ref_items.get_indexer(self.items)
9898
indexer = com._ensure_platform_int(indexer)
9999
if (indexer == -1).any():
100-
raise AssertionError('Some block items were not in block '
101-
'ref_items')
100+
101+
# this means that we have nan's in our block
102+
try:
103+
indexer[indexer == -1] = np.arange(len(self.items))[isnull(self.items)]
104+
except:
105+
raise AssertionError('Some block items were not in block '
106+
'ref_items')
102107

103108
self._ref_locs = indexer
104109
return self._ref_locs
@@ -2500,9 +2505,18 @@ def _consolidate_inplace(self):
25002505

25012506
def get(self, item):
25022507
if self.items.is_unique:
2508+
2509+
if isnull(item):
2510+
indexer = np.arange(len(self.items))[isnull(self.items)]
2511+
return self.get_for_nan_indexer(indexer)
2512+
25032513
_, block = self._find_block(item)
25042514
return block.get(item)
25052515
else:
2516+
2517+
if isnull(item):
2518+
raise ValueError("cannot label index with a null key")
2519+
25062520
indexer = self.items.get_loc(item)
25072521
ref_locs = np.array(self._set_ref_locs())
25082522

@@ -2528,14 +2542,31 @@ def get(self, item):
25282542

25292543
def iget(self, i):
25302544
item = self.items[i]
2545+
2546+
# unique
25312547
if self.items.is_unique:
2532-
return self.get(item)
2548+
if notnull(item):
2549+
return self.get(item)
2550+
return self.get_for_nan_indexer(i)
25332551

2534-
# compute the duplicative indexer if needed
25352552
ref_locs = self._set_ref_locs()
25362553
b, loc = ref_locs[i]
25372554
return b.iget(loc)
25382555

2556+
def get_for_nan_indexer(self, indexer):
2557+
2558+
# allow a single nan location indexer
2559+
if not np.isscalar(indexer):
2560+
if len(indexer) == 1:
2561+
indexer = indexer.item()
2562+
else:
2563+
raise ValueError("cannot label index with a null key")
2564+
2565+
# take a nan indexer and return the values
2566+
ref_locs = self._set_ref_locs(do_refs='force')
2567+
b, loc = ref_locs[indexer]
2568+
return b.iget(loc)
2569+
25392570
def get_scalar(self, tup):
25402571
"""
25412572
Retrieve single item

pandas/core/series.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1053,10 +1053,10 @@ def __setitem__(self, key, value):
10531053
except TypeError as e:
10541054
if isinstance(key, tuple) and not isinstance(self.index, MultiIndex):
10551055
raise ValueError("Can only tuple-index with a MultiIndex")
1056+
10561057
# python 3 type errors should be raised
10571058
if 'unorderable' in str(e): # pragma: no cover
10581059
raise IndexError(key)
1059-
# Could not hash item
10601060

10611061
if _is_bool_indexer(key):
10621062
key = _check_bool_indexer(self.index, key)

pandas/hashtable.pyx

+16
Original file line numberDiff line numberDiff line change
@@ -643,6 +643,8 @@ cdef class Float64HashTable(HashTable):
643643

644644
return uniques.to_array()
645645

646+
na_sentinel = object
647+
646648
cdef class PyObjectHashTable(HashTable):
647649
# cdef kh_pymap_t *table
648650

@@ -660,6 +662,8 @@ cdef class PyObjectHashTable(HashTable):
660662
def __contains__(self, object key):
661663
cdef khiter_t k
662664
hash(key)
665+
if key != key or key is None:
666+
key = na_sentinel
663667
k = kh_get_pymap(self.table, <PyObject*>key)
664668
return k != self.table.n_buckets
665669

@@ -669,6 +673,8 @@ cdef class PyObjectHashTable(HashTable):
669673

670674
cpdef get_item(self, object val):
671675
cdef khiter_t k
676+
if val != val or val is None:
677+
val = na_sentinel
672678
k = kh_get_pymap(self.table, <PyObject*>val)
673679
if k != self.table.n_buckets:
674680
return self.table.vals[k]
@@ -677,6 +683,8 @@ cdef class PyObjectHashTable(HashTable):
677683

678684
def get_iter_test(self, object key, Py_ssize_t iterations):
679685
cdef Py_ssize_t i, val
686+
if key != key or key is None:
687+
key = na_sentinel
680688
for i in range(iterations):
681689
k = kh_get_pymap(self.table, <PyObject*>key)
682690
if k != self.table.n_buckets:
@@ -689,6 +697,8 @@ cdef class PyObjectHashTable(HashTable):
689697
char* buf
690698

691699
hash(key)
700+
if key != key or key is None:
701+
key = na_sentinel
692702
k = kh_put_pymap(self.table, <PyObject*>key, &ret)
693703
# self.table.keys[k] = key
694704
if kh_exist_pymap(self.table, k):
@@ -706,6 +716,9 @@ cdef class PyObjectHashTable(HashTable):
706716
for i in range(n):
707717
val = values[i]
708718
hash(val)
719+
if val != val or val is None:
720+
val = na_sentinel
721+
709722
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
710723
self.table.vals[k] = i
711724

@@ -720,6 +733,9 @@ cdef class PyObjectHashTable(HashTable):
720733
for i in range(n):
721734
val = values[i]
722735
hash(val)
736+
if val != val or val is None:
737+
val = na_sentinel
738+
723739
k = kh_get_pymap(self.table, <PyObject*>val)
724740
if k != self.table.n_buckets:
725741
locs[i] = self.table.vals[k]

pandas/tests/test_frame.py

+37
Original file line numberDiff line numberDiff line change
@@ -642,6 +642,8 @@ def test_setitem_clear_caches(self):
642642
def test_setitem_None(self):
643643
# GH #766
644644
self.frame[None] = self.frame['A']
645+
assert_series_equal(self.frame.iloc[:,-1], self.frame['A'])
646+
assert_series_equal(self.frame.loc[:,None], self.frame['A'])
645647
assert_series_equal(self.frame[None], self.frame['A'])
646648
repr(self.frame)
647649

@@ -4475,6 +4477,41 @@ def test_constructor_lists_to_object_dtype(self):
44754477
self.assert_(d['a'].dtype == np.object_)
44764478
self.assert_(d['a'][1] is False)
44774479

4480+
def test_constructor_with_nas(self):
4481+
# GH 5016
4482+
# na's in indicies
4483+
4484+
def check(df):
4485+
for i in range(len(df.columns)):
4486+
df.iloc[:,i]
4487+
4488+
# allow single nans to succeed
4489+
indexer = np.arange(len(df.columns))[isnull(df.columns)]
4490+
4491+
if len(indexer) == 1:
4492+
assert_series_equal(df.iloc[:,indexer[0]],df.loc[:,np.nan])
4493+
4494+
4495+
# multiple nans should fail
4496+
else:
4497+
4498+
def f():
4499+
df.loc[:,np.nan]
4500+
self.assertRaises(ValueError, f)
4501+
4502+
4503+
df = DataFrame([[1,2,3],[4,5,6]], index=[1,np.nan])
4504+
check(df)
4505+
4506+
df = DataFrame([[1,2,3],[4,5,6]], columns=[1.1,2.2,np.nan])
4507+
check(df)
4508+
4509+
df = DataFrame([[0,1,2,3],[4,5,6,7]], columns=[np.nan,1.1,2.2,np.nan])
4510+
check(df)
4511+
4512+
df = DataFrame([[0.0,1,2,3.0],[4,5,6,7]], columns=[np.nan,1.1,2.2,np.nan])
4513+
check(df)
4514+
44784515
def test_logical_with_nas(self):
44794516
d = DataFrame({'a': [np.nan, False], 'b': [True, True]})
44804517

0 commit comments

Comments
 (0)