Skip to content

Commit bcdb3e9

Browse files
committed
ENH: Add Index.fillna
1 parent f44a83a commit bcdb3e9

File tree

10 files changed

+412
-63
lines changed

10 files changed

+412
-63
lines changed

doc/source/indexing.rst

+25
Original file line numberDiff line numberDiff line change
@@ -1367,6 +1367,31 @@ with duplicates dropped.
13671367
idx1.sym_diff(idx2)
13681368
idx1 ^ idx2
13691369
1370+
Missing values
1371+
~~~~~~~~~~~~~~
1372+
1373+
.. _indexing.missing:
1374+
1375+
.. versionadded:: 0.17.1
1376+
1377+
.. important::
1378+
1379+
Even though ``Index`` can hold missing values (``NaN``), it should be avoided
1380+
if you do not want any unexpected results. For example, some operations
1381+
exclude missing values implicitly.
1382+
1383+
``Index.fillna`` fills missing values with specified scalar value.
1384+
1385+
.. ipython:: python
1386+
1387+
idx1 = pd.Index([1, np.nan, 3, 4])
1388+
idx1
1389+
idx1.fillna(2)
1390+
1391+
idx2 = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')])
1392+
idx2
1393+
idx2.fillna(pd.Timestamp('2011-01-02'))
1394+
13701395
Set / Reset Index
13711396
-----------------
13721397

doc/source/whatsnew/v0.17.1.txt

+6
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@ Enhancements
2626
- ``DataFrame`` now uses the fields of a ``namedtuple`` as columns, if columns are not supplied (:issue:`11181`)
2727
- Improve the error message displayed in :func:`pandas.io.gbq.to_gbq` when the DataFrame does not match the schema of the destination table (:issue:`11359`)
2828

29+
- ``Index`` now has ``fillna`` method (:issue:`10089`)
30+
31+
.. ipython:: python
32+
33+
pd.Index([1, np.nan, 3]).fillna(2)
34+
2935
.. _whatsnew_0171.api:
3036

3137
API changes

pandas/core/index.py

+98-27
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
from pandas.compat import range, zip, lrange, lzip, u, map
1616
from pandas import compat
1717
from pandas.core import algorithms
18-
from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs, PandasDelegate
18+
from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, PandasDelegate
19+
import pandas.core.base as base
1920
from pandas.util.decorators import (Appender, Substitution, cache_readonly,
2021
deprecate, deprecate_kwarg)
2122
import pandas.core.common as com
@@ -29,8 +30,6 @@
2930
from pandas.io.common import PerformanceWarning
3031

3132

32-
33-
3433
# simplify
3534
default_pprint = lambda x, max_seq_items=None: com.pprint_thing(x,
3635
escape_chars=('\t', '\r', '\n'),
@@ -45,6 +44,7 @@
4544

4645
_index_doc_kwargs = dict(klass='Index', inplace='',
4746
duplicated='np.array')
47+
_index_shared_docs = dict()
4848

4949

5050
def _try_get_item(x):
@@ -108,6 +108,7 @@ class Index(IndexOpsMixin, PandasObject):
108108
_allow_datetime_index_ops = False
109109
_allow_period_index_ops = False
110110
_is_numeric_dtype = False
111+
_can_hold_na = True
111112

112113
_engine_type = _index.ObjectEngine
113114

@@ -1236,6 +1237,43 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None):
12361237
taken = self.values.take(indices)
12371238
return self._shallow_copy(taken)
12381239

1240+
@cache_readonly
1241+
def _isnan(self):
1242+
""" return if each value is nan"""
1243+
if self._can_hold_na:
1244+
return isnull(self)
1245+
else:
1246+
# shouldn't reach to this condition by checking hasnans beforehand
1247+
values = np.empty(len(self), dtype=np.bool_)
1248+
values.fill(False)
1249+
return values
1250+
1251+
@cache_readonly
1252+
def _nan_idxs(self):
1253+
if self._can_hold_na:
1254+
w, = self._isnan.nonzero()
1255+
return w
1256+
else:
1257+
return np.array([], dtype=np.int64)
1258+
1259+
@cache_readonly
1260+
def hasnans(self):
1261+
""" return if I have any nans; enables various perf speedups """
1262+
if self._can_hold_na:
1263+
return self._isnan.any()
1264+
else:
1265+
return False
1266+
1267+
def _convert_for_op(self, value):
1268+
""" Convert value to be insertable to ndarray """
1269+
return value
1270+
1271+
def _assert_can_do_op(self, value):
1272+
""" Check value is valid for scalar op """
1273+
if not lib.isscalar(value):
1274+
msg = "'value' must be a scalar, passed: {0}"
1275+
raise TypeError(msg.format(type(value).__name__))
1276+
12391277
def putmask(self, mask, value):
12401278
"""
12411279
return a new Index of the values set with the mask
@@ -1245,8 +1283,12 @@ def putmask(self, mask, value):
12451283
numpy.ndarray.putmask
12461284
"""
12471285
values = self.values.copy()
1248-
np.putmask(values, mask, value)
1249-
return self._shallow_copy(values)
1286+
try:
1287+
np.putmask(values, mask, self._convert_for_op(value))
1288+
return self._shallow_copy(values)
1289+
except (ValueError, TypeError):
1290+
# coerces to object
1291+
return self.astype(object).putmask(mask, value)
12501292

12511293
def format(self, name=False, formatter=None, **kwargs):
12521294
"""
@@ -2766,15 +2808,45 @@ def drop(self, labels, errors='raise'):
27662808
return self.delete(indexer)
27672809

27682810
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
2769-
@Appender(_shared_docs['drop_duplicates'] % _index_doc_kwargs)
2811+
@Appender(base._shared_docs['drop_duplicates'] % _index_doc_kwargs)
27702812
def drop_duplicates(self, keep='first'):
27712813
return super(Index, self).drop_duplicates(keep=keep)
27722814

27732815
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
2774-
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
2816+
@Appender(base._shared_docs['duplicated'] % _index_doc_kwargs)
27752817
def duplicated(self, keep='first'):
27762818
return super(Index, self).duplicated(keep=keep)
27772819

2820+
_index_shared_docs['fillna'] = """
2821+
Fill NA/NaN values with the specified value
2822+
2823+
Parameters
2824+
----------
2825+
value : scalar
2826+
Scalar value to use to fill holes (e.g. 0).
2827+
This value cannot be a list-likes.
2828+
downcast : dict, default is None
2829+
a dict of item->dtype of what to downcast if possible,
2830+
or the string 'infer' which will try to downcast to an appropriate
2831+
equal type (e.g. float64 to int64 if possible)
2832+
2833+
Returns
2834+
-------
2835+
filled : Index
2836+
"""
2837+
2838+
@Appender(_index_shared_docs['fillna'])
2839+
def fillna(self, value=None, downcast=None):
2840+
self._assert_can_do_op(value)
2841+
if self.hasnans:
2842+
result = self.putmask(self._isnan, value)
2843+
if downcast is None:
2844+
# no need to care metadata other than name
2845+
# because it can't have freq if
2846+
return Index(result, name=self.name)
2847+
2848+
return self._shallow_copy()
2849+
27782850
def _evaluate_with_timedelta_like(self, other, op, opstr):
27792851
raise TypeError("can only perform ops with timedelta like values")
27802852

@@ -3200,6 +3272,16 @@ def __array__(self, dtype=None):
32003272
""" the array interface, return my values """
32013273
return np.array(self._data, dtype=dtype)
32023274

3275+
@cache_readonly
3276+
def _isnan(self):
3277+
""" return if each value is nan"""
3278+
return self._data.codes == -1
3279+
3280+
@Appender(_index_shared_docs['fillna'])
3281+
def fillna(self, value, downcast=None):
3282+
self._assert_can_do_op(value)
3283+
return CategoricalIndex(self._data.fillna(value), name=self.name)
3284+
32033285
def argsort(self, *args, **kwargs):
32043286
return self.values.argsort(*args, **kwargs)
32053287

@@ -3214,7 +3296,7 @@ def is_unique(self):
32143296
return not self.duplicated().any()
32153297

32163298
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
3217-
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
3299+
@Appender(base._shared_docs['duplicated'] % _index_doc_kwargs)
32183300
def duplicated(self, keep='first'):
32193301
from pandas.hashtable import duplicated_int64
32203302
return duplicated_int64(self.codes.astype('i8'), keep)
@@ -3612,6 +3694,8 @@ class Int64Index(NumericIndex):
36123694
_inner_indexer = _algos.inner_join_indexer_int64
36133695
_outer_indexer = _algos.outer_join_indexer_int64
36143696

3697+
_can_hold_na = False
3698+
36153699
_engine_type = _index.Int64Engine
36163700

36173701
def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs):
@@ -3646,11 +3730,6 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, *
36463730
def inferred_type(self):
36473731
return 'integer'
36483732

3649-
@cache_readonly
3650-
def hasnans(self):
3651-
# by definition
3652-
return False
3653-
36543733
@property
36553734
def asi8(self):
36563735
# do not cache or you'll create a memory leak
@@ -3872,19 +3951,6 @@ def is_all_dates(self):
38723951
"""
38733952
return False
38743953

3875-
@cache_readonly
3876-
def _nan_idxs(self):
3877-
w, = self._isnan.nonzero()
3878-
return w
3879-
3880-
@cache_readonly
3881-
def _isnan(self):
3882-
return np.isnan(self.values)
3883-
3884-
@cache_readonly
3885-
def hasnans(self):
3886-
return self._isnan.any()
3887-
38883954
@cache_readonly
38893955
def is_unique(self):
38903956
return super(Float64Index, self).is_unique and self._nan_idxs.size < 2
@@ -4409,7 +4475,7 @@ def is_unique(self):
44094475
return not self.duplicated().any()
44104476

44114477
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
4412-
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
4478+
@Appender(base._shared_docs['duplicated'] % _index_doc_kwargs)
44134479
def duplicated(self, keep='first'):
44144480
from pandas.core.groupby import get_group_index
44154481
from pandas.hashtable import duplicated_int64
@@ -4419,6 +4485,11 @@ def duplicated(self, keep='first'):
44194485

44204486
return duplicated_int64(ids, keep)
44214487

4488+
@Appender(_index_shared_docs['fillna'])
4489+
def fillna(self, value=None, downcast=None):
4490+
# isnull is not implemented for MultiIndex
4491+
raise NotImplementedError('isnull is not defined for MultiIndex')
4492+
44224493
def get_value(self, series, key):
44234494
# somewhat broken encapsulation
44244495
from pandas.core.indexing import maybe_droplevels

pandas/src/period.pyx

+5-5
Original file line numberDiff line numberDiff line change
@@ -452,7 +452,7 @@ def extract_ordinals(ndarray[object] values, freq):
452452
p = values[i]
453453
ordinals[i] = p.ordinal
454454
if p.freqstr != freqstr:
455-
raise ValueError("%s is wrong freq" % p)
455+
raise ValueError(_DIFFERENT_FREQ_INDEX.format(freqstr, p.freqstr))
456456

457457
return ordinals
458458

@@ -624,8 +624,8 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps,
624624
return result
625625

626626

627-
_DIFFERENT_FREQ_ERROR = "Input has different freq={1} from Period(freq={0})"
628-
627+
_DIFFERENT_FREQ = "Input has different freq={1} from Period(freq={0})"
628+
_DIFFERENT_FREQ_INDEX = "Input has different freq={1} from PeriodIndex(freq={0})"
629629

630630
cdef class Period(object):
631631
"""
@@ -766,7 +766,7 @@ cdef class Period(object):
766766
if isinstance(other, Period):
767767
from pandas.tseries.frequencies import get_freq_code as _gfc
768768
if other.freq != self.freq:
769-
msg = _DIFFERENT_FREQ_ERROR.format(self.freqstr, other.freqstr)
769+
msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr)
770770
raise ValueError(msg)
771771
if self.ordinal == tslib.iNaT or other.ordinal == tslib.iNaT:
772772
return _nat_scalar_rules[op]
@@ -807,7 +807,7 @@ cdef class Period(object):
807807
else:
808808
ordinal = self.ordinal + other.n
809809
return Period(ordinal=ordinal, freq=self.freq)
810-
msg = _DIFFERENT_FREQ_ERROR.format(self.freqstr, other.freqstr)
810+
msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr)
811811
raise ValueError(msg)
812812
else: # pragma no cover
813813
return NotImplemented

0 commit comments

Comments
 (0)