diff --git a/doc/source/api.rst b/doc/source/api.rst index ca95a739ed661..c5b83e4af6999 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -465,6 +465,7 @@ Missing data handling DataFrame.dropna DataFrame.fillna + DataFrame.replace Reshaping, sorting, transposing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -492,7 +493,6 @@ Combining / joining / merging DataFrame.append DataFrame.join DataFrame.merge - DataFrame.replace DataFrame.update Time series-related diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index 133d83513041e..70db8abf3c503 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -334,6 +334,133 @@ missing and interpolate over them: ser.replace([1, 2, 3], method='pad') +String/Regular Expression Replacement +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. note:: + + Python strings prefixed with the ``r`` character such as ``r'hello world'`` + are so-called "raw" strings. They have different semantics regarding + backslashes than strings without this prefix. Backslashes in raw strings + will be interpreted as an escaped backslash, e.g., ``r'\' == '\\'``. You + should `read about them + `_ + if this is unclear. + +Replace the '.' with ``nan`` (str -> str) + +.. ipython:: python + + from numpy.random import rand, randn + from numpy import nan + from pandas import DataFrame + d = {'a': range(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} + df = DataFrame(d) + df.replace('.', nan) + +Now do it with a regular expression that removes surrounding whitespace +(regex -> regex) + +.. ipython:: python + + df.replace(r'\s*\.\s*', nan, regex=True) + +Replace a few different values (list -> list) + +.. ipython:: python + + df.replace(['a', '.'], ['b', nan]) + +list of regex -> list of regex + +.. ipython:: python + + df.replace([r'\.', r'(a)'], ['dot', '\1stuff'], regex=True) + +Only search in column ``'b'`` (dict -> dict) + +.. ipython:: python + + df.replace({'b': '.'}, {'b': nan}) + +Same as the previous example, but use a regular expression for +searching instead (dict of regex -> dict) + +.. ipython:: python + + df.replace({'b': r'\s*\.\s*'}, {'b': nan}, regex=True) + +You can pass nested dictionaries of regular expressions that use ``regex=True`` + +.. ipython:: python + + df.replace({'b': {'b': r''}}, regex=True) + +or you can pass the nested dictionary like so + +.. ipython:: python + + df.replace(regex={'b': {'b': r'\s*\.\s*'}}) + +You can also use the group of a regular expression match when replacing (dict +of regex -> dict of regex), this works for lists as well + +.. ipython:: python + + df.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, regex=True) + +You can pass a list of regular expressions, of which those that match +will be replaced with a scalar (list of regex -> regex) + +.. ipython:: python + + df.replace([r'\s*\.\*', r'a|b'], nan, regex=True) + +All of the regular expression examples can also be passed with the +``to_replace`` argument as the ``regex`` argument. In this case the ``value`` +argument must be passed explicity by name or ``regex`` must be a nested +dictionary. The previous example, in this case, would then be + +.. ipython:: python + + df.replace(regex=[r'\s*\.\*', r'a|b'], value=nan) + +This can be convenient if you do not want to pass ``regex=True`` every time you +want to use a regular expression. + +.. note:: + + Anywhere in the above ``replace`` examples that you see a regular expression + a compiled regular expression is valid as well. + +Numeric Replacement +^^^^^^^^^^^^^^^^^^^ + +Similiar to ``DataFrame.fillna`` + +.. ipython:: python + + from numpy.random import rand, randn + from numpy import nan + from pandas import DataFrame + from pandas.util.testing import assert_frame_equal + df = DataFrame(randn(10, 2)) + df[rand(df.shape[0]) > 0.5] = 1.5 + df.replace(1.5, nan) + +Replacing more than one value via lists works as well + +.. ipython:: python + + df00 = df.values[0, 0] + df.replace([1.5, df00], [nan, 'a']) + df[1].dtype + +You can also operate on the DataFrame in place + +.. ipython:: python + + df.replace(1.5, nan, inplace=True) Missing data casting rules and indexing --------------------------------------- diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt index 4d983905f9aaa..c16eb64631198 100644 --- a/doc/source/v0.11.1.txt +++ b/doc/source/v0.11.1.txt @@ -55,6 +55,9 @@ Enhancements - ``fillna`` methods now raise a ``TypeError`` if the ``value`` parameter is a list or tuple. - Added module for reading and writing Stata files: pandas.io.stata (GH1512_) + - ``DataFrame.replace()`` now allows regular expressions on contained + ``Series`` with object dtype. See the examples section in the regular docs + and the generated documentation for the method for more details. See the `full release notes `__ or issue tracker @@ -70,3 +73,4 @@ on GitHub for a complete list. .. _GH3590: https://github.com/pydata/pandas/issues/3590 .. _GH3435: https://github.com/pydata/pandas/issues/3435 .. _GH1512: https://github.com/pydata/pandas/issues/1512 +.. _GH2285: https://github.com/pydata/pandas/issues/2285 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 73f789a9425c6..39742557ccc56 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -17,6 +17,7 @@ import operator import sys import collections +import itertools from numpy import nan as NA import numpy as np @@ -32,7 +33,8 @@ _maybe_convert_indices) from pandas.core.internals import (BlockManager, create_block_manager_from_arrays, - create_block_manager_from_blocks) + create_block_manager_from_blocks, + _re_compilable) from pandas.core.series import Series, _radd_compat import pandas.core.expressions as expressions from pandas.compat.scipy import scoreatpercentile as _quantile @@ -3431,17 +3433,46 @@ def bfill(self, axis=0, inplace=False, limit=None): return self.fillna(method='bfill', axis=axis, inplace=inplace, limit=limit) - def replace(self, to_replace, value=None, method='pad', axis=0, - inplace=False, limit=None): - """ - Replace values given in 'to_replace' with 'value' or using 'method' + def replace(self, to_replace=None, value=None, method='pad', axis=0, + inplace=False, limit=None, regex=False, infer_types=False): + """Replace values given in 'to_replace' with 'value' or using 'method'. Parameters ---------- - value : scalar or dict, default None + to_replace : str, regex, list, dict, Series, numeric, or None + * str or regex: + - str: string exactly matching `to_replace` will be replaced + with `value` + - regex: regexs matching `to_replace` will be replaced with + `value` + * list of str, regex, or numeric: + - First, if `to_replace` and `value` are both lists, they + **must** be the same length. + - Second, if ``regex=True`` then all of the strings in **both** + lists will be interpreted as regexs otherwise they will match + directly. This doesn't matter much for `value` since there + are only a few possible substitution regexes you can use. + - str and regex rules apply as above. + * dict: + - Nested dictionaries, e.g., {'a': {'b': nan}}, are read as + follows: look in column 'a' for the value 'b' and replace it + with nan. You can nest regular expressions as well. Note that + column names (the top-level dictionary keys in a nested + dictionary) **cannot** be regular expressions. + - Keys map to column names and values map to substitution + values. You can treat this as a special case of passing two + lists except that you are specifying the column to search in. + * None: + - This means that the ``regex`` argument must be a string, + compiled regular expression, or list, dict, ndarray or Series + of such elements. If `value` is also ``None`` then this + **must** be a nested dictionary or ``Series``. + See the examples section for examples of each of these. + value : scalar, dict, list, str, regex, default None Value to use to fill holes (e.g. 0), alternately a dict of values specifying which value to use for each column (columns not in the - dict will not be filled) + dict will not be filled). Regular expressions, strings and lists or + dicts of such objects are also allowed. method : {'backfill', 'bfill', 'pad', 'ffill', None}, default 'pad' Method to use for filling holes in reindexed Series pad / ffill: propagate last valid observation forward to next valid @@ -3456,23 +3487,91 @@ def replace(self, to_replace, value=None, method='pad', axis=0, a reference to the filled object, which is self if inplace=True limit : int, default None Maximum size gap to forward or backward fill + regex : bool or same types as `to_replace`, default False + Whether to interpret `to_replace` and/or `value` as regular + expressions. If this is ``True`` then `to_replace` *must* be a + string. Otherwise, `to_replace` must be ``None`` because this + parameter will be interpreted as a regular expression or a list, + dict, or array of regular expressions. + infer_types : bool, default True + If ``True`` attempt to convert object blocks to a better dtype. See also -------- - reindex, asfreq + reindex, asfreq, fillna, interpolate Returns ------- filled : DataFrame - """ + + Raises + ------ + AssertionError + * If `regex` is not a ``bool`` and `to_replace` is not ``None``. + TypeError + * If `to_replace` is a ``dict`` and `value` is not a ``list``, + ``dict``, ``ndarray``, or ``Series`` + * If `to_replace` is ``None`` and `regex` is not compilable into a + regular expression or is a list, dict, ndarray, or Series. + ValueError + * If `to_replace` and `value` are ``list`` s or ``ndarray`` s, but + they are not the same length. + + Notes + ----- + * Regex substitution is performed under the hood with ``re.sub``. The + rules for substitution for ``re.sub`` are the same. + * Regular expressions will only substitute on strings, meaning you + cannot provide, for example, a regular expression matching floating + point numbers and expect the columns in your frame that have a + numeric dtype to be matched. However, if those floating point numbers + *are* strings, then you can do this. + * This method has *a lot* of options. You are encouraged to experiment + and play with this method to gain intuition about how it works. + """ + if not isinstance(regex, bool) and to_replace is not None: + raise AssertionError("'to_replace' must be 'None' if 'regex' is " + "not a bool") self._consolidate_inplace() axis = self._get_axis_number(axis) + method = com._clean_fill_method(method) if value is None: - return self._interpolate(to_replace, method, axis, inplace, limit) + if not isinstance(to_replace, (dict, Series)): + if not isinstance(regex, (dict, Series)): + raise TypeError('If "to_replace" and "value" are both None' + ' then regex must be a mapping') + to_replace = regex + regex = True + + items = to_replace.items() + keys, values = itertools.izip(*items) + + are_mappings = [isinstance(v, (dict, Series)) for v in values] + + if any(are_mappings): + if not all(are_mappings): + raise TypeError("If a nested mapping is passed, all values" + " of the top level mapping must be " + "mappings") + # passed a nested dict/Series + to_rep_dict = {} + value_dict = {} + + for k, v in items: + to_rep_dict[k] = v.keys() + value_dict[k] = v.values() + + to_replace, value = to_rep_dict, value_dict + else: + to_replace, value = keys, values + + return self.replace(to_replace, value, method=method, axis=axis, + inplace=inplace, limit=limit, regex=regex, + infer_types=infer_types) else: - if len(self.columns) == 0: + if not len(self.columns): return self new_data = self._data @@ -3483,17 +3582,20 @@ def replace(self, to_replace, value=None, method='pad', axis=0, if c in value and c in self: new_data = new_data.replace(src, value[c], filter=[ c ], - inplace=inplace) + inplace=inplace, + regex=regex) - elif not isinstance(value, (list, np.ndarray)): + elif not isinstance(value, (list, np.ndarray)): # {'A': NA} -> 0 new_data = self._data for k, src in to_replace.iteritems(): if k in self: new_data = new_data.replace(src, value, filter = [ k ], - inplace=inplace) + inplace=inplace, + regex=regex) else: - raise ValueError('Fill value must be scalar or dict or Series') + raise TypeError('Fill value must be scalar, dict, or ' + 'Series') elif isinstance(to_replace, (list, np.ndarray)): # [NA, ''] -> [0, 'missing'] @@ -3504,63 +3606,93 @@ def replace(self, to_replace, value=None, method='pad', axis=0, (len(to_replace), len(value))) new_data = self._data.replace_list(to_replace, value, - inplace=inplace) + inplace=inplace, + regex=regex) else: # [NA, ''] -> 0 new_data = self._data.replace(to_replace, value, - inplace=inplace) - + inplace=inplace, regex=regex) + elif to_replace is None: + if not (_re_compilable(regex) or + isinstance(regex, (list, dict, np.ndarray, Series))): + raise TypeError("'regex' must be a string or a compiled " + "regular expression or a list or dict of " + "strings or regular expressions, you " + "passed a {0}".format(type(regex))) + return self.replace(regex, value, method=method, axis=axis, + inplace=inplace, limit=limit, regex=True, + infer_types=infer_types) else: # dest iterable dict-like if isinstance(value, (dict, Series)): # NA -> {'A' : 0, 'B' : -1} - new_data = self._data + for k, v in value.iteritems(): if k in self: new_data = new_data.replace(to_replace, v, filter=[ k ], - inplace=inplace) + inplace=inplace, + regex=regex) elif not isinstance(value, (list, np.ndarray)): # NA -> 0 new_data = self._data.replace(to_replace, value, - inplace=inplace) + inplace=inplace, regex=regex) else: - raise ValueError('Invalid to_replace type: %s' % - type(to_replace)) # pragma: no cover + raise TypeError('Invalid "to_replace" type: ' + '{0}'.format(type(to_replace))) # pragma: no cover + if infer_types: + new_data = new_data.convert() if inplace: self._data = new_data else: return self._constructor(new_data) - def _interpolate(self, to_replace, method, axis, inplace, limit): + def interpolate(self, to_replace, method='pad', axis=0, inplace=False, + limit=None): + """Interpolate values according to different methods. + + Parameters + ---------- + to_replace : dict, Series + method : str + axis : int + inplace : bool + limit : int, default None + + Returns + ------- + frame : interpolated + + See Also + -------- + reindex, replace, fillna + """ if self._is_mixed_type and axis == 1: return self.T.replace(to_replace, method=method, limit=limit).T method = com._clean_fill_method(method) if isinstance(to_replace, (dict, Series)): - if axis == 1: - return self.T.replace(to_replace, method=method, - limit=limit).T - - rs = self if inplace else self.copy() - for k, v in to_replace.iteritems(): - if k in rs: - rs[k].replace(v, method=method, limit=limit, - inplace=True) - return rs if not inplace else None - + if axis == 0: + return self.replace(to_replace, method=method, inplace=inplace, + limit=limit, axis=axis) + elif axis == 1: + obj = self.T + if inplace: + obj.replace(to_replace, method=method, limit=limit, + inplace=inplace, axis=0) + return obj.T + return obj.replace(to_replace, method=method, limit=limit, + inplace=inplace, axis=0).T + else: + raise ValueError('Invalid value for axis') else: - - new_data = self._data.interpolate(method = method, - axis = axis, - limit = limit, - inplace = inplace, - missing = to_replace, - coerce = False) + new_data = self._data.interpolate(method=method, axis=axis, + limit=limit, inplace=inplace, + missing=to_replace, coerce=False) if inplace: self._data = new_data diff --git a/pandas/core/internals.py b/pandas/core/internals.py index d058d20427ad7..849776940512e 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1,5 +1,7 @@ import itertools +import re from datetime import datetime +import collections from numpy import nan import numpy as np @@ -16,6 +18,10 @@ from pandas.util import py3compat +def _re_compilable(ex): + return isinstance(ex, (basestring, re._pattern_type)) + + class Block(object): """ Canonical n-dimensional unit of homogeneous dtype contained in a pandas @@ -318,9 +324,12 @@ def to_native_types(self, slicer=None, na_rep='', **kwargs): values[mask] = na_rep return values.tolist() - def replace(self, to_replace, value, inplace=False, filter=None): - """ replace the to_replace value with value, possible to create new blocks here - this is just a call to putmask """ + def replace(self, to_replace, value, inplace=False, filter=None, + regex=False): + """ replace the to_replace value with value, possible to create new + blocks here this is just a call to putmask. regex is not used here. + It is used in ObjectBlocks. It is here for API + compatibility.""" mask = com.mask_missing(self.values, to_replace) if filter is not None: for i, item in enumerate(self.items): @@ -750,6 +759,101 @@ def should_store(self, value): (np.integer, np.floating, np.complexfloating, np.datetime64, np.bool_)) + def replace(self, to_replace, value, inplace=False, filter=None, + regex=False): + blk = [self] + to_rep_is_list = (isinstance(to_replace, collections.Iterable) and not + isinstance(to_replace, basestring)) + value_is_list = (isinstance(value, collections.Iterable) and not + isinstance(to_replace, basestring)) + both_lists = to_rep_is_list and value_is_list + either_list = to_rep_is_list or value_is_list + + if not either_list and not regex: + blk = super(ObjectBlock, self).replace(to_replace, value, + inplace=inplace, + filter=filter, regex=regex) + elif both_lists and regex: + for to_rep, v in itertools.izip(to_replace, value): + blk[0], = blk[0]._replace_single(to_rep, v, inplace=inplace, + filter=filter, regex=regex) + elif to_rep_is_list and regex: + for to_rep in to_replace: + blk[0], = blk[0]._replace_single(to_rep, value, + inplace=inplace, + filter=filter, regex=regex) + else: + blk[0], = blk[0]._replace_single(to_replace, value, + inplace=inplace, filter=filter, + regex=regex) + return blk + + def _replace_single(self, to_replace, value, inplace=False, filter=None, + regex=False): + # to_replace is regex compilable + to_rep_re = _re_compilable(to_replace) + + # regex is regex compilable + regex_re = _re_compilable(regex) + + if to_rep_re and regex_re: + raise AssertionError('only one of to_replace and regex can be ' + 'regex compilable') + + if regex_re: + to_replace = regex + + regex = regex_re or to_rep_re + + # try to get the pattern attribute (compiled re) or it's a string + try: + pattern = to_replace.pattern + except AttributeError: + pattern = to_replace + + # if the pattern is not empty and to_replace is either a string or a + # regex + if regex and pattern: + rx = re.compile(to_replace) + else: + # if the thing to replace is not a string or compiled regex call + # the superclass method -> to_replace is some kind of object + return super(ObjectBlock, self).replace(to_replace, value, + inplace=inplace, + filter=filter, regex=regex) + + new_values = self.values if inplace else self.values.copy() + + # deal with replacing values with objects (strings) that match but + # whose replacement is not a string (numeric, nan, object) + if isnull(value) or not isinstance(value, basestring): + def re_replacer(s): + try: + return value if rx.search(s) is not None else s + except TypeError: + return s + else: + # value is guaranteed to be a string here, s can be either a string + # or null if it's null it gets returned + def re_replacer(s): + try: + return rx.sub(value, s) + except TypeError: + return s + + f = np.vectorize(re_replacer, otypes=[self.dtype]) + + try: + filt = map(self.items.get_loc, filter) + except TypeError: + filt = slice(None) + + new_values[filt] = f(new_values[filt]) + + return [self if inplace else make_block(new_values, self.items, + self.ref_items, fastpath=True)] + + class DatetimeBlock(Block): _can_hold_na = True @@ -1136,7 +1240,9 @@ def _verify_integrity(self): if len(self.items) != tot_items: raise AssertionError('Number of manager items must equal union of ' - 'block items') + 'block items\n# manager items: {0}, # ' + 'tot_items: {1}'.format(len(self.items), + tot_items)) def apply(self, f, *args, **kwargs): """ iterate over the blocks, collect and create a new block manager @@ -1203,7 +1309,7 @@ def convert(self, *args, **kwargs): def replace(self, *args, **kwargs): return self.apply('replace', *args, **kwargs) - def replace_list(self, src_lst, dest_lst, inplace=False): + def replace_list(self, src_lst, dest_lst, inplace=False, regex=False): """ do a list replace """ # figure out our mask a-priori to avoid repeated replacements @@ -1220,16 +1326,20 @@ def comp(s): # its possible to get multiple result blocks here # replace ALWAYS will return a list rb = [ blk if inplace else blk.copy() ] - for i, d in enumerate(dest_lst): + for i, (s, d) in enumerate(zip(src_lst, dest_lst)): new_rb = [] for b in rb: - # get our mask for this element, sized to this - # particular block - m = masks[i][b.ref_locs] - if m.any(): - new_rb.extend(b.putmask(m, d, inplace=True)) + if b.dtype == np.object_: + new_rb.extend(b.replace(s, d, inplace=inplace, + regex=regex)) else: - new_rb.append(b) + # get our mask for this element, sized to this + # particular block + m = masks[i][b.ref_locs] + if m.any(): + new_rb.extend(b.putmask(m, d, inplace=True)) + else: + new_rb.append(b) rb = new_rb result_blocks.extend(rb) @@ -2165,7 +2275,6 @@ def _lcd_dtype(l): else: return _lcd_dtype(counts[FloatBlock]) - def _consolidate(blocks, items): """ Merge blocks having same dtype diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index de49eca7dab1c..8e48ef094c419 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4,7 +4,7 @@ from StringIO import StringIO import cPickle as pickle import operator -import os +import re import unittest import nose @@ -6131,9 +6131,8 @@ def test_replace_inplace(self): res = tsframe.replace(nan, 0, inplace=True) assert_frame_equal(tsframe, self.tsframe.fillna(0)) - tsframe = self.tsframe.copy() - res = tsframe.replace(nan, method='pad', inplace=True) - assert_frame_equal(tsframe, self.tsframe.fillna(method='pad')) + self.assertRaises(TypeError, self.tsframe.replace, nan, method='pad', + inplace=True) # mixed type self.mixed_frame['foo'][5:20] = nan @@ -6144,9 +6143,499 @@ def test_replace_inplace(self): assert_frame_equal(result, expected) tsframe = self.tsframe.copy() - res = tsframe.replace([nan], [0], inplace=True) + tsframe.replace([nan], [0], inplace=True) assert_frame_equal(tsframe, self.tsframe.fillna(0)) + def test_regex_replace_scalar(self): + obj = {'a': list('ab..'), 'b': list('efgh')} + dfobj = DataFrame(obj) + mix = {'a': range(4), 'b': list('ab..')} + dfmix = DataFrame(mix) + + ### simplest cases + ## regex -> value + # obj frame + res = dfobj.replace(r'\s*\.\s*', nan, regex=True) + assert_frame_equal(dfobj, res.fillna('.')) + + # mixed + res = dfmix.replace(r'\s*\.\s*', nan, regex=True) + assert_frame_equal(dfmix, res.fillna('.')) + + ## regex -> regex + # obj frame + res = dfobj.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True) + objc = obj.copy() + objc['a'] = ['a', 'b', '...', '...'] + expec = DataFrame(objc) + assert_frame_equal(res, expec) + + # with mixed + res = dfmix.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True) + mixc = mix.copy() + mixc['b'] = ['a', 'b', '...', '...'] + expec = DataFrame(mixc) + assert_frame_equal(res, expec) + + # everything with compiled regexs as well + res = dfobj.replace(re.compile(r'\s*\.\s*'), nan, regex=True) + assert_frame_equal(dfobj, res.fillna('.')) + + # mixed + res = dfmix.replace(re.compile(r'\s*\.\s*'), nan, regex=True) + assert_frame_equal(dfmix, res.fillna('.')) + + ## regex -> regex + # obj frame + res = dfobj.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1') + objc = obj.copy() + objc['a'] = ['a', 'b', '...', '...'] + expec = DataFrame(objc) + assert_frame_equal(res, expec) + + # with mixed + res = dfmix.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1') + mixc = mix.copy() + mixc['b'] = ['a', 'b', '...', '...'] + expec = DataFrame(mixc) + assert_frame_equal(res, expec) + + res = dfmix.replace(regex=re.compile(r'\s*(\.)\s*'), value=r'\1\1\1') + mixc = mix.copy() + mixc['b'] = ['a', 'b', '...', '...'] + expec = DataFrame(mixc) + assert_frame_equal(res, expec) + + res = dfmix.replace(regex=r'\s*(\.)\s*', value=r'\1\1\1') + mixc = mix.copy() + mixc['b'] = ['a', 'b', '...', '...'] + expec = DataFrame(mixc) + assert_frame_equal(res, expec) + + def test_regex_replace_scalar_inplace(self): + obj = {'a': list('ab..'), 'b': list('efgh')} + dfobj = DataFrame(obj) + mix = {'a': range(4), 'b': list('ab..')} + dfmix = DataFrame(mix) + + ### simplest cases + ## regex -> value + # obj frame + res = dfobj.copy() + res.replace(r'\s*\.\s*', nan, regex=True, inplace=True) + assert_frame_equal(dfobj, res.fillna('.')) + + # mixed + res = dfmix.copy() + res.replace(r'\s*\.\s*', nan, regex=True, inplace=True) + assert_frame_equal(dfmix, res.fillna('.')) + + ## regex -> regex + # obj frame + res = dfobj.copy() + res.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True, inplace=True) + objc = obj.copy() + objc['a'] = ['a', 'b', '...', '...'] + expec = DataFrame(objc) + assert_frame_equal(res, expec) + + # with mixed + res = dfmix.copy() + res.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True, inplace=True) + mixc = mix.copy() + mixc['b'] = ['a', 'b', '...', '...'] + expec = DataFrame(mixc) + assert_frame_equal(res, expec) + + # everything with compiled regexs as well + res = dfobj.copy() + res.replace(re.compile(r'\s*\.\s*'), nan, regex=True, inplace=True) + assert_frame_equal(dfobj, res.fillna('.')) + + # mixed + res = dfmix.copy() + res.replace(re.compile(r'\s*\.\s*'), nan, regex=True, inplace=True) + assert_frame_equal(dfmix, res.fillna('.')) + + ## regex -> regex + # obj frame + res = dfobj.copy() + res.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1', regex=True, + inplace=True) + objc = obj.copy() + objc['a'] = ['a', 'b', '...', '...'] + expec = DataFrame(objc) + assert_frame_equal(res, expec) + + # with mixed + res = dfmix.copy() + res.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1', regex=True, + inplace=True) + mixc = mix.copy() + mixc['b'] = ['a', 'b', '...', '...'] + expec = DataFrame(mixc) + assert_frame_equal(res, expec) + + res = dfobj.copy() + res.replace(regex=r'\s*\.\s*', value=nan, inplace=True) + assert_frame_equal(dfobj, res.fillna('.')) + + # mixed + res = dfmix.copy() + res.replace(regex=r'\s*\.\s*', value=nan, inplace=True) + assert_frame_equal(dfmix, res.fillna('.')) + + ## regex -> regex + # obj frame + res = dfobj.copy() + res.replace(regex=r'\s*(\.)\s*', value=r'\1\1\1', inplace=True) + objc = obj.copy() + objc['a'] = ['a', 'b', '...', '...'] + expec = DataFrame(objc) + assert_frame_equal(res, expec) + + # with mixed + res = dfmix.copy() + res.replace(regex=r'\s*(\.)\s*', value=r'\1\1\1', inplace=True) + mixc = mix.copy() + mixc['b'] = ['a', 'b', '...', '...'] + expec = DataFrame(mixc) + assert_frame_equal(res, expec) + + # everything with compiled regexs as well + res = dfobj.copy() + res.replace(regex=re.compile(r'\s*\.\s*'), value=nan, inplace=True) + assert_frame_equal(dfobj, res.fillna('.')) + + # mixed + res = dfmix.copy() + res.replace(regex=re.compile(r'\s*\.\s*'), value=nan, inplace=True) + assert_frame_equal(dfmix, res.fillna('.')) + + ## regex -> regex + # obj frame + res = dfobj.copy() + res.replace(regex=re.compile(r'\s*(\.)\s*'), value=r'\1\1\1', + inplace=True) + objc = obj.copy() + objc['a'] = ['a', 'b', '...', '...'] + expec = DataFrame(objc) + assert_frame_equal(res, expec) + + # with mixed + res = dfmix.copy() + res.replace(regex=re.compile(r'\s*(\.)\s*'), value=r'\1\1\1', + inplace=True) + mixc = mix.copy() + mixc['b'] = ['a', 'b', '...', '...'] + expec = DataFrame(mixc) + assert_frame_equal(res, expec) + + def test_regex_replace_list_obj(self): + obj = {'a': list('ab..'), 'b': list('efgh'), 'c': list('helo')} + dfobj = DataFrame(obj) + + ## lists of regexes and values + # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] + to_replace_res = [r'\s*\.\s*', r'e|f|g'] + values = [nan, 'crap'] + res = dfobj.replace(to_replace_res, values, regex=True) + expec = DataFrame({'a': ['a', 'b', nan, nan], 'b': ['crap'] * 3 + + ['h'], 'c': ['h', 'crap', 'l', 'o']}) + assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] + to_replace_res = [r'\s*(\.)\s*', r'(e|f|g)'] + values = [r'\1\1', r'\1_crap'] + res = dfobj.replace(to_replace_res, values, regex=True) + expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['e_crap', + 'f_crap', + 'g_crap', 'h'], + 'c': ['h', 'e_crap', 'l', 'o']}) + + assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN + # or vN)] + to_replace_res = [r'\s*(\.)\s*', r'e'] + values = [r'\1\1', r'crap'] + res = dfobj.replace(to_replace_res, values, regex=True) + expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['crap', 'f', 'g', + 'h'], + 'c': ['h', 'crap', 'l', 'o']}) + assert_frame_equal(res, expec) + + to_replace_res = [r'\s*(\.)\s*', r'e'] + values = [r'\1\1', r'crap'] + res = dfobj.replace(value=values, regex=to_replace_res) + expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['crap', 'f', 'g', + 'h'], + 'c': ['h', 'crap', 'l', 'o']}) + assert_frame_equal(res, expec) + + def test_regex_replace_list_obj_inplace(self): + ### same as above with inplace=True + ## lists of regexes and values + obj = {'a': list('ab..'), 'b': list('efgh'), 'c': list('helo')} + dfobj = DataFrame(obj) + + ## lists of regexes and values + # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] + to_replace_res = [r'\s*\.\s*', r'e|f|g'] + values = [nan, 'crap'] + res = dfobj.copy() + res.replace(to_replace_res, values, inplace=True, regex=True) + expec = DataFrame({'a': ['a', 'b', nan, nan], 'b': ['crap'] * 3 + + ['h'], 'c': ['h', 'crap', 'l', 'o']}) + assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] + to_replace_res = [r'\s*(\.)\s*', r'(e|f|g)'] + values = [r'\1\1', r'\1_crap'] + res = dfobj.copy() + res.replace(to_replace_res, values, inplace=True, regex=True) + expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['e_crap', + 'f_crap', + 'g_crap', 'h'], + 'c': ['h', 'e_crap', 'l', 'o']}) + + assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN + # or vN)] + to_replace_res = [r'\s*(\.)\s*', r'e'] + values = [r'\1\1', r'crap'] + res = dfobj.copy() + res.replace(to_replace_res, values, inplace=True, regex=True) + expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['crap', 'f', 'g', + 'h'], + 'c': ['h', 'crap', 'l', 'o']}) + assert_frame_equal(res, expec) + + to_replace_res = [r'\s*(\.)\s*', r'e'] + values = [r'\1\1', r'crap'] + res = dfobj.copy() + res.replace(value=values, regex=to_replace_res, inplace=True) + expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['crap', 'f', 'g', + 'h'], + 'c': ['h', 'crap', 'l', 'o']}) + assert_frame_equal(res, expec) + + def test_regex_replace_list_mixed(self): + ## mixed frame to make sure this doesn't break things + mix = {'a': range(4), 'b': list('ab..')} + dfmix = DataFrame(mix) + + ## lists of regexes and values + # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] + to_replace_res = [r'\s*\.\s*', r'a'] + values = [nan, 'crap'] + mix2 = {'a': range(4), 'b': list('ab..'), 'c': list('halo')} + dfmix2 = DataFrame(mix2) + res = dfmix2.replace(to_replace_res, values, regex=True) + expec = DataFrame({'a': mix2['a'], 'b': ['crap', 'b', nan, nan], + 'c': ['h', 'crap', 'l', 'o']}) + assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] + to_replace_res = [r'\s*(\.)\s*', r'(a|b)'] + values = [r'\1\1', r'\1_crap'] + res = dfmix.replace(to_replace_res, values, regex=True) + expec = DataFrame({'a': mix['a'], 'b': ['a_crap', 'b_crap', '..', + '..']}) + + assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN + # or vN)] + to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)'] + values = [r'\1\1', r'crap', r'\1_crap'] + res = dfmix.replace(to_replace_res, values, regex=True) + expec = DataFrame({'a': mix['a'], 'b': ['crap', 'b_crap', '..', '..']}) + assert_frame_equal(res, expec) + + to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)'] + values = [r'\1\1', r'crap', r'\1_crap'] + res = dfmix.replace(regex=to_replace_res, value=values) + expec = DataFrame({'a': mix['a'], 'b': ['crap', 'b_crap', '..', '..']}) + assert_frame_equal(res, expec) + + def test_regex_replace_list_mixed_inplace(self): + mix = {'a': range(4), 'b': list('ab..')} + dfmix = DataFrame(mix) + # the same inplace + ## lists of regexes and values + # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] + to_replace_res = [r'\s*\.\s*', r'a'] + values = [nan, 'crap'] + res = dfmix.copy() + res.replace(to_replace_res, values, inplace=True, regex=True) + expec = DataFrame({'a': mix['a'], 'b': ['crap', 'b', nan, nan]}) + assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] + to_replace_res = [r'\s*(\.)\s*', r'(a|b)'] + values = [r'\1\1', r'\1_crap'] + res = dfmix.copy() + res.replace(to_replace_res, values, inplace=True, regex=True) + expec = DataFrame({'a': mix['a'], 'b': ['a_crap', 'b_crap', '..', + '..']}) + + assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN + # or vN)] + to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)'] + values = [r'\1\1', r'crap', r'\1_crap'] + res = dfmix.copy() + res.replace(to_replace_res, values, inplace=True, regex=True) + expec = DataFrame({'a': mix['a'], 'b': ['crap', 'b_crap', '..', '..']}) + assert_frame_equal(res, expec) + + to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)'] + values = [r'\1\1', r'crap', r'\1_crap'] + res = dfmix.copy() + res.replace(regex=to_replace_res, value=values, inplace=True) + expec = DataFrame({'a': mix['a'], 'b': ['crap', 'b_crap', '..', '..']}) + assert_frame_equal(res, expec) + + def test_regex_replace_dict_mixed(self): + mix = {'a': range(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} + dfmix = DataFrame(mix) + + ## dicts + # single dict {re1: v1}, search the whole frame + # need test for this... + + # list of dicts {re1: v1, re2: v2, ..., re3: v3}, search the whole + # frame + res = dfmix.replace({'b': r'\s*\.\s*'}, {'b': nan}, regex=True) + res2 = dfmix.copy() + res2.replace({'b': r'\s*\.\s*'}, {'b': nan}, inplace=True, regex=True) + expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', nan, nan], 'c': + mix['c']}) + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + + # list of dicts {re1: re11, re2: re12, ..., reN: re1N}, search the + # whole frame + res = dfmix.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, regex=True) + res2 = dfmix.copy() + res2.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, inplace=True, + regex=True) + expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', '.ty', '.ty'], 'c': + mix['c']}) + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + + res = dfmix.replace(regex={'b': r'\s*(\.)\s*'}, value={'b': r'\1ty'}) + res2 = dfmix.copy() + res2.replace(regex={'b': r'\s*(\.)\s*'}, value={'b': r'\1ty'}, + inplace=True) + expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', '.ty', '.ty'], 'c': + mix['c']}) + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + + # scalar -> dict + # to_replace regex, {value: value} + res = dfmix.replace('a', {'b': nan}, regex=True) + res2 = dfmix.copy() + res2.replace('a', {'b': nan}, regex=True, inplace=True) + expec = DataFrame({'a': mix['a'], 'b': [nan, 'b', '.', '.'], 'c': + mix['c']}) + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + + res = dfmix.replace('a', {'b': nan}, regex=True) + res2 = dfmix.copy() + res2.replace(regex='a', value={'b': nan}, inplace=True) + expec = DataFrame({'a': mix['a'], 'b': [nan, 'b', '.', '.'], 'c': + mix['c']}) + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + + def test_regex_replace_dict_nested(self): + # nested dicts will not work until this is implemented for Series + mix = {'a': range(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} + dfmix = DataFrame(mix) + res = dfmix.replace({'b': {r'\s*\.\s*': nan}}, regex=True) + res2 = dfmix.copy() + res2.replace({'b': {r'\s*\.\s*': nan}}, inplace=True, regex=True) + print res2 + expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', nan, nan], 'c': + mix['c']}) + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + + def test_regex_replace_list_to_scalar(self): + mix = {'a': range(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} + df = DataFrame(mix) + res = df.replace([r'\s*\.\s*', 'a|b'], nan, regex=True) + res2 = df.copy() + res3 = df.copy() + res2.replace([r'\s*\.\s*', 'a|b'], nan, regex=True, inplace=True) + res3.replace(regex=[r'\s*\.\s*', 'a|b'], value=nan, inplace=True) + expec = DataFrame({'a': mix['a'], 'b': np.array([nan] * 4, object), + 'c': [nan, nan, nan, 'd']}) + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + assert_frame_equal(res3, expec) + + def test_regex_replace_str_to_numeric(self): + # what happens when you try to replace a numeric value with a regex? + mix = {'a': range(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} + df = DataFrame(mix) + res = df.replace(r'\s*\.\s*', 0, regex=True) + res2 = df.copy() + res2.replace(r'\s*\.\s*', 0, inplace=True, regex=True) + res3 = df.copy() + res3.replace(regex=r'\s*\.\s*', value=0, inplace=True) + expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', 0, 0], 'c': + mix['c']}) + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + assert_frame_equal(res3, expec) + + def test_regex_replace_regex_list_to_numeric(self): + mix = {'a': range(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} + df = DataFrame(mix) + res = df.replace([r'\s*\.\s*', 'b'], 0, regex=True) + res2 = df.copy() + res2.replace([r'\s*\.\s*', 'b'], 0, regex=True, inplace=True) + res3 = df.copy() + res3.replace(regex=[r'\s*\.\s*', 'b'], value=0, inplace=True) + expec = DataFrame({'a': mix['a'], 'b': ['a', 0, 0, 0], 'c': ['a', 0, + nan, + 'd']}) + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + assert_frame_equal(res3, expec) + + def test_regex_replace_series_of_regexes(self): + mix = {'a': range(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} + df = DataFrame(mix) + s1 = Series({'b': r'\s*\.\s*'}) + s2 = Series({'b': nan}) + res = df.replace(s1, s2, regex=True) + res2 = df.copy() + res2.replace(s1, s2, inplace=True, regex=True) + res3 = df.copy() + res3.replace(regex=s1, value=s2, inplace=True) + expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', nan, nan], 'c': + mix['c']}) + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + assert_frame_equal(res3, expec) + + def test_regex_replace_numeric_to_object_conversion(self): + mix = {'a': range(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} + df = DataFrame(mix) + res = df.replace(0, 'a') + expec = DataFrame({'a': ['a', 1, 2, 3], 'b': mix['b'], 'c': mix['c']}) + assert_frame_equal(res, expec) + self.assertEqual(res.a.dtype, np.object_) + def test_replace(self): self.tsframe['A'][:5] = nan self.tsframe['A'][-5:] = nan @@ -6163,7 +6652,7 @@ def test_replace(self): df = DataFrame(index=['a', 'b']) assert_frame_equal(df, df.replace(5, 7)) - def test_resplace_series_dict(self): + def test_replace_series_dict(self): # from GH 3064 df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}}) result = df.replace(0, {'zero': 0.5, 'one': 1.0}) @@ -6227,48 +6716,32 @@ def test_replace_mixed(self): expected.iloc[1,1] = m[1] assert_frame_equal(result,expected) - def test_replace_interpolate(self): - padded = self.tsframe.replace(nan, method='pad') - assert_frame_equal(padded, self.tsframe.fillna(method='pad')) + def test_interpolate(self): + pass + + def test_replace_value_is_none(self): + self.assertRaises(TypeError, self.tsframe.replace, nan, method='pad') + orig_value = self.tsframe.iloc[0, 0] + orig2 = self.tsframe.iloc[1, 0] - result = self.tsframe.replace(to_replace={'A': nan}, method='pad', + self.tsframe.iloc[0, 0] = nan + self.tsframe.iloc[1, 0] = 1 + + result = self.tsframe.replace(to_replace={nan: 0}, method='pad', axis=1) expected = self.tsframe.T.replace( - to_replace={'A': nan}, method='pad').T + to_replace={nan: 0}, method='pad').T assert_frame_equal(result, expected) - result = self.tsframe.replace(to_replace={'A': nan, 'B': -1e8}, + result = self.tsframe.replace(to_replace={nan: 0, 1: -1e8}, method='bfill') tsframe = self.tsframe.copy() - b = tsframe['B'] - b[b == -1e8] = nan - tsframe['B'] = b - expected = tsframe.fillna(method='bfill') + tsframe.iloc[0, 0] = 0 + tsframe.iloc[1, 0] = -1e8 + expected = tsframe assert_frame_equal(expected, result) - - bfilled = self.tsframe.replace(nan, method='bfill') - assert_frame_equal(bfilled, self.tsframe.fillna(method='bfill')) - - frame = self.tsframe.copy() - frame[frame == 0] = 1 - frame.ix[-5:, 2] = 0 - result = frame.replace([nan, 0], method='pad') - - expected = frame.copy() - expected[expected == 0] = nan - expected = expected.fillna(method='pad') - assert_frame_equal(result, expected) - - result = self.mixed_frame.replace(nan, method='pad', axis=1) - expected = self.mixed_frame.fillna(method='pad', axis=1) - assert_frame_equal(result, expected) - - # no nans - self.tsframe['A'][:5] = 1e8 - result = self.tsframe.replace(1e8, method='bfill') - self.tsframe['A'].replace(1e8, nan, inplace=True) - expected = self.tsframe.fillna(method='bfill') - assert_frame_equal(result, expected) + self.tsframe.iloc[0, 0] = orig_value + self.tsframe.iloc[1, 0] = orig2 def test_replace_for_new_dtypes(self): @@ -6351,7 +6824,7 @@ def test_replace_input_formats(self): expected[k] = v.replace(to_rep[k], 0) assert_frame_equal(filled, DataFrame(expected)) - self.assertRaises(ValueError, df.replace, to_rep, [np.nan, 0, '']) + self.assertRaises(TypeError, df.replace, to_rep, [np.nan, 0, '']) # scalar to dict values = {'A': 0, 'B': -1, 'C': 'missing'} @@ -6389,8 +6862,8 @@ def test_replace_axis(self): zero_filled = self.tsframe.replace(nan, 0, axis=1) assert_frame_equal(zero_filled, self.tsframe.fillna(0, axis=1)) - padded = self.tsframe.replace(nan, method='pad', axis=1) - assert_frame_equal(padded, self.tsframe.fillna(method='pad', axis=1)) + self.assertRaises(TypeError, self.tsframe.replace, method='pad', + axis=1) # mixed type self.mixed_frame['foo'][5:20] = nan @@ -6400,22 +6873,9 @@ def test_replace_axis(self): expected = self.mixed_frame.fillna(value=-1e8, axis=1) assert_frame_equal(result, expected) - def test_replace_limit(self): - padded = self.tsframe.replace(nan, method='pad', limit=2) - assert_frame_equal(padded, self.tsframe.fillna(method='pad', - limit=2)) - bfilled = self.tsframe.replace(nan, method='bfill', limit=2) - assert_frame_equal(padded, self.tsframe.fillna(method='bfill', - limit=2)) - - padded = self.tsframe.replace(nan, method='pad', axis=1, limit=2) - assert_frame_equal(padded, self.tsframe.fillna(method='pad', - axis=1, limit=2)) - - bfill = self.tsframe.replace(nan, method='bfill', axis=1, limit=2) - assert_frame_equal(padded, self.tsframe.fillna(method='bfill', - axis=1, limit=2)) + def test_replace_limit(self): + pass def test_combine_multiple_frames_dtypes(self): from pandas import concat