Merge branch 'replace-convert-dtypes-3907' of https://github.com/cpcloud/pandas into cpcloud-replace-convert-dtypes-3907

jreback · jreback · commit e4b8ed48acec · 2013-06-15T08:32:29.000-04:00
Conflicts:
	RELEASE.rst
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -127,6 +127,10 @@ pandas 0.11.1
   - ``DataFrame.interpolate()`` is now deprecated. Please use
     ``DataFrame.fillna()`` and ``DataFrame.replace()`` instead (GH3582_,
     GH3675_, GH3676_).
+  - the ``method`` and ``axis`` arguments of ``DataFrame.replace()`` are
+    deprecated
+  - ``DataFrame.replace`` 's ``infer_types`` parameter is removed and now
+    performs conversion by default. (GH3907_)
   - Deprecated display.height, display.width is now only a formatting option
     does not control triggering of summary, similar to < 0.11.0.
   - Add the keyword ``allow_duplicates`` to ``DataFrame.insert`` to allow a duplicate column
@@ -141,6 +145,8 @@ pandas 0.11.1
     ``to_pickle`` instance method, ``save`` and ``load`` will give deprecation warning.  
   - the ``method`` and ``axis`` arguments of ``DataFrame.replace()`` are
     deprecated
+  - the ``method`` and ``axis`` arguments of ``DataFrame.replace()`` are
+    deprecated
   - Implement ``__nonzero__`` for ``NDFrame`` objects (GH3691_, GH3696_)
   - ``as_matrix`` with mixed signed and unsigned dtypes will result in 2 x the lcd of the unsigned
     as an int, maxing with ``int64``, to avoid precision issues (GH3733_)
@@ -239,6 +245,8 @@ pandas 0.11.1
   - Fix index name not propogating when using ``loc/ix`` (GH3880_)
   - Fix groupby when applying a custom function resulting in a returned DataFrame was 
     not converting dtypes (GH3911_)
+  - Fixed a bug where ``DataFrame.replace`` with a compiled regular expression
+    in the ``to_replace`` argument wasn't working (GH3907_)
 
 .. _GH3164: https://github.com/pydata/pandas/issues/3164
 .. _GH2786: https://github.com/pydata/pandas/issues/2786
@@ -334,7 +342,11 @@ pandas 0.11.1
 .. _GH3873: https://github.com/pydata/pandas/issues/3873
 .. _GH3877: https://github.com/pydata/pandas/issues/3877
 .. _GH3880: https://github.com/pydata/pandas/issues/3880
+<<<<<<< HEAD
 .. _GH3911: https://github.com/pydata/pandas/issues/3911
+=======
+.. _GH3907: https://github.com/pydata/pandas/issues/3907
+>>>>>>> 7b5933247b80174de4ba571e95a1add809dd9d09
 
 
 pandas 0.11.0
diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt
@@ -98,6 +98,9 @@ API changes
   - the ``method`` and ``axis`` arguments of ``DataFrame.replace()`` are
     deprecated
 
+  - ``DataFrame.replace`` 's ``infer_types`` parameter is removed and now
+    performs conversion by default. (GH3907_)
+
   - Add the keyword ``allow_duplicates`` to ``DataFrame.insert`` to allow a duplicate column
     to be inserted if ``True``, default is ``False`` (same as prior to 0.11.1) (GH3679_)
   - Implement ``__nonzero__`` for ``NDFrame`` objects (GH3691_, GH3696_)
@@ -356,6 +359,8 @@ Bug Fixes
 
   - ``DataFrame.from_records`` did not accept empty recarrays (GH3682_)
   - ``read_html`` now correctly skips tests (GH3741_)
+  - Fixed a bug where ``DataFrame.replace`` with a compiled regular expression
+    in the ``to_replace`` argument wasn't working (GH3907_)
 
 See the `full release notes
 <https://github.com/pydata/pandas/blob/master/RELEASE.rst>`__ or issue tracker
@@ -410,3 +415,4 @@ on GitHub for a complete list.
 .. _GH3877: https://github.com/pydata/pandas/issues/3877
 .. _GH3659: https://github.com/pydata/pandas/issues/3659
 .. _GH3679: https://github.com/pydata/pandas/issues/3679
+.. _GH3907: https://github.com/pydata/pandas/issues/3907
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -3,6 +3,7 @@
 """
 
 import itertools
+import re
 from datetime import datetime
 
 from numpy.lib.format import read_array, write_array
@@ -1585,8 +1586,21 @@ def is_complex_dtype(arr_or_dtype):
     return issubclass(tipo, np.complexfloating)
 
 
+def is_re(obj):
+    return isinstance(obj, re._pattern_type)
+
+
+def is_re_compilable(obj):
+    try:
+        re.compile(obj)
+    except TypeError:
+        return False
+    else:
+        return True
+
+
 def is_list_like(arg):
-    return hasattr(arg, '__iter__') and not isinstance(arg, basestring) or hasattr(arg,'len')
+    return hasattr(arg, '__iter__') and not isinstance(arg, basestring)
 
 def _is_sequence(x):
     try:
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -33,8 +33,7 @@
                                   _maybe_convert_indices)
 from pandas.core.internals import (BlockManager,
                                    create_block_manager_from_arrays,
-                                   create_block_manager_from_blocks,
-                                   _re_compilable)
+                                   create_block_manager_from_blocks)
 from pandas.core.series import Series, _radd_compat
 import pandas.core.expressions as expressions
 from pandas.compat.scipy import scoreatpercentile as _quantile
@@ -3483,7 +3482,7 @@ def bfill(self, axis=0, inplace=False, limit=None):
                            limit=limit)
 
     def replace(self, to_replace=None, value=None, inplace=False, limit=None,
-                regex=False, infer_types=False, method=None, axis=None):
+                regex=False, method=None, axis=None):
         """
         Replace values given in 'to_replace' with 'value'.
 
@@ -3545,8 +3544,6 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
             string. Otherwise, `to_replace` must be ``None`` because this
             parameter will be interpreted as a regular expression or a list,
             dict, or array of regular expressions.
-        infer_types : bool, default True
-            If ``True`` attempt to convert object blocks to a better dtype.
 
         See also
         --------
@@ -3582,7 +3579,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
           and play with this method to gain intuition about how it works.
 
         """
-        if not isinstance(regex, bool) and to_replace is not None:
+        if not com.is_bool(regex) and to_replace is not None:
             raise AssertionError("'to_replace' must be 'None' if 'regex' is "
                                  "not a bool")
         if method is not None:
@@ -3628,8 +3625,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
                 to_replace, value = keys, values
 
             return self.replace(to_replace, value, inplace=inplace,
-                                limit=limit, regex=regex,
-                                infer_types=infer_types)
+                                limit=limit, regex=regex)
         else:
             if not len(self.columns):
                 return self
@@ -3673,14 +3669,14 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
                     new_data = self._data.replace(to_replace, value,
                                                   inplace=inplace, regex=regex)
             elif to_replace is None:
-                if not (_re_compilable(regex) or
+                if not (com.is_re_compilable(regex) or
                         isinstance(regex, (list, dict, np.ndarray, Series))):
                     raise TypeError("'regex' must be a string or a compiled "
                                     "regular expression or a list or dict of "
                                     "strings or regular expressions, you "
                                     "passed a {0}".format(type(regex)))
                 return self.replace(regex, value, inplace=inplace, limit=limit,
-                                    regex=True, infer_types=infer_types)
+                                    regex=True)
             else:
 
                 # dest iterable dict-like
@@ -3701,8 +3697,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
                     raise TypeError('Invalid "to_replace" type: '
                                     '{0}'.format(type(to_replace)))  # pragma: no cover
 
-        if infer_types:
-            new_data = new_data.convert()
+        new_data = new_data.convert(copy=not inplace, convert_numeric=False)
 
         if inplace:
             self._data = new_data
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -1,13 +1,14 @@
 import itertools
 import re
 from datetime import datetime
-import collections
 
 from numpy import nan
 import numpy as np
 
-from pandas.core.common import _possibly_downcast_to_dtype, isnull, _NS_DTYPE, _TD_DTYPE
-from pandas.core.index import Index, MultiIndex, _ensure_index, _handle_legacy_indexes
+from pandas.core.common import (_possibly_downcast_to_dtype, isnull, _NS_DTYPE,
+                                _TD_DTYPE)
+from pandas.core.index import (Index, MultiIndex, _ensure_index,
+                               _handle_legacy_indexes)
 from pandas.core.indexing import _check_slice_bounds, _maybe_convert_indices
 import pandas.core.common as com
 import pandas.lib as lib
@@ -18,10 +19,6 @@
 from pandas.util import py3compat
 
 
-def _re_compilable(ex):
-    return isinstance(ex, (basestring, re._pattern_type))
-
-
 class Block(object):
     """
     Canonical n-dimensional unit of homogeneous dtype contained in a pandas
@@ -744,14 +741,16 @@ def should_store(self, value):
     def replace(self, to_replace, value, inplace=False, filter=None,
                 regex=False):
         blk = [self]
-        to_rep_is_list = (isinstance(to_replace, collections.Iterable) and not
-                          isinstance(to_replace, basestring))
-        value_is_list = (isinstance(value, collections.Iterable) and not
-                         isinstance(to_replace, basestring))
+        to_rep_is_list = com.is_list_like(to_replace)
+        value_is_list = com.is_list_like(value)
         both_lists = to_rep_is_list and value_is_list
         either_list = to_rep_is_list or value_is_list
 
-        if not either_list and not regex:
+        if not either_list and com.is_re(to_replace):
+            blk[0], = blk[0]._replace_single(to_replace, value,
+                                             inplace=inplace, filter=filter,
+                                             regex=True)
+        elif not (either_list or regex):
             blk = super(ObjectBlock, self).replace(to_replace, value,
                                                    inplace=inplace,
                                                    filter=filter, regex=regex)
@@ -773,15 +772,18 @@ def replace(self, to_replace, value, inplace=False, filter=None,
     def _replace_single(self, to_replace, value, inplace=False, filter=None,
                         regex=False):
         # to_replace is regex compilable
-        to_rep_re = _re_compilable(to_replace)
+        to_rep_re = com.is_re_compilable(to_replace)
 
         # regex is regex compilable
-        regex_re = _re_compilable(regex)
+        regex_re = com.is_re_compilable(regex)
 
+        # only one will survive
         if to_rep_re and regex_re:
             raise AssertionError('only one of to_replace and regex can be '
                                  'regex compilable')
 
+        # if regex was passed as something that can be a regex (rather than a
+        # boolean)
         if regex_re:
             to_replace = regex
 
@@ -1668,7 +1670,6 @@ def get(self, item):
                 mgr._consolidate_inplace()
                 return mgr
 
-
     def iget(self, i):
         item = self.items[i]
         if self.items.is_unique:
@@ -1970,7 +1971,6 @@ def reindex_indexer(self, new_axis, indexer, axis=1, fill_value=np.nan):
     def _reindex_indexer_items(self, new_items, indexer, fill_value):
         # TODO: less efficient than I'd like
 
-        is_unique = self.items.is_unique
         item_order = com.take_1d(self.items.values, indexer)
 
         # keep track of what items aren't found anywhere
@@ -2141,7 +2141,6 @@ def rename_axis(self, mapper, axis=1):
 
     def rename_items(self, mapper, copydata=True):
         new_items = Index([mapper(x) for x in self.items])
-        is_unique = new_items.is_unique
 
         new_blocks = []
         for block in self.blocks:
diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py
@@ -1,5 +1,6 @@
 from datetime import datetime
 import sys
+import re
 
 import nose
 import unittest
@@ -244,6 +245,18 @@ def test_groupby():
         assert v == expected[k]
 
 
+def test_is_list_like():
+    passes = ([], [1], (1,), (1, 2), {'a': 1}, set([1, 'a']), Series([1]),
+              Series([]), Series(['a']).str)
+    fails = (1, '2', object())
+
+    for p in passes:
+        assert com.is_list_like(p)
+
+    for f in fails:
+        assert not com.is_list_like(f)
+
+
 def test_ensure_int32():
     values = np.arange(10, dtype=np.int32)
     result = com._ensure_int32(values)
@@ -288,6 +301,30 @@ def test_ensure_platform_int():
 #         expected = u"\u05d0".encode('utf-8')
 #         assert (result == expected)
 
+
+def test_is_re():
+    passes = re.compile('ad'),
+    fails = 'x', 2, 3, object()
+
+    for p in passes:
+        assert com.is_re(p)
+
+    for f in fails:
+        assert not com.is_re(f)
+
+
+def test_is_recompilable():
+    passes = (r'a', u'x', r'asdf', re.compile('adsf'), ur'\u2233\s*',
+              re.compile(r''))
+    fails = 1, [], object()
+
+    for p in passes:
+        assert com.is_re_compilable(p)
+
+    for f in fails:
+        assert not com.is_re_compilable(f)
+
+
 class TestTake(unittest.TestCase):
 
     _multiprocess_can_split_ = True
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -6696,7 +6696,7 @@ def test_regex_replace_list_to_scalar(self):
         res3 = df.copy()
         res2.replace([r'\s*\.\s*', 'a|b'], nan, regex=True, inplace=True)
         res3.replace(regex=[r'\s*\.\s*', 'a|b'], value=nan, inplace=True)
-        expec = DataFrame({'a': mix['a'], 'b': np.array([nan] * 4, object),
+        expec = DataFrame({'a': mix['a'], 'b': np.array([nan] * 4),
                            'c': [nan, nan, nan, 'd']})
         assert_frame_equal(res, expec)
         assert_frame_equal(res2, expec)
@@ -6772,6 +6772,30 @@ def test_replace(self):
         df = DataFrame(index=['a', 'b'])
         assert_frame_equal(df, df.replace(5, 7))
 
+    def test_replace_list(self):
+        obj = {'a': list('ab..'), 'b': list('efgh'), 'c': list('helo')}
+        dfobj = DataFrame(obj)
+
+        ## lists of regexes and values
+        # list of [v1, v2, ..., vN] -> [v1, v2, ..., vN]
+        to_replace_res = [r'.', r'e']
+        values = [nan, 'crap']
+        res = dfobj.replace(to_replace_res, values)
+        expec = DataFrame({'a': ['a', 'b', nan, nan],
+                           'b': ['crap', 'f', 'g', 'h'], 'c': ['h', 'crap',
+                                                               'l', 'o']})
+        assert_frame_equal(res, expec)
+
+        # list of [v1, v2, ..., vN] -> [v1, v2, .., vN]
+        to_replace_res = [r'.', r'f']
+        values = [r'..', r'crap']
+        res = dfobj.replace(to_replace_res, values)
+        expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['e', 'crap', 'g',
+                                                              'h'],
+                           'c': ['h', 'e', 'l', 'o']})
+
+        assert_frame_equal(res, expec)
+
     def test_replace_series_dict(self):
         # from GH 3064
         df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}})
@@ -6792,10 +6816,24 @@ def test_replace_series_dict(self):
         result = df.replace(s, df.mean())
         assert_frame_equal(result, expected)
 
+    def test_replace_convert(self):
+        # gh 3907
+        df = DataFrame([['foo', 'bar', 'bah'], ['bar', 'foo', 'bah']])
+        m = {'foo': 1, 'bar': 2, 'bah': 3}
+        rep = df.replace(m)
+        expec = Series([np.int_, np.int_, np.int_])
+        res = rep.dtypes
+        assert_series_equal(expec, res)
+
     def test_replace_mixed(self):
         self.mixed_frame['foo'][5:20] = nan
         self.mixed_frame['A'][-10:] = nan
 
+        result = self.mixed_frame.replace(np.nan, -18)
+        expected = self.mixed_frame.fillna(value=-18)
+        assert_frame_equal(result, expected)
+        assert_frame_equal(result.replace(-18, nan), self.mixed_frame)
+
         result = self.mixed_frame.replace(np.nan, -1e8)
         expected = self.mixed_frame.fillna(value=-1e8)
         assert_frame_equal(result, expected)