BUG: replace of numeric by string / dtype coversion (GH15743)

Carlos Souza · jreback · commit 6f789e15cdd9 · 2017-03-28T14:24:36.000-04:00
closes #15743 Author: Carlos Souza <carlos@udacity.com> Author: Jeff Reback <jeff@reback.net> Closes #15812 from ucals/bug-fix-15743 and squashes the following commits: e6e4971 [Carlos Souza] Adding replace unicode with number and replace mixed types with string tests bd31b2b [Carlos Souza] Resolving merge conflict by incorporating @jreback suggestions 73805ce [Jeff Reback] CLN: add infer_dtype_from_array 45e67e4 [Carlos Souza] Fixing PEP8 line indent 0a98557 [Carlos Souza] BUG: replace of numeric by string fixed 97e1f18 [Carlos Souza] Test e62763c [Carlos Souza] Fixing PEP8 line indent 080c71e [Carlos Souza] BUG: replace of numeric by string fixed 8b463cb [Carlos Souza] Merge remote-tracking branch 'upstream/master' 9fc617b [Carlos Souza] Merge remote-tracking branch 'upstream/master' e12bca7 [Carlos Souza] Sync fork 676a4e5 [Carlos Souza] Test
diff --git a/RELEASE.md b/RELEASE.md
@@ -1,6 +1,6 @@
 Release Notes
 =============
 
-The list of changes to pandas between each release can be found
+The list of changes to Pandas between each release can be found
 [here](http://pandas.pydata.org/pandas-docs/stable/whatsnew.html). For full
 details, see the commit logs at http://github.com/pandas-dev/pandas.
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -884,6 +884,8 @@ Bug Fixes
 - Bug in ``.at`` when selecting from a tz-aware column (:issue:`15822`)
 - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`)
 - Bug in ``.replace()`` may result in incorrect dtypes. (:issue:`12747`, :issue:`15765`)
+- Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`)
+- Bug in ``Series.replace`` which replaced a numeric by string (:issue:`15743`)
 
 - Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`)
 
@@ -986,7 +988,6 @@ Bug Fixes
 
 - Bug in ``DataFrame.hist`` where ``plt.tight_layout`` caused an ``AttributeError``  (use ``matplotlib >= 2.0.1``) (:issue:`9351`)
 - Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`)
-- Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`)
 - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`)
 - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`)
 - Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`)
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
@@ -9,10 +9,16 @@
 
 from pandas.compat import range, string_types
 from pandas.types.common import (is_numeric_v_string_like,
-                                 is_float_dtype, is_datetime64_dtype,
-                                 is_datetime64tz_dtype, is_integer_dtype,
-                                 _ensure_float64, is_scalar,
-                                 needs_i8_conversion, is_integer)
+                                 is_float_dtype,
+                                 is_datetime64_dtype,
+                                 is_datetime64tz_dtype,
+                                 is_integer_dtype,
+                                 is_scalar,
+                                 is_integer,
+                                 needs_i8_conversion,
+                                 _ensure_float64)
+
+from pandas.types.cast import infer_dtype_from_array
 from pandas.types.missing import isnull
 
 
@@ -21,11 +27,11 @@ def mask_missing(arr, values_to_mask):
     Return a masking array of same size/shape as arr
     with entries equaling any member of values_to_mask set to True
     """
-    if not isinstance(values_to_mask, (list, np.ndarray)):
-        values_to_mask = [values_to_mask]
+    dtype, values_to_mask = infer_dtype_from_array(values_to_mask)
 
     try:
-        values_to_mask = np.array(values_to_mask, dtype=arr.dtype)
+        values_to_mask = np.array(values_to_mask, dtype=dtype)
+
     except Exception:
         values_to_mask = np.array(values_to_mask, dtype=object)
 
@@ -409,7 +415,7 @@ def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None,
         if axis != 0:  # pragma: no cover
             raise AssertionError("cannot interpolate on a ndim == 1 with "
                                  "axis != 0")
-        values = values.reshape(tuple((1, ) + values.shape))
+        values = values.reshape(tuple((1,) + values.shape))
 
     if fill_value is None:
         mask = None
@@ -447,7 +453,6 @@ def wrapper(arr, mask, limit=None):
 
 
 def pad_1d(values, limit=None, mask=None, dtype=None):
-
     if dtype is None:
         dtype = values.dtype
     _method = None
@@ -472,7 +477,6 @@ def pad_1d(values, limit=None, mask=None, dtype=None):
 
 
 def backfill_1d(values, limit=None, mask=None, dtype=None):
-
     if dtype is None:
         dtype = values.dtype
     _method = None
@@ -498,7 +502,6 @@ def backfill_1d(values, limit=None, mask=None, dtype=None):
 
 
 def pad_2d(values, limit=None, mask=None, dtype=None):
-
     if dtype is None:
         dtype = values.dtype
     _method = None
@@ -528,7 +531,6 @@ def pad_2d(values, limit=None, mask=None, dtype=None):
 
 
 def backfill_2d(values, limit=None, mask=None, dtype=None):
-
     if dtype is None:
         dtype = values.dtype
     _method = None
diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py
@@ -795,7 +795,7 @@ def test_replace_dtypes(self):
         expected = DataFrame({'datetime64': Index([now] * 3)})
         assert_frame_equal(result, expected)
 
-    def test_replace_input_formats(self):
+    def test_replace_input_formats_listlike(self):
         # both dicts
         to_rep = {'A': np.nan, 'B': 0, 'C': ''}
         values = {'A': 0, 'B': -1, 'C': 'missing'}
@@ -812,15 +812,6 @@ def test_replace_input_formats(self):
                               'C': ['', 'asdf', 'fd']})
         assert_frame_equal(result, expected)
 
-        # dict to scalar
-        filled = df.replace(to_rep, 0)
-        expected = {}
-        for k, v in compat.iteritems(df):
-            expected[k] = v.replace(to_rep[k], 0)
-        assert_frame_equal(filled, DataFrame(expected))
-
-        self.assertRaises(TypeError, df.replace, to_rep, [np.nan, 0, ''])
-
         # scalar to dict
         values = {'A': 0, 'B': -1, 'C': 'missing'}
         df = DataFrame({'A': [np.nan, 0, np.nan], 'B': [0, 2, 5],
@@ -842,6 +833,20 @@ def test_replace_input_formats(self):
 
         self.assertRaises(ValueError, df.replace, to_rep, values[1:])
 
+    def test_replace_input_formats_scalar(self):
+        df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5],
+                        'C': ['', 'asdf', 'fd']})
+
+        # dict to scalar
+        to_rep = {'A': np.nan, 'B': 0, 'C': ''}
+        filled = df.replace(to_rep, 0)
+        expected = {}
+        for k, v in compat.iteritems(df):
+            expected[k] = v.replace(to_rep[k], 0)
+        assert_frame_equal(filled, DataFrame(expected))
+
+        self.assertRaises(TypeError, df.replace, to_rep, [np.nan, 0, ''])
+
         # list to scalar
         to_rep = [np.nan, 0, '']
         result = df.replace(to_rep, -1)
diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py
@@ -10,7 +10,6 @@
 
 
 class TestSeriesReplace(TestData, tm.TestCase):
-
     def test_replace(self):
         N = 100
         ser = pd.Series(np.random.randn(N))
@@ -227,3 +226,24 @@ def test_replace_with_empty_dictlike(self):
         s = pd.Series(list('abcd'))
         tm.assert_series_equal(s, s.replace(dict()))
         tm.assert_series_equal(s, s.replace(pd.Series([])))
+
+    def test_replace_string_with_number(self):
+        # GH 15743
+        s = pd.Series([1, 2, 3])
+        result = s.replace('2', np.nan)
+        expected = pd.Series([1, 2, 3])
+        tm.assert_series_equal(expected, result)
+
+    def test_replace_unicode_with_number(self):
+        # GH 15743
+        s = pd.Series([1, 2, 3])
+        result = s.replace(u'2', np.nan)
+        expected = pd.Series([1, 2, 3])
+        tm.assert_series_equal(expected, result)
+
+    def test_replace_mixed_types_with_string(self):
+        # Testing mixed
+        s = pd.Series([1, 2, 3, '4', 4, 5])
+        result = s.replace([2, '4'], np.nan)
+        expected = pd.Series([1, np.nan, 3, np.nan, 4, 5])
+        tm.assert_series_equal(expected, result)
diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py
@@ -5,13 +5,15 @@
 
 """
 
-from datetime import datetime
+import pytest
+from datetime import datetime, timedelta, date
 import numpy as np
 
 from pandas import Timedelta, Timestamp, DatetimeIndex
 from pandas.types.cast import (maybe_downcast_to_dtype,
                                maybe_convert_objects,
                                infer_dtype_from_scalar,
+                               infer_dtype_from_array,
                                maybe_convert_string_to_object,
                                maybe_convert_scalar,
                                find_common_type)
@@ -82,7 +84,7 @@ def test_datetime_with_timezone(self):
         tm.assert_index_equal(res, exp)
 
 
-class TestInferDtype(tm.TestCase):
+class TestInferDtype(object):
 
     def test_infer_dtype_from_scalar(self):
         # Test that _infer_dtype_from_scalar is returning correct dtype for int
@@ -92,44 +94,62 @@ def test_infer_dtype_from_scalar(self):
                        np.int32, np.uint64, np.int64]:
             data = dtypec(12)
             dtype, val = infer_dtype_from_scalar(data)
-            self.assertEqual(dtype, type(data))
+            assert dtype == type(data)
 
         data = 12
         dtype, val = infer_dtype_from_scalar(data)
-        self.assertEqual(dtype, np.int64)
+        assert dtype == np.int64
 
         for dtypec in [np.float16, np.float32, np.float64]:
             data = dtypec(12)
             dtype, val = infer_dtype_from_scalar(data)
-            self.assertEqual(dtype, dtypec)
+            assert dtype == dtypec
 
         data = np.float(12)
         dtype, val = infer_dtype_from_scalar(data)
-        self.assertEqual(dtype, np.float64)
+        assert dtype == np.float64
 
         for data in [True, False]:
             dtype, val = infer_dtype_from_scalar(data)
-            self.assertEqual(dtype, np.bool_)
+            assert dtype == np.bool_
 
         for data in [np.complex64(1), np.complex128(1)]:
             dtype, val = infer_dtype_from_scalar(data)
-            self.assertEqual(dtype, np.complex_)
+            assert dtype == np.complex_
 
-        import datetime
         for data in [np.datetime64(1, 'ns'), Timestamp(1),
-                     datetime.datetime(2000, 1, 1, 0, 0)]:
+                     datetime(2000, 1, 1, 0, 0)]:
             dtype, val = infer_dtype_from_scalar(data)
-            self.assertEqual(dtype, 'M8[ns]')
+            assert dtype == 'M8[ns]'
 
         for data in [np.timedelta64(1, 'ns'), Timedelta(1),
-                     datetime.timedelta(1)]:
+                     timedelta(1)]:
             dtype, val = infer_dtype_from_scalar(data)
-            self.assertEqual(dtype, 'm8[ns]')
+            assert dtype == 'm8[ns]'
 
-        for data in [datetime.date(2000, 1, 1),
+        for data in [date(2000, 1, 1),
                      Timestamp(1, tz='US/Eastern'), 'foo']:
             dtype, val = infer_dtype_from_scalar(data)
-            self.assertEqual(dtype, np.object_)
+            assert dtype == np.object_
+
+    @pytest.mark.parametrize(
+        "arr, expected",
+        [('foo', np.object_),
+         (b'foo', np.object_),
+         (1, np.int_),
+         (1.5, np.float_),
+         ([1], np.int_),
+         (np.array([1]), np.int_),
+         ([np.nan, 1, ''], np.object_),
+         (np.array([[1.0, 2.0]]), np.float_),
+         (Timestamp('20160101'), np.object_),
+         (np.datetime64('2016-01-01'), np.dtype('<M8[D]')),
+         ])
+    def test_infer_dtype_from_array(self, arr, expected):
+
+        # these infer specifically to numpy dtypes
+        dtype, _ = infer_dtype_from_array(arr)
+        assert dtype == expected
 
 
 class TestMaybe(tm.TestCase):
diff --git a/pandas/types/cast.py b/pandas/types/cast.py
@@ -387,6 +387,50 @@ def infer_dtype_from_scalar(val, pandas_dtype=False):
     return dtype, val
 
 
+def infer_dtype_from_array(arr):
+    """
+    infer the dtype from a scalar or array
+
+    Parameters
+    ----------
+    arr : scalar or array
+
+    Returns
+    -------
+    tuple (numpy-compat dtype, array)
+
+    Notes
+    -----
+    These infer to numpy dtypes exactly
+    with the exception that mixed / object dtypes
+    are not coerced by stringifying or conversion
+
+    Examples
+    --------
+    >>> np.asarray([1, '1'])
+    array(['1', '1'], dtype='<U21')
+
+    >>> infer_dtype_from_array([1, '1'])
+    (numpy.object_, [1, '1'])
+
+    """
+
+    if isinstance(arr, np.ndarray):
+        return arr.dtype, arr
+
+    if not is_list_like(arr):
+        arr = [arr]
+
+    # don't force numpy coerce with nan's
+    inferred = lib.infer_dtype(arr)
+    if inferred in ['string', 'bytes', 'unicode',
+                    'mixed', 'mixed-integer']:
+        return (np.object_, arr)
+
+    arr = np.asarray(arr)
+    return arr.dtype, arr
+
+
 def maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False):
     """ provide explict type promotion and coercion