Skip to content

Commit 6f789e1

Browse files
Carlos Souzajreback
Carlos Souza
authored andcommitted
BUG: replace of numeric by string / dtype coversion (GH15743)
closes #15743 Author: Carlos Souza <[email protected]> Author: Jeff Reback <[email protected]> Closes #15812 from ucals/bug-fix-15743 and squashes the following commits: e6e4971 [Carlos Souza] Adding replace unicode with number and replace mixed types with string tests bd31b2b [Carlos Souza] Resolving merge conflict by incorporating @jreback suggestions 73805ce [Jeff Reback] CLN: add infer_dtype_from_array 45e67e4 [Carlos Souza] Fixing PEP8 line indent 0a98557 [Carlos Souza] BUG: replace of numeric by string fixed 97e1f18 [Carlos Souza] Test e62763c [Carlos Souza] Fixing PEP8 line indent 080c71e [Carlos Souza] BUG: replace of numeric by string fixed 8b463cb [Carlos Souza] Merge remote-tracking branch 'upstream/master' 9fc617b [Carlos Souza] Merge remote-tracking branch 'upstream/master' e12bca7 [Carlos Souza] Sync fork 676a4e5 [Carlos Souza] Test
1 parent d96ff29 commit 6f789e1

File tree

7 files changed

+132
-40
lines changed

7 files changed

+132
-40
lines changed

RELEASE.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Release Notes
22
=============
33

4-
The list of changes to pandas between each release can be found
4+
The list of changes to Pandas between each release can be found
55
[here](http://pandas.pydata.org/pandas-docs/stable/whatsnew.html). For full
66
details, see the commit logs at http://github.com/pandas-dev/pandas.

doc/source/whatsnew/v0.20.0.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -884,6 +884,8 @@ Bug Fixes
884884
- Bug in ``.at`` when selecting from a tz-aware column (:issue:`15822`)
885885
- Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`)
886886
- Bug in ``.replace()`` may result in incorrect dtypes. (:issue:`12747`, :issue:`15765`)
887+
- Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`)
888+
- Bug in ``Series.replace`` which replaced a numeric by string (:issue:`15743`)
887889

888890
- Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`)
889891

@@ -986,7 +988,6 @@ Bug Fixes
986988

987989
- Bug in ``DataFrame.hist`` where ``plt.tight_layout`` caused an ``AttributeError`` (use ``matplotlib >= 2.0.1``) (:issue:`9351`)
988990
- Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`)
989-
- Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`)
990991
- Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`)
991992
- Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`)
992993
- Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`)

pandas/core/missing.py

+14-12
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,16 @@
99

1010
from pandas.compat import range, string_types
1111
from pandas.types.common import (is_numeric_v_string_like,
12-
is_float_dtype, is_datetime64_dtype,
13-
is_datetime64tz_dtype, is_integer_dtype,
14-
_ensure_float64, is_scalar,
15-
needs_i8_conversion, is_integer)
12+
is_float_dtype,
13+
is_datetime64_dtype,
14+
is_datetime64tz_dtype,
15+
is_integer_dtype,
16+
is_scalar,
17+
is_integer,
18+
needs_i8_conversion,
19+
_ensure_float64)
20+
21+
from pandas.types.cast import infer_dtype_from_array
1622
from pandas.types.missing import isnull
1723

1824

@@ -21,11 +27,11 @@ def mask_missing(arr, values_to_mask):
2127
Return a masking array of same size/shape as arr
2228
with entries equaling any member of values_to_mask set to True
2329
"""
24-
if not isinstance(values_to_mask, (list, np.ndarray)):
25-
values_to_mask = [values_to_mask]
30+
dtype, values_to_mask = infer_dtype_from_array(values_to_mask)
2631

2732
try:
28-
values_to_mask = np.array(values_to_mask, dtype=arr.dtype)
33+
values_to_mask = np.array(values_to_mask, dtype=dtype)
34+
2935
except Exception:
3036
values_to_mask = np.array(values_to_mask, dtype=object)
3137

@@ -409,7 +415,7 @@ def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None,
409415
if axis != 0: # pragma: no cover
410416
raise AssertionError("cannot interpolate on a ndim == 1 with "
411417
"axis != 0")
412-
values = values.reshape(tuple((1, ) + values.shape))
418+
values = values.reshape(tuple((1,) + values.shape))
413419

414420
if fill_value is None:
415421
mask = None
@@ -447,7 +453,6 @@ def wrapper(arr, mask, limit=None):
447453

448454

449455
def pad_1d(values, limit=None, mask=None, dtype=None):
450-
451456
if dtype is None:
452457
dtype = values.dtype
453458
_method = None
@@ -472,7 +477,6 @@ def pad_1d(values, limit=None, mask=None, dtype=None):
472477

473478

474479
def backfill_1d(values, limit=None, mask=None, dtype=None):
475-
476480
if dtype is None:
477481
dtype = values.dtype
478482
_method = None
@@ -498,7 +502,6 @@ def backfill_1d(values, limit=None, mask=None, dtype=None):
498502

499503

500504
def pad_2d(values, limit=None, mask=None, dtype=None):
501-
502505
if dtype is None:
503506
dtype = values.dtype
504507
_method = None
@@ -528,7 +531,6 @@ def pad_2d(values, limit=None, mask=None, dtype=None):
528531

529532

530533
def backfill_2d(values, limit=None, mask=None, dtype=None):
531-
532534
if dtype is None:
533535
dtype = values.dtype
534536
_method = None

pandas/tests/frame/test_replace.py

+15-10
Original file line numberDiff line numberDiff line change
@@ -795,7 +795,7 @@ def test_replace_dtypes(self):
795795
expected = DataFrame({'datetime64': Index([now] * 3)})
796796
assert_frame_equal(result, expected)
797797

798-
def test_replace_input_formats(self):
798+
def test_replace_input_formats_listlike(self):
799799
# both dicts
800800
to_rep = {'A': np.nan, 'B': 0, 'C': ''}
801801
values = {'A': 0, 'B': -1, 'C': 'missing'}
@@ -812,15 +812,6 @@ def test_replace_input_formats(self):
812812
'C': ['', 'asdf', 'fd']})
813813
assert_frame_equal(result, expected)
814814

815-
# dict to scalar
816-
filled = df.replace(to_rep, 0)
817-
expected = {}
818-
for k, v in compat.iteritems(df):
819-
expected[k] = v.replace(to_rep[k], 0)
820-
assert_frame_equal(filled, DataFrame(expected))
821-
822-
self.assertRaises(TypeError, df.replace, to_rep, [np.nan, 0, ''])
823-
824815
# scalar to dict
825816
values = {'A': 0, 'B': -1, 'C': 'missing'}
826817
df = DataFrame({'A': [np.nan, 0, np.nan], 'B': [0, 2, 5],
@@ -842,6 +833,20 @@ def test_replace_input_formats(self):
842833

843834
self.assertRaises(ValueError, df.replace, to_rep, values[1:])
844835

836+
def test_replace_input_formats_scalar(self):
837+
df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5],
838+
'C': ['', 'asdf', 'fd']})
839+
840+
# dict to scalar
841+
to_rep = {'A': np.nan, 'B': 0, 'C': ''}
842+
filled = df.replace(to_rep, 0)
843+
expected = {}
844+
for k, v in compat.iteritems(df):
845+
expected[k] = v.replace(to_rep[k], 0)
846+
assert_frame_equal(filled, DataFrame(expected))
847+
848+
self.assertRaises(TypeError, df.replace, to_rep, [np.nan, 0, ''])
849+
845850
# list to scalar
846851
to_rep = [np.nan, 0, '']
847852
result = df.replace(to_rep, -1)

pandas/tests/series/test_replace.py

+21-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010

1111

1212
class TestSeriesReplace(TestData, tm.TestCase):
13-
1413
def test_replace(self):
1514
N = 100
1615
ser = pd.Series(np.random.randn(N))
@@ -227,3 +226,24 @@ def test_replace_with_empty_dictlike(self):
227226
s = pd.Series(list('abcd'))
228227
tm.assert_series_equal(s, s.replace(dict()))
229228
tm.assert_series_equal(s, s.replace(pd.Series([])))
229+
230+
def test_replace_string_with_number(self):
231+
# GH 15743
232+
s = pd.Series([1, 2, 3])
233+
result = s.replace('2', np.nan)
234+
expected = pd.Series([1, 2, 3])
235+
tm.assert_series_equal(expected, result)
236+
237+
def test_replace_unicode_with_number(self):
238+
# GH 15743
239+
s = pd.Series([1, 2, 3])
240+
result = s.replace(u'2', np.nan)
241+
expected = pd.Series([1, 2, 3])
242+
tm.assert_series_equal(expected, result)
243+
244+
def test_replace_mixed_types_with_string(self):
245+
# Testing mixed
246+
s = pd.Series([1, 2, 3, '4', 4, 5])
247+
result = s.replace([2, '4'], np.nan)
248+
expected = pd.Series([1, np.nan, 3, np.nan, 4, 5])
249+
tm.assert_series_equal(expected, result)

pandas/tests/types/test_cast.py

+35-15
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,15 @@
55
66
"""
77

8-
from datetime import datetime
8+
import pytest
9+
from datetime import datetime, timedelta, date
910
import numpy as np
1011

1112
from pandas import Timedelta, Timestamp, DatetimeIndex
1213
from pandas.types.cast import (maybe_downcast_to_dtype,
1314
maybe_convert_objects,
1415
infer_dtype_from_scalar,
16+
infer_dtype_from_array,
1517
maybe_convert_string_to_object,
1618
maybe_convert_scalar,
1719
find_common_type)
@@ -82,7 +84,7 @@ def test_datetime_with_timezone(self):
8284
tm.assert_index_equal(res, exp)
8385

8486

85-
class TestInferDtype(tm.TestCase):
87+
class TestInferDtype(object):
8688

8789
def test_infer_dtype_from_scalar(self):
8890
# Test that _infer_dtype_from_scalar is returning correct dtype for int
@@ -92,44 +94,62 @@ def test_infer_dtype_from_scalar(self):
9294
np.int32, np.uint64, np.int64]:
9395
data = dtypec(12)
9496
dtype, val = infer_dtype_from_scalar(data)
95-
self.assertEqual(dtype, type(data))
97+
assert dtype == type(data)
9698

9799
data = 12
98100
dtype, val = infer_dtype_from_scalar(data)
99-
self.assertEqual(dtype, np.int64)
101+
assert dtype == np.int64
100102

101103
for dtypec in [np.float16, np.float32, np.float64]:
102104
data = dtypec(12)
103105
dtype, val = infer_dtype_from_scalar(data)
104-
self.assertEqual(dtype, dtypec)
106+
assert dtype == dtypec
105107

106108
data = np.float(12)
107109
dtype, val = infer_dtype_from_scalar(data)
108-
self.assertEqual(dtype, np.float64)
110+
assert dtype == np.float64
109111

110112
for data in [True, False]:
111113
dtype, val = infer_dtype_from_scalar(data)
112-
self.assertEqual(dtype, np.bool_)
114+
assert dtype == np.bool_
113115

114116
for data in [np.complex64(1), np.complex128(1)]:
115117
dtype, val = infer_dtype_from_scalar(data)
116-
self.assertEqual(dtype, np.complex_)
118+
assert dtype == np.complex_
117119

118-
import datetime
119120
for data in [np.datetime64(1, 'ns'), Timestamp(1),
120-
datetime.datetime(2000, 1, 1, 0, 0)]:
121+
datetime(2000, 1, 1, 0, 0)]:
121122
dtype, val = infer_dtype_from_scalar(data)
122-
self.assertEqual(dtype, 'M8[ns]')
123+
assert dtype == 'M8[ns]'
123124

124125
for data in [np.timedelta64(1, 'ns'), Timedelta(1),
125-
datetime.timedelta(1)]:
126+
timedelta(1)]:
126127
dtype, val = infer_dtype_from_scalar(data)
127-
self.assertEqual(dtype, 'm8[ns]')
128+
assert dtype == 'm8[ns]'
128129

129-
for data in [datetime.date(2000, 1, 1),
130+
for data in [date(2000, 1, 1),
130131
Timestamp(1, tz='US/Eastern'), 'foo']:
131132
dtype, val = infer_dtype_from_scalar(data)
132-
self.assertEqual(dtype, np.object_)
133+
assert dtype == np.object_
134+
135+
@pytest.mark.parametrize(
136+
"arr, expected",
137+
[('foo', np.object_),
138+
(b'foo', np.object_),
139+
(1, np.int_),
140+
(1.5, np.float_),
141+
([1], np.int_),
142+
(np.array([1]), np.int_),
143+
([np.nan, 1, ''], np.object_),
144+
(np.array([[1.0, 2.0]]), np.float_),
145+
(Timestamp('20160101'), np.object_),
146+
(np.datetime64('2016-01-01'), np.dtype('<M8[D]')),
147+
])
148+
def test_infer_dtype_from_array(self, arr, expected):
149+
150+
# these infer specifically to numpy dtypes
151+
dtype, _ = infer_dtype_from_array(arr)
152+
assert dtype == expected
133153

134154

135155
class TestMaybe(tm.TestCase):

pandas/types/cast.py

+44
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,50 @@ def infer_dtype_from_scalar(val, pandas_dtype=False):
387387
return dtype, val
388388

389389

390+
def infer_dtype_from_array(arr):
391+
"""
392+
infer the dtype from a scalar or array
393+
394+
Parameters
395+
----------
396+
arr : scalar or array
397+
398+
Returns
399+
-------
400+
tuple (numpy-compat dtype, array)
401+
402+
Notes
403+
-----
404+
These infer to numpy dtypes exactly
405+
with the exception that mixed / object dtypes
406+
are not coerced by stringifying or conversion
407+
408+
Examples
409+
--------
410+
>>> np.asarray([1, '1'])
411+
array(['1', '1'], dtype='<U21')
412+
413+
>>> infer_dtype_from_array([1, '1'])
414+
(numpy.object_, [1, '1'])
415+
416+
"""
417+
418+
if isinstance(arr, np.ndarray):
419+
return arr.dtype, arr
420+
421+
if not is_list_like(arr):
422+
arr = [arr]
423+
424+
# don't force numpy coerce with nan's
425+
inferred = lib.infer_dtype(arr)
426+
if inferred in ['string', 'bytes', 'unicode',
427+
'mixed', 'mixed-integer']:
428+
return (np.object_, arr)
429+
430+
arr = np.asarray(arr)
431+
return arr.dtype, arr
432+
433+
390434
def maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False):
391435
""" provide explict type promotion and coercion
392436

0 commit comments

Comments
 (0)