From f677011d4396f4cb078c76b5dc90e285571ada83 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 17 Jan 2015 16:32:54 +0900 Subject: [PATCH] BUG: where coerces numeric to str incorrectly --- doc/source/whatsnew/v0.16.0.txt | 2 ++ pandas/core/common.py | 15 ++++++++++++++- pandas/core/generic.py | 6 +++++- pandas/core/internals.py | 4 +++- pandas/tests/test_common.py | 28 ++++++++++++++++++++++++++++ pandas/tests/test_series.py | 22 ++++++++++++++++++++++ 6 files changed, 74 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 2db455272363b..d8fc10dd54e8c 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -187,6 +187,8 @@ Bug Fixes - Bug in read_csv when using skiprows on a file with CR line endings with the c engine. (:issue:`9079`) - isnull now detects ``NaT`` in PeriodIndex (:issue:`9129`) - Bug in groupby ``.nth()`` with a multiple column groupby (:issue:`8979`) +- Bug in ``DataFrame.where`` and ``Series.where`` coerce numerics to string incorrectly (:issue:`9280`) +- Bug in ``DataFrame.where`` and ``Series.where`` raise ``ValueError`` when string list-like is passed. (:issue:`9280`) - Fixed division by zero error for ``Series.kurt()`` when all values are equal (:issue:`9197`) diff --git a/pandas/core/common.py b/pandas/core/common.py index 143f65ee64e60..f8f5928ca7d51 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -19,7 +19,7 @@ import pandas.lib as lib import pandas.tslib as tslib from pandas import compat -from pandas.compat import StringIO, BytesIO, range, long, u, zip, map +from pandas.compat import StringIO, BytesIO, range, long, u, zip, map, string_types from pandas.core.config import get_option @@ -1322,6 +1322,19 @@ def _possibly_downcast_to_dtype(result, dtype): return result +def _maybe_convert_string_to_object(values): + """ + Convert string-like and string-like array to convert object dtype. + This is to avoid numpy to handle the array as str dtype. + """ + if isinstance(values, string_types): + values = np.array([values], dtype=object) + elif (isinstance(values, np.ndarray) and + issubclass(values.dtype.type, (np.string_, np.unicode_))): + values = values.astype(object) + return values + + def _lcd_dtypes(a_dtype, b_dtype): """ return the lcd dtype to hold these types """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7fa64e0b4ca91..b2adfae744db7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3292,7 +3292,11 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, if self.ndim == 1: # try to set the same dtype as ourselves - new_other = np.array(other, dtype=self.dtype) + try: + new_other = np.array(other, dtype=self.dtype) + except ValueError: + new_other = np.array(other) + if not (new_other == np.array(other)).all(): other = np.array(other) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index ef33e27d861fd..f4abe05097cff 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -13,7 +13,8 @@ ABCSparseSeries, _infer_dtype_from_scalar, _is_null_datelike_scalar, _maybe_promote, is_timedelta64_dtype, is_datetime64_dtype, - _possibly_infer_to_datetimelike, array_equivalent) + _possibly_infer_to_datetimelike, array_equivalent, + _maybe_convert_string_to_object) from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import (_maybe_convert_indices, _length_of_indexer) from pandas.core.categorical import Categorical, _maybe_to_categorical, _is_categorical @@ -1052,6 +1053,7 @@ def where(self, other, cond, align=True, raise_on_error=True, values = values.T is_transposed = not is_transposed + other = _maybe_convert_string_to_object(other) # our where function def func(c, v, o): diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 2f57fa593bc40..36d6c39586d97 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -947,6 +947,34 @@ def test_2d_datetime64(self): tm.assert_almost_equal(result, expected) +class TestMaybe(tm.TestCase): + + def test_maybe_convert_string_to_array(self): + result = com._maybe_convert_string_to_object('x') + tm.assert_numpy_array_equal(result, np.array(['x'], dtype=object)) + self.assertTrue(result.dtype == object) + + result = com._maybe_convert_string_to_object(1) + self.assertEquals(result, 1) + + arr = np.array(['x', 'y'], dtype=str) + result = com._maybe_convert_string_to_object(arr) + tm.assert_numpy_array_equal(result, np.array(['x', 'y'], dtype=object)) + self.assertTrue(result.dtype == object) + + # unicode + arr = np.array(['x', 'y']).astype('U') + result = com._maybe_convert_string_to_object(arr) + tm.assert_numpy_array_equal(result, np.array(['x', 'y'], dtype=object)) + self.assertTrue(result.dtype == object) + + # object + arr = np.array(['x', 2], dtype=object) + result = com._maybe_convert_string_to_object(arr) + tm.assert_numpy_array_equal(result, np.array(['x', 2], dtype=object)) + self.assertTrue(result.dtype == object) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index b67a8c5de1c2d..a5de26da1606a 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -1886,6 +1886,28 @@ def test_ix_setitem(self): self.assertEqual(self.series[d1], 4) self.assertEqual(self.series[d2], 6) + def test_where_numeric_with_string(self): + # GH 9280 + s = pd.Series([1, 2, 3]) + w = s.where(s>1, 'X') + + self.assertTrue(isinstance(w[0], str)) + self.assertTrue(isinstance(w[1], int)) + self.assertTrue(isinstance(w[2], int)) + self.assertTrue(w.dtype == 'object') + + w = s.where(s>1, ['X', 'Y', 'Z']) + self.assertTrue(isinstance(w[0], str)) + self.assertTrue(isinstance(w[1], int)) + self.assertTrue(isinstance(w[2], int)) + self.assertTrue(w.dtype == 'object') + + w = s.where(s>1, np.array(['X', 'Y', 'Z'])) + self.assertTrue(isinstance(w[0], str)) + self.assertTrue(isinstance(w[1], int)) + self.assertTrue(isinstance(w[2], int)) + self.assertTrue(w.dtype == 'object') + def test_setitem_boolean(self): mask = self.series > self.series.median()