From f8910eb987803097c7e07a827b939708f6ef8b53 Mon Sep 17 00:00:00 2001 From: Kinshuk Dua Date: Thu, 21 Oct 2021 16:42:33 +0530 Subject: [PATCH 1/4] BUG: make `.mean()` raise an exception for strings --- pandas/core/nanops.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 52d2322b11f42..e66a8f9716316 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -44,6 +44,7 @@ is_numeric_dtype, is_object_dtype, is_scalar, + is_string_dtype, is_timedelta64_dtype, needs_i8_conversion, pandas_dtype, @@ -696,7 +697,11 @@ def nanmean( dtype_count = dtype count = _get_counts(values.shape, mask, axis, dtype=dtype_count) - the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) + the_sum = values.sum(axis, dtype=dtype_sum) + if isinstance(the_sum, str) or is_string_dtype(the_sum): + raise TypeError("cannot find the mean of type 'str'") + else: + _ensure_numeric(the_sum) if axis is not None and getattr(the_sum, "ndim", False): count = cast(np.ndarray, count) From adb58ce0851ef562d4f35f0a30efd9ab9aab3c7e Mon Sep 17 00:00:00 2001 From: Kinshuk Dua Date: Fri, 22 Oct 2021 10:14:28 +0530 Subject: [PATCH 2/4] Use `infer_dtype` to detect strings; add new regex for tests --- pandas/core/nanops.py | 7 +++---- pandas/tests/apply/test_invalid_arg.py | 5 ++++- pandas/tests/groupby/aggregate/test_cython.py | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index e66a8f9716316..c72a7af420ca5 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -20,6 +20,7 @@ iNaT, lib, ) +from pandas._libs.lib import infer_dtype from pandas._typing import ( ArrayLike, Dtype, @@ -44,7 +45,6 @@ is_numeric_dtype, is_object_dtype, is_scalar, - is_string_dtype, is_timedelta64_dtype, needs_i8_conversion, pandas_dtype, @@ -698,10 +698,9 @@ def nanmean( count = _get_counts(values.shape, mask, axis, dtype=dtype_count) the_sum = values.sum(axis, dtype=dtype_sum) - if isinstance(the_sum, str) or is_string_dtype(the_sum): + if infer_dtype(the_sum) in ("string", "byte", "mixed-integer", "mixed"): raise TypeError("cannot find the mean of type 'str'") - else: - _ensure_numeric(the_sum) + the_sum = _ensure_numeric(the_sum) if axis is not None and getattr(the_sum, "ndim", False): count = cast(np.ndarray, count) diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index b0faeba23a479..9aea08a998207 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -257,7 +257,10 @@ def test_agg_cython_table_raises_frame(df, func, expected, axis): ) def test_agg_cython_table_raises_series(series, func, expected): # GH21224 - msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type" + msg = ( + r"[Cc]ould not convert|can't multiply sequence by non-int of type" + r"|cannot find the mean of type 'str'" + ) with pytest.raises(expected, match=msg): # e.g. Series('a b'.split()).cumprod() will raise series.agg(func) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index d9372ba5cbb50..9a4daad61b97a 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -92,7 +92,7 @@ def test_cython_agg_nothing_to_agg(): with pytest.raises(NotImplementedError, match="does not implement"): frame.groupby("a")["b"].mean(numeric_only=True) - with pytest.raises(TypeError, match="Could not convert (foo|bar)*"): + with pytest.raises(TypeError, match="cannot find the mean of*"): frame.groupby("a")["b"].mean() frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) From 1c81cf622e95a89a5cfd0c551a64921352b6aae3 Mon Sep 17 00:00:00 2001 From: Kinshuk Dua Date: Fri, 12 Nov 2021 11:00:27 +0530 Subject: [PATCH 3/4] Add tests for mean with strings --- pandas/tests/frame/test_reductions.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 6402a08ca54a2..54da0f2796d7b 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -524,6 +524,33 @@ def test_mean_mixed_string_decimal(self): expected = Series([2.7, 681.6], index=["A", "C"]) tm.assert_series_equal(result, expected) + def test_mean_string(self): + # https://github.com/pandas-dev/pandas/issues/44008 + # https://github.com/pandas-dev/pandas/issues/34671 + # https://github.com/pandas-dev/pandas/issues/22642 + # https://github.com/pandas-dev/pandas/issues/26927 + # https://github.com/pandas-dev/pandas/issues/13916 + # https://github.com/pandas-dev/pandas/issues/36703 + + df = DataFrame( + { + "A": ["1", "2", "3"], + "B": ["a", "b", "c"], + "C": [1, 2, 3], + "D": ["0", "1", "J"], + } + ) + with tm.assert_produces_warning(FutureWarning, match="Dropping of nuisance"): + result = df.mean() + expected = Series([2.0], index=["C"]) + tm.assert_series_equal(result, expected) + msg = "cannot find the mean of type 'str'" + with pytest.raises(TypeError, match=msg): + df.mean(numeric_only=False) + result = df.sum() + expected = Series(["123", "abc", 6, "01J"], index=["A", "B", "C", "D"]) + tm.assert_series_equal(result, expected) + def test_var_std(self, datetime_frame): result = datetime_frame.std(ddof=4) expected = datetime_frame.apply(lambda x: x.std(ddof=4)) From be053cef25f5729eccc94abaf7ac57237a560346 Mon Sep 17 00:00:00 2001 From: Kinshuk Dua Date: Thu, 2 Dec 2021 16:40:11 +0530 Subject: [PATCH 4/4] Add whatsnew;fix conflict --- doc/source/whatsnew/v1.4.0.rst | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index fd7cb6a69d955..4811902eac2dd 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -346,6 +346,39 @@ second column is instead renamed to ``a.2``. res +.. _whatsnew_140.notable_bug_fixes.mean_implicit_conversion_to_numeric: + +Implicit conversion of string to numeric type in mean +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When computing the ``mean`` of a :class:`Series` or :class:`DataFrame` with a string-type value, the elements are concatenated +to a single string then coerced to a numeric type implicitly before computing the mean. This can lead to unexpected results: + +.. code-block:: ipython + + In [5]: df = DataFrame({ + "A": ["1", "2", "3"], + "B": ["0", "1", "J"], + }) + In [6]: df.mean(numeric_only=False) + Out[6]: + A 41.00000+0.00000j + C 0.000000+0.333333j + dtype: complex128 + +Now, an exception will be raised whenever ``mean`` is called on a string-type column or :class:`Series`. + +.. code-block:: ipython + + In [7]: df = DataFrame({ + "A": ["1", "2", "3"], + "B": ["0", "1", "J"], + }) + In [8]: df.mean(numeric_only=False) + Out[8]: + ... + TypeError: cannot find the mean of type 'str' + .. _whatsnew_140.notable_bug_fixes.notable_bug_fix3: notable_bug_fix3