diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 47a087d38d146..a1d31e39e4a57 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -405,6 +405,39 @@ raise a ``ValueError`` if the operation could produce a result with more than .. --------------------------------------------------------------------------- +.. _whatsnew_140.notable_bug_fixes.mean_implicit_conversion_to_numeric: + +Implicit conversion of string to numeric type in mean +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When computing the ``mean`` of a :class:`Series` or :class:`DataFrame` with a string-type value, the elements are concatenated +to a single string then coerced to a numeric type implicitly before computing the mean. This can lead to unexpected results: + +.. code-block:: ipython + + In [5]: df = DataFrame({ + "A": ["1", "2", "3"], + "B": ["0", "1", "J"], + }) + In [6]: df.mean(numeric_only=False) + Out[6]: + A 41.00000+0.00000j + C 0.000000+0.333333j + dtype: complex128 + +Now, an exception will be raised whenever ``mean`` is called on a string-type column or :class:`Series`. + +.. code-block:: ipython + + In [7]: df = DataFrame({ + "A": ["1", "2", "3"], + "B": ["0", "1", "J"], + }) + In [8]: df.mean(numeric_only=False) + Out[8]: + ... + TypeError: cannot find the mean of type 'str' + .. _whatsnew_140.notable_bug_fixes.groupby_apply_mutation: groupby.apply consistent transform detection diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 40664f178993e..ac2ff4fe00467 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -20,6 +20,7 @@ iNaT, lib, ) +from pandas._libs.lib import infer_dtype from pandas._typing import ( ArrayLike, Dtype, @@ -695,7 +696,10 @@ def nanmean( dtype_count = dtype count = _get_counts(values.shape, mask, axis, dtype=dtype_count) - the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) + the_sum = values.sum(axis, dtype=dtype_sum) + if infer_dtype(the_sum) in ("string", "byte", "mixed-integer", "mixed"): + raise TypeError("cannot find the mean of type 'str'") + the_sum = _ensure_numeric(the_sum) if axis is not None and getattr(the_sum, "ndim", False): count = cast(np.ndarray, count) diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index 410a8f6bf3965..2a1944e52a5b8 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -257,7 +257,10 @@ def test_agg_cython_table_raises_frame(df, func, expected, axis): ) def test_agg_cython_table_raises_series(series, func, expected): # GH21224 - msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type" + msg = ( + r"[Cc]ould not convert|can't multiply sequence by non-int of type" + r"|cannot find the mean of type 'str'" + ) with pytest.raises(expected, match=msg): # e.g. Series('a b'.split()).cumprod() will raise series.agg(func) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 43cf8cccb1784..4c01e9e7d6d7c 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -513,6 +513,33 @@ def test_mean_mixed_string_decimal(self): expected = Series([2.7, 681.6], index=["A", "C"]) tm.assert_series_equal(result, expected) + def test_mean_string(self): + # https://github.com/pandas-dev/pandas/issues/44008 + # https://github.com/pandas-dev/pandas/issues/34671 + # https://github.com/pandas-dev/pandas/issues/22642 + # https://github.com/pandas-dev/pandas/issues/26927 + # https://github.com/pandas-dev/pandas/issues/13916 + # https://github.com/pandas-dev/pandas/issues/36703 + + df = DataFrame( + { + "A": ["1", "2", "3"], + "B": ["a", "b", "c"], + "C": [1, 2, 3], + "D": ["0", "1", "J"], + } + ) + with tm.assert_produces_warning(FutureWarning, match="Dropping of nuisance"): + result = df.mean() + expected = Series([2.0], index=["C"]) + tm.assert_series_equal(result, expected) + msg = "cannot find the mean of type 'str'" + with pytest.raises(TypeError, match=msg): + df.mean(numeric_only=False) + result = df.sum() + expected = Series(["123", "abc", 6, "01J"], index=["A", "B", "C", "D"]) + tm.assert_series_equal(result, expected) + def test_var_std(self, datetime_frame): result = datetime_frame.std(ddof=4) expected = datetime_frame.apply(lambda x: x.std(ddof=4)) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index d9372ba5cbb50..9a4daad61b97a 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -92,7 +92,7 @@ def test_cython_agg_nothing_to_agg(): with pytest.raises(NotImplementedError, match="does not implement"): frame.groupby("a")["b"].mean(numeric_only=True) - with pytest.raises(TypeError, match="Could not convert (foo|bar)*"): + with pytest.raises(TypeError, match="cannot find the mean of*"): frame.groupby("a")["b"].mean() frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})