Skip to content

BUG: make mean() raise an exception for strings #44131

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,39 @@ raise a ``ValueError`` if the operation could produce a result with more than

.. ---------------------------------------------------------------------------

.. _whatsnew_140.notable_bug_fixes.mean_implicit_conversion_to_numeric:

Implicit conversion of string to numeric type in mean
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

When computing the ``mean`` of a :class:`Series` or :class:`DataFrame` with a string-type value, the elements are concatenated
to a single string then coerced to a numeric type implicitly before computing the mean. This can lead to unexpected results:

.. code-block:: ipython

In [5]: df = DataFrame({
"A": ["1", "2", "3"],
"B": ["0", "1", "J"],
})
In [6]: df.mean(numeric_only=False)
Out[6]:
A 41.00000+0.00000j
C 0.000000+0.333333j
dtype: complex128

Now, an exception will be raised whenever ``mean`` is called on a string-type column or :class:`Series`.

.. code-block:: ipython

In [7]: df = DataFrame({
"A": ["1", "2", "3"],
"B": ["0", "1", "J"],
})
In [8]: df.mean(numeric_only=False)
Out[8]:
...
TypeError: cannot find the mean of type 'str'

.. _whatsnew_140.notable_bug_fixes.groupby_apply_mutation:

groupby.apply consistent transform detection
Expand Down
6 changes: 5 additions & 1 deletion pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
iNaT,
lib,
)
from pandas._libs.lib import infer_dtype
from pandas._typing import (
ArrayLike,
Dtype,
Expand Down Expand Up @@ -695,7 +696,10 @@ def nanmean(
dtype_count = dtype

count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
the_sum = values.sum(axis, dtype=dtype_sum)
if infer_dtype(the_sum) in ("string", "byte", "mixed-integer", "mixed"):
raise TypeError("cannot find the mean of type 'str'")
the_sum = _ensure_numeric(the_sum)

if axis is not None and getattr(the_sum, "ndim", False):
count = cast(np.ndarray, count)
Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/apply/test_invalid_arg.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,10 @@ def test_agg_cython_table_raises_frame(df, func, expected, axis):
)
def test_agg_cython_table_raises_series(series, func, expected):
# GH21224
msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type"
msg = (
r"[Cc]ould not convert|can't multiply sequence by non-int of type"
r"|cannot find the mean of type 'str'"
)
with pytest.raises(expected, match=msg):
# e.g. Series('a b'.split()).cumprod() will raise
series.agg(func)
Expand Down
27 changes: 27 additions & 0 deletions pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,33 @@ def test_mean_mixed_string_decimal(self):
expected = Series([2.7, 681.6], index=["A", "C"])
tm.assert_series_equal(result, expected)

def test_mean_string(self):
# https://github.com/pandas-dev/pandas/issues/44008
# https://github.com/pandas-dev/pandas/issues/34671
# https://github.com/pandas-dev/pandas/issues/22642
# https://github.com/pandas-dev/pandas/issues/26927
# https://github.com/pandas-dev/pandas/issues/13916
# https://github.com/pandas-dev/pandas/issues/36703

df = DataFrame(
{
"A": ["1", "2", "3"],
"B": ["a", "b", "c"],
"C": [1, 2, 3],
"D": ["0", "1", "J"],
}
)
with tm.assert_produces_warning(FutureWarning, match="Dropping of nuisance"):
result = df.mean()
expected = Series([2.0], index=["C"])
tm.assert_series_equal(result, expected)
msg = "cannot find the mean of type 'str'"
with pytest.raises(TypeError, match=msg):
df.mean(numeric_only=False)
result = df.sum()
expected = Series(["123", "abc", 6, "01J"], index=["A", "B", "C", "D"])
tm.assert_series_equal(result, expected)

def test_var_std(self, datetime_frame):
result = datetime_frame.std(ddof=4)
expected = datetime_frame.apply(lambda x: x.std(ddof=4))
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/aggregate/test_cython.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def test_cython_agg_nothing_to_agg():
with pytest.raises(NotImplementedError, match="does not implement"):
frame.groupby("a")["b"].mean(numeric_only=True)

with pytest.raises(TypeError, match="Could not convert (foo|bar)*"):
with pytest.raises(TypeError, match="cannot find the mean of*"):
frame.groupby("a")["b"].mean()

frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
Expand Down