Skip to content

Commit 1f0fcb3

Browse files
jbrockmendelmroeschke
authored and
Yi Wei
committed
BUG: mean/median with strings (pandas-dev#52281)
* BUG: converting string to numeric in median, mean * whatsnew, median test * troubleshoot builds * fix arraymanager build * say in whatsnew we raise TypeError --------- Co-authored-by: Matthew Roeschke <[email protected]>
1 parent f7ac55c commit 1f0fcb3

File tree

10 files changed

+149
-21
lines changed

10 files changed

+149
-21
lines changed

doc/source/whatsnew/v2.1.0.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,8 +313,11 @@ Timezones
313313

314314
Numeric
315315
^^^^^^^
316+
- Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`)
317+
- Bug in :meth:`Series.mean`, :meth:`DataFrame.mean` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results; these now raise ``TypeError`` (:issue:`36703`, :issue:`44008`)
316318
- Bug in :meth:`DataFrame.corrwith` raising ``NotImplementedError`` for pyarrow-backed dtypes (:issue:`52314`)
317319
- Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`)
320+
- Bug in :meth:`Series.median` and :meth:`DataFrame.median` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results; these now raise ``TypeError`` (:issue:`34671`)
318321
-
319322

320323
Conversion

pandas/core/nanops.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -716,7 +716,8 @@ def nanmean(
716716
dtype_count = dtype
717717

718718
count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
719-
the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
719+
the_sum = values.sum(axis, dtype=dtype_sum)
720+
the_sum = _ensure_numeric(the_sum)
720721

721722
if axis is not None and getattr(the_sum, "ndim", False):
722723
count = cast(np.ndarray, count)
@@ -775,6 +776,11 @@ def get_median(x, _mask=None):
775776
dtype = values.dtype
776777
values, mask = _get_values(values, skipna, mask=mask, fill_value=0)
777778
if values.dtype.kind != "f":
779+
if values.dtype == object:
780+
# GH#34671 avoid casting strings to numeric
781+
inferred = lib.infer_dtype(values)
782+
if inferred in ["string", "mixed"]:
783+
raise TypeError(f"Cannot convert {values} to numeric")
778784
try:
779785
values = values.astype("f8")
780786
except ValueError as err:
@@ -1659,6 +1665,10 @@ def _ensure_numeric(x):
16591665
if x.dtype.kind in "biu":
16601666
x = x.astype(np.float64)
16611667
elif x.dtype == object:
1668+
inferred = lib.infer_dtype(x)
1669+
if inferred in ["string", "mixed"]:
1670+
# GH#44008, GH#36703 avoid casting e.g. strings to numeric
1671+
raise TypeError(f"Could not convert {x} to numeric")
16621672
try:
16631673
x = x.astype(np.complex128)
16641674
except (TypeError, ValueError):
@@ -1671,6 +1681,9 @@ def _ensure_numeric(x):
16711681
if not np.any(np.imag(x)):
16721682
x = x.real
16731683
elif not (is_float(x) or is_integer(x) or is_complex(x)):
1684+
if isinstance(x, str):
1685+
# GH#44008, GH#36703 avoid casting e.g. strings to numeric
1686+
raise TypeError(f"Could not convert string '{x}' to numeric")
16741687
try:
16751688
x = float(x)
16761689
except (TypeError, ValueError):

pandas/tests/apply/test_invalid_arg.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,9 @@ def test_agg_cython_table_raises_frame(df, func, expected, axis):
244244
def test_agg_cython_table_raises_series(series, func, expected):
245245
# GH21224
246246
msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type"
247+
if func == "median" or func is np.nanmedian or func is np.median:
248+
msg = r"Cannot convert \['a' 'b' 'c'\] to numeric"
249+
247250
with pytest.raises(expected, match=msg):
248251
# e.g. Series('a b'.split()).cumprod() will raise
249252
series.agg(func)

pandas/tests/frame/test_reductions.py

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -169,15 +169,30 @@ def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname):
169169
):
170170
getattr(float_string_frame, opname)(axis=axis)
171171
else:
172-
msg = "|".join(
173-
[
174-
"Could not convert",
175-
"could not convert",
176-
"can't multiply sequence by non-int",
177-
"unsupported operand type",
178-
"not supported between instances of",
179-
]
180-
)
172+
if opname in ["var", "std", "sem", "skew", "kurt"]:
173+
msg = "could not convert string to float: 'bar'"
174+
elif opname == "product":
175+
if axis == 1:
176+
msg = "can't multiply sequence by non-int of type 'float'"
177+
else:
178+
msg = "can't multiply sequence by non-int of type 'str'"
179+
elif opname == "sum":
180+
msg = r"unsupported operand type\(s\) for \+: 'float' and 'str'"
181+
elif opname == "mean":
182+
if axis == 0:
183+
# different message on different builds
184+
msg = "|".join(
185+
[
186+
r"Could not convert \['.*'\] to numeric",
187+
"Could not convert string '(bar){30}' to numeric",
188+
]
189+
)
190+
else:
191+
msg = r"unsupported operand type\(s\) for \+: 'float' and 'str'"
192+
elif opname in ["min", "max"]:
193+
msg = "'[><]=' not supported between instances of 'float' and 'str'"
194+
elif opname == "median":
195+
msg = re.compile(r"Cannot convert \[.*\] to numeric", flags=re.S)
181196
with pytest.raises(TypeError, match=msg):
182197
getattr(float_string_frame, opname)(axis=axis)
183198
if opname != "nunique":
@@ -1759,5 +1774,16 @@ def test_fails_on_non_numeric(kernel):
17591774
"argument must be a string or a real number",
17601775
]
17611776
)
1777+
if kernel == "median":
1778+
# slightly different message on different builds
1779+
msg1 = (
1780+
r"Cannot convert \[\[<class 'object'> <class 'object'> "
1781+
r"<class 'object'>\]\] to numeric"
1782+
)
1783+
msg2 = (
1784+
r"Cannot convert \[<class 'object'> <class 'object'> "
1785+
r"<class 'object'>\] to numeric"
1786+
)
1787+
msg = "|".join([msg1, msg2])
17621788
with pytest.raises(TypeError, match=msg):
17631789
getattr(df, kernel)(*args)

pandas/tests/groupby/test_function.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,8 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
262262
"can't multiply sequence by non-int of type 'str'",
263263
]
264264
)
265+
if method == "median":
266+
msg = r"Cannot convert \['a' 'b'\] to numeric"
265267
with pytest.raises(exception, match=msg):
266268
getattr(gb, method)()
267269
else:
@@ -279,6 +281,8 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
279281
f"Cannot perform {method} with non-ordered Categorical",
280282
]
281283
)
284+
if method == "median":
285+
msg = r"Cannot convert \['a' 'b'\] to numeric"
282286
with pytest.raises(exception, match=msg):
283287
getattr(gb, method)(numeric_only=False)
284288
else:
@@ -1467,6 +1471,8 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys):
14671471
"function is not implemented for this dtype",
14681472
]
14691473
)
1474+
if kernel == "median":
1475+
msg = r"Cannot convert \[<class 'object'> <class 'object'>\] to numeric"
14701476
with pytest.raises(exception, match=msg):
14711477
method(*args, **kwargs)
14721478
elif not has_arg and numeric_only is not lib.no_default:

pandas/tests/groupby/test_groupby.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -655,7 +655,8 @@ def test_frame_multi_key_function_list_partial_failure():
655655

656656
grouped = data.groupby(["A", "B"])
657657
funcs = [np.mean, np.std]
658-
with pytest.raises(TypeError, match="Could not convert dullshinyshiny to numeric"):
658+
msg = "Could not convert string 'dullshinyshiny' to numeric"
659+
with pytest.raises(TypeError, match=msg):
659660
grouped.agg(funcs)
660661

661662

@@ -974,6 +975,8 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only):
974975
# columns when numeric_only is False
975976
klass = ValueError if agg_function in ("std", "sem") else TypeError
976977
msg = "|".join(["[C|c]ould not convert", "can't multiply sequence"])
978+
if agg_function == "median":
979+
msg = r"Cannot convert \['one' 'three' 'two'\] to numeric"
977980
with pytest.raises(klass, match=msg):
978981
getattr(grouped, agg_function)(numeric_only=numeric_only)
979982
else:

pandas/tests/groupby/test_raises.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,21 @@ def test_groupby_raises_string(
147147
"idxmin": (TypeError, "'argmin' not allowed for this dtype"),
148148
"last": (None, ""),
149149
"max": (None, ""),
150-
"mean": (TypeError, "Could not convert xy?z?w?t?y?u?i?o? to numeric"),
151-
"median": (TypeError, "could not convert string to float"),
150+
"mean": (
151+
TypeError,
152+
"Could not convert string '(xy|xyzwt|xyz|xztuo)' to numeric",
153+
),
154+
"median": (
155+
TypeError,
156+
"|".join(
157+
[
158+
r"Cannot convert \['x' 'y' 'z'\] to numeric",
159+
r"Cannot convert \['x' 'y'\] to numeric",
160+
r"Cannot convert \['x' 'y' 'z' 'w' 't'\] to numeric",
161+
r"Cannot convert \['x' 'z' 't' 'u' 'o'\] to numeric",
162+
]
163+
),
164+
),
152165
"min": (None, ""),
153166
"ngroup": (None, ""),
154167
"nunique": (None, ""),
@@ -197,7 +210,10 @@ def test_groupby_raises_string_np(
197210

198211
klass, msg = {
199212
np.sum: (None, ""),
200-
np.mean: (TypeError, "Could not convert xy?z?w?t?y?u?i?o? to numeric"),
213+
np.mean: (
214+
TypeError,
215+
"Could not convert string '(xyzwt|xy|xyz|xztuo)' to numeric",
216+
),
201217
}[groupby_func_np]
202218

203219
_call_and_check(klass, msg, how, gb, groupby_func_np, tuple())

pandas/tests/resample/test_resample_api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -857,8 +857,8 @@ def test_end_and_end_day_origin(
857857
("mean", False, "Could not convert"),
858858
("mean", lib.no_default, "Could not convert"),
859859
("median", True, {"num": [12.5]}),
860-
("median", False, "could not convert"),
861-
("median", lib.no_default, "could not convert"),
860+
("median", False, r"Cannot convert \['cat_1' 'cat_2'\] to numeric"),
861+
("median", lib.no_default, r"Cannot convert \['cat_1' 'cat_2'\] to numeric"),
862862
("std", True, {"num": [10.606601717798213]}),
863863
("std", False, "could not convert string to float"),
864864
("std", lib.no_default, "could not convert string to float"),

pandas/tests/series/test_reductions.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,3 +129,52 @@ def test_validate_stat_keepdims():
129129
)
130130
with pytest.raises(ValueError, match=msg):
131131
np.sum(ser, keepdims=True)
132+
133+
134+
def test_mean_with_convertible_string_raises(using_array_manager):
135+
# GH#44008
136+
ser = Series(["1", "2"])
137+
assert ser.sum() == "12"
138+
msg = "Could not convert string '12' to numeric"
139+
with pytest.raises(TypeError, match=msg):
140+
ser.mean()
141+
142+
df = ser.to_frame()
143+
if not using_array_manager:
144+
msg = r"Could not convert \['12'\] to numeric"
145+
with pytest.raises(TypeError, match=msg):
146+
df.mean()
147+
148+
149+
def test_mean_dont_convert_j_to_complex(using_array_manager):
150+
# GH#36703
151+
df = pd.DataFrame([{"db": "J", "numeric": 123}])
152+
if using_array_manager:
153+
msg = "Could not convert string 'J' to numeric"
154+
else:
155+
msg = r"Could not convert \['J'\] to numeric"
156+
with pytest.raises(TypeError, match=msg):
157+
df.mean()
158+
159+
with pytest.raises(TypeError, match=msg):
160+
df.agg("mean")
161+
162+
msg = "Could not convert string 'J' to numeric"
163+
with pytest.raises(TypeError, match=msg):
164+
df["db"].mean()
165+
with pytest.raises(TypeError, match=msg):
166+
np.mean(df["db"].astype("string").array)
167+
168+
169+
def test_median_with_convertible_string_raises(using_array_manager):
170+
# GH#34671 this _could_ return a string "2", but definitely not float 2.0
171+
msg = r"Cannot convert \['1' '2' '3'\] to numeric"
172+
ser = Series(["1", "2", "3"])
173+
with pytest.raises(TypeError, match=msg):
174+
ser.median()
175+
176+
if not using_array_manager:
177+
msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric"
178+
df = ser.to_frame()
179+
with pytest.raises(TypeError, match=msg):
180+
df.median()

pandas/tests/test_nanops.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -850,7 +850,9 @@ def test_ndarray(self):
850850

851851
# Test convertible string ndarray
852852
s_values = np.array(["1", "2", "3"], dtype=object)
853-
assert np.allclose(nanops._ensure_numeric(s_values), values)
853+
msg = r"Could not convert \['1' '2' '3'\] to numeric"
854+
with pytest.raises(TypeError, match=msg):
855+
nanops._ensure_numeric(s_values)
854856

855857
# Test non-convertible string ndarray
856858
s_values = np.array(["foo", "bar", "baz"], dtype=object)
@@ -859,12 +861,19 @@ def test_ndarray(self):
859861
nanops._ensure_numeric(s_values)
860862

861863
def test_convertable_values(self):
862-
assert np.allclose(nanops._ensure_numeric("1"), 1.0)
863-
assert np.allclose(nanops._ensure_numeric("1.1"), 1.1)
864-
assert np.allclose(nanops._ensure_numeric("1+1j"), 1 + 1j)
864+
with pytest.raises(TypeError, match="Could not convert string '1' to numeric"):
865+
nanops._ensure_numeric("1")
866+
with pytest.raises(
867+
TypeError, match="Could not convert string '1.1' to numeric"
868+
):
869+
nanops._ensure_numeric("1.1")
870+
with pytest.raises(
871+
TypeError, match=r"Could not convert string '1\+1j' to numeric"
872+
):
873+
nanops._ensure_numeric("1+1j")
865874

866875
def test_non_convertable_values(self):
867-
msg = "Could not convert foo to numeric"
876+
msg = "Could not convert string 'foo' to numeric"
868877
with pytest.raises(TypeError, match=msg):
869878
nanops._ensure_numeric("foo")
870879

0 commit comments

Comments
 (0)