Skip to content

Commit 494c108

Browse files
Backport PR pandas-dev#31877: BUG: fix infer_dtype for StringDtype (pandas-dev#31926)
1 parent 0db3112 commit 494c108

File tree

6 files changed

+36
-1
lines changed

6 files changed

+36
-1
lines changed

doc/source/whatsnew/v1.0.2.rst

+2
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,10 @@ Bug fixes
3434

3535
- Using ``pd.NA`` with :meth:`DataFrame.to_json` now correctly outputs a null value instead of an empty object (:issue:`31615`)
3636

37+
3738
**Experimental dtypes**
3839

40+
- Fix bug in :meth:`DataFrame.convert_dtypes` for columns that were already using the ``"string"`` dtype (:issue:`31731`).
3941
- Fixed bug in setting values using a slice indexer with string dtype (:issue:`31772`)
4042

4143
.. ---------------------------------------------------------------------------

pandas/_libs/lib.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -961,7 +961,7 @@ _TYPE_MAP = {
961961
'complex64': 'complex',
962962
'complex128': 'complex',
963963
'c': 'complex',
964-
'string': 'bytes',
964+
'string': 'string',
965965
'S': 'bytes',
966966
'U': 'string',
967967
'bool': 'boolean',

pandas/conftest.py

+2
Original file line numberDiff line numberDiff line change
@@ -731,6 +731,7 @@ def any_numpy_dtype(request):
731731
# categoricals are handled separately
732732
_any_skipna_inferred_dtype = [
733733
("string", ["a", np.nan, "c"]),
734+
("string", ["a", pd.NA, "c"]),
734735
("bytes", [b"a", np.nan, b"c"]),
735736
("empty", [np.nan, np.nan, np.nan]),
736737
("empty", []),
@@ -741,6 +742,7 @@ def any_numpy_dtype(request):
741742
("mixed-integer-float", [1, np.nan, 2.0]),
742743
("decimal", [Decimal(1), np.nan, Decimal(2)]),
743744
("boolean", [True, np.nan, False]),
745+
("boolean", [True, pd.NA, False]),
744746
("datetime64", [np.datetime64("2013-01-01"), np.nan, np.datetime64("2018-01-01")]),
745747
("datetime", [pd.Timestamp("20130101"), np.nan, pd.Timestamp("20180101")]),
746748
("date", [date(2013, 1, 1), np.nan, date(2018, 1, 1)]),

pandas/tests/dtypes/test_inference.py

+18
Original file line numberDiff line numberDiff line change
@@ -1200,6 +1200,24 @@ def test_interval(self):
12001200
inferred = lib.infer_dtype(pd.Series(idx), skipna=False)
12011201
assert inferred == "interval"
12021202

1203+
@pytest.mark.parametrize("klass", [pd.array, pd.Series])
1204+
@pytest.mark.parametrize("skipna", [True, False])
1205+
@pytest.mark.parametrize("data", [["a", "b", "c"], ["a", "b", pd.NA]])
1206+
def test_string_dtype(self, data, skipna, klass):
1207+
# StringArray
1208+
val = klass(data, dtype="string")
1209+
inferred = lib.infer_dtype(val, skipna=skipna)
1210+
assert inferred == "string"
1211+
1212+
@pytest.mark.parametrize("klass", [pd.array, pd.Series])
1213+
@pytest.mark.parametrize("skipna", [True, False])
1214+
@pytest.mark.parametrize("data", [[True, False, True], [True, False, pd.NA]])
1215+
def test_boolean_dtype(self, data, skipna, klass):
1216+
# BooleanArray
1217+
val = klass(data, dtype="boolean")
1218+
inferred = lib.infer_dtype(val, skipna=skipna)
1219+
assert inferred == "boolean"
1220+
12031221

12041222
class TestNumberScalar:
12051223
def test_is_number(self):

pandas/tests/series/test_convert_dtypes.py

+9
Original file line numberDiff line numberDiff line change
@@ -246,3 +246,12 @@ def test_convert_dtypes(self, data, maindtype, params, answerdict):
246246

247247
# Make sure original not changed
248248
tm.assert_series_equal(series, copy)
249+
250+
def test_convert_string_dtype(self):
251+
# https://github.com/pandas-dev/pandas/issues/31731 -> converting columns
252+
# that are already string dtype
253+
df = pd.DataFrame(
254+
{"A": ["a", "b", pd.NA], "B": ["ä", "ö", "ü"]}, dtype="string"
255+
)
256+
result = df.convert_dtypes()
257+
tm.assert_frame_equal(df, result)

pandas/tests/test_strings.py

+4
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from pandas._libs import lib
99

10+
import pandas as pd
1011
from pandas import DataFrame, Index, MultiIndex, Series, concat, isna, notna
1112
import pandas._testing as tm
1213
import pandas.core.strings as strings
@@ -207,6 +208,9 @@ def test_api_per_dtype(self, index_or_series, dtype, any_skipna_inferred_dtype):
207208
box = index_or_series
208209
inferred_dtype, values = any_skipna_inferred_dtype
209210

211+
if dtype == "category" and len(values) and values[1] is pd.NA:
212+
pytest.xfail(reason="Categorical does not yet support pd.NA")
213+
210214
t = box(values, dtype=dtype) # explicit dtype to avoid casting
211215

212216
# TODO: get rid of these xfails

0 commit comments

Comments
 (0)