Skip to content

BUG: fix infer_dtype for StringDtype #31877

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Feb 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.0.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,10 @@ Bug fixes

- Using ``pd.NA`` with :meth:`DataFrame.to_json` now correctly outputs a null value instead of an empty object (:issue:`31615`)


**Experimental dtypes**

- Fix bug in :meth:`DataFrame.convert_dtypes` for columns that were already using the ``"string"`` dtype (:issue:`31731`).
- Fixed bug in setting values using a slice indexer with string dtype (:issue:`31772`)

.. ---------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1005,7 +1005,7 @@ _TYPE_MAP = {
'complex64': 'complex',
'complex128': 'complex',
'c': 'complex',
'string': 'bytes',
'string': 'string',
'S': 'bytes',
'U': 'string',
'bool': 'boolean',
Expand Down
2 changes: 2 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -744,6 +744,7 @@ def any_numpy_dtype(request):
# categoricals are handled separately
_any_skipna_inferred_dtype = [
("string", ["a", np.nan, "c"]),
("string", ["a", pd.NA, "c"]),
("bytes", [b"a", np.nan, b"c"]),
("empty", [np.nan, np.nan, np.nan]),
("empty", []),
Expand All @@ -754,6 +755,7 @@ def any_numpy_dtype(request):
("mixed-integer-float", [1, np.nan, 2.0]),
("decimal", [Decimal(1), np.nan, Decimal(2)]),
("boolean", [True, np.nan, False]),
("boolean", [True, pd.NA, False]),
("datetime64", [np.datetime64("2013-01-01"), np.nan, np.datetime64("2018-01-01")]),
("datetime", [pd.Timestamp("20130101"), np.nan, pd.Timestamp("20180101")]),
("date", [date(2013, 1, 1), np.nan, date(2018, 1, 1)]),
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -1200,6 +1200,24 @@ def test_interval(self):
inferred = lib.infer_dtype(pd.Series(idx), skipna=False)
assert inferred == "interval"

@pytest.mark.parametrize("klass", [pd.array, pd.Series])
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("data", [["a", "b", "c"], ["a", "b", pd.NA]])
def test_string_dtype(self, data, skipna, klass):
# StringArray
val = klass(data, dtype="string")
inferred = lib.infer_dtype(val, skipna=skipna)
assert inferred == "string"

@pytest.mark.parametrize("klass", [pd.array, pd.Series])
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("data", [[True, False, True], [True, False, pd.NA]])
def test_boolean_dtype(self, data, skipna, klass):
# BooleanArray
val = klass(data, dtype="boolean")
inferred = lib.infer_dtype(val, skipna=skipna)
assert inferred == "boolean"


class TestNumberScalar:
def test_is_number(self):
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/series/methods/test_convert_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,3 +246,12 @@ def test_convert_dtypes(self, data, maindtype, params, answerdict):

# Make sure original not changed
tm.assert_series_equal(series, copy)

def test_convert_string_dtype(self):
# https://github.com/pandas-dev/pandas/issues/31731 -> converting columns
# that are already string dtype
df = pd.DataFrame(
{"A": ["a", "b", pd.NA], "B": ["ä", "ö", "ü"]}, dtype="string"
)
result = df.convert_dtypes()
tm.assert_frame_equal(df, result)
4 changes: 4 additions & 0 deletions pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from pandas._libs import lib

import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series, concat, isna, notna
import pandas._testing as tm
import pandas.core.strings as strings
Expand Down Expand Up @@ -207,6 +208,9 @@ def test_api_per_dtype(self, index_or_series, dtype, any_skipna_inferred_dtype):
box = index_or_series
inferred_dtype, values = any_skipna_inferred_dtype

if dtype == "category" and len(values) and values[1] is pd.NA:
pytest.xfail(reason="Categorical does not yet support pd.NA")

t = box(values, dtype=dtype) # explicit dtype to avoid casting

# TODO: get rid of these xfails
Expand Down