Skip to content

String dtype: still return nullable NA-variant in object inference (maybe_converts_object) if requested #59487

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2699,16 +2699,16 @@ def maybe_convert_objects(ndarray[object] objects,
seen.object_ = True

elif seen.str_:
if using_string_dtype() and is_string_array(objects, skipna=True):
if convert_to_nullable_dtype and is_string_array(objects, skipna=True):
from pandas.core.arrays.string_ import StringDtype

dtype = StringDtype(na_value=np.nan)
dtype = StringDtype()
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)

elif convert_to_nullable_dtype and is_string_array(objects, skipna=True):
elif using_string_dtype() and is_string_array(objects, skipna=True):
from pandas.core.arrays.string_ import StringDtype

dtype = StringDtype()
dtype = StringDtype(na_value=np.nan)
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
Comment on lines -2702 to 2712
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Essentially this diff is just switching the order of the if/elif blocks, to first check if we want a nullable dtype and only then check if we want the future default string dtype.


seen.object_ = True
Expand Down
10 changes: 4 additions & 6 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,9 @@
from pandas.core.arrays.integer import IntegerArray
import pandas.core.common as com
from pandas.core.construction import (
array as pd_array,
ensure_wrapped_if_datetimelike,
extract_array,
sanitize_array,
)
from pandas.core.indexers import (
check_array_indexer,
Expand Down Expand Up @@ -667,12 +667,10 @@ def _validate_listlike(self, value, allow_object: bool = False):
msg = self._validation_error_message(value, True)
raise TypeError(msg) from err

# Do type inference if necessary up front (after unpacking
# NumpyExtensionArray)
# Do type inference if necessary up front
# e.g. we passed PeriodIndex.values and got an ndarray of Periods
value = extract_array(value, extract_numpy=True)
value = pd_array(value)
value = extract_array(value, extract_numpy=True)
value = sanitize_array(value, index=None)
value = ensure_wrapped_if_datetimelike(value)
Copy link
Member Author

@jorisvandenbossche jorisvandenbossche Aug 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is essentially the default input sanitation/inference we do for the Series constructor (and other places), so switched to use that here instead of pd.array(..) (which doesn't use the default dtypes for all types)

(this mostly impacts the name of the type in the error message)


if is_all_strings(value):
# We got a StringArray
Expand Down
13 changes: 3 additions & 10 deletions pandas/tests/arrays/string_/test_string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import numpy as np
import pytest

from pandas.compat import HAS_PYARROW
import pandas.util._test_decorators as td

import pandas as pd
Expand All @@ -27,20 +26,14 @@ def test_eq_all_na():
tm.assert_extension_array_equal(result, expected)


def test_config(string_storage, request, using_infer_string):
if using_infer_string and string_storage == "python" and HAS_PYARROW:
# string storage with na_value=NaN always uses pyarrow if available
# -> does not yet honor the option
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))

def test_config(string_storage):
with pd.option_context("string_storage", string_storage):
assert StringDtype().storage == string_storage
result = pd.array(["a", "b"])
assert result.dtype.storage == string_storage

dtype = StringDtype(
string_storage, na_value=np.nan if using_infer_string else pd.NA
)
# pd.array(..) by default always returns the NA-variant
dtype = StringDtype(string_storage, na_value=pd.NA)
expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype)
tm.assert_equal(result, expected)

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/base/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def test_value_counts_inferred(index_or_series, using_infer_string):
else:
exp = np.unique(np.array(s_values, dtype=np.object_))
if using_infer_string:
exp = array(exp)
exp = array(exp, dtype="str")
tm.assert_equal(s.unique(), exp)

assert s.nunique() == 4
Expand Down Expand Up @@ -192,7 +192,7 @@ def test_value_counts_bins(index_or_series, using_infer_string):
else:
exp = np.array(["a", "b", np.nan, "d"], dtype=object)
if using_infer_string:
exp = array(exp)
exp = array(exp, dtype="str")
tm.assert_equal(s.unique(), exp)
assert s.nunique() == 3

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/dtypes/cast/test_construct_ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def test_construct_1d_ndarray_preserving_na(
):
result = sanitize_array(values, index=None, dtype=dtype)
if using_infer_string and expected.dtype == object and dtype is None:
tm.assert_extension_array_equal(result, pd.array(expected))
tm.assert_extension_array_equal(result, pd.array(expected, dtype="str"))
else:
tm.assert_numpy_array_equal(result, expected)

Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/io/parser/usecols/test_usecols_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.errors import ParserError

from pandas import (
Expand Down Expand Up @@ -531,7 +529,6 @@ def test_usecols_additional_columns_integer_columns(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_usecols_dtype(all_parsers):
parser = all_parsers
data = """
Expand Down
Loading