From 3bb9750bd789eec0d39294fdbbaef22447291128 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 28 Mar 2021 14:52:13 +0100 Subject: [PATCH 1/3] TST: [ArrowStringArray] more parameterised testing - part 1 --- pandas/_libs/lib.pyx | 2 +- pandas/conftest.py | 18 ++++++++++++++++++ pandas/tests/dtypes/test_common.py | 5 ++++- pandas/tests/dtypes/test_inference.py | 4 ++-- pandas/tests/extension/json/array.py | 3 ++- pandas/tests/frame/methods/test_astype.py | 6 ++++++ .../tests/frame/methods/test_combine_first.py | 10 ++++++---- pandas/tests/frame/test_constructors.py | 6 +++--- pandas/tests/tools/test_to_numeric.py | 4 ++-- 9 files changed, 44 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 94a4d586b4f13..9ea10651ac35e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1109,7 +1109,7 @@ _TYPE_MAP = { "complex64": "complex", "complex128": "complex", "c": "complex", - "string": "string", + str: "string", "S": "bytes", "U": "string", "bool": "boolean", diff --git a/pandas/conftest.py b/pandas/conftest.py index f3356d2998ff8..aa43746d0e7d5 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1131,6 +1131,24 @@ def string_dtype(request): return request.param +@pytest.fixture( + params=[ + "string", + pytest.param( + "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + ), + ] +) +def nullable_string_dtype(request): + """ + Parametrized fixture for string dtypes. + + * 'string' + * 'arrow_string' + """ + return request.param + + @pytest.fixture(params=tm.BYTES_DTYPES) def bytes_dtype(request): """ diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 406aec9d4c16e..616f46624bfd7 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -281,7 +281,10 @@ def test_is_string_dtype(): assert com.is_string_dtype(object) assert com.is_string_dtype(np.array(["a", "b"])) assert com.is_string_dtype(pd.StringDtype()) - assert com.is_string_dtype(pd.array(["a", "b"], dtype="string")) + + +def test_is_string_dtype_nullable(nullable_string_dtype): + assert com.is_string_dtype(pd.array(["a", "b"], dtype=nullable_string_dtype)) integer_dtypes: List = [] diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index b3c6015475674..907991b97ead1 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1267,9 +1267,9 @@ def test_interval(self): @pytest.mark.parametrize("klass", [pd.array, Series]) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("data", [["a", "b", "c"], ["a", "b", pd.NA]]) - def test_string_dtype(self, data, skipna, klass): + def test_string_dtype(self, data, skipna, klass, nullable_string_dtype): # StringArray - val = klass(data, dtype="string") + val = klass(data, dtype=nullable_string_dtype) inferred = lib.infer_dtype(val, skipna=skipna) assert inferred == "string" diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index a4fedd9a4c5da..d017649d35e87 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -38,6 +38,7 @@ ExtensionDtype, ) from pandas.api.types import is_bool_dtype +from pandas.core.arrays.string_arrow import ArrowStringDtype class JSONDtype(ExtensionDtype): @@ -193,7 +194,7 @@ def astype(self, dtype, copy=True): if copy: return self.copy() return self - elif isinstance(dtype, StringDtype): + elif isinstance(dtype, (StringDtype, ArrowStringDtype)): value = self.astype(str) # numpy doesn'y like nested dicts return dtype.construct_array_type()._from_sequence(value, copy=False) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 161fe7990a327..20c2a25633097 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Categorical, @@ -567,6 +569,10 @@ def test_astype_empty_dtype_dict(self): "df", [ DataFrame(Series(["x", "y", "z"], dtype="string")), + pytest.param( + DataFrame(Series(["x", "y", "z"], dtype="arrow_string")), + marks=td.skip_if_no("pyarrow", min_version="1.0.0"), + ), DataFrame(Series(["x", "y", "z"], dtype="category")), DataFrame(Series(3 * [Timestamp("2020-01-01", tz="UTC")])), DataFrame(Series(3 * [Interval(0, 1)])), diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index b4d8a53e4b23f..dd91b32c8eb8c 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -381,15 +381,17 @@ def test_combine_first_with_asymmetric_other(self, val): tm.assert_frame_equal(res, exp) - def test_combine_first_string_dtype_only_na(self): + def test_combine_first_string_dtype_only_na(self, nullable_string_dtype): # GH: 37519 - df = DataFrame({"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string") - df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype="string") + df = DataFrame( + {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype + ) + df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype=nullable_string_dtype) df.set_index(["a", "b"], inplace=True) df2.set_index(["a", "b"], inplace=True) result = df.combine_first(df2) expected = DataFrame( - {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string" + {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype ).set_index(["a", "b"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index b76a44b3c86be..a62f2b0426911 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1649,10 +1649,10 @@ def test_constructor_empty_with_string_dtype(self): df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5") tm.assert_frame_equal(df, expected) - def test_constructor_empty_with_string_extension(self): + def test_constructor_empty_with_string_extension(self, nullable_string_dtype): # GH 34915 - expected = DataFrame(index=[], columns=["c1"], dtype="string") - df = DataFrame(columns=["c1"], dtype="string") + expected = DataFrame(index=[], columns=["c1"], dtype=nullable_string_dtype) + df = DataFrame(columns=["c1"], dtype=nullable_string_dtype) tm.assert_frame_equal(df, expected) def test_constructor_single_value(self): diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 65aa189a3e965..30d6436c7e250 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -725,9 +725,9 @@ def test_precision_float_conversion(strrep): (["1", "2", "3.5"], Series([1, 2, 3.5])), ], ) -def test_to_numeric_from_nullable_string(values, expected): +def test_to_numeric_from_nullable_string(values, nullable_string_dtype, expected): # https://github.com/pandas-dev/pandas/issues/37262 - s = Series(values, dtype="string") + s = Series(values, dtype=nullable_string_dtype) result = to_numeric(s) tm.assert_series_equal(result, expected) From 98b3a5f13fb082ea00780bb9ddd3135e69706d21 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 29 Mar 2021 16:05:06 +0100 Subject: [PATCH 2/3] revert changes to pandas/tests/frame/methods/test_astype.py --- pandas/tests/frame/methods/test_astype.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 20c2a25633097..161fe7990a327 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -3,8 +3,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( Categorical, @@ -569,10 +567,6 @@ def test_astype_empty_dtype_dict(self): "df", [ DataFrame(Series(["x", "y", "z"], dtype="string")), - pytest.param( - DataFrame(Series(["x", "y", "z"], dtype="arrow_string")), - marks=td.skip_if_no("pyarrow", min_version="1.0.0"), - ), DataFrame(Series(["x", "y", "z"], dtype="category")), DataFrame(Series(3 * [Timestamp("2020-01-01", tz="UTC")])), DataFrame(Series(3 * [Interval(0, 1)])), From c095cd4d37a6aea8a1a09cf598bb6eb263100c08 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 29 Mar 2021 20:32:08 +0100 Subject: [PATCH 3/3] undo inference change --- pandas/_libs/lib.pyx | 2 +- pandas/tests/dtypes/test_inference.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 9ea10651ac35e..94a4d586b4f13 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1109,7 +1109,7 @@ _TYPE_MAP = { "complex64": "complex", "complex128": "complex", "c": "complex", - str: "string", + "string": "string", "S": "bytes", "U": "string", "bool": "boolean", diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 907991b97ead1..b3c6015475674 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1267,9 +1267,9 @@ def test_interval(self): @pytest.mark.parametrize("klass", [pd.array, Series]) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("data", [["a", "b", "c"], ["a", "b", pd.NA]]) - def test_string_dtype(self, data, skipna, klass, nullable_string_dtype): + def test_string_dtype(self, data, skipna, klass): # StringArray - val = klass(data, dtype=nullable_string_dtype) + val = klass(data, dtype="string") inferred = lib.infer_dtype(val, skipna=skipna) assert inferred == "string"