From 3bb9750bd789eec0d39294fdbbaef22447291128 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 28 Mar 2021 14:52:13 +0100 Subject: [PATCH 01/11] TST: [ArrowStringArray] more parameterised testing - part 1 --- pandas/_libs/lib.pyx | 2 +- pandas/conftest.py | 18 ++++++++++++++++++ pandas/tests/dtypes/test_common.py | 5 ++++- pandas/tests/dtypes/test_inference.py | 4 ++-- pandas/tests/extension/json/array.py | 3 ++- pandas/tests/frame/methods/test_astype.py | 6 ++++++ .../tests/frame/methods/test_combine_first.py | 10 ++++++---- pandas/tests/frame/test_constructors.py | 6 +++--- pandas/tests/tools/test_to_numeric.py | 4 ++-- 9 files changed, 44 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 94a4d586b4f13..9ea10651ac35e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1109,7 +1109,7 @@ _TYPE_MAP = { "complex64": "complex", "complex128": "complex", "c": "complex", - "string": "string", + str: "string", "S": "bytes", "U": "string", "bool": "boolean", diff --git a/pandas/conftest.py b/pandas/conftest.py index f3356d2998ff8..aa43746d0e7d5 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1131,6 +1131,24 @@ def string_dtype(request): return request.param +@pytest.fixture( + params=[ + "string", + pytest.param( + "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + ), + ] +) +def nullable_string_dtype(request): + """ + Parametrized fixture for string dtypes. + + * 'string' + * 'arrow_string' + """ + return request.param + + @pytest.fixture(params=tm.BYTES_DTYPES) def bytes_dtype(request): """ diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 406aec9d4c16e..616f46624bfd7 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -281,7 +281,10 @@ def test_is_string_dtype(): assert com.is_string_dtype(object) assert com.is_string_dtype(np.array(["a", "b"])) assert com.is_string_dtype(pd.StringDtype()) - assert com.is_string_dtype(pd.array(["a", "b"], dtype="string")) + + +def test_is_string_dtype_nullable(nullable_string_dtype): + assert com.is_string_dtype(pd.array(["a", "b"], dtype=nullable_string_dtype)) integer_dtypes: List = [] diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index b3c6015475674..907991b97ead1 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1267,9 +1267,9 @@ def test_interval(self): @pytest.mark.parametrize("klass", [pd.array, Series]) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("data", [["a", "b", "c"], ["a", "b", pd.NA]]) - def test_string_dtype(self, data, skipna, klass): + def test_string_dtype(self, data, skipna, klass, nullable_string_dtype): # StringArray - val = klass(data, dtype="string") + val = klass(data, dtype=nullable_string_dtype) inferred = lib.infer_dtype(val, skipna=skipna) assert inferred == "string" diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index a4fedd9a4c5da..d017649d35e87 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -38,6 +38,7 @@ ExtensionDtype, ) from pandas.api.types import is_bool_dtype +from pandas.core.arrays.string_arrow import ArrowStringDtype class JSONDtype(ExtensionDtype): @@ -193,7 +194,7 @@ def astype(self, dtype, copy=True): if copy: return self.copy() return self - elif isinstance(dtype, StringDtype): + elif isinstance(dtype, (StringDtype, ArrowStringDtype)): value = self.astype(str) # numpy doesn'y like nested dicts return dtype.construct_array_type()._from_sequence(value, copy=False) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 161fe7990a327..20c2a25633097 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Categorical, @@ -567,6 +569,10 @@ def test_astype_empty_dtype_dict(self): "df", [ DataFrame(Series(["x", "y", "z"], dtype="string")), + pytest.param( + DataFrame(Series(["x", "y", "z"], dtype="arrow_string")), + marks=td.skip_if_no("pyarrow", min_version="1.0.0"), + ), DataFrame(Series(["x", "y", "z"], dtype="category")), DataFrame(Series(3 * [Timestamp("2020-01-01", tz="UTC")])), DataFrame(Series(3 * [Interval(0, 1)])), diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index b4d8a53e4b23f..dd91b32c8eb8c 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -381,15 +381,17 @@ def test_combine_first_with_asymmetric_other(self, val): tm.assert_frame_equal(res, exp) - def test_combine_first_string_dtype_only_na(self): + def test_combine_first_string_dtype_only_na(self, nullable_string_dtype): # GH: 37519 - df = DataFrame({"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string") - df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype="string") + df = DataFrame( + {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype + ) + df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype=nullable_string_dtype) df.set_index(["a", "b"], inplace=True) df2.set_index(["a", "b"], inplace=True) result = df.combine_first(df2) expected = DataFrame( - {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string" + {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype ).set_index(["a", "b"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index b76a44b3c86be..a62f2b0426911 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1649,10 +1649,10 @@ def test_constructor_empty_with_string_dtype(self): df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5") tm.assert_frame_equal(df, expected) - def test_constructor_empty_with_string_extension(self): + def test_constructor_empty_with_string_extension(self, nullable_string_dtype): # GH 34915 - expected = DataFrame(index=[], columns=["c1"], dtype="string") - df = DataFrame(columns=["c1"], dtype="string") + expected = DataFrame(index=[], columns=["c1"], dtype=nullable_string_dtype) + df = DataFrame(columns=["c1"], dtype=nullable_string_dtype) tm.assert_frame_equal(df, expected) def test_constructor_single_value(self): diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 65aa189a3e965..30d6436c7e250 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -725,9 +725,9 @@ def test_precision_float_conversion(strrep): (["1", "2", "3.5"], Series([1, 2, 3.5])), ], ) -def test_to_numeric_from_nullable_string(values, expected): +def test_to_numeric_from_nullable_string(values, nullable_string_dtype, expected): # https://github.com/pandas-dev/pandas/issues/37262 - s = Series(values, dtype="string") + s = Series(values, dtype=nullable_string_dtype) result = to_numeric(s) tm.assert_series_equal(result, expected) From 98b3a5f13fb082ea00780bb9ddd3135e69706d21 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 29 Mar 2021 16:05:06 +0100 Subject: [PATCH 02/11] revert changes to pandas/tests/frame/methods/test_astype.py --- pandas/tests/frame/methods/test_astype.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 20c2a25633097..161fe7990a327 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -3,8 +3,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( Categorical, @@ -569,10 +567,6 @@ def test_astype_empty_dtype_dict(self): "df", [ DataFrame(Series(["x", "y", "z"], dtype="string")), - pytest.param( - DataFrame(Series(["x", "y", "z"], dtype="arrow_string")), - marks=td.skip_if_no("pyarrow", min_version="1.0.0"), - ), DataFrame(Series(["x", "y", "z"], dtype="category")), DataFrame(Series(3 * [Timestamp("2020-01-01", tz="UTC")])), DataFrame(Series(3 * [Interval(0, 1)])), From c095cd4d37a6aea8a1a09cf598bb6eb263100c08 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 29 Mar 2021 20:32:08 +0100 Subject: [PATCH 03/11] undo inference change --- pandas/_libs/lib.pyx | 2 +- pandas/tests/dtypes/test_inference.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 9ea10651ac35e..94a4d586b4f13 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1109,7 +1109,7 @@ _TYPE_MAP = { "complex64": "complex", "complex128": "complex", "c": "complex", - str: "string", + "string": "string", "S": "bytes", "U": "string", "bool": "boolean", diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 907991b97ead1..b3c6015475674 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1267,9 +1267,9 @@ def test_interval(self): @pytest.mark.parametrize("klass", [pd.array, Series]) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("data", [["a", "b", "c"], ["a", "b", pd.NA]]) - def test_string_dtype(self, data, skipna, klass, nullable_string_dtype): + def test_string_dtype(self, data, skipna, klass): # StringArray - val = klass(data, dtype=nullable_string_dtype) + val = klass(data, dtype="string") inferred = lib.infer_dtype(val, skipna=skipna) assert inferred == "string" From f337018c5adcefaf7a5fadec6b13835125b9937d Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 1 Apr 2021 14:11:31 +0100 Subject: [PATCH 04/11] undo unrelated changes --- pandas/conftest.py | 2 ++ pandas/tests/dtypes/test_common.py | 5 +---- pandas/tests/extension/base/casting.py | 6 +++--- pandas/tests/extension/json/array.py | 3 +-- pandas/tests/frame/methods/test_combine_first.py | 10 ++++------ pandas/tests/frame/test_constructors.py | 6 +++--- pandas/tests/tools/test_to_numeric.py | 4 ++-- 7 files changed, 16 insertions(+), 20 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index aa43746d0e7d5..03a6b610acdb8 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1146,6 +1146,8 @@ def nullable_string_dtype(request): * 'string' * 'arrow_string' """ + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + return request.param diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 616f46624bfd7..406aec9d4c16e 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -281,10 +281,7 @@ def test_is_string_dtype(): assert com.is_string_dtype(object) assert com.is_string_dtype(np.array(["a", "b"])) assert com.is_string_dtype(pd.StringDtype()) - - -def test_is_string_dtype_nullable(nullable_string_dtype): - assert com.is_string_dtype(pd.array(["a", "b"], dtype=nullable_string_dtype)) + assert com.is_string_dtype(pd.array(["a", "b"], dtype="string")) integer_dtypes: List = [] diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 7c5ef5b3b27d3..47f4f7585243d 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -43,10 +43,10 @@ def test_astype_str(self, data): expected = pd.Series([str(x) for x in data[:5]], dtype=str) self.assert_series_equal(result, expected) - def test_astype_string(self, data): + def test_astype_string(self, data, nullable_string_dtype): # GH-33465 - result = pd.Series(data[:5]).astype("string") - expected = pd.Series([str(x) for x in data[:5]], dtype="string") + result = pd.Series(data[:5]).astype(nullable_string_dtype) + expected = pd.Series([str(x) for x in data[:5]], dtype=nullable_string_dtype) self.assert_series_equal(result, expected) def test_to_numpy(self, data): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index a63c849d25a9f..6c1161294dd17 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -39,7 +39,6 @@ ExtensionDtype, ) from pandas.api.types import is_bool_dtype -from pandas.core.arrays.string_arrow import ArrowStringDtype class JSONDtype(ExtensionDtype): @@ -195,7 +194,7 @@ def astype(self, dtype, copy=True): if copy: return self.copy() return self - elif isinstance(dtype, (StringDtype, ArrowStringDtype)): + elif isinstance(dtype, StringDtype): value = self.astype(str) # numpy doesn'y like nested dicts return dtype.construct_array_type()._from_sequence(value, copy=False) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index dd91b32c8eb8c..b4d8a53e4b23f 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -381,17 +381,15 @@ def test_combine_first_with_asymmetric_other(self, val): tm.assert_frame_equal(res, exp) - def test_combine_first_string_dtype_only_na(self, nullable_string_dtype): + def test_combine_first_string_dtype_only_na(self): # GH: 37519 - df = DataFrame( - {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype - ) - df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype=nullable_string_dtype) + df = DataFrame({"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string") + df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype="string") df.set_index(["a", "b"], inplace=True) df2.set_index(["a", "b"], inplace=True) result = df.combine_first(df2) expected = DataFrame( - {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype + {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string" ).set_index(["a", "b"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 869255505eb74..fc8d82b9e00b2 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1654,10 +1654,10 @@ def test_constructor_empty_with_string_dtype(self): df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5") tm.assert_frame_equal(df, expected) - def test_constructor_empty_with_string_extension(self, nullable_string_dtype): + def test_constructor_empty_with_string_extension(self): # GH 34915 - expected = DataFrame(index=[], columns=["c1"], dtype=nullable_string_dtype) - df = DataFrame(columns=["c1"], dtype=nullable_string_dtype) + expected = DataFrame(index=[], columns=["c1"], dtype="string") + df = DataFrame(columns=["c1"], dtype="string") tm.assert_frame_equal(df, expected) def test_constructor_single_value(self): diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 30d6436c7e250..65aa189a3e965 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -725,9 +725,9 @@ def test_precision_float_conversion(strrep): (["1", "2", "3.5"], Series([1, 2, 3.5])), ], ) -def test_to_numeric_from_nullable_string(values, nullable_string_dtype, expected): +def test_to_numeric_from_nullable_string(values, expected): # https://github.com/pandas-dev/pandas/issues/37262 - s = Series(values, dtype=nullable_string_dtype) + s = Series(values, dtype="string") result = to_numeric(s) tm.assert_series_equal(result, expected) From d861895d027ce1859f900fa5899bc4f8f5bf1ed2 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 1 Apr 2021 14:54:34 +0100 Subject: [PATCH 05/11] StringArray --- pandas/core/arrays/string_.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 0a2893ac49a49..bb4db32aa6e66 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -306,13 +306,16 @@ def __setitem__(self, key, value): super().__setitem__(key, value) def astype(self, dtype, copy=True): + from pandas.core.arrays.string_arrow import ArrowStringDtype + dtype = pandas_dtype(dtype) if is_dtype_equal(dtype, self.dtype): if copy: return self.copy() return self - + elif isinstance(dtype, ArrowStringDtype): + return ArrowStringDtype.construct_array_type()._from_sequence(self) elif isinstance(dtype, _IntegerDtype): arr = self._ndarray.copy() mask = self.isna() From dd59832feddb3e0376e507b83dadfda81c11c936 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 16 Apr 2021 15:08:13 +0100 Subject: [PATCH 06/11] revert changes to StringArray.astype. fixed in #40450 --- pandas/core/arrays/string_.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index a57c16ec9d7c0..f5bfda60f4b67 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -308,16 +308,12 @@ def __setitem__(self, key, value): super().__setitem__(key, value) def astype(self, dtype, copy=True): - from pandas.core.arrays.string_arrow import ArrowStringDtype - dtype = pandas_dtype(dtype) if is_dtype_equal(dtype, self.dtype): if copy: return self.copy() return self - elif isinstance(dtype, ArrowStringDtype): - return ArrowStringDtype.construct_array_type()._from_sequence(self) elif isinstance(dtype, _IntegerDtype): arr = self._ndarray.copy() mask = self.isna() From 8dddaef13df9a7a42fd2e328e77af5f47909a177 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 17 Apr 2021 10:40:33 +0100 Subject: [PATCH 07/11] test_floating.py::TestCasting::test_astype_string[arrow_string-Float32Dtype] --- pandas/core/arrays/string_arrow.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 52bdcd03d3b49..723f6d1c0a0a3 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -227,10 +227,22 @@ def _chk_pyarrow_available(cls) -> None: @classmethod def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False): + from pandas.core.arrays.masked import BaseMaskedArray + cls._chk_pyarrow_available() - # convert non-na-likes to str, and nan-likes to ArrowStringDtype.na_value - scalars = lib.ensure_string_array(scalars, copy=False) - return cls(pa.array(scalars, type=pa.string(), from_pandas=True)) + + if isinstance(scalars, BaseMaskedArray): + # avoid costly conversion to object dtype in ensure_string_array and + # numerical issues with Float32Dtype + na_values = scalars._mask + result = scalars._data + result = lib.ensure_string_array(result, copy=False, convert_na_value=False) + result[na_values] = ArrowStringDtype.na_value + else: + # convert non-na-likes to str, and nan-likes to StringDtype.na_value + result = lib.ensure_string_array(scalars, copy=False) + + return cls(pa.array(result, type=pa.string(), from_pandas=True)) @classmethod def _from_sequence_of_strings( From 770b018eb129a36a091d3aa6d17ebdcc37bf0362 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 17 Apr 2021 11:11:22 +0100 Subject: [PATCH 08/11] test_interval.py::TestCasting::test_astype_string[arrow_string] --- pandas/core/arrays/interval.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 8d3a8feb89d67..50e8cc4c82e0d 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -829,6 +829,7 @@ def astype(self, dtype, copy: bool = True): """ from pandas import Index from pandas.core.arrays.string_ import StringDtype + from pandas.core.arrays.string_arrow import ArrowStringDtype if dtype is not None: dtype = pandas_dtype(dtype) @@ -851,7 +852,7 @@ def astype(self, dtype, copy: bool = True): return self._shallow_copy(new_left, new_right) elif is_categorical_dtype(dtype): return Categorical(np.asarray(self), dtype=dtype) - elif isinstance(dtype, StringDtype): + elif isinstance(dtype, (StringDtype, ArrowStringDtype)): return dtype.construct_array_type()._from_sequence(self, copy=False) # TODO: This try/except will be repeated. From 2d6c835dce6d4ff9adb6064640a0760645527ff9 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 17 Apr 2021 12:01:11 +0100 Subject: [PATCH 09/11] tidy diff --- pandas/core/arrays/string_.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index f5bfda60f4b67..600aacec9c87a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -314,6 +314,7 @@ def astype(self, dtype, copy=True): if copy: return self.copy() return self + elif isinstance(dtype, _IntegerDtype): arr = self._ndarray.copy() mask = self.isna() From e211b75cf8dd720b4b6081b1190d7d32038303ce Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 29 Apr 2021 14:35:32 +0100 Subject: [PATCH 10/11] copy=copy --- pandas/core/arrays/string_arrow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index c6f6361a58a6c..359dc2de2d1a9 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -237,11 +237,11 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) # numerical issues with Float32Dtype na_values = scalars._mask result = scalars._data - result = lib.ensure_string_array(result, copy=False, convert_na_value=False) + result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) result[na_values] = ArrowStringDtype.na_value else: # convert non-na-likes to str, and nan-likes to StringDtype.na_value - result = lib.ensure_string_array(scalars, copy=False) + result = lib.ensure_string_array(scalars, copy=copy) return cls(pa.array(result, type=pa.string(), from_pandas=True)) From 45b93c6cb34b8a4450e9b51c87f6eda04a6c15d9 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 29 Apr 2021 15:05:37 +0100 Subject: [PATCH 11/11] pass mask to pyarrow constructor --- pandas/core/arrays/string_arrow.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 359dc2de2d1a9..1692afbf1fc84 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -238,11 +238,10 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) na_values = scalars._mask result = scalars._data result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - result[na_values] = ArrowStringDtype.na_value - else: - # convert non-na-likes to str, and nan-likes to StringDtype.na_value - result = lib.ensure_string_array(scalars, copy=copy) + return cls(pa.array(result, mask=na_values, type=pa.string())) + # convert non-na-likes to str + result = lib.ensure_string_array(scalars, copy=copy) return cls(pa.array(result, type=pa.string(), from_pandas=True)) @classmethod