Skip to content

CLN: pass dtype to from_sequence explicitly #56506

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -1071,7 +1071,7 @@ def shift(self, periods: int = 1, fill_value: object = None) -> IntervalArray:
fill_value = Index(self._left, copy=False)._na_value
empty = IntervalArray.from_breaks([fill_value] * (empty_len + 1))
else:
empty = self._from_sequence([fill_value] * empty_len)
empty = self._from_sequence([fill_value] * empty_len, dtype=self.dtype)

if periods > 0:
a = empty
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def __len__(self) -> int:
return len(self._pa_array)

@classmethod
def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False):
def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
from pandas.core.arrays.masked import BaseMaskedArray

_chk_pyarrow_available()
Expand Down
29 changes: 15 additions & 14 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
ensure_object,
is_bool,
is_bool_dtype,
is_extension_array_dtype,
is_float_dtype,
is_integer,
is_integer_dtype,
Expand Down Expand Up @@ -1385,20 +1384,22 @@ def _maybe_coerce_merge_keys(self) -> None:
if lk.dtype.kind == rk.dtype.kind:
continue

if is_extension_array_dtype(lk.dtype) and not is_extension_array_dtype(
rk.dtype
if isinstance(lk.dtype, ExtensionDtype) and not isinstance(
rk.dtype, ExtensionDtype
):
ct = find_common_type([lk.dtype, rk.dtype])
if is_extension_array_dtype(ct):
rk = ct.construct_array_type()._from_sequence(rk) # type: ignore[union-attr]
if isinstance(ct, ExtensionDtype):
com_cls = ct.construct_array_type()
rk = com_cls._from_sequence(rk, dtype=ct, copy=False)
else:
rk = rk.astype(ct) # type: ignore[arg-type]
elif is_extension_array_dtype(rk.dtype):
rk = rk.astype(ct)
elif isinstance(rk.dtype, ExtensionDtype):
ct = find_common_type([lk.dtype, rk.dtype])
if is_extension_array_dtype(ct):
lk = ct.construct_array_type()._from_sequence(lk) # type: ignore[union-attr]
if isinstance(ct, ExtensionDtype):
com_cls = ct.construct_array_type()
lk = com_cls._from_sequence(lk, dtype=ct, copy=False)
else:
lk = lk.astype(ct) # type: ignore[arg-type]
lk = lk.astype(ct)

# check whether ints and floats
if is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype):
Expand Down Expand Up @@ -2500,15 +2501,15 @@ def _convert_arrays_and_get_rizer_klass(
if not isinstance(lk, ExtensionArray):
lk = cls._from_sequence(lk, dtype=dtype, copy=False)
else:
lk = lk.astype(dtype)
lk = lk.astype(dtype, copy=False)

if not isinstance(rk, ExtensionArray):
rk = cls._from_sequence(rk, dtype=dtype, copy=False)
else:
rk = rk.astype(dtype)
rk = rk.astype(dtype, copy=False)
else:
lk = lk.astype(dtype)
rk = rk.astype(dtype)
lk = lk.astype(dtype, copy=False)
rk = rk.astype(dtype, copy=False)
if isinstance(lk, BaseMaskedArray):
# Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]";
# expected type "Type[object]"
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/arrays/boolean/test_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,8 @@ def test_coerce_to_numpy_array():

def test_to_boolean_array_from_strings():
result = BooleanArray._from_sequence_of_strings(
np.array(["True", "False", "1", "1.0", "0", "0.0", np.nan], dtype=object)
np.array(["True", "False", "1", "1.0", "0", "0.0", np.nan], dtype=object),
dtype="boolean",
)
expected = BooleanArray(
np.array([True, False, True, True, False, False, False]),
Expand All @@ -254,7 +255,7 @@ def test_to_boolean_array_from_strings():

def test_to_boolean_array_from_strings_invalid_string():
with pytest.raises(ValueError, match="cannot be cast"):
BooleanArray._from_sequence_of_strings(["donkey"])
BooleanArray._from_sequence_of_strings(["donkey"], dtype="boolean")


@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/arrays/categorical/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -755,12 +755,12 @@ def test_categorical_extension_array_nullable(self, nulls_fixture):

def test_from_sequence_copy(self):
cat = Categorical(np.arange(5).repeat(2))
result = Categorical._from_sequence(cat, dtype=None, copy=False)
result = Categorical._from_sequence(cat, dtype=cat.dtype, copy=False)

# more generally, we'd be OK with a view
assert result._codes is cat._codes

result = Categorical._from_sequence(cat, dtype=None, copy=True)
result = Categorical._from_sequence(cat, dtype=cat.dtype, copy=True)

assert not tm.shares_memory(result, cat)

Expand Down
1 change: 1 addition & 0 deletions pandas/tests/arrays/datetimes/test_cumulative.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def test_accumulators_freq(self):
"2000-01-02",
"2000-01-03",
],
dtype="M8[ns]",
)
tm.assert_datetime_array_equal(result, expected)

Expand Down
14 changes: 8 additions & 6 deletions pandas/tests/arrays/integer/test_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,32 +175,34 @@ def test_to_integer_array_dtype_keyword(constructor):


def test_to_integer_array_float():
result = IntegerArray._from_sequence([1.0, 2.0])
result = IntegerArray._from_sequence([1.0, 2.0], dtype="Int64")
expected = pd.array([1, 2], dtype="Int64")
tm.assert_extension_array_equal(result, expected)

with pytest.raises(TypeError, match="cannot safely cast non-equivalent"):
IntegerArray._from_sequence([1.5, 2.0])
IntegerArray._from_sequence([1.5, 2.0], dtype="Int64")

# for float dtypes, the itemsize is not preserved
result = IntegerArray._from_sequence(np.array([1.0, 2.0], dtype="float32"))
result = IntegerArray._from_sequence(
np.array([1.0, 2.0], dtype="float32"), dtype="Int64"
)
assert result.dtype == Int64Dtype()


def test_to_integer_array_str():
result = IntegerArray._from_sequence(["1", "2", None])
result = IntegerArray._from_sequence(["1", "2", None], dtype="Int64")
expected = pd.array([1, 2, np.nan], dtype="Int64")
tm.assert_extension_array_equal(result, expected)

with pytest.raises(
ValueError, match=r"invalid literal for int\(\) with base 10: .*"
):
IntegerArray._from_sequence(["1", "2", ""])
IntegerArray._from_sequence(["1", "2", ""], dtype="Int64")

with pytest.raises(
ValueError, match=r"invalid literal for int\(\) with base 10: .*"
):
IntegerArray._from_sequence(["1.5", "2.0"])
IntegerArray._from_sequence(["1.5", "2.0"], dtype="Int64")


@pytest.mark.parametrize(
Expand Down
53 changes: 32 additions & 21 deletions pandas/tests/arrays/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,11 @@ def test_dt64_array(dtype_unit):
None,
NumpyExtensionArray(np.array([], dtype=object)),
),
(np.array([1, 2], dtype="int64"), None, IntegerArray._from_sequence([1, 2])),
(
np.array([1, 2], dtype="int64"),
None,
IntegerArray._from_sequence([1, 2], dtype="Int64"),
),
(
np.array([1.0, 2.0], dtype="float64"),
None,
Expand Down Expand Up @@ -284,7 +288,7 @@ def test_array_copy():
# datetime
(
[pd.Timestamp("2000"), pd.Timestamp("2001")],
DatetimeArray._from_sequence(["2000", "2001"]),
DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
),
(
[datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)],
Expand Down Expand Up @@ -319,7 +323,7 @@ def test_array_copy():
# timedelta
(
[pd.Timedelta("1h"), pd.Timedelta("2h")],
TimedeltaArray._from_sequence(["1h", "2h"]),
TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
),
(
np.array([1, 2], dtype="m8[ns]"),
Expand All @@ -330,35 +334,42 @@ def test_array_copy():
TimedeltaArray(np.array([1, 2], dtype="m8[us]")),
),
# integer
([1, 2], IntegerArray._from_sequence([1, 2])),
([1, None], IntegerArray._from_sequence([1, None])),
([1, pd.NA], IntegerArray._from_sequence([1, pd.NA])),
([1, np.nan], IntegerArray._from_sequence([1, np.nan])),
([1, 2], IntegerArray._from_sequence([1, 2], dtype="Int64")),
([1, None], IntegerArray._from_sequence([1, None], dtype="Int64")),
([1, pd.NA], IntegerArray._from_sequence([1, pd.NA], dtype="Int64")),
([1, np.nan], IntegerArray._from_sequence([1, np.nan], dtype="Int64")),
# float
([0.1, 0.2], FloatingArray._from_sequence([0.1, 0.2])),
([0.1, None], FloatingArray._from_sequence([0.1, pd.NA])),
([0.1, np.nan], FloatingArray._from_sequence([0.1, pd.NA])),
([0.1, pd.NA], FloatingArray._from_sequence([0.1, pd.NA])),
([0.1, 0.2], FloatingArray._from_sequence([0.1, 0.2], dtype="Float64")),
([0.1, None], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
([0.1, np.nan], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
([0.1, pd.NA], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
# integer-like float
([1.0, 2.0], FloatingArray._from_sequence([1.0, 2.0])),
([1.0, None], FloatingArray._from_sequence([1.0, pd.NA])),
([1.0, np.nan], FloatingArray._from_sequence([1.0, pd.NA])),
([1.0, pd.NA], FloatingArray._from_sequence([1.0, pd.NA])),
([1.0, 2.0], FloatingArray._from_sequence([1.0, 2.0], dtype="Float64")),
([1.0, None], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
([1.0, np.nan], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
([1.0, pd.NA], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
# mixed-integer-float
([1, 2.0], FloatingArray._from_sequence([1.0, 2.0])),
([1, np.nan, 2.0], FloatingArray._from_sequence([1.0, None, 2.0])),
([1, 2.0], FloatingArray._from_sequence([1.0, 2.0], dtype="Float64")),
(
[1, np.nan, 2.0],
FloatingArray._from_sequence([1.0, None, 2.0], dtype="Float64"),
),
# string
(
["a", "b"],
pd.StringDtype().construct_array_type()._from_sequence(["a", "b"]),
pd.StringDtype()
.construct_array_type()
._from_sequence(["a", "b"], dtype=pd.StringDtype()),
),
(
["a", None],
pd.StringDtype().construct_array_type()._from_sequence(["a", None]),
pd.StringDtype()
.construct_array_type()
._from_sequence(["a", None], dtype=pd.StringDtype()),
),
# Boolean
([True, False], BooleanArray._from_sequence([True, False])),
([True, None], BooleanArray._from_sequence([True, None])),
([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")),
([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")),
],
)
def test_array_inference(data, expected):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/base/constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def test_from_sequence_from_cls(self, data):

def test_array_from_scalars(self, data):
scalars = [data[0], data[1], data[2]]
result = data._from_sequence(scalars)
result = data._from_sequence(scalars, dtype=data.dtype)
assert isinstance(result, type(data))

def test_series_constructor(self, data):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ def test_duplicated(self, data, keep):
@pytest.mark.parametrize("box", [pd.Series, lambda x: x])
@pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique])
def test_unique(self, data, box, method):
duplicated = box(data._from_sequence([data[0], data[0]]))
duplicated = box(data._from_sequence([data[0], data[0]], dtype=data.dtype))

result = method(duplicated)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/list/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def __init__(self, values, dtype=None, copy=False) -> None:
self.data = values

@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
data = np.empty(len(scalars), dtype=object)
data[:] = scalars
return cls(data)
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,11 +294,13 @@ def test_from_dtype(self, data, request):
def test_from_sequence_pa_array(self, data):
# https://github.com/pandas-dev/pandas/pull/47034#discussion_r955500784
# data._pa_array = pa.ChunkedArray
result = type(data)._from_sequence(data._pa_array)
result = type(data)._from_sequence(data._pa_array, dtype=data.dtype)
tm.assert_extension_array_equal(result, data)
assert isinstance(result._pa_array, pa.ChunkedArray)

result = type(data)._from_sequence(data._pa_array.combine_chunks())
result = type(data)._from_sequence(
data._pa_array.combine_chunks(), dtype=data.dtype
)
tm.assert_extension_array_equal(result, data)
assert isinstance(result._pa_array, pa.ChunkedArray)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexes/timedeltas/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test_array_of_dt64_nat_raises(self):
TimedeltaIndex(arr)

with pytest.raises(TypeError, match=msg):
TimedeltaArray._from_sequence(arr)
TimedeltaArray._from_sequence(arr, dtype="m8[ns]")

with pytest.raises(TypeError, match=msg):
to_timedelta(arr)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/tools/test_to_timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def test_to_timedelta_oob_non_nano(self):
TimedeltaIndex(arr)

with pytest.raises(OutOfBoundsTimedelta, match=msg):
TimedeltaArray._from_sequence(arr)
TimedeltaArray._from_sequence(arr, dtype="m8[s]")

@pytest.mark.parametrize(
"arg", [np.arange(10).reshape(2, 5), pd.DataFrame(np.arange(10).reshape(2, 5))]
Expand Down