Skip to content

REF: re-use sanitize_array in DataFrame._sanitize_columns #41611

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,8 @@ def sanitize_array(
dtype: DtypeObj | None = None,
copy: bool = False,
raise_cast_failure: bool = True,
*,
allow_2d: bool = False,
) -> ArrayLike:
"""
Sanitize input data to an ndarray or ExtensionArray, copy if specified,
Expand All @@ -479,6 +481,8 @@ def sanitize_array(
dtype : np.dtype, ExtensionDtype, or None, default None
copy : bool, default False
raise_cast_failure : bool, default True
allow_2d : bool, default False
If False, raise if we have a 2D Arraylike.

Returns
-------
Expand Down Expand Up @@ -552,7 +556,7 @@ def sanitize_array(
# "ExtensionArray")
subarr = maybe_cast_to_datetime(subarr, dtype) # type: ignore[assignment]

subarr = _sanitize_ndim(subarr, data, dtype, index)
subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d)

if not (
isinstance(subarr.dtype, ExtensionDtype) or isinstance(dtype, ExtensionDtype)
Expand All @@ -570,7 +574,12 @@ def sanitize_array(


def _sanitize_ndim(
result: ArrayLike, data, dtype: DtypeObj | None, index: Index | None
result: ArrayLike,
data,
dtype: DtypeObj | None,
index: Index | None,
*,
allow_2d: bool = False,
) -> ArrayLike:
"""
Ensure we have a 1-dimensional result array.
Expand All @@ -584,6 +593,8 @@ def _sanitize_ndim(

elif result.ndim > 1:
if isinstance(data, np.ndarray):
if allow_2d:
return result
raise ValueError("Data must be 1-dimensional")
if is_object_dtype(dtype) and isinstance(dtype, ExtensionDtype):
# i.e. PandasDtype("O")
Expand Down
31 changes: 3 additions & 28 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@
infer_dtype_from_scalar,
invalidate_string_dtypes,
maybe_box_native,
maybe_convert_platform,
maybe_downcast_to_dtype,
validate_numeric_casting,
)
Expand Down Expand Up @@ -4498,35 +4497,11 @@ def _sanitize_column(self, value) -> ArrayLike:

# We should never get here with DataFrame value
if isinstance(value, Series):
value = _reindex_for_setitem(value, self.index)
return _reindex_for_setitem(value, self.index)

elif isinstance(value, ExtensionArray):
# Explicitly copy here
value = value.copy()
if is_list_like(value):
com.require_length_match(value, self.index)

elif is_sequence(value):
com.require_length_match(value, self.index)

# turn me into an ndarray
if not isinstance(value, (np.ndarray, Index)):
if isinstance(value, list) and len(value) > 0:
value = maybe_convert_platform(value)
else:
value = com.asarray_tuplesafe(value)
elif isinstance(value, Index):
value = value.copy(deep=True)._values
else:
value = value.copy()

# possibly infer to datetimelike
if is_object_dtype(value.dtype):
value = sanitize_array(value, None)

else:
value = construct_1d_arraylike_from_scalar(value, len(self), dtype=None)

return value
return sanitize_array(value, self.index, copy=True, allow_2d=True)

@property
def _series(self):
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/extension/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,25 @@ def _can_hold_element_patched(obj, element) -> bool:
return can_hold_element(obj, element)


orig_assert_attr_equal = tm.assert_attr_equal


def _assert_attr_equal(attr: str, left, right, obj: str = "Attributes"):
"""
patch tm.assert_attr_equal so PandasDtype("object") is closed enough to
np.dtype("object")
"""
if attr == "dtype":
lattr = getattr(left, "dtype", None)
rattr = getattr(right, "dtype", None)
if isinstance(lattr, PandasDtype) and not isinstance(rattr, PandasDtype):
left = left.astype(lattr.numpy_dtype)
elif isinstance(rattr, PandasDtype) and not isinstance(lattr, PandasDtype):
right = right.astype(rattr.numpy_dtype)

orig_assert_attr_equal(attr, left, right, obj)


@pytest.fixture(params=["float", "object"])
def dtype(request):
return PandasDtype(np.dtype(request.param))
Expand All @@ -81,6 +100,7 @@ def allow_in_pandas(monkeypatch):
m.setattr(PandasArray, "_typ", "extension")
m.setattr(managers, "_extract_array", _extract_array_patched)
m.setattr(blocks, "can_hold_element", _can_hold_element_patched)
m.setattr(tm.asserters, "assert_attr_equal", _assert_attr_equal)
yield


Expand Down
56 changes: 25 additions & 31 deletions pandas/tests/indexing/test_partial.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,58 +386,51 @@ def test_partial_set_empty_frame(self):
with pytest.raises(ValueError, match=msg):
df.loc[:, 1] = 1

def test_partial_set_empty_frame2(self):
# these work as they don't really change
# anything but the index
# GH5632
expected = DataFrame(columns=["foo"], index=Index([], dtype="object"))

def f():
df = DataFrame(index=Index([], dtype="object"))
df["foo"] = Series([], dtype="object")
return df
df = DataFrame(index=Index([], dtype="object"))
df["foo"] = Series([], dtype="object")

tm.assert_frame_equal(f(), expected)
tm.assert_frame_equal(df, expected)

def f():
df = DataFrame()
df["foo"] = Series(df.index)
return df
df = DataFrame()
df["foo"] = Series(df.index)

tm.assert_frame_equal(f(), expected)
tm.assert_frame_equal(df, expected)

def f():
df = DataFrame()
df["foo"] = df.index
return df
df = DataFrame()
df["foo"] = df.index

tm.assert_frame_equal(f(), expected)
tm.assert_frame_equal(df, expected)

def test_partial_set_empty_frame3(self):
expected = DataFrame(columns=["foo"], index=Index([], dtype="int64"))
expected["foo"] = expected["foo"].astype("float64")

def f():
df = DataFrame(index=Index([], dtype="int64"))
df["foo"] = []
return df
df = DataFrame(index=Index([], dtype="int64"))
df["foo"] = []

tm.assert_frame_equal(f(), expected)
tm.assert_frame_equal(df, expected)

def f():
df = DataFrame(index=Index([], dtype="int64"))
df["foo"] = Series(np.arange(len(df)), dtype="float64")
return df
df = DataFrame(index=Index([], dtype="int64"))
df["foo"] = Series(np.arange(len(df)), dtype="float64")

tm.assert_frame_equal(f(), expected)
tm.assert_frame_equal(df, expected)

def f():
df = DataFrame(index=Index([], dtype="int64"))
df["foo"] = range(len(df))
return df
def test_partial_set_empty_frame4(self):
df = DataFrame(index=Index([], dtype="int64"))
df["foo"] = range(len(df))

expected = DataFrame(columns=["foo"], index=Index([], dtype="int64"))
expected["foo"] = expected["foo"].astype("float64")
tm.assert_frame_equal(f(), expected)
# range is int-dtype-like, so we get int64 dtype
expected["foo"] = expected["foo"].astype("int64")
tm.assert_frame_equal(df, expected)

def test_partial_set_empty_frame5(self):
df = DataFrame()
tm.assert_index_equal(df.columns, Index([], dtype=object))
df2 = DataFrame()
Expand All @@ -446,6 +439,7 @@ def f():
tm.assert_frame_equal(df, DataFrame([[1]], index=["foo"], columns=[1]))
tm.assert_frame_equal(df, df2)

def test_partial_set_empty_frame_no_index(self):
# no index to start
expected = DataFrame({0: Series(1, index=range(4))}, columns=["A", "B", 0])

Expand Down