Skip to content

REF: simplify _sanitize_column #38459

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Dec 14, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 86 additions & 72 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3160,6 +3160,8 @@ def __setitem__(self, key, value):
self._setitem_frame(key, value)
elif isinstance(key, (Series, np.ndarray, list, Index)):
self._setitem_array(key, value)
elif isinstance(value, DataFrame):
self._set_item_frame_value(key, value)
else:
# set column
self._set_item(key, value)
Expand Down Expand Up @@ -3213,15 +3215,47 @@ def _setitem_frame(self, key, value):
self._check_setitem_copy()
self._where(-key, value, inplace=True)

def _set_item_frame_value(self, key, value: "DataFrame") -> None:
self._ensure_valid_index(value)

# align right-hand-side columns if self.columns
# is multi-index and self[key] is a sub-frame
if isinstance(self.columns, MultiIndex) and key in self.columns:
loc = self.columns.get_loc(key)
if isinstance(loc, (slice, Series, np.ndarray, Index)):
cols = maybe_droplevels(self.columns[loc], key)
if len(cols) and not cols.equals(value.columns):
value = value.reindex(cols, axis=1)

# now align rows
value = _reindex_for_setitem(value, self.index)
value = value.T
self._set_item_mgr(key, value)

def _iset_item_mgr(self, loc: int, value) -> None:
self._mgr.iset(loc, value)
self._clear_item_cache()

def _iset_item(self, loc: int, value, broadcast: bool = False):
def _set_item_mgr(self, key, value):
value = _maybe_atleast_2d(value)

try:
loc = self._info_axis.get_loc(key)
except KeyError:
# This item wasn't present, just insert at end
self._mgr.insert(len(self._info_axis), key, value)
else:
self._iset_item_mgr(loc, value)

# check if we are modifying a copy
# try to set first as we want an invalid
# value exception to occur first
if len(self):
self._check_setitem_copy()

# technically _sanitize_column expects a label, not a position,
# but the behavior is the same as long as we pass broadcast=False
value = self._sanitize_column(loc, value, broadcast=broadcast)
def _iset_item(self, loc: int, value):
value = self._sanitize_column(value)
value = _maybe_atleast_2d(value)
self._iset_item_mgr(loc, value)

# check if we are modifying a copy
Expand All @@ -3240,21 +3274,20 @@ def _set_item(self, key, value):
Series/TimeSeries will be conformed to the DataFrames index to
ensure homogeneity.
"""
value = self._sanitize_column(key, value)
value = self._sanitize_column(value)

try:
loc = self._info_axis.get_loc(key)
except KeyError:
# This item wasn't present, just insert at end
self._mgr.insert(len(self._info_axis), key, value)
else:
self._iset_item_mgr(loc, value)
if (
key in self.columns
and value.ndim == 1
and not is_extension_array_dtype(value)
):
# broadcast across multiple columns if necessary
if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
existing_piece = self[key]
if isinstance(existing_piece, DataFrame):
value = np.tile(value, (len(existing_piece.columns), 1))

# check if we are modifying a copy
# try to set first as we want an invalid
# value exception to occur first
if len(self):
self._check_setitem_copy()
self._set_item_mgr(key, value)

def _set_value(self, index, col, value, takeable: bool = False):
"""
Expand Down Expand Up @@ -3788,7 +3821,8 @@ def insert(self, loc, column, value, allow_duplicates: bool = False) -> None:
"Cannot specify 'allow_duplicates=True' when "
"'self.flags.allows_duplicate_labels' is False."
)
value = self._sanitize_column(column, value, broadcast=False)
value = self._sanitize_column(value)
value = _maybe_atleast_2d(value)
self._mgr.insert(loc, column, value, allow_duplicates=allow_duplicates)

def assign(self, **kwargs) -> DataFrame:
Expand Down Expand Up @@ -3859,63 +3893,24 @@ def assign(self, **kwargs) -> DataFrame:
data[k] = com.apply_if_callable(v, data)
return data

def _sanitize_column(self, key, value, broadcast: bool = True):
def _sanitize_column(self, value):
"""
Ensures new columns (which go into the BlockManager as new blocks) are
always copied and converted into an array.

Parameters
----------
key : object
value : scalar, Series, or array-like
broadcast : bool, default True
If ``key`` matches multiple duplicate column names in the
DataFrame, this parameter indicates whether ``value`` should be
tiled so that the returned array contains a (duplicated) column for
each occurrence of the key. If False, ``value`` will not be tiled.

Returns
-------
numpy.ndarray
"""
self._ensure_valid_index(value)

def reindexer(value):
# reindex if necessary

if value.index.equals(self.index) or not len(self.index):
value = value._values.copy()
else:

# GH 4107
try:
value = value.reindex(self.index)._values
except ValueError as err:
# raised in MultiIndex.from_tuples, see test_insert_error_msmgs
if not value.index.is_unique:
# duplicate axis
raise err

# other
raise TypeError(
"incompatible index of inserted column with frame index"
) from err
return value

# We should never get here with DataFrame value
if isinstance(value, Series):
value = reindexer(value)

elif isinstance(value, DataFrame):
# align right-hand-side columns if self.columns
# is multi-index and self[key] is a sub-frame
if isinstance(self.columns, MultiIndex) and key in self.columns:
loc = self.columns.get_loc(key)
if isinstance(loc, (slice, Series, np.ndarray, Index)):
cols = maybe_droplevels(self.columns[loc], key)
if len(cols) and not cols.equals(value.columns):
value = value.reindex(cols, axis=1)
# now align rows
value = reindexer(value).T
value = _reindex_for_setitem(value, self.index)

elif isinstance(value, ExtensionArray):
# Explicitly copy here, instead of in sanitize_index,
Expand Down Expand Up @@ -3946,18 +3941,7 @@ def reindexer(value):
else:
value = construct_1d_arraylike_from_scalar(value, len(self), dtype=None)

# return internal types directly
if is_extension_array_dtype(value):
return value

# broadcast across multiple columns if necessary
if broadcast and key in self.columns and value.ndim == 1:
if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
existing_piece = self[key]
if isinstance(existing_piece, DataFrame):
value = np.tile(value, (len(existing_piece.columns), 1))

return np.atleast_2d(np.asarray(value))
return value

@property
def _series(self):
Expand Down Expand Up @@ -9555,3 +9539,33 @@ def _from_nested_dict(data) -> collections.defaultdict:
for col, v in s.items():
new_data[col][index] = v
return new_data


def _reindex_for_setitem(value, index: Index):
# reindex if necessary

if value.index.equals(index) or not len(index):
return value._values.copy()

# GH#4107
try:
value = value.reindex(index)._values
except ValueError as err:
# raised in MultiIndex.from_tuples, see test_insert_error_msmgs
if not value.index.is_unique:
# duplicate axis
raise err

raise TypeError(
"incompatible index of inserted column with frame index"
) from err
return value


def _maybe_atleast_2d(value):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can likley use elsewhere

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could put in core/dtypes/cast

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ill take a look; most likely place it would be used is in internals

# TODO(EA2D): not needed with 2D EAs

if is_extension_array_dtype(value):
return value

return np.atleast_2d(np.asarray(value))