-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
Fix passing empty label to df drop #21515
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
35742fc
832a50b
bb80ded
a81c74a
394b384
93bbf05
1b832d0
8119f72
9d5a42f
01f6a9c
13b36c2
d5f67e1
1ffc0d4
454cb7e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3129,7 +3129,7 @@ def _drop_axis(self, labels, axis, level=None, errors='raise'): | |
""" | ||
axis = self._get_axis_number(axis) | ||
axis_name = self._get_axis_name(axis) | ||
axis, axis_ = self._get_axis(axis), axis | ||
axis = self._get_axis(axis) | ||
|
||
if axis.is_unique: | ||
if level is not None: | ||
|
@@ -3138,24 +3138,25 @@ def _drop_axis(self, labels, axis, level=None, errors='raise'): | |
new_axis = axis.drop(labels, level=level, errors=errors) | ||
else: | ||
new_axis = axis.drop(labels, errors=errors) | ||
dropped = self.reindex(**{axis_name: new_axis}) | ||
try: | ||
dropped.axes[axis_].set_names(axis.names, inplace=True) | ||
except AttributeError: | ||
pass | ||
result = dropped | ||
result = self.reindex(**{axis_name: new_axis}) | ||
|
||
# Case for non-unique axis | ||
else: | ||
labels = _ensure_object(com._index_labels_to_array(labels)) | ||
if level is not None: | ||
if not isinstance(axis, MultiIndex): | ||
raise AssertionError('axis must be a MultiIndex') | ||
indexer = ~axis.get_level_values(level).isin(labels) | ||
|
||
# GH 18561 MultiIndex.drop should raise if label is absent | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. blank line before the comment |
||
if errors == 'raise' and indexer.all(): | ||
raise KeyError('{} not found in axis'.format(labels)) | ||
else: | ||
indexer = ~axis.isin(labels) | ||
|
||
if errors == 'raise' and indexer.all(): | ||
raise KeyError('{} not found in axis'.format(labels)) | ||
# Check if label doesn't exist along axis | ||
labels_missing = (axis.get_indexer_for(labels) == -1).any() | ||
if errors == 'raise' and labels_missing: | ||
raise KeyError('{} not found in axis'.format(labels)) | ||
|
||
slicer = [slice(None)] * self.ndim | ||
slicer[self._get_axis_number(axis_name)] = indexer | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1151,3 +1151,24 @@ def test_raise_on_drop_duplicate_index(self, actual): | |
expected_no_err = actual.T.drop('c', axis=1, level=level, | ||
errors='ignore') | ||
assert_frame_equal(expected_no_err.T, actual) | ||
|
||
@pytest.mark.parametrize('index, drop_labels', [ | ||
([1, 2, 3], []), | ||
([1, 1, 2], []), | ||
([1, 2, 3], [2]), | ||
([1, 1, 3], [1]), | ||
]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I still think (see my previous comment) I would rather write this as: @pytest.mark.parametrize('index', [[1, 2, 3], [1, 1, 2]])
@pytest.mark.parametrize('drop_labels', [[], [1]]) Do you have any reason to explictly parametrize the combinations? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree with you, your approach seems more compact. Let me update |
||
def test_drop_empty_list(self, index, drop_labels): | ||
# GH 21494 | ||
expected_index = [i for i in index if i not in drop_labels] | ||
frame = pd.DataFrame(index=index).drop(drop_labels) | ||
tm.assert_frame_equal(frame, pd.DataFrame(index=expected_index)) | ||
|
||
@pytest.mark.parametrize('index, drop_labels', [ | ||
([1, 2, 3], [1, 4]), | ||
([1, 2, 2], [1, 4]), | ||
]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similarly as above, I don't think you need to parametrize @jreback , what do you think? Do we have a general rule concerning making parameter combinations more explicit/more compact? |
||
def test_drop_non_empty_list(self, index, drop_labels): | ||
# GH 21494 | ||
with tm.assert_raises_regex(KeyError, 'not found in axis'): | ||
pd.DataFrame(index=index).drop(drop_labels) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1565,6 +1565,79 @@ def test_drop_tuple(self, values, to_drop): | |
for drop_me in to_drop[1], [to_drop[1]]: | ||
pytest.raises(KeyError, removed.drop, drop_me) | ||
|
||
def test_drop_unique_and_non_unique_index(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. any way to parameterize some of this (e.g. you can break into 2 tests if that makes it easier) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I told you to move them here, but on 2nd thought, there they belong in test_alter_index (where the tests were originally). sorry about that. test_base is for generically testing Index/Series types and not doing that here so kind of superfluous to put the tests here. |
||
# unique | ||
s = Series([1, 2], index=['one', 'two']) | ||
expected = Series([1], index=['one']) | ||
result = s.drop(['two']) | ||
tm.assert_series_equal(result, expected) | ||
result = s.drop('two', axis='rows') | ||
tm.assert_series_equal(result, expected) | ||
|
||
# non-unique | ||
# GH 5248 | ||
s = Series([1, 1, 2], index=['one', 'two', 'one']) | ||
expected = Series([1, 2], index=['one', 'one']) | ||
result = s.drop(['two'], axis=0) | ||
tm.assert_series_equal(result, expected) | ||
result = s.drop('two') | ||
tm.assert_series_equal(result, expected) | ||
|
||
expected = Series([1], index=['two']) | ||
result = s.drop(['one']) | ||
tm.assert_series_equal(result, expected) | ||
result = s.drop('one') | ||
tm.assert_series_equal(result, expected) | ||
|
||
# single string/tuple-like | ||
s = Series(range(3), index=list('abc')) | ||
pytest.raises(KeyError, s.drop, 'bc') | ||
pytest.raises(KeyError, s.drop, ('a',)) | ||
|
||
# errors='ignore' | ||
s = Series(range(3), index=list('abc')) | ||
result = s.drop('bc', errors='ignore') | ||
tm.assert_series_equal(result, s) | ||
result = s.drop(['a', 'd'], errors='ignore') | ||
expected = s.iloc[1:] | ||
tm.assert_series_equal(result, expected) | ||
|
||
# bad axis | ||
pytest.raises(ValueError, s.drop, 'one', axis='columns') | ||
|
||
# GH 8522 | ||
s = Series([2, 3], index=[True, False]) | ||
assert s.index.is_object() | ||
result = s.drop(True) | ||
expected = Series([3], index=[False]) | ||
tm.assert_series_equal(result, expected) | ||
|
||
# GH 16877 | ||
s = Series([2, 3], index=[0, 1]) | ||
with tm.assert_raises_regex(KeyError, 'not found in axis'): | ||
s.drop([False, True]) | ||
|
||
@pytest.mark.parametrize('index, drop_labels', [ | ||
([1, 2, 3], []), | ||
([1, 1, 2], []), | ||
([1, 2, 3], [2]), | ||
([1, 1, 3], [1]), | ||
]) | ||
def test_drop_empty_list(self, index, drop_labels): | ||
# GH 21494 | ||
expected_index = [i for i in index if i not in drop_labels] | ||
series = pd.Series(index=index).drop(drop_labels) | ||
tm.assert_series_equal(series, pd.Series(index=expected_index)) | ||
|
||
@pytest.mark.parametrize('index, drop_labels', [ | ||
([1, 2, 3], [1, 4]), | ||
([1, 2, 2], [1, 4]), | ||
]) | ||
def test_drop_non_empty_list(self, index, drop_labels): | ||
# GH 21494 | ||
with tm.assert_raises_regex(KeyError, 'not found in axis'): | ||
pd.Series(index=index).drop(drop_labels) | ||
|
||
@pytest.mark.parametrize("method,expected", [ | ||
('intersection', np.array([(1, 'A'), (2, 'A'), (1, 'B'), (2, 'B')], | ||
dtype=[('num', int), ('let', 'a1')])), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
comment to indicate this is the non-unique case