Skip to content

BUG: Fix KeyError in merge on CategoricalIndex #20777

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1327,6 +1327,7 @@ Sparse
Reshaping
^^^^^^^^^

- Bug in :func:`DataFrame.merge` where referencing a ``CategoricalIndex`` by name, where the ``by`` kwarg would ``KeyError`` (:issue:`20777`)
- Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`)
- Bug in :func:`DataFrame.unstack` which casts int to float if ``columns`` is a ``MultiIndex`` with unused levels (:issue:`17845`)
- Bug in :func:`DataFrame.unstack` which raises an error if ``index`` is a ``MultiIndex`` with unused labels on the unstacked level (:issue:`18562`)
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1585,6 +1585,8 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,

if is_sparse(arr):
arr = arr.get_values()
elif isinstance(arr, (ABCIndexClass, ABCSeries)):
arr = arr.values

arr = np.asarray(arr)

Expand Down
3 changes: 1 addition & 2 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -705,8 +705,7 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
take_right = self.right[name]._values

elif left_indexer is not None \
and isinstance(self.left_join_keys[i], np.ndarray):

and is_array_like(self.left_join_keys[i]):
take_left = self.left_join_keys[i]
take_right = self.right_join_keys[i]

Expand Down
77 changes: 54 additions & 23 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,27 @@
# pylint: disable=E1103

import pytest
from datetime import datetime, date
from numpy.random import randn
from numpy import nan
import numpy as np
import random
import re
from collections import OrderedDict
from datetime import date, datetime

import numpy as np
import pytest
from numpy import nan
from numpy.random import randn

import pandas as pd
import pandas.util.testing as tm
from pandas import (Categorical, CategoricalIndex, DataFrame, DatetimeIndex,
Float64Index, Index, Int64Index, MultiIndex, RangeIndex,
Series, UInt64Index)
from pandas.api.types import CategoricalDtype as CDT
from pandas.compat import lrange, lzip
from pandas.core.dtypes.common import is_categorical_dtype, is_object_dtype
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.reshape.concat import concat
from pandas.core.reshape.merge import merge, MergeError
from pandas.core.reshape.merge import MergeError, merge
from pandas.util.testing import assert_frame_equal, assert_series_equal
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.common import (
is_categorical_dtype,
is_object_dtype,
)
from pandas import DataFrame, Index, MultiIndex, Series, Categorical
import pandas.util.testing as tm
from pandas.api.types import CategoricalDtype as CDT

N = 50
NGROUPS = 8
Expand Down Expand Up @@ -813,7 +814,7 @@ def test_validation(self):

# Dups on right
right_w_dups = right.append(pd.DataFrame({'a': ['e'], 'c': ['moo']},
index=[4]))
index=[4]))
merge(left, right_w_dups, left_index=True, right_index=True,
validate='one_to_many')

Expand Down Expand Up @@ -1388,17 +1389,24 @@ def test_merge_datetime_index(self, klass):
if klass is not None:
on_vector = klass(on_vector)

expected = DataFrame({"a": [1, 2, 3]})

if klass == np.asarray:
# The join key is added for ndarray.
expected["key_1"] = [2016, 2017, 2018]
expected = DataFrame(
OrderedDict([
("a", [1, 2, 3]),
("key_1", [2016, 2017, 2018]),
])
)

result = df.merge(df, on=["a", on_vector], how="inner")
tm.assert_frame_equal(result, expected)

expected = DataFrame({"a_x": [1, 2, 3],
"a_y": [1, 2, 3]})
expected = DataFrame(
OrderedDict([
("key_0", [2016, 2017, 2018]),
("a_x", [1, 2, 3]),
("a_y", [1, 2, 3]),
])
)

result = df.merge(df, on=[df.index.year], how="inner")
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -1427,7 +1435,7 @@ def test_different(self, right_vals):
# We allow merging on object and categorical cols and cast
# categorical cols to object
if (is_categorical_dtype(right['A'].dtype) or
is_object_dtype(right['A'].dtype)):
is_object_dtype(right['A'].dtype)):
result = pd.merge(left, right, on='A')
assert is_object_dtype(result.A.dtype)

Expand Down Expand Up @@ -1826,3 +1834,26 @@ def test_merge_on_indexes(self, left_df, right_df, how, sort, expected):
how=how,
sort=sort)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
'index', [
CategoricalIndex(['A', 'B'], categories=['A', 'B'], name='index_col'),
Float64Index([1.0, 2.0], name='index_col'),
Int64Index([1, 2], name='index_col'),
UInt64Index([1, 2], name='index_col'),
RangeIndex(start=0, stop=2, name='index_col'),
DatetimeIndex(["2018-01-01", "2018-01-02"], name='index_col'),
], ids=lambda x: type(x).__name__)
def test_merge_index_types(index):
# gh-20777
# assert key access is consistent across index types
left = DataFrame({"left_data": [1, 2]}, index=index)
right = DataFrame({"right_data": [1.0, 2.0]}, index=index)

result = left.merge(right, on=['index_col'])

expected = DataFrame(
OrderedDict([('left_data', [1, 2]), ('right_data', [1.0, 2.0])]),
index=index)
assert_frame_equal(result, expected)