diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index f5e3b1a35acb5..9cdda2d14669c 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1327,6 +1327,7 @@ Sparse Reshaping ^^^^^^^^^ +- Bug in :func:`DataFrame.merge` where referencing a ``CategoricalIndex`` by name, where the ``by`` kwarg would ``KeyError`` (:issue:`20777`) - Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`) - Bug in :func:`DataFrame.unstack` which casts int to float if ``columns`` is a ``MultiIndex`` with unused levels (:issue:`17845`) - Bug in :func:`DataFrame.unstack` which raises an error if ``index`` is a ``MultiIndex`` with unused labels on the unstacked level (:issue:`18562`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index a03d892432b51..e8f74cf58a262 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1585,6 +1585,8 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, if is_sparse(arr): arr = arr.get_values() + elif isinstance(arr, (ABCIndexClass, ABCSeries)): + arr = arr.values arr = np.asarray(arr) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 7b1a0875bba59..0204e655bfa2c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -705,8 +705,7 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): take_right = self.right[name]._values elif left_indexer is not None \ - and isinstance(self.left_join_keys[i], np.ndarray): - + and is_array_like(self.left_join_keys[i]): take_left = self.left_join_keys[i] take_right = self.right_join_keys[i] diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f3827ac251cf0..436fe8f9f5d7e 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1,26 +1,27 @@ # pylint: disable=E1103 -import pytest -from datetime import datetime, date -from numpy.random import randn -from numpy import nan -import numpy as np import random import re +from collections import OrderedDict +from datetime import date, datetime + +import numpy as np +import pytest +from numpy import nan +from numpy.random import randn import pandas as pd +import pandas.util.testing as tm +from pandas import (Categorical, CategoricalIndex, DataFrame, DatetimeIndex, + Float64Index, Index, Int64Index, MultiIndex, RangeIndex, + Series, UInt64Index) +from pandas.api.types import CategoricalDtype as CDT from pandas.compat import lrange, lzip +from pandas.core.dtypes.common import is_categorical_dtype, is_object_dtype +from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.reshape.concat import concat -from pandas.core.reshape.merge import merge, MergeError +from pandas.core.reshape.merge import MergeError, merge from pandas.util.testing import assert_frame_equal, assert_series_equal -from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas.core.dtypes.common import ( - is_categorical_dtype, - is_object_dtype, -) -from pandas import DataFrame, Index, MultiIndex, Series, Categorical -import pandas.util.testing as tm -from pandas.api.types import CategoricalDtype as CDT N = 50 NGROUPS = 8 @@ -813,7 +814,7 @@ def test_validation(self): # Dups on right right_w_dups = right.append(pd.DataFrame({'a': ['e'], 'c': ['moo']}, - index=[4])) + index=[4])) merge(left, right_w_dups, left_index=True, right_index=True, validate='one_to_many') @@ -1388,17 +1389,24 @@ def test_merge_datetime_index(self, klass): if klass is not None: on_vector = klass(on_vector) - expected = DataFrame({"a": [1, 2, 3]}) - - if klass == np.asarray: - # The join key is added for ndarray. - expected["key_1"] = [2016, 2017, 2018] + expected = DataFrame( + OrderedDict([ + ("a", [1, 2, 3]), + ("key_1", [2016, 2017, 2018]), + ]) + ) result = df.merge(df, on=["a", on_vector], how="inner") tm.assert_frame_equal(result, expected) - expected = DataFrame({"a_x": [1, 2, 3], - "a_y": [1, 2, 3]}) + expected = DataFrame( + OrderedDict([ + ("key_0", [2016, 2017, 2018]), + ("a_x", [1, 2, 3]), + ("a_y", [1, 2, 3]), + ]) + ) + result = df.merge(df, on=[df.index.year], how="inner") tm.assert_frame_equal(result, expected) @@ -1427,7 +1435,7 @@ def test_different(self, right_vals): # We allow merging on object and categorical cols and cast # categorical cols to object if (is_categorical_dtype(right['A'].dtype) or - is_object_dtype(right['A'].dtype)): + is_object_dtype(right['A'].dtype)): result = pd.merge(left, right, on='A') assert is_object_dtype(result.A.dtype) @@ -1826,3 +1834,26 @@ def test_merge_on_indexes(self, left_df, right_df, how, sort, expected): how=how, sort=sort) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + 'index', [ + CategoricalIndex(['A', 'B'], categories=['A', 'B'], name='index_col'), + Float64Index([1.0, 2.0], name='index_col'), + Int64Index([1, 2], name='index_col'), + UInt64Index([1, 2], name='index_col'), + RangeIndex(start=0, stop=2, name='index_col'), + DatetimeIndex(["2018-01-01", "2018-01-02"], name='index_col'), + ], ids=lambda x: type(x).__name__) +def test_merge_index_types(index): + # gh-20777 + # assert key access is consistent across index types + left = DataFrame({"left_data": [1, 2]}, index=index) + right = DataFrame({"right_data": [1.0, 2.0]}, index=index) + + result = left.merge(right, on=['index_col']) + + expected = DataFrame( + OrderedDict([('left_data', [1, 2]), ('right_data', [1.0, 2.0])]), + index=index) + assert_frame_equal(result, expected)