From 1e0539d7b155099930324b1fbe445ccb6dc7dc02 Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 21 Oct 2018 22:20:27 +0800 Subject: [PATCH 01/14] fix bug #GH23020 --- doc/source/whatsnew/v0.24.0.rst | 6 ++++++ pandas/core/reshape/merge.py | 6 +++++- pandas/tests/reshape/merge/test_merge.py | 10 ++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index db1758199be0d..0e1010fa903af 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1635,6 +1635,12 @@ Sparse - Bug in :meth:`SparseArray.nonzero` and :meth:`SparseDataFrame.dropna` returning shifted/incorrect results (:issue:`21172`) - Bug in :meth:`DataFrame.apply` where dtypes would lose sparseness (:issue:`23744`) +IntegerArray +^^^^^^^^^^^^ + +- Bug in :func:`pandas.merge` when merging on an Integer extension array (:issue:`23020`) + + Build Changes ^^^^^^^^^^^^^ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e6e6c1c99b509..e65d330be6c5c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -19,7 +19,7 @@ is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_datetimelike, is_dtype_equal, is_float_dtype, is_int64_dtype, is_integer, is_integer_dtype, is_list_like, is_number, - is_numeric_dtype, needs_i8_conversion) + is_numeric_dtype, needs_i8_conversion, is_extension_array_dtype) from pandas.core.dtypes.missing import isnull, na_value_for_dtype from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta @@ -1607,6 +1607,10 @@ def _factorize_keys(lk, rk, sort=True): lk = ensure_int64(lk.codes) rk = ensure_int64(rk) + elif is_extension_array_dtype(lk) and is_extension_array_dtype(rk): + klass = libhashtable.Factorizer + lk = ensure_object(lk) + rk = ensure_object(rk) elif is_integer_dtype(lk) and is_integer_dtype(rk): # GH#23917 TODO: needs tests for case where lk is integer-dtype # and rk is datetime-dtype diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 94e180f9328d6..15cbe78a08f71 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1326,6 +1326,16 @@ def test_merging_with_bool_or_int_cateorical_column(self, category_column, CDT(categories, ordered=ordered)) assert_frame_equal(expected, result) + def test_merge_on_int_array(self): + # GH 23020 + df = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'), + 'B': 1}) + result = pd.merge(df, df, on='A') + expected = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'), + 'B_x': 1, + 'B_y': 1}) + assert_frame_equal(result, expected, check_dtype=True) + @pytest.fixture def left_df(): From 931036168a827fd7e4aa48fa343d6c55435d0d58 Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 4 Nov 2018 20:26:01 +0800 Subject: [PATCH 02/14] change after review --- pandas/core/reshape/merge.py | 9 +++++---- pandas/tests/extension/base/reshaping.py | 8 ++++++++ pandas/tests/extension/conftest.py | 8 ++++++++ pandas/tests/reshape/merge/test_merge.py | 10 ---------- 4 files changed, 21 insertions(+), 14 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e65d330be6c5c..9655029d7507d 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1607,7 +1607,9 @@ def _factorize_keys(lk, rk, sort=True): lk = ensure_int64(lk.codes) rk = ensure_int64(rk) - elif is_extension_array_dtype(lk) and is_extension_array_dtype(rk): + elif (is_extension_array_dtype(lk) and + is_extension_array_dtype(rk) and + lk.dtype == rk.dtype): klass = libhashtable.Factorizer lk = ensure_object(lk) rk = ensure_object(rk) @@ -1620,9 +1622,8 @@ def _factorize_keys(lk, rk, sort=True): elif (issubclass(lk.dtype.type, (np.timedelta64, np.datetime64)) and issubclass(rk.dtype.type, (np.timedelta64, np.datetime64))): # GH#23917 TODO: Needs tests for non-matching dtypes - klass = libhashtable.Int64Factorizer - lk = ensure_int64(com.values_from_object(lk)) - rk = ensure_int64(com.values_from_object(rk)) + lk, _ = lk._values_for_factorize() + rk, _ = rk._values_for_factorize() else: klass = libhashtable.Factorizer lk = ensure_object(lk) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 42e481d974295..adcc34798bd41 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -237,3 +237,11 @@ def test_unstack(self, data, index, obj): result = result.astype(object) self.assert_frame_equal(result, expected) + + def test_merge_on_int_array(self, df_merge_on_int_array): + # GH 23020 + result = pd.merge(df_merge_on_int_array, df_merge_on_int_array, on='A') + expected = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'), + 'B_x': 1, + 'B_y': 1}) + self.assert_frame_equal(result, expected, check_dtype=True) diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 5349dd919f2a2..24c13dfcc1424 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -1,3 +1,5 @@ +import pandas as pd +import numpy as np import operator import pytest @@ -108,3 +110,9 @@ def data_for_grouping(): def box_in_series(request): """Whether to box the data in a Series""" return request.param + + +@pytest.fixture +def df_merge_on_int_array(): + return pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'), + 'B': 1}) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 15cbe78a08f71..94e180f9328d6 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1326,16 +1326,6 @@ def test_merging_with_bool_or_int_cateorical_column(self, category_column, CDT(categories, ordered=ordered)) assert_frame_equal(expected, result) - def test_merge_on_int_array(self): - # GH 23020 - df = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'), - 'B': 1}) - result = pd.merge(df, df, on='A') - expected = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'), - 'B_x': 1, - 'B_y': 1}) - assert_frame_equal(result, expected, check_dtype=True) - @pytest.fixture def left_df(): From cb2d4d1e969cc8ba8c48c45f9f84e0211457e81f Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 11 Nov 2018 18:17:23 +0800 Subject: [PATCH 03/14] 2nd change after review --- doc/source/whatsnew/v0.24.0.rst | 7 +------ pandas/tests/extension/base/reshaping.py | 22 ++++++++++++++-------- pandas/tests/extension/conftest.py | 8 -------- 3 files changed, 15 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 0e1010fa903af..fae6a84770b59 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1041,6 +1041,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your - :meth:`Series.unstack` and :meth:`DataFrame.unstack` no longer convert extension arrays to object-dtype ndarrays. Each column in the output ``DataFrame`` will now have the same dtype as the input (:issue:`23077`). - Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`). - A default repr for :class:`ExtensionArray` is now provided (:issue:`23601`). +- Bug in :func:`pandas.merge` when merging on an Integer extension array (:issue:`23020`) .. _whatsnew_0240.api.incompatibilities: @@ -1635,12 +1636,6 @@ Sparse - Bug in :meth:`SparseArray.nonzero` and :meth:`SparseDataFrame.dropna` returning shifted/incorrect results (:issue:`21172`) - Bug in :meth:`DataFrame.apply` where dtypes would lose sparseness (:issue:`23744`) -IntegerArray -^^^^^^^^^^^^ - -- Bug in :func:`pandas.merge` when merging on an Integer extension array (:issue:`23020`) - - Build Changes ^^^^^^^^^^^^^ diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index adcc34798bd41..c850ae162f6b3 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -173,6 +173,20 @@ def test_merge(self, data, na_value): dtype=data.dtype)}) self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) + # GH 23020 + df1 = pd.DataFrame({'ext': data[:3], + 'key': pd.Series([1, 2, np.nan], dtype='Int64')}) + + res = pd.merge(df1, df1, on='key') + + exp = pd.DataFrame( + {'key': pd.Series([1, 2, np.nan], dtype='Int64'), + 'ext_x': data._from_sequence(data[:3], dtype=data.dtype), + 'ext_y': data._from_sequence(data[:3], dtype=data.dtype)}) + + self.assert_frame_equal(res, exp[['ext_x', 'key', 'ext_y']], + check_dtype=True) + @pytest.mark.parametrize("columns", [ ["A", "B"], pd.MultiIndex.from_tuples([('A', 'a'), ('A', 'b')], @@ -237,11 +251,3 @@ def test_unstack(self, data, index, obj): result = result.astype(object) self.assert_frame_equal(result, expected) - - def test_merge_on_int_array(self, df_merge_on_int_array): - # GH 23020 - result = pd.merge(df_merge_on_int_array, df_merge_on_int_array, on='A') - expected = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'), - 'B_x': 1, - 'B_y': 1}) - self.assert_frame_equal(result, expected, check_dtype=True) diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 24c13dfcc1424..5349dd919f2a2 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -1,5 +1,3 @@ -import pandas as pd -import numpy as np import operator import pytest @@ -110,9 +108,3 @@ def data_for_grouping(): def box_in_series(request): """Whether to box the data in a Series""" return request.param - - -@pytest.fixture -def df_merge_on_int_array(): - return pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'), - 'B': 1}) From 42477c3a838dd74913ae55c5666b727a7efeaccd Mon Sep 17 00:00:00 2001 From: makbigc Date: Sat, 17 Nov 2018 20:02:47 +0800 Subject: [PATCH 04/14] 3rd change after review --- pandas/core/reshape/merge.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 9655029d7507d..ef58ae4d6785f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -19,7 +19,8 @@ is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_datetimelike, is_dtype_equal, is_float_dtype, is_int64_dtype, is_integer, is_integer_dtype, is_list_like, is_number, - is_numeric_dtype, needs_i8_conversion, is_extension_array_dtype) + is_numeric_dtype, needs_i8_conversion, is_extension_array_dtype, + is_period_dtype) from pandas.core.dtypes.missing import isnull, na_value_for_dtype from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta @@ -1607,6 +1608,12 @@ def _factorize_keys(lk, rk, sort=True): lk = ensure_int64(lk.codes) rk = ensure_int64(rk) + elif (is_period_dtype(lk) and + is_period_dtype(rk) and + lk.dtype == rk.dtype): + klass = libhashtable.Factorizer + lk = ensure_object(lk) + rk = ensure_object(rk) elif (is_extension_array_dtype(lk) and is_extension_array_dtype(rk) and lk.dtype == rk.dtype): From 6a9e36013c85c4ae69ea6ef8d1529162ad510c97 Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 18 Nov 2018 22:21:21 +0800 Subject: [PATCH 05/14] Separate the test and add more cases --- pandas/tests/extension/base/reshaping.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index c850ae162f6b3..f21118a183cfd 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -173,14 +173,15 @@ def test_merge(self, data, na_value): dtype=data.dtype)}) self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) + @pytest.mark.parametrize("dtypes", ["Int64"]) + def test_merge_on_int_array(self, data, dtypes): # GH 23020 df1 = pd.DataFrame({'ext': data[:3], - 'key': pd.Series([1, 2, np.nan], dtype='Int64')}) - + 'key': pd.Series([1, 2, np.nan], dtype=dtypes)}) res = pd.merge(df1, df1, on='key') exp = pd.DataFrame( - {'key': pd.Series([1, 2, np.nan], dtype='Int64'), + {'key': pd.Series([1, 2, np.nan], dtype=dtypes), 'ext_x': data._from_sequence(data[:3], dtype=data.dtype), 'ext_y': data._from_sequence(data[:3], dtype=data.dtype)}) From 373582ec970145d9dce65ababed59adece86bba7 Mon Sep 17 00:00:00 2001 From: makbigc Date: Wed, 5 Dec 2018 23:18:13 +0800 Subject: [PATCH 06/14] add more types into the test_merge_on_int_array --- pandas/core/reshape/merge.py | 9 +++++---- pandas/tests/extension/base/reshaping.py | 3 ++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index ef58ae4d6785f..894211bfceb94 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1618,8 +1618,8 @@ def _factorize_keys(lk, rk, sort=True): is_extension_array_dtype(rk) and lk.dtype == rk.dtype): klass = libhashtable.Factorizer - lk = ensure_object(lk) - rk = ensure_object(rk) + lk, _ = lk._values_for_factorize() + rk, _ = rk._values_for_factorize() elif is_integer_dtype(lk) and is_integer_dtype(rk): # GH#23917 TODO: needs tests for case where lk is integer-dtype # and rk is datetime-dtype @@ -1629,8 +1629,9 @@ def _factorize_keys(lk, rk, sort=True): elif (issubclass(lk.dtype.type, (np.timedelta64, np.datetime64)) and issubclass(rk.dtype.type, (np.timedelta64, np.datetime64))): # GH#23917 TODO: Needs tests for non-matching dtypes - lk, _ = lk._values_for_factorize() - rk, _ = rk._values_for_factorize() + klass = libhashtable.Int64Factorizer + lk = ensure_int64(com.values_from_object(lk)) + rk = ensure_int64(com.values_from_object(rk)) else: klass = libhashtable.Factorizer lk = ensure_object(lk) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index f21118a183cfd..b106a95fc0964 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -173,7 +173,8 @@ def test_merge(self, data, na_value): dtype=data.dtype)}) self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) - @pytest.mark.parametrize("dtypes", ["Int64"]) + @pytest.mark.parametrize("dtypes", ["Int8", "Int16", "Int32", "Int64", + "UInt8", "UInt16", "UInt32", "UInt64"]) def test_merge_on_int_array(self, data, dtypes): # GH 23020 df1 = pd.DataFrame({'ext': data[:3], From f817cad2f617065614bd755bfae931c5d0eed5c7 Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 9 Dec 2018 17:58:04 +0800 Subject: [PATCH 07/14] isort pandas/core/reshape/merge.py --- pandas/core/reshape/merge.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 894211bfceb94..d5c5b46bee08a 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -17,10 +17,10 @@ from pandas.core.dtypes.common import ( ensure_float64, ensure_int64, ensure_object, is_array_like, is_bool, is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_datetimelike, is_dtype_equal, is_float_dtype, - is_int64_dtype, is_integer, is_integer_dtype, is_list_like, is_number, - is_numeric_dtype, needs_i8_conversion, is_extension_array_dtype, - is_period_dtype) + is_datetime64tz_dtype, is_datetimelike, is_dtype_equal, + is_extension_array_dtype, is_float_dtype, is_int64_dtype, is_integer, + is_integer_dtype, is_list_like, is_number, is_numeric_dtype, + is_period_dtype, needs_i8_conversion) from pandas.core.dtypes.missing import isnull, na_value_for_dtype from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta From 8140ee14ea07eec96f94eab777adc15112537f47 Mon Sep 17 00:00:00 2001 From: makbigc Date: Sat, 15 Dec 2018 15:25:53 +0800 Subject: [PATCH 08/14] Add _values_for_factorize in period.py --- pandas/core/arrays/period.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 60febc5f5636d..93a9899aacf19 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -814,6 +814,9 @@ def _check_timedeltalike_freq_compat(self, other): def _values_for_argsort(self): return self._data + def _values_for_factorize(self): + return np.asarray(self), np.nan + PeriodArray._add_comparison_ops() From c425b24b9fe19f5713f650afe76711d383a259d6 Mon Sep 17 00:00:00 2001 From: makbigc Date: Sat, 15 Dec 2018 15:29:06 +0800 Subject: [PATCH 09/14] Remove is_period_dtype check --- pandas/core/reshape/merge.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index d5c5b46bee08a..6f7570703f86f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -20,7 +20,7 @@ is_datetime64tz_dtype, is_datetimelike, is_dtype_equal, is_extension_array_dtype, is_float_dtype, is_int64_dtype, is_integer, is_integer_dtype, is_list_like, is_number, is_numeric_dtype, - is_period_dtype, needs_i8_conversion) + needs_i8_conversion) from pandas.core.dtypes.missing import isnull, na_value_for_dtype from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta @@ -1608,12 +1608,6 @@ def _factorize_keys(lk, rk, sort=True): lk = ensure_int64(lk.codes) rk = ensure_int64(rk) - elif (is_period_dtype(lk) and - is_period_dtype(rk) and - lk.dtype == rk.dtype): - klass = libhashtable.Factorizer - lk = ensure_object(lk) - rk = ensure_object(rk) elif (is_extension_array_dtype(lk) and is_extension_array_dtype(rk) and lk.dtype == rk.dtype): From 29e820d7cd35591ae55ec1b6d70462e20a2e1b29 Mon Sep 17 00:00:00 2001 From: makbigc Date: Sat, 15 Dec 2018 15:30:07 +0800 Subject: [PATCH 10/14] Modify _values_for_factorize in sparse.py --- pandas/core/arrays/sparse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 9e1d2efc21b81..b05c854900d83 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -942,7 +942,7 @@ def unique(self): def _values_for_factorize(self): # Still override this for hash_pandas_object - return np.asarray(self), self.fill_value + return np.asarray(self, object), self.fill_value def factorize(self, na_sentinel=-1): # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA] From 19621e628ecf3a3df080f6498d37bc4500f8d4fb Mon Sep 17 00:00:00 2001 From: makbigc Date: Sat, 15 Dec 2018 15:33:06 +0800 Subject: [PATCH 11/14] Test merge with EA as key and open a special case --- pandas/tests/extension/base/reshaping.py | 17 ++++++++--------- pandas/tests/reshape/merge/test_merge.py | 10 ++++++++++ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index b106a95fc0964..f8283367266c1 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -173,18 +173,17 @@ def test_merge(self, data, na_value): dtype=data.dtype)}) self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) - @pytest.mark.parametrize("dtypes", ["Int8", "Int16", "Int32", "Int64", - "UInt8", "UInt16", "UInt32", "UInt64"]) - def test_merge_on_int_array(self, data, dtypes): - # GH 23020 - df1 = pd.DataFrame({'ext': data[:3], - 'key': pd.Series([1, 2, np.nan], dtype=dtypes)}) + def test_merge_on_extension_array(self, data): + # GH 23020 + df1 = pd.DataFrame({'ext': [1, 2, 3], + 'key': data[:3]}) + res = pd.merge(df1, df1, on='key') exp = pd.DataFrame( - {'key': pd.Series([1, 2, np.nan], dtype=dtypes), - 'ext_x': data._from_sequence(data[:3], dtype=data.dtype), - 'ext_y': data._from_sequence(data[:3], dtype=data.dtype)}) + {'key': data[:3], + 'ext_x': [1, 2, 3], + 'ext_y': [1, 2, 3]}) self.assert_frame_equal(res, exp[['ext_x', 'key', 'ext_y']], check_dtype=True) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 94e180f9328d6..751b76c1e20ad 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1326,6 +1326,16 @@ def test_merging_with_bool_or_int_cateorical_column(self, category_column, CDT(categories, ordered=ordered)) assert_frame_equal(expected, result) + def test_merge_on_int_array(self): + # GH 23020 + df = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'), + 'B': 1}) + result = pd.merge(df, df, on='A') + expected = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'), + 'B_x': 1, + 'B_y': 1}) + assert_frame_equal(result, expected, check_dtype=True) + @pytest.fixture def left_df(): From a69f7b7a886e2585f9f59c59631745975685185a Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 16 Dec 2018 22:46:20 +0800 Subject: [PATCH 12/14] Add factorize method in period.py --- pandas/core/arrays/period.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 93a9899aacf19..3ccd5304a8cc8 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -817,6 +817,16 @@ def _values_for_argsort(self): def _values_for_factorize(self): return np.asarray(self), np.nan + def factorize(self, na_sentinel=-1): + from pandas.core.algorithms import factorize + + arr, na_value = self._values_for_factorize() + + labels, uniques = factorize(arr, na_sentinel=na_sentinel) + + uniques = period_array(uniques) + return labels, uniques + PeriodArray._add_comparison_ops() From 61f5e1592ab9412982dad415e3508eb5a8bb3978 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 18 Dec 2018 07:00:03 -0600 Subject: [PATCH 13/14] Updates * generic release note * removed _values_for_factorize changes * made tests depend on just the first two * do the factorization first --- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/core/arrays/period.py | 13 --------- pandas/core/arrays/sparse.py | 2 +- pandas/core/reshape/merge.py | 15 ++++++---- pandas/tests/extension/base/reshaping.py | 35 ++++++++++++++++++------ 5 files changed, 37 insertions(+), 30 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 0284198b49c3b..7a6d7fed6fae8 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1051,7 +1051,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your - :meth:`DataFrame.stack` no longer converts to object dtype for DataFrames where each column has the same extension dtype. The output Series will have the same dtype as the columns (:issue:`23077`). - :meth:`Series.unstack` and :meth:`DataFrame.unstack` no longer convert extension arrays to object-dtype ndarrays. Each column in the output ``DataFrame`` will now have the same dtype as the input (:issue:`23077`). - Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`). -- Bug in :func:`pandas.merge` when merging on an Integer extension array (:issue:`23020`) +- Bug in :func:`pandas.merge` when merging on an extension array-backed column (:issue:`23020`). - A default repr for :class:`pandas.api.extensions.ExtensionArray` is now provided (:issue:`23601`). .. _whatsnew_0240.api.incompatibilities: diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 3ccd5304a8cc8..60febc5f5636d 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -814,19 +814,6 @@ def _check_timedeltalike_freq_compat(self, other): def _values_for_argsort(self): return self._data - def _values_for_factorize(self): - return np.asarray(self), np.nan - - def factorize(self, na_sentinel=-1): - from pandas.core.algorithms import factorize - - arr, na_value = self._values_for_factorize() - - labels, uniques = factorize(arr, na_sentinel=na_sentinel) - - uniques = period_array(uniques) - return labels, uniques - PeriodArray._add_comparison_ops() diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index b05c854900d83..9e1d2efc21b81 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -942,7 +942,7 @@ def unique(self): def _values_for_factorize(self): # Still override this for hash_pandas_object - return np.asarray(self, object), self.fill_value + return np.asarray(self), self.fill_value def factorize(self, na_sentinel=-1): # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA] diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6f7570703f86f..f642a5b64fb66 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1594,6 +1594,15 @@ def _factorize_keys(lk, rk, sort=True): lk = lk.values rk = rk.values + elif (is_extension_array_dtype(lk.dtype) and + is_extension_array_dtype(rk.dtype) and + lk.dtype == rk.dtype and + not is_categorical_dtype(lk)): + # Categorical handled separately, since unordered categories can + # may need to be recoded. + lk, _ = lk._values_for_factorize() + rk, _ = rk._values_for_factorize() + # if we exactly match in categories, allow us to factorize on codes if (is_categorical_dtype(lk) and is_categorical_dtype(rk) and @@ -1608,12 +1617,6 @@ def _factorize_keys(lk, rk, sort=True): lk = ensure_int64(lk.codes) rk = ensure_int64(rk) - elif (is_extension_array_dtype(lk) and - is_extension_array_dtype(rk) and - lk.dtype == rk.dtype): - klass = libhashtable.Factorizer - lk, _ = lk._values_for_factorize() - rk, _ = rk._values_for_factorize() elif is_integer_dtype(lk) and is_integer_dtype(rk): # GH#23917 TODO: needs tests for case where lk is integer-dtype # and rk is datetime-dtype diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index f8283367266c1..ee22ffb3ccf97 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -175,18 +175,35 @@ def test_merge(self, data, na_value): def test_merge_on_extension_array(self, data): # GH 23020 - df1 = pd.DataFrame({'ext': [1, 2, 3], - 'key': data[:3]}) + a, b = data[:2] + key = type(data)._from_sequence([a, b], dtype=data.dtype) + + df = pd.DataFrame({"key": key, "val": [1, 2]}) + result = pd.merge(df, df, on='key') + expected = pd.DataFrame({"key": key, + "val_x": [1, 2], + "val_y": [1, 2]}) + self.assert_frame_equal(result, expected) - res = pd.merge(df1, df1, on='key') + # order + result = pd.merge(df.iloc[[1, 0]], df, on='key') + expected = expected.iloc[[1, 0]].reset_index(drop=True) + self.assert_frame_equal(result, expected) - exp = pd.DataFrame( - {'key': data[:3], - 'ext_x': [1, 2, 3], - 'ext_y': [1, 2, 3]}) + def test_merge_on_extension_array_duplicates(self, data): + # GH 23020 + a, b = data[:2] + key = type(data)._from_sequence([a, b, a], dtype=data.dtype) + df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]}) + df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]}) - self.assert_frame_equal(res, exp[['ext_x', 'key', 'ext_y']], - check_dtype=True) + result = pd.merge(df1, df2, on='key') + expected = pd.DataFrame({ + "key": key.take([0, 0, 0, 0, 1]), + "val_x": [1, 1, 3, 3, 2], + "val_y": [1, 3, 1, 3, 2], + }) + self.assert_frame_equal(result, expected) @pytest.mark.parametrize("columns", [ ["A", "B"], From 6bae6cacef83991adfff4961abb72bbb097216c6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 18 Dec 2018 08:38:43 -0600 Subject: [PATCH 14/14] Reorganize the if / elifs --- pandas/core/reshape/merge.py | 25 +++++++++++------------- pandas/tests/reshape/merge/test_merge.py | 2 +- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f642a5b64fb66..0adeb7997a888 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1590,26 +1590,16 @@ def _right_outer_join(x, y, max_groups): def _factorize_keys(lk, rk, sort=True): + # Some pre-processing for non-ndarray lk / rk if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): lk = lk.values rk = rk.values - elif (is_extension_array_dtype(lk.dtype) and - is_extension_array_dtype(rk.dtype) and - lk.dtype == rk.dtype and - not is_categorical_dtype(lk)): - # Categorical handled separately, since unordered categories can - # may need to be recoded. - lk, _ = lk._values_for_factorize() - rk, _ = rk._values_for_factorize() - - # if we exactly match in categories, allow us to factorize on codes - if (is_categorical_dtype(lk) and + elif (is_categorical_dtype(lk) and is_categorical_dtype(rk) and lk.is_dtype_equal(rk)): - klass = libhashtable.Int64Factorizer - if lk.categories.equals(rk.categories): + # if we exactly match in categories, allow us to factorize on codes rk = rk.codes else: # Same categories in different orders -> recode @@ -1617,7 +1607,14 @@ def _factorize_keys(lk, rk, sort=True): lk = ensure_int64(lk.codes) rk = ensure_int64(rk) - elif is_integer_dtype(lk) and is_integer_dtype(rk): + + elif (is_extension_array_dtype(lk.dtype) and + is_extension_array_dtype(rk.dtype) and + lk.dtype == rk.dtype): + lk, _ = lk._values_for_factorize() + rk, _ = rk._values_for_factorize() + + if is_integer_dtype(lk) and is_integer_dtype(rk): # GH#23917 TODO: needs tests for case where lk is integer-dtype # and rk is datetime-dtype klass = libhashtable.Int64Factorizer diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 751b76c1e20ad..970802e94662a 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1334,7 +1334,7 @@ def test_merge_on_int_array(self): expected = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'), 'B_x': 1, 'B_y': 1}) - assert_frame_equal(result, expected, check_dtype=True) + assert_frame_equal(result, expected) @pytest.fixture