|
30 | 30 | is_int64_dtype,
|
31 | 31 | is_integer,
|
32 | 32 | is_integer_dtype,
|
| 33 | + is_interval_dtype, |
33 | 34 | is_list_like,
|
34 | 35 | is_number,
|
35 | 36 | is_numeric_dtype,
|
36 | 37 | is_object_dtype,
|
| 38 | + is_period_dtype, |
37 | 39 | needs_i8_conversion,
|
38 | 40 | )
|
39 | 41 | from pandas.core.dtypes.missing import isnull, na_value_for_dtype
|
@@ -842,65 +844,187 @@ def _get_join_info(self):
|
842 | 844 | else:
|
843 | 845 | (left_indexer, right_indexer) = self._get_join_indexers()
|
844 | 846 |
|
| 847 | + if self.left_index: |
| 848 | + left = self.left.index |
| 849 | + else: |
| 850 | + left = self.left_join_keys |
| 851 | + |
845 | 852 | if self.right_index:
|
846 |
| - if len(self.left) > 0: |
847 |
| - join_index = self._create_join_index( |
848 |
| - self.left.index, |
849 |
| - self.right.index, |
850 |
| - left_indexer, |
851 |
| - right_indexer, |
852 |
| - how="right", |
853 |
| - ) |
854 |
| - else: |
855 |
| - join_index = self.right.index.take(right_indexer) |
856 |
| - left_indexer = np.array([-1] * len(join_index)) |
857 |
| - elif self.left_index: |
858 |
| - if len(self.right) > 0: |
859 |
| - join_index = self._create_join_index( |
860 |
| - self.right.index, |
861 |
| - self.left.index, |
862 |
| - right_indexer, |
863 |
| - left_indexer, |
864 |
| - how="left", |
865 |
| - ) |
866 |
| - else: |
867 |
| - join_index = self.left.index.take(left_indexer) |
868 |
| - right_indexer = np.array([-1] * len(join_index)) |
| 853 | + right = self.right.index |
| 854 | + else: |
| 855 | + right = self.right_join_keys |
| 856 | + |
| 857 | + if self.how != "right": |
| 858 | + join_index = self._create_join_index( |
| 859 | + left, |
| 860 | + right, |
| 861 | + left_indexer, |
| 862 | + right_indexer, |
| 863 | + self.join_names, |
| 864 | + self.join_names, |
| 865 | + ) |
869 | 866 | else:
|
870 |
| - join_index = Index(np.arange(len(left_indexer))) |
| 867 | + join_index = self._create_join_index( |
| 868 | + right, |
| 869 | + left, |
| 870 | + right_indexer, |
| 871 | + left_indexer, |
| 872 | + self.join_names, |
| 873 | + self.join_names, |
| 874 | + ) |
871 | 875 |
|
872 | 876 | if len(join_index) == 0:
|
873 | 877 | join_index = join_index.astype(object)
|
874 | 878 | return join_index, left_indexer, right_indexer
|
875 | 879 |
|
876 | 880 | def _create_join_index(
|
877 |
| - self, index, other_index, indexer, other_indexer, how="left" |
| 881 | + self, index, other_index, indexer, other_indexer, names, other_names |
878 | 882 | ):
|
879 | 883 | """
|
880 | 884 | Create a join index by rearranging one index to match another
|
881 | 885 |
|
882 | 886 | Parameters
|
883 | 887 | ----------
|
884 |
| - index: Index being rearranged |
885 |
| - other_index: Index used to supply values not found in index |
886 |
| - indexer: how to rearrange index |
887 |
| - how: replacement is only necessary if indexer based on other_index |
| 888 | + index: [Multi]Index being rearranged |
| 889 | + other_index: [Multi]Index used to supply values not found in index |
| 890 | + indexer: indicies for fancy indexing in index |
| 891 | + other_indexer: indicies for fancy indexing in other_index |
| 892 | + names: index name(s) |
| 893 | + other_names: other_index name(s) |
888 | 894 |
|
889 | 895 | Returns
|
890 | 896 | -------
|
891 | 897 | join_index
|
892 | 898 | """
|
893 |
| - if self.how in (how, "outer") and not isinstance(other_index, MultiIndex): |
894 |
| - # if final index requires values in other_index but not target |
895 |
| - # index, indexer may hold missing (-1) values, causing Index.take |
896 |
| - # to take the final value in target index. So, we set the last |
897 |
| - # element to be the desired fill value. We do not use allow_fill |
898 |
| - # and fill_value because it throws a ValueError on integer indices |
899 |
| - mask = indexer == -1 |
900 |
| - if np.any(mask): |
901 |
| - fill_value = na_value_for_dtype(index.dtype, compat=False) |
902 |
| - index = index.append(Index([fill_value])) |
903 |
| - return index.take(indexer) |
| 899 | + |
| 900 | + if not issubclass(type(index), Index) and is_list_like(index): |
| 901 | + if len(index) > 1: |
| 902 | + index = MultiIndex.from_arrays(index, names=names) |
| 903 | + else: |
| 904 | + index = Index(index[0], name=names[0]) |
| 905 | + |
| 906 | + infer_index_type = False |
| 907 | + if not issubclass(type(other_index), Index) and is_list_like(other_index): |
| 908 | + # must infer dtype from index if possible |
| 909 | + infer_index_type = True |
| 910 | + |
| 911 | + if len(other_index) > 1: |
| 912 | + other_index = MultiIndex.from_arrays(other_index, names=other_names) |
| 913 | + else: |
| 914 | + other_index = Index(other_index[0], name=other_names[0]) |
| 915 | + |
| 916 | + # if final index requires values in other_index but not target |
| 917 | + # index, indexer may hold missing (-1) values, causing Index.take |
| 918 | + # to take the final value in target index |
| 919 | + mask = indexer == -1 |
| 920 | + |
| 921 | + idx = None |
| 922 | + convert_type = None |
| 923 | + if self.how == "outer": |
| 924 | + |
| 925 | + # we may need to reshape the index to include missing keys |
| 926 | + outer_mask = mask & (other_indexer != -1) |
| 927 | + |
| 928 | + if outer_mask.any(): |
| 929 | + |
| 930 | + if is_categorical_dtype(other_index.dtype): |
| 931 | + np_type = other_index.categories.dtype |
| 932 | + convert_type = other_index.categories.dtype |
| 933 | + elif ( |
| 934 | + is_period_dtype(other_index.dtype) |
| 935 | + or is_datetime64tz_dtype(other_index.dtype) |
| 936 | + or is_interval_dtype(other_index.dtype) |
| 937 | + ): |
| 938 | + # TODO: better way to get numpy dtype without try/except? |
| 939 | + np_type = other_index.to_numpy().dtype |
| 940 | + convert_type = other_index.dtype |
| 941 | + else: |
| 942 | + np_type = other_index.dtype |
| 943 | + |
| 944 | + outer_index = np.empty(len(outer_mask), dtype=np_type) |
| 945 | + outer_index[mask] = other_index.take(other_indexer[outer_mask]) |
| 946 | + |
| 947 | + if not index.empty: |
| 948 | + if is_datetime64tz_dtype(other_index.dtype): |
| 949 | + idx = np.where( |
| 950 | + outer_mask, outer_index, index.take(indexer).to_numpy() |
| 951 | + ) |
| 952 | + else: |
| 953 | + idx = np.where(outer_mask, outer_index, index.take(indexer)) |
| 954 | + else: |
| 955 | + idx = outer_index |
| 956 | + |
| 957 | + elif self.how == "inner": |
| 958 | + if not self.left_index and not self.right_index: |
| 959 | + # TODO: this behaviour below is assumed in a lot of test cases |
| 960 | + return np.arange(np.sum(~mask)) |
| 961 | + else: |
| 962 | + idx = index.take(indexer[~mask]) |
| 963 | + |
| 964 | + # left/right/outer (if outer is a subset of left/right) join |
| 965 | + reset_dtype = index.empty |
| 966 | + if idx is None: |
| 967 | + |
| 968 | + # try to preserve index, worse case we reset it |
| 969 | + if index.shape == mask.shape: |
| 970 | + idx = np.where(mask, index, index.take(indexer)) |
| 971 | + else: |
| 972 | + if mask.any(): |
| 973 | + return np.arange(len(mask)) |
| 974 | + else: |
| 975 | + idx = index.take(indexer) |
| 976 | + |
| 977 | + if isinstance(index, MultiIndex): |
| 978 | + if len(idx) == 0: |
| 979 | + return MultiIndex.from_arrays(np.empty((index.nlevels, 0)), names=names) |
| 980 | + else: |
| 981 | + return MultiIndex.from_tuples(idx, names=names) |
| 982 | + else: |
| 983 | + |
| 984 | + # a bit of spagetti soup |
| 985 | + create_categorial = False |
| 986 | + if infer_index_type and issubclass(type(index), Index): |
| 987 | + |
| 988 | + # TODO: Int64Index doesn't like Int64Index(np.ndarray) nor Categorical |
| 989 | + if is_integer_dtype(index): |
| 990 | + index_type = Index |
| 991 | + elif is_categorical_dtype(index): |
| 992 | + create_categorial = True |
| 993 | + index_type = type(index) |
| 994 | + if is_categorical_dtype(other_index): |
| 995 | + union_cat = index.categories.union(other_index.categories) |
| 996 | + else: |
| 997 | + union_cat = index.categories.union(other_index) |
| 998 | + else: |
| 999 | + index_type = type(index) |
| 1000 | + |
| 1001 | + else: |
| 1002 | + if is_categorical_dtype(other_index): |
| 1003 | + create_categorial = True |
| 1004 | + index_type = type(other_index) |
| 1005 | + if is_categorical_dtype(index): |
| 1006 | + union_cat = other_index.categories.union(index.categories) |
| 1007 | + else: |
| 1008 | + union_cat = other_index.categories.union(index) |
| 1009 | + else: |
| 1010 | + index_type = Index |
| 1011 | + |
| 1012 | + if create_categorial: |
| 1013 | + return index_type(idx, categories=union_cat, name=names[0]) |
| 1014 | + elif reset_dtype: |
| 1015 | + return index_type(idx, name=names[0]) |
| 1016 | + elif convert_type is None: |
| 1017 | + if is_period_dtype(index.dtype) or is_datetime64tz_dtype(index.dtype): |
| 1018 | + return index_type(idx, name=names[0]) |
| 1019 | + else: |
| 1020 | + return index_type(idx, dtype=index.dtype, name=names[0]) |
| 1021 | + else: |
| 1022 | + if is_period_dtype(other_index.dtype) or is_datetime64tz_dtype( |
| 1023 | + other_index.dtype |
| 1024 | + ): |
| 1025 | + return index_type(idx, name=names[0]) |
| 1026 | + else: |
| 1027 | + return index_type(idx, dtype=convert_type, name=names[0]) |
904 | 1028 |
|
905 | 1029 | def _get_merge_keys(self):
|
906 | 1030 | """
|
|
0 commit comments