Merge pull request #11410 from jreback/tz_merge

jreback · jreback · commit 1b2e45ad53c7 · 2015-10-23T12:43:22.000-04:00
Bug in merging datetime64[ns, tz] dtypes #11405
diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.txt
@@ -74,7 +74,7 @@ Bug Fixes
 
 - Bug in ``.to_latex()`` output broken when the index has a name (:issue: `10660`)
 - Bug in ``HDFStore.append`` with strings whose encoded length exceded the max unencoded length (:issue:`11234`)
-
+- Bug in merging ``datetime64[ns, tz]`` dtypes (:issue:`11405`)
 - Bug in ``HDFStore.select`` when comparing with a numpy scalar in a where clause (:issue:`11283`)
 
 
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -1081,6 +1081,9 @@ def _maybe_promote(dtype, fill_value=np.nan):
                     fill_value = tslib.iNaT
             else:
                 fill_value = tslib.iNaT
+    elif is_datetimetz(dtype):
+        if isnull(fill_value):
+            fill_value = tslib.iNaT
     elif is_float(fill_value):
         if issubclass(dtype.type, np.bool_):
             dtype = np.object_
@@ -1107,7 +1110,9 @@ def _maybe_promote(dtype, fill_value=np.nan):
 
     # in case we have a string that looked like a number
     if is_categorical_dtype(dtype):
-        dtype = dtype
+        pass
+    elif is_datetimetz(dtype):
+        pass
     elif issubclass(np.dtype(dtype).type, compat.string_types):
         dtype = np.object_
 
@@ -2497,7 +2502,6 @@ def is_int64_dtype(arr_or_dtype):
     tipo = _get_dtype_type(arr_or_dtype)
     return issubclass(tipo, np.int64)
 
-
 def is_int_or_datetime_dtype(arr_or_dtype):
     tipo = _get_dtype_type(arr_or_dtype)
     return (issubclass(tipo, np.integer) or
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -4114,7 +4114,7 @@ def _interleaved_dtype(blocks):
     if not len(blocks):
         return None
 
-    counts = defaultdict(lambda: [])
+    counts = defaultdict(list)
     for x in blocks:
         counts[type(x)].append(x)
 
@@ -4482,9 +4482,8 @@ def get_empty_dtype_and_na(join_units):
         else:
             dtypes[i] = unit.dtype
 
-    # dtypes = set()
-    upcast_classes = set()
-    null_upcast_classes = set()
+    upcast_classes = defaultdict(list)
+    null_upcast_classes = defaultdict(list)
     for dtype, unit in zip(dtypes, join_units):
         if dtype is None:
             continue
@@ -4508,9 +4507,9 @@ def get_empty_dtype_and_na(join_units):
         # are only null blocks, when same upcasting rules must be applied to
         # null upcast classes.
         if unit.is_null:
-            null_upcast_classes.add(upcast_cls)
+            null_upcast_classes[upcast_cls].append(dtype)
         else:
-            upcast_classes.add(upcast_cls)
+            upcast_classes[upcast_cls].append(dtype)
 
     if not upcast_classes:
         upcast_classes = null_upcast_classes
@@ -4528,7 +4527,8 @@ def get_empty_dtype_and_na(join_units):
     elif 'float' in upcast_classes:
         return np.dtype(np.float64), np.nan
     elif 'datetimetz' in upcast_classes:
-        return np.dtype('M8[ns]'), tslib.iNaT
+        dtype = upcast_classes['datetimetz']
+        return dtype[0], tslib.iNaT
     elif 'datetime' in upcast_classes:
         return np.dtype('M8[ns]'), tslib.iNaT
     elif 'timedelta' in upcast_classes:
@@ -4788,6 +4788,7 @@ def is_null(self):
         return True
 
     def get_reindexed_values(self, empty_dtype, upcasted_na):
+
         if upcasted_na is None:
             # No upcasting is necessary
             fill_value = self.block.fill_value
diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py
@@ -220,8 +220,8 @@ def get_result(self):
         return result
 
     def _indicator_pre_merge(self, left, right):
-                
-        columns = left.columns.union(right.columns)  
+
+        columns = left.columns.union(right.columns)
 
         for i in ['_left_indicator', '_right_indicator']:
             if i in columns:
@@ -232,12 +232,12 @@ def _indicator_pre_merge(self, left, right):
         left = left.copy()
         right = right.copy()
 
-        left['_left_indicator'] = 1  
-        left['_left_indicator'] = left['_left_indicator'].astype('int8')  
-        
-        right['_right_indicator'] = 2     
-        right['_right_indicator'] = right['_right_indicator'].astype('int8') 
-        
+        left['_left_indicator'] = 1
+        left['_left_indicator'] = left['_left_indicator'].astype('int8')
+
+        right['_right_indicator'] = 2
+        right['_right_indicator'] = right['_right_indicator'].astype('int8')
+
         return left, right
 
     def _indicator_post_merge(self, result):
@@ -246,8 +246,8 @@ def _indicator_post_merge(self, result):
         result['_right_indicator'] = result['_right_indicator'].fillna(0)
 
         result[self.indicator_name] = Categorical((result['_left_indicator'] + result['_right_indicator']), categories=[1,2,3])
-        result[self.indicator_name] = result[self.indicator_name].cat.rename_categories(['left_only', 'right_only', 'both'])        
- 
+        result[self.indicator_name] = result[self.indicator_name].cat.rename_categories(['left_only', 'right_only', 'both'])
+
         result = result.drop(labels=['_left_indicator', '_right_indicator'], axis=1)
 
         return result
@@ -261,7 +261,7 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
                 continue
 
             if name in result:
-                key_col = result[name]
+                key_indexer = result.columns.get_loc(name)
 
                 if left_indexer is not None and right_indexer is not None:
 
@@ -274,9 +274,8 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
                             continue
 
                         right_na_indexer = right_indexer.take(na_indexer)
-                        key_col.put(
-                            na_indexer, com.take_1d(self.right_join_keys[i],
-                                                    right_na_indexer))
+                        result.iloc[na_indexer,key_indexer] = com.take_1d(self.right_join_keys[i],
+                                                                          right_na_indexer)
                     elif name in self.right:
                         if len(self.right) == 0:
                             continue
@@ -286,9 +285,8 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
                             continue
 
                         left_na_indexer = left_indexer.take(na_indexer)
-                        key_col.put(na_indexer, com.take_1d(self.left_join_keys[i],
-                                                            left_na_indexer))
-
+                        result.iloc[na_indexer,key_indexer] = com.take_1d(self.left_join_keys[i],
+                                                                          left_na_indexer)
             elif left_indexer is not None \
                     and isinstance(self.left_join_keys[i], np.ndarray):
 
@@ -664,10 +662,13 @@ def _right_outer_join(x, y, max_groups):
 
 
 def _factorize_keys(lk, rk, sort=True):
+    if com.is_datetime64tz_dtype(lk) and com.is_datetime64tz_dtype(rk):
+        lk = lk.values
+        rk = rk.values
     if com.is_int_or_datetime_dtype(lk) and com.is_int_or_datetime_dtype(rk):
         klass = _hash.Int64Factorizer
-        lk = com._ensure_int64(lk)
-        rk = com._ensure_int64(rk)
+        lk = com._ensure_int64(com._values_from_object(lk))
+        rk = com._ensure_int64(com._values_from_object(rk))
     else:
         klass = _hash.Factorizer
         lk = com._ensure_object(lk)
diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py
diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py