move check for datetime tz to hashing function

jorisvandenbossche · jorisvandenbossche · commit 287817a8b9c4 · 2017-05-16T22:49:46.000+02:00
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -333,7 +333,7 @@ def maybe_promote(dtype, fill_value=np.nan):
     return dtype, fill_value
 
 
-def infer_dtype_from_scalar(val, pandas_dtype=False, use_datetimetz=True):
+def infer_dtype_from_scalar(val, pandas_dtype=False):
     """
     interpret the dtype from a scalar
 
@@ -368,7 +368,7 @@ def infer_dtype_from_scalar(val, pandas_dtype=False, use_datetimetz=True):
 
     elif isinstance(val, (np.datetime64, datetime)):
         val = tslib.Timestamp(val)
-        if val is tslib.NaT or val.tz is None or not use_datetimetz:
+        if val is tslib.NaT or val.tz is None:
             dtype = np.dtype('M8[ns]')
         else:
             if pandas_dtype:
diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
@@ -4,7 +4,7 @@
 import itertools
 
 import numpy as np
-from pandas._libs import hashing
+from pandas._libs import hashing, tslib
 from pandas.core.dtypes.generic import (
     ABCMultiIndex,
     ABCIndexClass,
@@ -317,7 +317,15 @@ def _hash_scalar(val, encoding='utf8', hash_key=None):
         # this is to be consistent with the _hash_categorical implementation
         return np.array([np.iinfo(np.uint64).max], dtype='u8')
 
-    dtype, val = infer_dtype_from_scalar(val, use_datetimetz=False)
+    if getattr(val, 'tzinfo', None) is not None:
+        # for tz-aware datetimes, we need the underlying naive UTC value and
+        # not the tz aware object or pd extension type (as
+        # infer_dtype_from_scalar would do)
+        if not isinstance(val, tslib.Timestamp):
+            val = tslib.Timestamp(val)
+        val = val.tz_convert(None)
+
+    dtype, val = infer_dtype_from_scalar(val)
     vals = np.array([val], dtype=dtype)
 
     return hash_array(vals, hash_key=hash_key, encoding=encoding,
diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py
@@ -1,4 +1,5 @@
 import pytest
+import datetime
 
 from warnings import catch_warnings
 import numpy as np
@@ -81,16 +82,20 @@ def test_hash_tuples(self):
 
     def test_hash_tuple(self):
         # test equivalence between hash_tuples and hash_tuple
-        for tup in [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A')]:
+        for tup in [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'),
+                    ('A', pd.Timestamp("2012-01-01"))]:
             result = hash_tuple(tup)
             expected = hash_tuples([tup])[0]
             assert result == expected
 
     def test_hash_scalar(self):
         for val in [1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"),
                     pd.Timestamp("2012-01-01", tz='Europe/Brussels'),
-                    pd.Period('2012-01-01', freq='D'), pd.Timedelta('1 days'),
-                    pd.Interval(0, 1), np.nan, pd.NaT, None]:
+                    datetime.datetime(2012, 1, 1),
+                    pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(),
+                    pd.Timedelta('1 days'), datetime.timedelta(1),
+                    pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1),
+                    np.nan, pd.NaT, None]:
             result = _hash_scalar(val)
             expected = hash_array(np.array([val], dtype=object),
                                   categorize=True)