CLN: assorted cleanups, remove unicode checks in cython (pandas-dev#28879)

jbrockmendel · proost · commit 397cad2cc446 · 2019-12-20T01:09:07.000+09:00
diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx
@@ -60,7 +60,7 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'):
         val = arr[i]
         if isinstance(val, bytes):
             data = <bytes>val
-        elif isinstance(val, unicode):
+        elif isinstance(val, str):
             data = <bytes>val.encode(encoding)
         elif val is None or is_nan(val):
             # null, stringify and encode
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -667,7 +667,7 @@ cdef class StringHashTable(HashTable):
         for i in range(n):
             val = values[i]
 
-            if isinstance(val, (str, unicode)):
+            if isinstance(val, str):
                 v = get_c_string(val)
             else:
                 v = get_c_string(self.na_string_sentinel)
@@ -700,7 +700,7 @@ cdef class StringHashTable(HashTable):
         for i in range(n):
             val = values[i]
 
-            if isinstance(val, (str, unicode)):
+            if isinstance(val, str):
                 v = get_c_string(val)
             else:
                 v = get_c_string(self.na_string_sentinel)
@@ -774,7 +774,7 @@ cdef class StringHashTable(HashTable):
             val = values[i]
 
             if (ignore_na
-                and (not isinstance(val, (str, unicode))
+                and (not isinstance(val, str)
                      or (use_na_value and val == na_value))):
                 # if missing values do not count as unique values (i.e. if
                 # ignore_na is True), we can skip the actual value, and
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -2249,7 +2249,7 @@ cdef _apply_converter(object f, parser_t *parser, int64_t col,
 def _maybe_encode(values):
     if values is None:
         return []
-    return [x.encode('utf-8') if isinstance(x, unicode) else x for x in values]
+    return [x.encode('utf-8') if isinstance(x, str) else x for x in values]
 
 
 def sanitize_objects(ndarray[object] values, set na_values,
diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx
@@ -22,7 +22,7 @@ from pandas._libs.tslibs.np_datetime cimport (
 from pandas._libs.tslibs.nattype cimport NPY_NAT
 
 
-def get_time_micros(ndarray[int64_t] dtindex):
+def get_time_micros(const int64_t[:] dtindex):
     """
     Return the number of microseconds in the time component of a
     nanosecond timestamp.
@@ -537,7 +537,7 @@ def get_date_field(const int64_t[:] dtindex, object field):
     elif field == 'is_leap_year':
         return isleapyear_arr(get_date_field(dtindex, 'Y'))
 
-    raise ValueError("Field %s not supported" % field)
+    raise ValueError("Field {field} not supported".format(field=field))
 
 
 @cython.wraparound(False)
diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx
@@ -252,9 +252,7 @@ def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None):
     -------
     datetime, datetime/dateutil.parser._result, str
     """
-    if not isinstance(arg, (str, unicode)):
-        # Note: cython recognizes `unicode` in both py2/py3, optimizes
-        # this check into a C call.
+    if not isinstance(arg, str):
         return arg
 
     if getattr(freq, "_typ", None) == "dateoffset":
@@ -370,7 +368,7 @@ cdef inline object _parse_dateabbr_string(object date_string, object default,
         int year, quarter = -1, month, mnum, date_len
 
     # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1
-    assert isinstance(date_string, (str, unicode))
+    assert isinstance(date_string, str)
 
     # len(date_string) == 0
     # should be NaT???
@@ -517,7 +515,7 @@ cdef dateutil_parse(object timestr, object default, ignoretz=False,
                 tzdata = tzinfos.get(res.tzname)
             if isinstance(tzdata, datetime.tzinfo):
                 tzinfo = tzdata
-            elif isinstance(tzdata, (str, unicode)):
+            elif isinstance(tzdata, str):
                 tzinfo = _dateutil_tzstr(tzdata)
             elif isinstance(tzdata, int):
                 tzinfo = tzoffset(res.tzname, tzdata)
diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx
@@ -2448,7 +2448,10 @@ class Period(_Period):
                 converted = other.asfreq(freq)
                 ordinal = converted.ordinal
 
-        elif is_null_datetimelike(value) or value in nat_strings:
+        elif is_null_datetimelike(value) or (isinstance(value, str) and
+                                             value in nat_strings):
+            # explicit str check is necessary to avoid raising incorrectly
+            #  if we have a non-hashable value.
             ordinal = NPY_NAT
 
         elif isinstance(value, str) or util.is_integer_object(value):
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -1148,7 +1148,7 @@ def _addsub_offset_array(self, other, op):
         )
 
         # For EA self.astype('O') returns a numpy array, not an Index
-        left = lib.values_from_object(self.astype("O"))
+        left = self.astype("O")
 
         res_values = op(left, np.array(other))
         kwargs = {}
diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
@@ -70,7 +70,7 @@ def _period_array_cmp(cls, op):
     nat_result = opname == "__ne__"
 
     def wrapper(self, other):
-        op = getattr(self.asi8, opname)
+        ordinal_op = getattr(self.asi8, opname)
 
         other = lib.item_from_zerodim(other)
         if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):
@@ -82,11 +82,11 @@ def wrapper(self, other):
         if isinstance(other, Period):
             self._check_compatible_with(other)
 
-            result = op(other.ordinal)
+            result = ordinal_op(other.ordinal)
         elif isinstance(other, cls):
             self._check_compatible_with(other)
 
-            result = op(other.asi8)
+            result = ordinal_op(other.asi8)
 
             mask = self._isnan | other._isnan
             if mask.any():
@@ -98,7 +98,7 @@ def wrapper(self, other):
             result.fill(nat_result)
         else:
             other = Period(other, freq=self.freq)
-            result = op(other.ordinal)
+            result = ordinal_op(other.ordinal)
 
         if self._hasnans:
             result[self._isnan] = nat_result
diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
@@ -553,7 +553,7 @@ def __mul__(self, other):
             #  for that instead of ValueError
             raise ValueError("Cannot multiply with unequal lengths")
 
-        if is_object_dtype(other):
+        if is_object_dtype(other.dtype):
             # this multiplication will succeed only if all elements of other
             #  are int or float scalars, so we will end up with
             #  timedelta64[ns]-dtyped result
@@ -601,11 +601,11 @@ def __truediv__(self, other):
         if len(other) != len(self):
             raise ValueError("Cannot divide vectors with unequal lengths")
 
-        elif is_timedelta64_dtype(other):
+        elif is_timedelta64_dtype(other.dtype):
             # let numpy handle it
             return self._data / other
 
-        elif is_object_dtype(other):
+        elif is_object_dtype(other.dtype):
             # Note: we do not do type inference on the result, so either
             #  an object array or numeric-dtyped (if numpy does inference)
             #  will be returned.  GH#23829
@@ -649,12 +649,12 @@ def __rtruediv__(self, other):
         if len(other) != len(self):
             raise ValueError("Cannot divide vectors with unequal lengths")
 
-        elif is_timedelta64_dtype(other):
+        elif is_timedelta64_dtype(other.dtype):
             # let numpy handle it
             return other / self._data
 
-        elif is_object_dtype(other):
-            # Note: unlike in __truediv__, we do not _need_ to do type#
+        elif is_object_dtype(other.dtype):
+            # Note: unlike in __truediv__, we do not _need_ to do type
             #  inference on the result.  It does not raise, a numeric array
             #  is returned.  GH#23829
             result = [other[n] / self[n] for n in range(len(self))]
@@ -701,7 +701,7 @@ def __floordiv__(self, other):
         if len(other) != len(self):
             raise ValueError("Cannot divide with unequal lengths")
 
-        elif is_timedelta64_dtype(other):
+        elif is_timedelta64_dtype(other.dtype):
             other = type(self)(other)
 
             # numpy timedelta64 does not natively support floordiv, so operate
@@ -713,15 +713,15 @@ def __floordiv__(self, other):
                 result[mask] = np.nan
             return result
 
-        elif is_object_dtype(other):
+        elif is_object_dtype(other.dtype):
             result = [self[n] // other[n] for n in range(len(self))]
             result = np.array(result)
             if lib.infer_dtype(result, skipna=False) == "timedelta":
                 result, _ = sequence_to_td64ns(result)
                 return type(self)(result)
             return result
 
-        elif is_integer_dtype(other) or is_float_dtype(other):
+        elif is_integer_dtype(other.dtype) or is_float_dtype(other.dtype):
             result = self._data // other
             return type(self)(result)
 
@@ -763,7 +763,7 @@ def __rfloordiv__(self, other):
         if len(other) != len(self):
             raise ValueError("Cannot divide with unequal lengths")
 
-        elif is_timedelta64_dtype(other):
+        elif is_timedelta64_dtype(other.dtype):
             other = type(self)(other)
 
             # numpy timedelta64 does not natively support floordiv, so operate
@@ -775,7 +775,7 @@ def __rfloordiv__(self, other):
                 result[mask] = np.nan
             return result
 
-        elif is_object_dtype(other):
+        elif is_object_dtype(other.dtype):
             result = [other[n] // self[n] for n in range(len(self))]
             result = np.array(result)
             return result
diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py
@@ -161,7 +161,7 @@ def arithmetic_op(
     right: Any,
     op,
     str_rep: str,
-    eval_kwargs: Dict[str, str],
+    eval_kwargs: Dict[str, bool],
 ):
     """
     Evaluate an arithmetic operation `+`, `-`, `*`, `/`, `//`, `%`, `**`, ...
diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py
@@ -400,7 +400,7 @@ def test_combineFrame(self, float_frame, mixed_float_frame, mixed_int_frame):
         added = float_frame + mixed_int_frame
         _check_mixed_float(added, dtype="float64")
 
-    def test_combineSeries(
+    def test_combine_series(
         self, float_frame, mixed_float_frame, mixed_int_frame, datetime_frame
     ):
 
@@ -432,6 +432,7 @@ def test_combineSeries(
         added = mixed_float_frame + series.astype("float16")
         _check_mixed_float(added, dtype=dict(C=None))
 
+        # FIXME: don't leave commented-out
         # these raise with numexpr.....as we are adding an int64 to an
         # uint64....weird vs int
 
diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py
@@ -122,7 +122,8 @@ def test_ops(self):
                     result = getattr(df, rop)(m)
                     assert_frame_equal(result, expected)
 
-        # GH7192
+        # GH7192: Note we need a large number of rows to ensure this
+        #  goes through the numexpr path
         df = DataFrame(dict(A=np.random.randn(25000)))
         df.iloc[0:5] = np.nan
         expected = 1 - np.isnan(df.iloc[0:25])

Original file line number	Diff line number	Diff line change
`@@ -1148,7 +1148,7 @@ def _addsub_offset_array(self, other, op):`
`1148`	`1148`	`)`
`1149`	`1149`
`1150`	`1150`	`# For EA self.astype('O') returns a numpy array, not an Index`
`1151`		`- left = lib.values_from_object(self.astype("O"))`
	`1151`	`+ left = self.astype("O")`
`1152`	`1152`
`1153`	`1153`	`res_values = op(left, np.array(other))`
`1154`	`1154`	`kwargs = {}`