Skip to content

CLN: assorted cleanups, remove unicode checks in cython #28879

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 10, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/_libs/hashing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'):
val = arr[i]
if isinstance(val, bytes):
data = <bytes>val
elif isinstance(val, unicode):
elif isinstance(val, str):
data = <bytes>val.encode(encoding)
elif val is None or is_nan(val):
# null, stringify and encode
Expand Down
6 changes: 3 additions & 3 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -667,7 +667,7 @@ cdef class StringHashTable(HashTable):
for i in range(n):
val = values[i]

if isinstance(val, (str, unicode)):
if isinstance(val, str):
v = get_c_string(val)
else:
v = get_c_string(self.na_string_sentinel)
Expand Down Expand Up @@ -700,7 +700,7 @@ cdef class StringHashTable(HashTable):
for i in range(n):
val = values[i]

if isinstance(val, (str, unicode)):
if isinstance(val, str):
v = get_c_string(val)
else:
v = get_c_string(self.na_string_sentinel)
Expand Down Expand Up @@ -774,7 +774,7 @@ cdef class StringHashTable(HashTable):
val = values[i]

if (ignore_na
and (not isinstance(val, (str, unicode))
and (not isinstance(val, str)
or (use_na_value and val == na_value))):
# if missing values do not count as unique values (i.e. if
# ignore_na is True), we can skip the actual value, and
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2249,7 +2249,7 @@ cdef _apply_converter(object f, parser_t *parser, int64_t col,
def _maybe_encode(values):
if values is None:
return []
return [x.encode('utf-8') if isinstance(x, unicode) else x for x in values]
return [x.encode('utf-8') if isinstance(x, str) else x for x in values]


def sanitize_objects(ndarray[object] values, set na_values,
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/tslibs/fields.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ from pandas._libs.tslibs.np_datetime cimport (
from pandas._libs.tslibs.nattype cimport NPY_NAT


def get_time_micros(ndarray[int64_t] dtindex):
def get_time_micros(const int64_t[:] dtindex):
"""
Return the number of microseconds in the time component of a
nanosecond timestamp.
Expand Down Expand Up @@ -537,7 +537,7 @@ def get_date_field(const int64_t[:] dtindex, object field):
elif field == 'is_leap_year':
return isleapyear_arr(get_date_field(dtindex, 'Y'))

raise ValueError("Field %s not supported" % field)
raise ValueError("Field {field} not supported".format(field=field))


@cython.wraparound(False)
Expand Down
8 changes: 3 additions & 5 deletions pandas/_libs/tslibs/parsing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -252,9 +252,7 @@ def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None):
-------
datetime, datetime/dateutil.parser._result, str
"""
if not isinstance(arg, (str, unicode)):
# Note: cython recognizes `unicode` in both py2/py3, optimizes
# this check into a C call.
if not isinstance(arg, str):
return arg

if getattr(freq, "_typ", None) == "dateoffset":
Expand Down Expand Up @@ -370,7 +368,7 @@ cdef inline object _parse_dateabbr_string(object date_string, object default,
int year, quarter = -1, month, mnum, date_len

# special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1
assert isinstance(date_string, (str, unicode))
assert isinstance(date_string, str)

# len(date_string) == 0
# should be NaT???
Expand Down Expand Up @@ -517,7 +515,7 @@ cdef dateutil_parse(object timestr, object default, ignoretz=False,
tzdata = tzinfos.get(res.tzname)
if isinstance(tzdata, datetime.tzinfo):
tzinfo = tzdata
elif isinstance(tzdata, (str, unicode)):
elif isinstance(tzdata, str):
tzinfo = _dateutil_tzstr(tzdata)
elif isinstance(tzdata, int):
tzinfo = tzoffset(res.tzname, tzdata)
Expand Down
5 changes: 4 additions & 1 deletion pandas/_libs/tslibs/period.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2448,7 +2448,10 @@ class Period(_Period):
converted = other.asfreq(freq)
ordinal = converted.ordinal

elif is_null_datetimelike(value) or value in nat_strings:
elif is_null_datetimelike(value) or (isinstance(value, str) and
value in nat_strings):
# explicit str check is necessary to avoid raising incorrectly
# if we have a non-hashable value.
ordinal = NPY_NAT

elif isinstance(value, str) or util.is_integer_object(value):
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -1148,7 +1148,7 @@ def _addsub_offset_array(self, other, op):
)

# For EA self.astype('O') returns a numpy array, not an Index
left = lib.values_from_object(self.astype("O"))
left = self.astype("O")

res_values = op(left, np.array(other))
kwargs = {}
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def _period_array_cmp(cls, op):
nat_result = opname == "__ne__"

def wrapper(self, other):
op = getattr(self.asi8, opname)
ordinal_op = getattr(self.asi8, opname)

other = lib.item_from_zerodim(other)
if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):
Expand All @@ -82,11 +82,11 @@ def wrapper(self, other):
if isinstance(other, Period):
self._check_compatible_with(other)

result = op(other.ordinal)
result = ordinal_op(other.ordinal)
elif isinstance(other, cls):
self._check_compatible_with(other)

result = op(other.asi8)
result = ordinal_op(other.asi8)

mask = self._isnan | other._isnan
if mask.any():
Expand All @@ -98,7 +98,7 @@ def wrapper(self, other):
result.fill(nat_result)
else:
other = Period(other, freq=self.freq)
result = op(other.ordinal)
result = ordinal_op(other.ordinal)

if self._hasnans:
result[self._isnan] = nat_result
Expand Down
22 changes: 11 additions & 11 deletions pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,7 @@ def __mul__(self, other):
# for that instead of ValueError
raise ValueError("Cannot multiply with unequal lengths")

if is_object_dtype(other):
if is_object_dtype(other.dtype):
# this multiplication will succeed only if all elements of other
# are int or float scalars, so we will end up with
# timedelta64[ns]-dtyped result
Expand Down Expand Up @@ -601,11 +601,11 @@ def __truediv__(self, other):
if len(other) != len(self):
raise ValueError("Cannot divide vectors with unequal lengths")

elif is_timedelta64_dtype(other):
elif is_timedelta64_dtype(other.dtype):
# let numpy handle it
return self._data / other

elif is_object_dtype(other):
elif is_object_dtype(other.dtype):
# Note: we do not do type inference on the result, so either
# an object array or numeric-dtyped (if numpy does inference)
# will be returned. GH#23829
Expand Down Expand Up @@ -649,12 +649,12 @@ def __rtruediv__(self, other):
if len(other) != len(self):
raise ValueError("Cannot divide vectors with unequal lengths")

elif is_timedelta64_dtype(other):
elif is_timedelta64_dtype(other.dtype):
# let numpy handle it
return other / self._data

elif is_object_dtype(other):
# Note: unlike in __truediv__, we do not _need_ to do type#
elif is_object_dtype(other.dtype):
# Note: unlike in __truediv__, we do not _need_ to do type
# inference on the result. It does not raise, a numeric array
# is returned. GH#23829
result = [other[n] / self[n] for n in range(len(self))]
Expand Down Expand Up @@ -701,7 +701,7 @@ def __floordiv__(self, other):
if len(other) != len(self):
raise ValueError("Cannot divide with unequal lengths")

elif is_timedelta64_dtype(other):
elif is_timedelta64_dtype(other.dtype):
other = type(self)(other)

# numpy timedelta64 does not natively support floordiv, so operate
Expand All @@ -713,15 +713,15 @@ def __floordiv__(self, other):
result[mask] = np.nan
return result

elif is_object_dtype(other):
elif is_object_dtype(other.dtype):
result = [self[n] // other[n] for n in range(len(self))]
result = np.array(result)
if lib.infer_dtype(result, skipna=False) == "timedelta":
result, _ = sequence_to_td64ns(result)
return type(self)(result)
return result

elif is_integer_dtype(other) or is_float_dtype(other):
elif is_integer_dtype(other.dtype) or is_float_dtype(other.dtype):
result = self._data // other
return type(self)(result)

Expand Down Expand Up @@ -763,7 +763,7 @@ def __rfloordiv__(self, other):
if len(other) != len(self):
raise ValueError("Cannot divide with unequal lengths")

elif is_timedelta64_dtype(other):
elif is_timedelta64_dtype(other.dtype):
other = type(self)(other)

# numpy timedelta64 does not natively support floordiv, so operate
Expand All @@ -775,7 +775,7 @@ def __rfloordiv__(self, other):
result[mask] = np.nan
return result

elif is_object_dtype(other):
elif is_object_dtype(other.dtype):
result = [other[n] // self[n] for n in range(len(self))]
result = np.array(result)
return result
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/ops/array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def arithmetic_op(
right: Any,
op,
str_rep: str,
eval_kwargs: Dict[str, str],
eval_kwargs: Dict[str, bool],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice this makes sense. Would certainly love a follow up to comment that all the way through

):
"""
Evaluate an arithmetic operation `+`, `-`, `*`, `/`, `//`, `%`, `**`, ...
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/frame/test_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ def test_combineFrame(self, float_frame, mixed_float_frame, mixed_int_frame):
added = float_frame + mixed_int_frame
_check_mixed_float(added, dtype="float64")

def test_combineSeries(
def test_combine_series(
self, float_frame, mixed_float_frame, mixed_int_frame, datetime_frame
):

Expand Down Expand Up @@ -432,6 +432,7 @@ def test_combineSeries(
added = mixed_float_frame + series.astype("float16")
_check_mixed_float(added, dtype=dict(C=None))

# FIXME: don't leave commented-out
# these raise with numexpr.....as we are adding an int64 to an
# uint64....weird vs int

Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/frame/test_query_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,8 @@ def test_ops(self):
result = getattr(df, rop)(m)
assert_frame_equal(result, expected)

# GH7192
# GH7192: Note we need a large number of rows to ensure this
# goes through the numexpr path
df = DataFrame(dict(A=np.random.randn(25000)))
df.iloc[0:5] = np.nan
expected = 1 - np.isnan(df.iloc[0:25])
Expand Down