Skip to content

CLN refactor core dtypes #37584

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jan 3, 2021
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
7863781
refactor core dtypes
MarcoGorelli Nov 2, 2020
a67b324
Merge remote-tracking branch 'upstream/master' into refactor-core-dtypes
MarcoGorelli Nov 2, 2020
4e21831
revert check
MarcoGorelli Nov 4, 2020
68479a1
revert
MarcoGorelli Nov 4, 2020
9696dae
Merge remote-tracking branch 'upstream/master' into refactor-core-dtypes
MarcoGorelli Nov 6, 2020
8aaebe2
pass list of dtypeobj
MarcoGorelli Nov 6, 2020
d2b5bef
Merge remote-tracking branch 'upstream/master' into refactor-core-dtypes
MarcoGorelli Nov 18, 2020
5e53457
Merge remote-tracking branch 'upstream/master' into refactor-core-dtypes
MarcoGorelli Nov 18, 2020
1374cf8
Merge remote-tracking branch 'upstream/master' into refactor-core-dtypes
MarcoGorelli Nov 23, 2020
1d92e4f
Merge remote-tracking branch 'upstream/master' into refactor-core-dtypes
MarcoGorelli Nov 24, 2020
a4bb6e8
Merge remote-tracking branch 'upstream/master' into refactor-core-dtypes
MarcoGorelli Nov 29, 2020
5241029
Merge remote-tracking branch 'upstream/master' into refactor-core-dtypes
MarcoGorelli Dec 6, 2020
9e1366d
coverage
MarcoGorelli Dec 9, 2020
b6ab2be
Merge branch 'refactor-core-dtypes' of github.com:MarcoGorelli/pandas…
MarcoGorelli Dec 9, 2020
f8ed13e
Merge remote-tracking branch 'upstream/master' into refactor-core-dtypes
MarcoGorelli Dec 9, 2020
65c5179
Merge remote-tracking branch 'upstream/master' into refactor-core-dtypes
MarcoGorelli Jan 3, 2021
f4a1ff1
Merge branch 'refactor-core-dtypes' of github.com:MarcoGorelli/pandas…
MarcoGorelli Jan 3, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 31 additions & 23 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
Routines for casting.
"""

from __future__ import annotations

from contextlib import suppress
from datetime import date, datetime, timedelta
from typing import (
Expand Down Expand Up @@ -126,12 +128,11 @@ def is_nested_object(obj) -> bool:
This may not be necessarily be performant.

"""
if isinstance(obj, ABCSeries) and is_object_dtype(obj.dtype):

if any(isinstance(v, ABCSeries) for v in obj._values):
return True

return False
return bool(
isinstance(obj, ABCSeries)
and is_object_dtype(obj.dtype)
and any(isinstance(v, ABCSeries) for v in obj._values)
)


def maybe_downcast_to_dtype(result, dtype: Union[str, np.dtype]):
Expand Down Expand Up @@ -272,7 +273,7 @@ def trans(x):
return result


def maybe_cast_result(result, obj: "Series", numeric_only: bool = False, how: str = ""):
def maybe_cast_result(result, obj: Series, numeric_only: bool = False, how: str = ""):
"""
Try casting result to a different type if appropriate

Expand Down Expand Up @@ -335,10 +336,11 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj:
from pandas.core.arrays.boolean import BooleanDtype
from pandas.core.arrays.integer import Int64Dtype

if how in ["add", "cumsum", "sum"] and (dtype == np.dtype(bool)):
return np.dtype(np.int64)
elif how in ["add", "cumsum", "sum"] and isinstance(dtype, BooleanDtype):
return Int64Dtype()
if how in ["add", "cumsum", "sum"]:
if dtype == np.dtype(bool):
return np.dtype(np.int64)
elif isinstance(dtype, BooleanDtype):
return Int64Dtype()
return dtype


Expand Down Expand Up @@ -481,9 +483,11 @@ def maybe_casted_values(
"""

values = index._values
if not isinstance(index, (ABCPeriodIndex, ABCDatetimeIndex)):
if values.dtype == np.object_:
values = lib.maybe_convert_objects(values)
if (
not isinstance(index, (ABCPeriodIndex, ABCDatetimeIndex))
and values.dtype == np.object_
):
values = lib.maybe_convert_objects(values)

# if we have the codes, extract the values with a mask
if codes is not None:
Expand Down Expand Up @@ -725,8 +729,8 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj,

# a 1-element ndarray
if isinstance(val, np.ndarray):
msg = "invalid ndarray passed to infer_dtype_from_scalar"
if val.ndim != 0:
msg = "invalid ndarray passed to infer_dtype_from_scalar"
raise ValueError(msg)

dtype = val.dtype
Expand Down Expand Up @@ -1535,7 +1539,7 @@ def maybe_cast_to_datetime(value, dtype: DtypeObj, errors: str = "raise"):
return value


def find_common_type(types: List[DtypeObj]) -> DtypeObj:
def find_common_type(types: Union[List[DtypeObj], Series]) -> DtypeObj:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where are we passing a Series? we shouldn't do this

Copy link
Member Author

@MarcoGorelli MarcoGorelli Nov 4, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in pandas/tests/arrays/sparse/test_accessor.py::TestFrameAccessor::test_to_coo

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll dig deeper at the weekend

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback the problem is that when we do

        df = pd.DataFrame({"A": [0, 1, 0], "B": [1, 0, 0]}, dtype="Sparse[int64, 0]")
        result = df.sparse.to_coo()

then, in to_coo from pandas/core/arrays/sparse/accessor.py, we have

        dtype = find_common_type(self._parent.dtypes)

and so this is how we pass a Series (self._parent.dtypes is a Series).

Is there a better way to fix this than

        dtype = find_common_type(self._parent.dtypes.to_list())

?

"""
Find a common data type among the given dtypes.

Expand Down Expand Up @@ -1841,12 +1845,16 @@ def validate_numeric_casting(dtype: np.dtype, value: Scalar) -> None:
------
ValueError
"""
if issubclass(dtype.type, (np.integer, np.bool_)):
if is_float(value) and np.isnan(value):
raise ValueError("Cannot assign nan to integer series")
if (
issubclass(dtype.type, (np.integer, np.bool_))
and is_float(value)
and np.isnan(value)
):
raise ValueError("Cannot assign nan to integer series")

if issubclass(dtype.type, (np.integer, np.floating, complex)) and not issubclass(
dtype.type, np.bool_
if (
issubclass(dtype.type, (np.integer, np.floating, complex))
and not issubclass(dtype.type, np.bool_)
and is_bool(value)
):
if is_bool(value):
raise ValueError("Cannot assign bool to float/integer series")
raise ValueError("Cannot assign bool to float/integer series")
4 changes: 2 additions & 2 deletions pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1698,7 +1698,7 @@ def infer_dtype_from_object(dtype):
elif dtype in ["period"]:
raise NotImplementedError

if dtype == "datetime" or dtype == "timedelta":
if dtype in ["datetime", "timedelta"]:
dtype += "64"
try:
return infer_dtype_from_object(getattr(np, dtype))
Expand Down Expand Up @@ -1733,7 +1733,7 @@ def _validate_date_like_dtype(dtype) -> None:
typ = np.datetime_data(dtype)[0]
except ValueError as e:
raise TypeError(e) from e
if typ != "generic" and typ != "ns":
if typ not in ["generic", "ns"]:
raise ValueError(
f"{repr(dtype.name)} is too specific of a frequency, "
f"try passing {repr(dtype.type.__name__)}"
Expand Down
7 changes: 3 additions & 4 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,9 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
# are not coming from Index/Series._values), eg in BlockManager.quantile
arr = array(arr)

if is_extension_array_dtype(dtype):
if isinstance(arr, np.ndarray):
# numpy's astype cannot handle ExtensionDtypes
return array(arr, dtype=dtype, copy=False)
if is_extension_array_dtype(dtype) and isinstance(arr, np.ndarray):
# numpy's astype cannot handle ExtensionDtypes
return array(arr, dtype=dtype, copy=False)
return arr.astype(dtype, copy=False)


Expand Down
17 changes: 8 additions & 9 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,14 +419,13 @@ def _hash_categories(categories, ordered: Ordered = True) -> int:
categories = list(categories) # breaks if a np.array of categories
cat_array = hash_tuples(categories)
else:
if categories.dtype == "O":
if len({type(x) for x in categories}) != 1:
# TODO: hash_array doesn't handle mixed types. It casts
# everything to a str first, which means we treat
# {'1', '2'} the same as {'1', 2}
# find a better solution
hashed = hash((tuple(categories), ordered))
return hashed
if categories.dtype == "O" and len({type(x) for x in categories}) != 1:
# TODO: hash_array doesn't handle mixed types. It casts
# everything to a str first, which means we treat
# {'1', '2'} the same as {'1', 2}
# find a better solution
hashed = hash((tuple(categories), ordered))
return hashed

if DatetimeTZDtype.is_dtype(categories.dtype):
# Avoid future warning.
Expand Down Expand Up @@ -903,7 +902,7 @@ def __hash__(self) -> int:

def __eq__(self, other: Any) -> bool:
if isinstance(other, str):
return other == self.name or other == self.name.title()
return other in [self.name, self.name.title()]

return isinstance(other, PeriodDtype) and self.freq == other.freq

Expand Down
5 changes: 1 addition & 4 deletions pandas/core/dtypes/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,10 +125,7 @@ def is_file_like(obj) -> bool:
if not (hasattr(obj, "read") or hasattr(obj, "write")):
return False

if not hasattr(obj, "__iter__"):
return False

return True
return bool(hasattr(obj, "__iter__"))


def is_re(obj) -> bool:
Expand Down
23 changes: 11 additions & 12 deletions pandas/core/dtypes/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,8 +358,8 @@ def isna_compat(arr, fill_value=np.nan) -> bool:
-------
True if we can fill using this fill_value
"""
dtype = arr.dtype
if isna(fill_value):
dtype = arr.dtype
return not (is_bool_dtype(dtype) or is_integer_dtype(dtype))
return True

Expand Down Expand Up @@ -447,9 +447,10 @@ def array_equivalent(
right = right.view("i8")

# if we have structured dtypes, compare first
if left.dtype.type is np.void or right.dtype.type is np.void:
if left.dtype != right.dtype:
return False
if (
left.dtype.type is np.void or right.dtype.type is np.void
) and left.dtype != right.dtype:
return False

return np.array_equal(left, right)

Expand Down Expand Up @@ -484,11 +485,11 @@ def _array_equivalent_object(left, right, strict_nan):
if np.any(np.asarray(left_value != right_value)):
return False
except TypeError as err:
if "Cannot compare tz-naive" in str(err):
if "Cannot compare tz-naive" in str(
err
) or "boolean value of NA is ambiguous" in str(err):
# tzawareness compat failure, see GH#28507
return False
elif "boolean value of NA is ambiguous" in str(err):
return False
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think this is for coverage

raise
return True

Expand Down Expand Up @@ -637,8 +638,6 @@ def isna_all(arr: ArrayLike) -> bool:
else:
checker = lambda x: _isna_ndarraylike(x, inf_as_na=INF_AS_NA)

for i in range(0, total_len, chunk_len):
if not checker(arr[i : i + chunk_len]).all():
return False

return True
return all(
checker(arr[i : i + chunk_len]).all() for i in range(0, total_len, chunk_len)
)