Skip to content

REF: collected dtypes.cast and construction simplifications #38629

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Dec 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 6 additions & 13 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
is_extension_array_dtype,
is_float_dtype,
is_integer_dtype,
is_iterator,
is_list_like,
is_object_dtype,
is_sparse,
Expand Down Expand Up @@ -462,10 +461,7 @@ def sanitize_array(
try:
subarr = _try_cast(data, dtype, copy, True)
except ValueError:
if copy:
subarr = data.copy()
else:
subarr = np.array(data, copy=False)
subarr = np.array(data, copy=copy)
else:
# we will try to copy by-definition here
subarr = _try_cast(data, dtype, copy, raise_cast_failure)
Expand Down Expand Up @@ -600,6 +596,10 @@ def _try_cast(arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bo
subarr = array_type(arr, dtype=dtype, copy=copy)
return subarr

if is_object_dtype(dtype) and not isinstance(arr, np.ndarray):
subarr = construct_1d_object_array_from_listlike(arr)
return subarr

try:
# GH#15832: Check if we are requesting a numeric dtype and
# that we can convert the data to the requested dtype.
Expand All @@ -610,14 +610,7 @@ def _try_cast(arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bo
else:
subarr = maybe_cast_to_datetime(arr, dtype)

# Take care in creating object arrays (but iterators are not
# supported):
if is_object_dtype(dtype) and (
is_list_like(subarr)
and not (is_iterator(subarr) or isinstance(subarr, np.ndarray))
):
subarr = construct_1d_object_array_from_listlike(subarr)
elif not is_extension_array_dtype(subarr):
if not isinstance(subarr, (ABCExtensionArray, ABCIndex)):
subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy)
except OutOfBoundsDatetime:
# in case of out of bound datetime64 -> always raise
Expand Down
61 changes: 27 additions & 34 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
is_datetime64_dtype,
is_datetime64_ns_dtype,
is_datetime64tz_dtype,
is_datetime_or_timedelta_dtype,
is_dtype_equal,
is_extension_array_dtype,
is_float,
Expand Down Expand Up @@ -219,14 +218,11 @@ def maybe_downcast_to_dtype(result, dtype: Union[str, np.dtype]):
# a datetimelike
# GH12821, iNaT is cast to float
if dtype.kind in ["M", "m"] and result.dtype.kind in ["i", "f"]:
if hasattr(dtype, "tz"):
# not a numpy dtype
if dtype.tz:
# convert to datetime and change timezone
from pandas import to_datetime

result = to_datetime(result).tz_localize("utc")
result = result.tz_convert(dtype.tz)
if isinstance(dtype, DatetimeTZDtype):
# convert to datetime and change timezone
i8values = result.astype("i8", copy=False)
cls = dtype.construct_array_type()
result = cls._simple_new(i8values, dtype=dtype)
else:
result = result.astype(dtype)

Expand Down Expand Up @@ -609,13 +605,12 @@ def maybe_promote(dtype, fill_value=np.nan):
dtype = mst

elif fill_value is None or fill_value is libmissing.NA:
# Note: we already excluded dt64/td64 dtypes above
if is_float_dtype(dtype) or is_complex_dtype(dtype):
fill_value = np.nan
elif is_integer_dtype(dtype):
dtype = np.float64
fill_value = np.nan
elif is_datetime_or_timedelta_dtype(dtype):
fill_value = dtype.type("NaT", "ns")
else:
dtype = np.dtype(np.object_)
if fill_value is not libmissing.NA:
Expand Down Expand Up @@ -951,7 +946,7 @@ def astype_td64_unit_conversion(


def astype_nansafe(
arr, dtype: DtypeObj, copy: bool = True, skipna: bool = False
arr: np.ndarray, dtype: DtypeObj, copy: bool = True, skipna: bool = False
) -> ArrayLike:
"""
Cast the elements of an array to a given dtype a nan-safe manner.
Expand Down Expand Up @@ -979,6 +974,9 @@ def astype_nansafe(
order = "F" if flags.f_contiguous else "C"
return result.reshape(arr.shape, order=order)

# We get here with 0-dim from sparse
arr = np.atleast_1d(arr)

# dispatch on extension dtype if needed
if isinstance(dtype, ExtensionDtype):
return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy)
Expand All @@ -995,9 +993,7 @@ def astype_nansafe(
return arr.astype(dtype, copy=copy)

if issubclass(dtype.type, str):
return lib.ensure_string_array(
arr.ravel(), skipna=skipna, convert_na_value=False
).reshape(arr.shape)
return lib.ensure_string_array(arr, skipna=skipna, convert_na_value=False)

elif is_datetime64_dtype(arr):
if dtype == np.int64:
Expand Down Expand Up @@ -1031,7 +1027,7 @@ def astype_nansafe(

# work around NumPy brokenness, #1987
if np.issubdtype(dtype.type, np.integer):
return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
return lib.astype_intsafe(arr, dtype)

# if we have a datetime/timedelta array of objects
# then coerce to a proper dtype and recall astype_nansafe
Expand Down Expand Up @@ -1092,27 +1088,22 @@ def soft_convert_objects(
raise ValueError("At least one of datetime, numeric or timedelta must be True.")

# Soft conversions
if datetime:
if datetime or timedelta:
# GH 20380, when datetime is beyond year 2262, hence outside
# bound of nanosecond-resolution 64-bit integers.
try:
values = lib.maybe_convert_objects(values, convert_datetime=True)
values = lib.maybe_convert_objects(
values, convert_datetime=datetime, convert_timedelta=timedelta
)
except OutOfBoundsDatetime:
pass

if timedelta and is_object_dtype(values.dtype):
# Object check to ensure only run if previous did not convert
values = lib.maybe_convert_objects(values, convert_timedelta=True)
return values

if numeric and is_object_dtype(values.dtype):
try:
converted = lib.maybe_convert_numeric(values, set(), coerce_numeric=True)
except (ValueError, TypeError):
pass
else:
# If all NaNs, then do not-alter
values = converted if not isna(converted).all() else values
values = values.copy() if copy else values
converted = lib.maybe_convert_numeric(values, set(), coerce_numeric=True)

# If all NaNs, then do not-alter
values = converted if not isna(converted).all() else values
values = values.copy() if copy else values

return values

Expand Down Expand Up @@ -1274,6 +1265,7 @@ def try_datetime(v):
# safe coerce to datetime64
try:
# GH19671
# tznaive only
v = tslib.array_to_datetime(v, require_iso8601=True, errors="raise")[0]
except ValueError:

Expand All @@ -1285,11 +1277,12 @@ def try_datetime(v):
try:

values, tz = conversion.datetime_to_datetime64(v)
return DatetimeIndex(values).tz_localize("UTC").tz_convert(tz=tz)
except (ValueError, TypeError):
pass

except Exception:
else:
return DatetimeIndex(values).tz_localize("UTC").tz_convert(tz=tz)
except TypeError:
# e.g. <class 'numpy.timedelta64'> is not convertible to datetime
pass

return v.reshape(shape)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/dtypes/cast/test_downcast.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,9 @@ def test_datetime_likes_nan(klass):
def test_datetime_with_timezone(as_asi):
# see gh-15426
ts = Timestamp("2016-01-01 12:00:00", tz="US/Pacific")
exp = DatetimeIndex([ts, ts])
exp = DatetimeIndex([ts, ts])._data

obj = exp.asi8 if as_asi else exp
res = maybe_downcast_to_dtype(obj, exp.dtype)

tm.assert_index_equal(res, exp)
tm.assert_datetime_array_equal(res, exp)
14 changes: 9 additions & 5 deletions pandas/tests/io/parser/test_c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,15 @@ def test_buffer_rd_bytes(c_parser_only):
)
parser = c_parser_only

for _ in range(100):
try:
parser.read_csv(StringIO(data), compression="gzip", delim_whitespace=True)
except Exception:
pass
with tm.assert_produces_warning(RuntimeWarning):
# compression has no effect when passing a non-binary object as input
for _ in range(100):
try:
parser.read_csv(
StringIO(data), compression="gzip", delim_whitespace=True
)
except Exception:
pass


def test_delim_whitespace_custom_terminator(c_parser_only):
Expand Down