Skip to content

Commit b0727cd

Browse files
committed
API: This fixes a number of inconsistencies and API issues
w.r.t. dtype conversions. This is a reprise of pandas-dev#14145 & pandas-dev#16408. This removes some code from the core structures & pushes it to internals, where the primitives are made more consistent. This should all us to be a bit more consistent for pandas2 type things. closes pandas-dev#16402 supersedes pandas-dev#14145 closes pandas-dev#14001
1 parent 80e7869 commit b0727cd

File tree

16 files changed

+428
-386
lines changed

16 files changed

+428
-386
lines changed

doc/source/whatsnew/v0.21.0.txt

+12
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,18 @@ Backwards incompatible API changes
5454
- :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`).
5555
- Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`)
5656

57+
.. _whatsnew_0210.dtype_conversions:
58+
59+
Dtype Conversions
60+
^^^^^^^^^^^^^^^^^
61+
62+
Example about setitem / where with bools.
63+
64+
65+
66+
- Inconsistent behavior in ``.where()`` with datetimelikes which would raise rather than coerce to ``object`` (:issue:`16402`)
67+
- Bug in assignment against datetime-like data with ``int`` may incorrectly convert to datetime-like (:issue:`14145`)
68+
- Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`)
5769

5870
.. _whatsnew_0210.api:
5971

pandas/_libs/index.pyx

+20-6
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ cimport tslib
1919
from hashtable cimport *
2020
from pandas._libs import tslib, algos, hashtable as _hash
2121
from pandas._libs.tslib import Timestamp, Timedelta
22+
from datetime import datetime, timedelta
2223

2324
from datetime cimport (get_datetime64_value, _pydatetime_to_dts,
2425
pandas_datetimestruct)
@@ -507,24 +508,37 @@ cdef class TimedeltaEngine(DatetimeEngine):
507508
return 'm8[ns]'
508509

509510
cpdef convert_scalar(ndarray arr, object value):
511+
# we don't turn integers
512+
# into datetimes/timedeltas
513+
514+
# we don't turn bools into int/float/complex
515+
510516
if arr.descr.type_num == NPY_DATETIME:
511517
if isinstance(value, np.ndarray):
512518
pass
513-
elif isinstance(value, Timestamp):
514-
return value.value
519+
elif isinstance(value, datetime):
520+
return Timestamp(value).value
515521
elif value is None or value != value:
516522
return iNaT
517-
else:
523+
elif util.is_string_object(value):
518524
return Timestamp(value).value
525+
raise ValueError("cannot set a Timestamp with a non-timestamp")
526+
519527
elif arr.descr.type_num == NPY_TIMEDELTA:
520528
if isinstance(value, np.ndarray):
521529
pass
522-
elif isinstance(value, Timedelta):
523-
return value.value
530+
elif isinstance(value, timedelta):
531+
return Timedelta(value).value
524532
elif value is None or value != value:
525533
return iNaT
526-
else:
534+
elif util.is_string_object(value):
527535
return Timedelta(value).value
536+
raise ValueError("cannot set a Timedelta with a non-timedelta")
537+
538+
if (issubclass(arr.dtype.type, (np.integer, np.floating, np.complex)) and not
539+
issubclass(arr.dtype.type, np.bool_)):
540+
if util.is_bool_object(value):
541+
raise ValueError('Cannot assign bool to float/integer series')
528542

529543
if issubclass(arr.dtype.type, (np.integer, np.bool_)):
530544
if util.is_float_object(value) and value != value:

pandas/core/algorithms.py

+6
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,12 @@ def _reconstruct_data(values, dtype, original):
149149
pass
150150
elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype):
151151
values = Index(original)._shallow_copy(values, name=None)
152+
elif is_bool_dtype(dtype):
153+
values = values.astype(dtype)
154+
155+
# we only support object dtypes bool Index
156+
if isinstance(original, Index):
157+
values = values.astype(object)
152158
elif dtype is not None:
153159
values = values.astype(dtype)
154160

pandas/core/dtypes/cast.py

+51-9
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ def maybe_promote(dtype, fill_value=np.nan):
272272
else:
273273
if issubclass(dtype.type, np.datetime64):
274274
try:
275-
fill_value = Timestamp(fill_value).value
275+
fill_value = tslib.Timestamp(fill_value).value
276276
except:
277277
# the proper thing to do here would probably be to upcast
278278
# to object (but numpy 1.6.1 doesn't do this properly)
@@ -333,6 +333,23 @@ def maybe_promote(dtype, fill_value=np.nan):
333333
return dtype, fill_value
334334

335335

336+
def infer_dtype_from(val, pandas_dtype=False):
337+
"""
338+
interpret the dtype from a scalar or array. This is a convenience
339+
routines to infer dtype from a scalar or an array
340+
341+
Parameters
342+
----------
343+
pandas_dtype : bool, default False
344+
whether to infer dtype including pandas extension types.
345+
If False, scalar/array belongs to pandas extension types is inferred as
346+
object
347+
"""
348+
if is_scalar(val):
349+
return infer_dtype_from_scalar(val, pandas_dtype=pandas_dtype)
350+
return infer_dtype_from_array(val, pandas_dtype=pandas_dtype)
351+
352+
336353
def infer_dtype_from_scalar(val, pandas_dtype=False):
337354
"""
338355
interpret the dtype from a scalar
@@ -408,23 +425,29 @@ def infer_dtype_from_scalar(val, pandas_dtype=False):
408425
return dtype, val
409426

410427

411-
def infer_dtype_from_array(arr):
428+
def infer_dtype_from_array(arr, pandas_dtype=False):
412429
"""
413430
infer the dtype from a scalar or array
414431
415432
Parameters
416433
----------
417434
arr : scalar or array
435+
pandas_dtype : bool, default False
436+
whether to infer dtype including pandas extension types.
437+
If False, array belongs to pandas extension types
438+
is inferred as object
418439
419440
Returns
420441
-------
421-
tuple (numpy-compat dtype, array)
442+
tuple (numpy-compat/pandas-compat dtype, array)
422443
423444
Notes
424445
-----
425-
These infer to numpy dtypes exactly
426-
with the exception that mixed / object dtypes
427-
are not coerced by stringifying or conversion
446+
if pandas_dtype=False. these infer to numpy dtypes
447+
exactly with the exception that mixed / object dtypes
448+
449+
if pandas_dtype=True. datetime64tz-aware/categorical
450+
types will retain there character.
428451
429452
Examples
430453
--------
@@ -442,6 +465,13 @@ def infer_dtype_from_array(arr):
442465
if not is_list_like(arr):
443466
arr = [arr]
444467

468+
if pandas_dtype and (is_categorical_dtype(arr) or
469+
is_datetime64tz_dtype(arr)):
470+
return arr.dtype, arr
471+
472+
elif isinstance(arr, ABCSeries):
473+
return arr.dtype, np.asarray(arr)
474+
445475
# don't force numpy coerce with nan's
446476
inferred = lib.infer_dtype(arr)
447477
if inferred in ['string', 'bytes', 'unicode',
@@ -552,7 +582,7 @@ def conv(r, dtype):
552582
if isnull(r):
553583
pass
554584
elif dtype == _NS_DTYPE:
555-
r = Timestamp(r)
585+
r = tslib.Timestamp(r)
556586
elif dtype == _TD_DTYPE:
557587
r = _coerce_scalar_to_timedelta_type(r)
558588
elif dtype == np.bool_:
@@ -1028,13 +1058,25 @@ def find_common_type(types):
10281058
return np.find_common_type(types, [])
10291059

10301060

1031-
def _cast_scalar_to_array(shape, value, dtype=None):
1061+
def cast_scalar_to_array(shape, value, dtype=None):
10321062
"""
10331063
create np.ndarray of specified shape and dtype, filled with values
1064+
1065+
Parameters
1066+
----------
1067+
shape : tuple
1068+
value : scalar value
1069+
dtype : np.dtype, optional
1070+
dtype to coerce
1071+
1072+
Returns
1073+
-------
1074+
ndarray of shape, filled with value, of specified / inferred dtype
1075+
10341076
"""
10351077

10361078
if dtype is None:
1037-
dtype, fill_value = _infer_dtype_from_scalar(value)
1079+
dtype, fill_value = infer_dtype_from_scalar(value)
10381080
else:
10391081
fill_value = value
10401082

pandas/core/dtypes/common.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
ExtensionDtype)
1212
from .generic import (ABCCategorical, ABCPeriodIndex,
1313
ABCDatetimeIndex, ABCSeries,
14-
ABCSparseArray, ABCSparseSeries)
14+
ABCSparseArray, ABCSparseSeries,
15+
ABCIndexClass)
1516
from .inference import is_string_like
1617
from .inference import * # noqa
1718

@@ -1540,6 +1541,16 @@ def is_bool_dtype(arr_or_dtype):
15401541
except ValueError:
15411542
# this isn't even a dtype
15421543
return False
1544+
1545+
if isinstance(arr_or_dtype, ABCIndexClass):
1546+
1547+
# TODO(jreback)
1548+
# we don't have a boolean Index class
1549+
# so its object, we need to infer to
1550+
# guess this
1551+
return (arr_or_dtype.is_object and
1552+
arr_or_dtype.inferred_type == 'boolean')
1553+
15431554
return issubclass(tipo, np.bool_)
15441555

15451556

pandas/core/frame.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@
2525
import numpy.ma as ma
2626

2727
from pandas.core.dtypes.cast import (
28-
maybe_upcast, infer_dtype_from_scalar,
28+
maybe_upcast,
29+
cast_scalar_to_array,
2930
maybe_cast_to_datetime,
3031
maybe_infer_to_datetimelike,
3132
maybe_convert_platform,
@@ -386,8 +387,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
386387
raise_with_traceback(exc)
387388

388389
if arr.ndim == 0 and index is not None and columns is not None:
389-
values = _cast_scalar_to_array((len(index), len(columns)),
390-
data, dtype=dtype)
390+
values = cast_scalar_to_array((len(index), len(columns)),
391+
data, dtype=dtype)
391392
mgr = self._init_ndarray(values, index, columns,
392393
dtype=values.dtype, copy=False)
393394
else:
@@ -2679,8 +2680,8 @@ def reindexer(value):
26792680

26802681
else:
26812682
# upcast the scalar
2682-
value = _cast_scalar_to_array(len(self.index), value)
2683-
value = _possibly_cast_to_datetime(value, value.dtype)
2683+
value = cast_scalar_to_array(len(self.index), value)
2684+
value = maybe_cast_to_datetime(value, value.dtype)
26842685

26852686
# return internal types directly
26862687
if is_extension_type(value):

pandas/core/generic.py

+2-44
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
from pandas.core.dtypes.common import (
1414
_ensure_int64,
1515
_ensure_object,
16-
needs_i8_conversion,
1716
is_scalar,
1817
is_number,
1918
is_integer, is_bool,
@@ -26,7 +25,8 @@
2625
is_dict_like,
2726
is_re_compilable,
2827
pandas_dtype)
29-
from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask
28+
from pandas.core.dtypes.cast import (
29+
maybe_promote, maybe_upcast_putmask)
3030
from pandas.core.dtypes.missing import isnull, notnull
3131
from pandas.core.dtypes.generic import ABCSeries, ABCPanel
3232

@@ -5336,48 +5336,6 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
53365336
raise NotImplementedError("cannot align with a higher "
53375337
"dimensional NDFrame")
53385338

5339-
elif is_list_like(other):
5340-
5341-
if self.ndim == 1:
5342-
5343-
# try to set the same dtype as ourselves
5344-
try:
5345-
new_other = np.array(other, dtype=self.dtype)
5346-
except ValueError:
5347-
new_other = np.array(other)
5348-
except TypeError:
5349-
new_other = other
5350-
5351-
# we can end up comparing integers and m8[ns]
5352-
# which is a numpy no no
5353-
is_i8 = needs_i8_conversion(self.dtype)
5354-
if is_i8:
5355-
matches = False
5356-
else:
5357-
matches = (new_other == np.array(other))
5358-
5359-
if matches is False or not matches.all():
5360-
5361-
# coerce other to a common dtype if we can
5362-
if needs_i8_conversion(self.dtype):
5363-
try:
5364-
other = np.array(other, dtype=self.dtype)
5365-
except:
5366-
other = np.array(other)
5367-
else:
5368-
other = np.asarray(other)
5369-
other = np.asarray(other,
5370-
dtype=np.common_type(other,
5371-
new_other))
5372-
5373-
# we need to use the new dtype
5374-
try_quick = False
5375-
else:
5376-
other = new_other
5377-
else:
5378-
5379-
other = np.array(other)
5380-
53815339
if isinstance(other, np.ndarray):
53825340

53835341
if other.shape != self.shape:

pandas/core/indexes/base.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
is_object_dtype,
2727
is_categorical_dtype,
2828
is_interval_dtype,
29+
is_bool,
2930
is_bool_dtype,
3031
is_signed_integer_dtype,
3132
is_unsigned_integer_dtype,
@@ -610,9 +611,18 @@ def repeat(self, repeats, *args, **kwargs):
610611
def where(self, cond, other=None):
611612
if other is None:
612613
other = self._na_value
613-
values = np.where(cond, self.values, other)
614614

615615
dtype = self.dtype
616+
values = self.values
617+
618+
if is_bool(other) or is_bool_dtype(other):
619+
620+
# bools force casting
621+
values = values.astype(object)
622+
dtype = None
623+
624+
values = np.where(cond, values, other)
625+
616626
if self._is_numeric_dtype and np.any(isnull(values)):
617627
# We can't coerce to the numeric dtype of "self" (unless
618628
# it's float) if there are NaN values in our output.

pandas/core/indexes/numeric.py

+18-3
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,14 @@
22
from pandas._libs import (index as libindex,
33
algos as libalgos, join as libjoin)
44
from pandas.core.dtypes.common import (
5-
is_dtype_equal, pandas_dtype,
6-
is_float_dtype, is_object_dtype,
7-
is_integer_dtype, is_scalar)
5+
is_dtype_equal,
6+
pandas_dtype,
7+
is_float_dtype,
8+
is_object_dtype,
9+
is_integer_dtype,
10+
is_bool,
11+
is_bool_dtype,
12+
is_scalar)
813
from pandas.core.common import _asarray_tuplesafe, _values_from_object
914

1015
from pandas import compat
@@ -56,6 +61,16 @@ def _maybe_cast_slice_bound(self, label, side, kind):
5661
# we will try to coerce to integers
5762
return self._maybe_cast_indexer(label)
5863

64+
def _convert_for_op(self, value):
65+
""" Convert value to be insertable to ndarray """
66+
67+
if is_bool(value) or is_bool_dtype(value):
68+
# force conversion to object
69+
# so we don't lose the bools
70+
raise TypeError
71+
72+
return value
73+
5974
def _convert_tolerance(self, tolerance):
6075
try:
6176
return float(tolerance)

0 commit comments

Comments
 (0)