Skip to content

Commit 7b1fd80

Browse files
committed
API: This fixes a number of inconsistencies and API issues
w.r.t. dtype conversions. This is a reprise of #14145 & #16408. This removes some code from the core structures & pushes it to internals, where the primitives are made more consistent. This should all us to be a bit more consistent for pandas2 type things. closes #16402 supersedes #14145 closes #14001
1 parent 37c1ec8 commit 7b1fd80

23 files changed

+796
-541
lines changed

doc/source/whatsnew/v0.21.0.txt

+57
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,63 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in
102102
- Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`)
103103
- ``Index.get_indexer_non_unique()`` now returns a ndarray indexer rather than an ``Index``; this is consistent with ``Index.get_indexer()`` (:issue:`16819`)
104104

105+
.. _whatsnew_0210.dtype_conversions:
106+
107+
Dtype Conversions
108+
^^^^^^^^^^^^^^^^^
109+
110+
- Previously assignments, ``.where()`` and ``.fillna()`` with a ``bool`` assignment, would coerce to
111+
same type (e.g. int / float), or raise for datetimelikes. These will now preseve the bools with ``object`` dtypes. (:issue:`16821`).
112+
113+
.. ipython:: python
114+
115+
s = Series([1, 2, 3])
116+
117+
.. code-block:: python
118+
119+
In [5]: s[1] = True
120+
121+
In [6]: s
122+
Out[6]:
123+
0 1
124+
1 1
125+
2 3
126+
dtype: int64
127+
128+
New Behavior
129+
130+
.. ipython:: python
131+
132+
s[1] = True
133+
s
134+
135+
- Previously as assignment to a datetimelike with a non-datetimelike would corece (:issue:`14145`).
136+
137+
.. ipython:: python
138+
139+
s = pd.Series([pd.Timestamp('2011-01-01'), pd.Timestamp('2012-01-01')])
140+
141+
.. code-block:: python
142+
143+
In [1]: s[1] = 1
144+
145+
In [2]: s
146+
Out[2]:
147+
0 2011-01-01 00:00:00.000000000
148+
1 1970-01-01 00:00:00.000000001
149+
dtype: datetime64[ns]
150+
151+
These now coerce to ``object`` dtype.
152+
153+
.. ipython:: python
154+
155+
s[1] = 1
156+
s
157+
158+
- Additional bug fixes w.r.t. dtype conversions.
159+
160+
- Inconsistent behavior in ``.where()`` with datetimelikes which would raise rather than coerce to ``object`` (:issue:`16402`)
161+
- Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`)
105162

106163
.. _whatsnew_0210.api:
107164

pandas/_libs/index.pyx

+20-6
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ cimport tslib
1919
from hashtable cimport *
2020
from pandas._libs import tslib, algos, hashtable as _hash
2121
from pandas._libs.tslib import Timestamp, Timedelta
22+
from datetime import datetime, timedelta
2223

2324
from datetime cimport (get_datetime64_value, _pydatetime_to_dts,
2425
pandas_datetimestruct)
@@ -507,24 +508,37 @@ cdef class TimedeltaEngine(DatetimeEngine):
507508
return 'm8[ns]'
508509

509510
cpdef convert_scalar(ndarray arr, object value):
511+
# we don't turn integers
512+
# into datetimes/timedeltas
513+
514+
# we don't turn bools into int/float/complex
515+
510516
if arr.descr.type_num == NPY_DATETIME:
511517
if isinstance(value, np.ndarray):
512518
pass
513-
elif isinstance(value, Timestamp):
514-
return value.value
519+
elif isinstance(value, datetime):
520+
return Timestamp(value).value
515521
elif value is None or value != value:
516522
return iNaT
517-
else:
523+
elif util.is_string_object(value):
518524
return Timestamp(value).value
525+
raise ValueError("cannot set a Timestamp with a non-timestamp")
526+
519527
elif arr.descr.type_num == NPY_TIMEDELTA:
520528
if isinstance(value, np.ndarray):
521529
pass
522-
elif isinstance(value, Timedelta):
523-
return value.value
530+
elif isinstance(value, timedelta):
531+
return Timedelta(value).value
524532
elif value is None or value != value:
525533
return iNaT
526-
else:
534+
elif util.is_string_object(value):
527535
return Timedelta(value).value
536+
raise ValueError("cannot set a Timedelta with a non-timedelta")
537+
538+
if (issubclass(arr.dtype.type, (np.integer, np.floating, np.complex)) and
539+
not issubclass(arr.dtype.type, np.bool_)):
540+
if util.is_bool_object(value):
541+
raise ValueError('Cannot assign bool to float/integer series')
528542

529543
if issubclass(arr.dtype.type, (np.integer, np.bool_)):
530544
if util.is_float_object(value) and value != value:

pandas/_libs/tslib.pyx

+2-1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ cdef bint PY3 = (sys.version_info[0] >= 3)
1414
from cpython cimport (
1515
PyTypeObject,
1616
PyFloat_Check,
17+
PyComplex_Check,
1718
PyLong_Check,
1819
PyObject_RichCompareBool,
1920
PyObject_RichCompare,
@@ -902,7 +903,7 @@ cdef inline bint _checknull_with_nat(object val):
902903
cdef inline bint _check_all_nulls(object val):
903904
""" utility to check if a value is any type of null """
904905
cdef bint res
905-
if PyFloat_Check(val):
906+
if PyFloat_Check(val) or PyComplex_Check(val):
906907
res = val != val
907908
elif val is NaT:
908909
res = 1

pandas/core/algorithms.py

+6
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,12 @@ def _reconstruct_data(values, dtype, original):
149149
pass
150150
elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype):
151151
values = Index(original)._shallow_copy(values, name=None)
152+
elif is_bool_dtype(dtype):
153+
values = values.astype(dtype)
154+
155+
# we only support object dtypes bool Index
156+
if isinstance(original, Index):
157+
values = values.astype(object)
152158
elif dtype is not None:
153159
values = values.astype(dtype)
154160

pandas/core/dtypes/cast.py

+51-9
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ def maybe_promote(dtype, fill_value=np.nan):
272272
else:
273273
if issubclass(dtype.type, np.datetime64):
274274
try:
275-
fill_value = Timestamp(fill_value).value
275+
fill_value = tslib.Timestamp(fill_value).value
276276
except:
277277
# the proper thing to do here would probably be to upcast
278278
# to object (but numpy 1.6.1 doesn't do this properly)
@@ -333,6 +333,23 @@ def maybe_promote(dtype, fill_value=np.nan):
333333
return dtype, fill_value
334334

335335

336+
def infer_dtype_from(val, pandas_dtype=False):
337+
"""
338+
interpret the dtype from a scalar or array. This is a convenience
339+
routines to infer dtype from a scalar or an array
340+
341+
Parameters
342+
----------
343+
pandas_dtype : bool, default False
344+
whether to infer dtype including pandas extension types.
345+
If False, scalar/array belongs to pandas extension types is inferred as
346+
object
347+
"""
348+
if is_scalar(val):
349+
return infer_dtype_from_scalar(val, pandas_dtype=pandas_dtype)
350+
return infer_dtype_from_array(val, pandas_dtype=pandas_dtype)
351+
352+
336353
def infer_dtype_from_scalar(val, pandas_dtype=False):
337354
"""
338355
interpret the dtype from a scalar
@@ -408,23 +425,29 @@ def infer_dtype_from_scalar(val, pandas_dtype=False):
408425
return dtype, val
409426

410427

411-
def infer_dtype_from_array(arr):
428+
def infer_dtype_from_array(arr, pandas_dtype=False):
412429
"""
413430
infer the dtype from a scalar or array
414431
415432
Parameters
416433
----------
417434
arr : scalar or array
435+
pandas_dtype : bool, default False
436+
whether to infer dtype including pandas extension types.
437+
If False, array belongs to pandas extension types
438+
is inferred as object
418439
419440
Returns
420441
-------
421-
tuple (numpy-compat dtype, array)
442+
tuple (numpy-compat/pandas-compat dtype, array)
422443
423444
Notes
424445
-----
425-
These infer to numpy dtypes exactly
426-
with the exception that mixed / object dtypes
427-
are not coerced by stringifying or conversion
446+
if pandas_dtype=False. these infer to numpy dtypes
447+
exactly with the exception that mixed / object dtypes
448+
449+
if pandas_dtype=True. datetime64tz-aware/categorical
450+
types will retain there character.
428451
429452
Examples
430453
--------
@@ -442,6 +465,13 @@ def infer_dtype_from_array(arr):
442465
if not is_list_like(arr):
443466
arr = [arr]
444467

468+
if pandas_dtype and (is_categorical_dtype(arr) or
469+
is_datetime64tz_dtype(arr)):
470+
return arr.dtype, arr
471+
472+
elif isinstance(arr, ABCSeries):
473+
return arr.dtype, np.asarray(arr)
474+
445475
# don't force numpy coerce with nan's
446476
inferred = lib.infer_dtype(arr)
447477
if inferred in ['string', 'bytes', 'unicode',
@@ -552,7 +582,7 @@ def conv(r, dtype):
552582
if isnull(r):
553583
pass
554584
elif dtype == _NS_DTYPE:
555-
r = Timestamp(r)
585+
r = tslib.Timestamp(r)
556586
elif dtype == _TD_DTYPE:
557587
r = _coerce_scalar_to_timedelta_type(r)
558588
elif dtype == np.bool_:
@@ -1028,13 +1058,25 @@ def find_common_type(types):
10281058
return np.find_common_type(types, [])
10291059

10301060

1031-
def _cast_scalar_to_array(shape, value, dtype=None):
1061+
def cast_scalar_to_array(shape, value, dtype=None):
10321062
"""
10331063
create np.ndarray of specified shape and dtype, filled with values
1064+
1065+
Parameters
1066+
----------
1067+
shape : tuple
1068+
value : scalar value
1069+
dtype : np.dtype, optional
1070+
dtype to coerce
1071+
1072+
Returns
1073+
-------
1074+
ndarray of shape, filled with value, of specified / inferred dtype
1075+
10341076
"""
10351077

10361078
if dtype is None:
1037-
dtype, fill_value = _infer_dtype_from_scalar(value)
1079+
dtype, fill_value = infer_dtype_from_scalar(value)
10381080
else:
10391081
fill_value = value
10401082

pandas/core/dtypes/common.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
ExtensionDtype)
1212
from .generic import (ABCCategorical, ABCPeriodIndex,
1313
ABCDatetimeIndex, ABCSeries,
14-
ABCSparseArray, ABCSparseSeries)
14+
ABCSparseArray, ABCSparseSeries,
15+
ABCIndexClass)
1516
from .inference import is_string_like
1617
from .inference import * # noqa
1718

@@ -1543,6 +1544,16 @@ def is_bool_dtype(arr_or_dtype):
15431544
except ValueError:
15441545
# this isn't even a dtype
15451546
return False
1547+
1548+
if isinstance(arr_or_dtype, ABCIndexClass):
1549+
1550+
# TODO(jreback)
1551+
# we don't have a boolean Index class
1552+
# so its object, we need to infer to
1553+
# guess this
1554+
return (arr_or_dtype.is_object and
1555+
arr_or_dtype.inferred_type == 'boolean')
1556+
15461557
return issubclass(tipo, np.bool_)
15471558

15481559

0 commit comments

Comments
 (0)