Skip to content

Commit 9ad1e00

Browse files
h-vetinarijreback
authored andcommitted
REF: shift ravel in infer_dtype (#24560)
1 parent 43b35fc commit 9ad1e00

23 files changed

+147
-138
lines changed

pandas/_libs/lib.pyx

+5-4
Original file line numberDiff line numberDiff line change
@@ -623,7 +623,7 @@ def clean_index_list(obj: list):
623623
return obj, all_arrays
624624

625625
# don't force numpy coerce with nan's
626-
inferred = infer_dtype(obj)
626+
inferred = infer_dtype(obj, skipna=False)
627627
if inferred in ['string', 'bytes', 'unicode', 'mixed', 'mixed-integer']:
628628
return np.asarray(obj, dtype=object), 0
629629
elif inferred in ['integer']:
@@ -1210,6 +1210,10 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
12101210
values = construct_1d_object_array_from_listlike(value)
12111211

12121212
values = getattr(values, 'values', values)
1213+
1214+
# make contiguous
1215+
values = values.ravel()
1216+
12131217
if skipna:
12141218
values = values[~isnaobj(values)]
12151219

@@ -1220,9 +1224,6 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
12201224
if values.dtype != np.object_:
12211225
values = values.astype('O')
12221226

1223-
# make contiguous
1224-
values = values.ravel()
1225-
12261227
n = len(values)
12271228
if n == 0:
12281229
return 'empty'

pandas/core/algorithms.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ def _ensure_arraylike(values):
165165
ensure that we are arraylike if not already
166166
"""
167167
if not is_array_like(values):
168-
inferred = lib.infer_dtype(values)
168+
inferred = lib.infer_dtype(values, skipna=False)
169169
if inferred in ['mixed', 'string', 'unicode']:
170170
if isinstance(values, tuple):
171171
values = list(values)
@@ -202,8 +202,10 @@ def _get_hashtable_algo(values):
202202

203203
if ndtype == 'object':
204204

205-
# its cheaper to use a String Hash Table than Object
206-
if lib.infer_dtype(values) in ['string']:
205+
# it's cheaper to use a String Hash Table than Object; we infer
206+
# including nulls because that is the only difference between
207+
# StringHashTable and ObjectHashtable
208+
if lib.infer_dtype(values, skipna=False) in ['string']:
207209
ndtype = 'string'
208210
else:
209211
ndtype = 'object'
@@ -220,8 +222,10 @@ def _get_data_algo(values, func_map):
220222
values, dtype, ndtype = _ensure_data(values)
221223
if ndtype == 'object':
222224

223-
# its cheaper to use a String Hash Table than Object
224-
if lib.infer_dtype(values) in ['string']:
225+
# it's cheaper to use a String Hash Table than Object; we infer
226+
# including nulls because that is the only difference between
227+
# StringHashTable and ObjectHashtable
228+
if lib.infer_dtype(values, skipna=False) in ['string']:
225229
ndtype = 'string'
226230

227231
f = func_map.get(ndtype, func_map['object'])

pandas/core/arrays/datetimes.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1652,7 +1652,7 @@ def sequence_to_dt64ns(data, dtype=None, copy=False,
16521652
# TODO: We do not have tests specific to string-dtypes,
16531653
# also complex or categorical or other extension
16541654
copy = False
1655-
if lib.infer_dtype(data) == 'integer':
1655+
if lib.infer_dtype(data, skipna=False) == 'integer':
16561656
data = data.astype(np.int64)
16571657
else:
16581658
# data comes back here as either i8 to denote UTC timestamps

pandas/core/arrays/integer.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -171,8 +171,8 @@ def coerce_to_array(values, dtype, mask=None, copy=False):
171171

172172
values = np.array(values, copy=copy)
173173
if is_object_dtype(values):
174-
inferred_type = lib.infer_dtype(values)
175-
if inferred_type is 'mixed' and isna(values).all():
174+
inferred_type = lib.infer_dtype(values, skipna=True)
175+
if inferred_type == 'empty':
176176
values = np.empty(len(values))
177177
values.fill(np.nan)
178178
elif inferred_type not in ['floating', 'integer',

pandas/core/arrays/timedeltas.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -594,7 +594,7 @@ def __floordiv__(self, other):
594594
elif is_object_dtype(other):
595595
result = [self[n] // other[n] for n in range(len(self))]
596596
result = np.array(result)
597-
if lib.infer_dtype(result) == 'timedelta':
597+
if lib.infer_dtype(result, skipna=False) == 'timedelta':
598598
result, _ = sequence_to_td64ns(result)
599599
return type(self)(result)
600600
return result

pandas/core/dtypes/cast.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@ def trans(x):
7575

7676
if isinstance(dtype, string_types):
7777
if dtype == 'infer':
78-
inferred_type = lib.infer_dtype(ensure_object(result.ravel()))
78+
inferred_type = lib.infer_dtype(ensure_object(result.ravel()),
79+
skipna=False)
7980
if inferred_type == 'boolean':
8081
dtype = 'bool'
8182
elif inferred_type == 'integer':
@@ -460,7 +461,7 @@ def infer_dtype_from_array(arr, pandas_dtype=False):
460461
return arr.dtype, np.asarray(arr)
461462

462463
# don't force numpy coerce with nan's
463-
inferred = lib.infer_dtype(arr)
464+
inferred = lib.infer_dtype(arr, skipna=False)
464465
if inferred in ['string', 'bytes', 'unicode',
465466
'mixed', 'mixed-integer']:
466467
return (np.object_, arr)
@@ -941,10 +942,11 @@ def try_timedelta(v):
941942

942943
# We have at least a NaT and a string
943944
# try timedelta first to avoid spurious datetime conversions
944-
# e.g. '00:00:01' is a timedelta but
945-
# technically is also a datetime
945+
# e.g. '00:00:01' is a timedelta but technically is also a datetime
946946
value = try_timedelta(v)
947-
if lib.infer_dtype(value) in ['mixed']:
947+
if lib.infer_dtype(value, skipna=False) in ['mixed']:
948+
# cannot skip missing values, as NaT implies that the string
949+
# is actually a datetime
948950
value = try_datetime(v)
949951

950952
return value

pandas/core/dtypes/common.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -703,7 +703,8 @@ def is_datetime_arraylike(arr):
703703
if isinstance(arr, ABCDatetimeIndex):
704704
return True
705705
elif isinstance(arr, (np.ndarray, ABCSeries)):
706-
return arr.dtype == object and lib.infer_dtype(arr) == 'datetime'
706+
return (is_object_dtype(arr.dtype)
707+
and lib.infer_dtype(arr, skipna=False) == 'datetime')
707708
return getattr(arr, 'inferred_type', None) == 'datetime'
708709

709710

pandas/core/dtypes/missing.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -474,7 +474,7 @@ def _infer_fill_value(val):
474474
if is_datetimelike(val):
475475
return np.array('NaT', dtype=val.dtype)
476476
elif is_object_dtype(val.dtype):
477-
dtype = lib.infer_dtype(ensure_object(val))
477+
dtype = lib.infer_dtype(ensure_object(val), skipna=False)
478478
if dtype in ['datetime', 'datetime64']:
479479
return np.array('NaT', dtype=_NS_DTYPE)
480480
elif dtype in ['timedelta', 'timedelta64']:

pandas/core/indexes/base.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
346346
# should not be coerced
347347
# GH 11836
348348
if is_integer_dtype(dtype):
349-
inferred = lib.infer_dtype(data)
349+
inferred = lib.infer_dtype(data, skipna=False)
350350
if inferred == 'integer':
351351
data = maybe_cast_to_integer_array(data, dtype,
352352
copy=copy)
@@ -376,7 +376,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
376376
else:
377377
data = data.astype(dtype)
378378
elif is_float_dtype(dtype):
379-
inferred = lib.infer_dtype(data)
379+
inferred = lib.infer_dtype(data, skipna=False)
380380
if inferred == 'string':
381381
pass
382382
else:
@@ -414,7 +414,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
414414
subarr = subarr.copy()
415415

416416
if dtype is None:
417-
inferred = lib.infer_dtype(subarr)
417+
inferred = lib.infer_dtype(subarr, skipna=False)
418418
if inferred == 'integer':
419419
try:
420420
return cls._try_convert_to_int_index(
@@ -1718,7 +1718,7 @@ def inferred_type(self):
17181718
"""
17191719
Return a string of the type inferred from the values.
17201720
"""
1721-
return lib.infer_dtype(self)
1721+
return lib.infer_dtype(self, skipna=False)
17221722

17231723
@cache_readonly
17241724
def is_all_dates(self):

pandas/core/indexes/multi.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -2318,7 +2318,8 @@ def _partial_tup_index(self, tup, side='left'):
23182318
section = labs[start:end]
23192319

23202320
if lab not in lev:
2321-
if not lev.is_type_compatible(lib.infer_dtype([lab])):
2321+
if not lev.is_type_compatible(lib.infer_dtype([lab],
2322+
skipna=False)):
23222323
raise TypeError('Level type mismatch: %s' % lab)
23232324

23242325
# short circuit

pandas/core/internals/construction.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -667,7 +667,7 @@ def sanitize_array(data, index, dtype=None, copy=False,
667667
subarr = np.array(data, dtype=object, copy=copy)
668668

669669
if is_object_dtype(subarr.dtype) and dtype != 'object':
670-
inferred = lib.infer_dtype(subarr)
670+
inferred = lib.infer_dtype(subarr, skipna=False)
671671
if inferred == 'period':
672672
try:
673673
subarr = period_array(subarr)

pandas/core/reshape/merge.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -947,7 +947,8 @@ def _maybe_coerce_merge_keys(self):
947947
continue
948948

949949
# let's infer and see if we are ok
950-
elif lib.infer_dtype(lk) == lib.infer_dtype(rk):
950+
elif (lib.infer_dtype(lk, skipna=False)
951+
== lib.infer_dtype(rk, skipna=False)):
951952
continue
952953

953954
# Check if we are trying to merge on obviously

pandas/core/reshape/tile.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -416,7 +416,7 @@ def _convert_bin_to_numeric_type(bins, dtype):
416416
------
417417
ValueError if bins are not of a compat dtype to dtype
418418
"""
419-
bins_dtype = infer_dtype(bins)
419+
bins_dtype = infer_dtype(bins, skipna=False)
420420
if is_timedelta64_dtype(dtype):
421421
if bins_dtype in ['timedelta', 'timedelta64']:
422422
bins = to_timedelta(bins).view(np.int64)

pandas/core/series.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -875,7 +875,7 @@ def _get_with(self, key):
875875
if isinstance(key, Index):
876876
key_type = key.inferred_type
877877
else:
878-
key_type = lib.infer_dtype(key)
878+
key_type = lib.infer_dtype(key, skipna=False)
879879

880880
if key_type == 'integer':
881881
if self.index.is_integer() or self.index.is_floating():
@@ -1012,7 +1012,7 @@ def _set_with(self, key, value):
10121012
if isinstance(key, Index):
10131013
key_type = key.inferred_type
10141014
else:
1015-
key_type = lib.infer_dtype(key)
1015+
key_type = lib.infer_dtype(key, skipna=False)
10161016

10171017
if key_type == 'integer':
10181018
if self.index.inferred_type == 'integer':

pandas/core/sorting.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -454,7 +454,7 @@ def sort_mixed(values):
454454
return np.concatenate([nums, np.asarray(strs, dtype=object)])
455455

456456
sorter = None
457-
if PY3 and lib.infer_dtype(values) == 'mixed-integer':
457+
if PY3 and lib.infer_dtype(values, skipna=False) == 'mixed-integer':
458458
# unorderable in py3 if mixed str/int
459459
ordered = sort_mixed(values)
460460
else:

pandas/io/parsers.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1300,7 +1300,7 @@ def _validate_usecols_arg(usecols):
13001300
elif not is_list_like(usecols):
13011301
raise ValueError(msg)
13021302
else:
1303-
usecols_dtype = lib.infer_dtype(usecols)
1303+
usecols_dtype = lib.infer_dtype(usecols, skipna=False)
13041304
if usecols_dtype not in ('empty', 'integer',
13051305
'string', 'unicode'):
13061306
raise ValueError(msg)

pandas/io/pytables.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -1952,7 +1952,7 @@ def set_atom(self, block, block_items, existing_col, min_itemsize,
19521952
return self.set_atom_complex(block)
19531953

19541954
dtype = block.dtype.name
1955-
inferred_type = lib.infer_dtype(block.values)
1955+
inferred_type = lib.infer_dtype(block.values, skipna=False)
19561956

19571957
if inferred_type == 'date':
19581958
raise TypeError(
@@ -1998,15 +1998,15 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize,
19981998
data = block.values
19991999

20002000
# see if we have a valid string type
2001-
inferred_type = lib.infer_dtype(data.ravel())
2001+
inferred_type = lib.infer_dtype(data.ravel(), skipna=False)
20022002
if inferred_type != 'string':
20032003

20042004
# we cannot serialize this data, so report an exception on a column
20052005
# by column basis
20062006
for i, item in enumerate(block_items):
20072007

20082008
col = block.iget(i)
2009-
inferred_type = lib.infer_dtype(col.ravel())
2009+
inferred_type = lib.infer_dtype(col.ravel(), skipna=False)
20102010
if inferred_type != 'string':
20112011
raise TypeError(
20122012
"Cannot serialize the column [%s] because\n"
@@ -2745,7 +2745,7 @@ def write_array(self, key, value, items=None):
27452745

27462746
# infer the type, warn if we have a non-string type here (for
27472747
# performance)
2748-
inferred_type = lib.infer_dtype(value.ravel())
2748+
inferred_type = lib.infer_dtype(value.ravel(), skipna=False)
27492749
if empty_array:
27502750
pass
27512751
elif inferred_type == 'string':
@@ -4512,7 +4512,7 @@ def _convert_index(index, encoding=None, errors='strict', format_type=None):
45124512
if isinstance(index, MultiIndex):
45134513
raise TypeError('MultiIndex not supported here!')
45144514

4515-
inferred_type = lib.infer_dtype(index)
4515+
inferred_type = lib.infer_dtype(index, skipna=False)
45164516

45174517
values = np.asarray(index)
45184518

@@ -4745,7 +4745,7 @@ def __init__(self, table, where=None, start=None, stop=None):
47454745

47464746
# see if we have a passed coordinate like
47474747
try:
4748-
inferred = lib.infer_dtype(where)
4748+
inferred = lib.infer_dtype(where, skipna=False)
47494749
if inferred == 'integer' or inferred == 'boolean':
47504750
where = np.asarray(where)
47514751
if where.dtype == np.bool_:

pandas/io/sql.py

+7-16
Original file line numberDiff line numberDiff line change
@@ -857,27 +857,15 @@ def _harmonize_columns(self, parse_dates=None):
857857
except KeyError:
858858
pass # this column not in results
859859

860-
def _get_notna_col_dtype(self, col):
861-
"""
862-
Infer datatype of the Series col. In case the dtype of col is 'object'
863-
and it contains NA values, this infers the datatype of the not-NA
864-
values. Needed for inserting typed data containing NULLs, GH8778.
865-
"""
866-
col_for_inference = col
867-
if col.dtype == 'object':
868-
notnadata = col[~isna(col)]
869-
if len(notnadata):
870-
col_for_inference = notnadata
871-
872-
return lib.infer_dtype(col_for_inference)
873-
874860
def _sqlalchemy_type(self, col):
875861

876862
dtype = self.dtype or {}
877863
if col.name in dtype:
878864
return self.dtype[col.name]
879865

880-
col_type = self._get_notna_col_dtype(col)
866+
# Infer type of column, while ignoring missing values.
867+
# Needed for inserting typed data containing NULLs, GH 8778.
868+
col_type = lib.infer_dtype(col, skipna=True)
881869

882870
from sqlalchemy.types import (BigInteger, Integer, Float,
883871
Text, Boolean,
@@ -1374,7 +1362,10 @@ def _sql_type_name(self, col):
13741362
if col.name in dtype:
13751363
return dtype[col.name]
13761364

1377-
col_type = self._get_notna_col_dtype(col)
1365+
# Infer type of column, while ignoring missing values.
1366+
# Needed for inserting typed data containing NULLs, GH 8778.
1367+
col_type = lib.infer_dtype(col, skipna=True)
1368+
13781369
if col_type == 'timedelta64':
13791370
warnings.warn("the 'timedelta' type is not supported, and will be "
13801371
"written as integer values (ns frequency) to the "

pandas/io/stata.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,7 @@ def parse_dates_safe(dates, delta=False, year=False, days=False):
396396
to_datetime(d['year'], format='%Y').astype(np.int64))
397397
d['days'] = days // NS_PER_DAY
398398

399-
elif infer_dtype(dates) == 'datetime':
399+
elif infer_dtype(dates, skipna=False) == 'datetime':
400400
if delta:
401401
delta = dates.values - stata_epoch
402402
f = lambda x: \
@@ -1867,7 +1867,7 @@ def _dtype_to_default_stata_fmt(dtype, column, dta_version=114,
18671867
if force_strl:
18681868
return '%9s'
18691869
if dtype.type == np.object_:
1870-
inferred_dtype = infer_dtype(column.dropna())
1870+
inferred_dtype = infer_dtype(column, skipna=True)
18711871
if not (inferred_dtype in ('string', 'unicode') or
18721872
len(column) == 0):
18731873
raise ValueError('Column `{col}` cannot be exported.\n\nOnly '

pandas/plotting/_converter.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ def _convert_1d(values, units, axis):
246246
return values.asfreq(axis.freq)._ndarray_values
247247
elif isinstance(values, Index):
248248
return values.map(lambda x: get_datevalue(x, axis.freq))
249-
elif lib.infer_dtype(values) == 'period':
249+
elif lib.infer_dtype(values, skipna=False) == 'period':
250250
# https://github.com/pandas-dev/pandas/issues/24304
251251
# convert ndarray[period] -> PeriodIndex
252252
return PeriodIndex(values, freq=axis.freq)._ndarray_values

0 commit comments

Comments
 (0)