REF: shift ravel in infer_dtype (#24560)

h-vetinari · jreback · commit 9ad1e00c5c70 · 2019-01-02T20:46:54.000-05:00
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -623,7 +623,7 @@ def clean_index_list(obj: list):
         return obj, all_arrays
 
     # don't force numpy coerce with nan's
-    inferred = infer_dtype(obj)
+    inferred = infer_dtype(obj, skipna=False)
     if inferred in ['string', 'bytes', 'unicode', 'mixed', 'mixed-integer']:
         return np.asarray(obj, dtype=object), 0
     elif inferred in ['integer']:
@@ -1210,6 +1210,10 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
         values = construct_1d_object_array_from_listlike(value)
 
     values = getattr(values, 'values', values)
+
+    # make contiguous
+    values = values.ravel()
+
     if skipna:
         values = values[~isnaobj(values)]
 
@@ -1220,9 +1224,6 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
     if values.dtype != np.object_:
         values = values.astype('O')
 
-    # make contiguous
-    values = values.ravel()
-
     n = len(values)
     if n == 0:
         return 'empty'
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -165,7 +165,7 @@ def _ensure_arraylike(values):
     ensure that we are arraylike if not already
     """
     if not is_array_like(values):
-        inferred = lib.infer_dtype(values)
+        inferred = lib.infer_dtype(values, skipna=False)
         if inferred in ['mixed', 'string', 'unicode']:
             if isinstance(values, tuple):
                 values = list(values)
@@ -202,8 +202,10 @@ def _get_hashtable_algo(values):
 
     if ndtype == 'object':
 
-        # its cheaper to use a String Hash Table than Object
-        if lib.infer_dtype(values) in ['string']:
+        # it's cheaper to use a String Hash Table than Object; we infer
+        # including nulls because that is the only difference between
+        # StringHashTable and ObjectHashtable
+        if lib.infer_dtype(values, skipna=False) in ['string']:
             ndtype = 'string'
         else:
             ndtype = 'object'
@@ -220,8 +222,10 @@ def _get_data_algo(values, func_map):
     values, dtype, ndtype = _ensure_data(values)
     if ndtype == 'object':
 
-        # its cheaper to use a String Hash Table than Object
-        if lib.infer_dtype(values) in ['string']:
+        # it's cheaper to use a String Hash Table than Object; we infer
+        # including nulls because that is the only difference between
+        # StringHashTable and ObjectHashtable
+        if lib.infer_dtype(values, skipna=False) in ['string']:
             ndtype = 'string'
 
     f = func_map.get(ndtype, func_map['object'])
diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -1652,7 +1652,7 @@ def sequence_to_dt64ns(data, dtype=None, copy=False,
         # TODO: We do not have tests specific to string-dtypes,
         #  also complex or categorical or other extension
         copy = False
-        if lib.infer_dtype(data) == 'integer':
+        if lib.infer_dtype(data, skipna=False) == 'integer':
             data = data.astype(np.int64)
         else:
             # data comes back here as either i8 to denote UTC timestamps
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -171,8 +171,8 @@ def coerce_to_array(values, dtype, mask=None, copy=False):
 
     values = np.array(values, copy=copy)
     if is_object_dtype(values):
-        inferred_type = lib.infer_dtype(values)
-        if inferred_type is 'mixed' and isna(values).all():
+        inferred_type = lib.infer_dtype(values, skipna=True)
+        if inferred_type == 'empty':
             values = np.empty(len(values))
             values.fill(np.nan)
         elif inferred_type not in ['floating', 'integer',
diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
@@ -594,7 +594,7 @@ def __floordiv__(self, other):
         elif is_object_dtype(other):
             result = [self[n] // other[n] for n in range(len(self))]
             result = np.array(result)
-            if lib.infer_dtype(result) == 'timedelta':
+            if lib.infer_dtype(result, skipna=False) == 'timedelta':
                 result, _ = sequence_to_td64ns(result)
                 return type(self)(result)
             return result
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -75,7 +75,8 @@ def trans(x):
 
     if isinstance(dtype, string_types):
         if dtype == 'infer':
-            inferred_type = lib.infer_dtype(ensure_object(result.ravel()))
+            inferred_type = lib.infer_dtype(ensure_object(result.ravel()),
+                                            skipna=False)
             if inferred_type == 'boolean':
                 dtype = 'bool'
             elif inferred_type == 'integer':
@@ -460,7 +461,7 @@ def infer_dtype_from_array(arr, pandas_dtype=False):
         return arr.dtype, np.asarray(arr)
 
     # don't force numpy coerce with nan's
-    inferred = lib.infer_dtype(arr)
+    inferred = lib.infer_dtype(arr, skipna=False)
     if inferred in ['string', 'bytes', 'unicode',
                     'mixed', 'mixed-integer']:
         return (np.object_, arr)
@@ -941,10 +942,11 @@ def try_timedelta(v):
 
             # We have at least a NaT and a string
             # try timedelta first to avoid spurious datetime conversions
-            # e.g. '00:00:01' is a timedelta but
-            # technically is also a datetime
+            # e.g. '00:00:01' is a timedelta but technically is also a datetime
             value = try_timedelta(v)
-            if lib.infer_dtype(value) in ['mixed']:
+            if lib.infer_dtype(value, skipna=False) in ['mixed']:
+                # cannot skip missing values, as NaT implies that the string
+                # is actually a datetime
                 value = try_datetime(v)
 
     return value
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -703,7 +703,8 @@ def is_datetime_arraylike(arr):
     if isinstance(arr, ABCDatetimeIndex):
         return True
     elif isinstance(arr, (np.ndarray, ABCSeries)):
-        return arr.dtype == object and lib.infer_dtype(arr) == 'datetime'
+        return (is_object_dtype(arr.dtype)
+                and lib.infer_dtype(arr, skipna=False) == 'datetime')
     return getattr(arr, 'inferred_type', None) == 'datetime'
 
 
diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
@@ -474,7 +474,7 @@ def _infer_fill_value(val):
     if is_datetimelike(val):
         return np.array('NaT', dtype=val.dtype)
     elif is_object_dtype(val.dtype):
-        dtype = lib.infer_dtype(ensure_object(val))
+        dtype = lib.infer_dtype(ensure_object(val), skipna=False)
         if dtype in ['datetime', 'datetime64']:
             return np.array('NaT', dtype=_NS_DTYPE)
         elif dtype in ['timedelta', 'timedelta64']:
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -346,7 +346,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
                     # should not be coerced
                     # GH 11836
                     if is_integer_dtype(dtype):
-                        inferred = lib.infer_dtype(data)
+                        inferred = lib.infer_dtype(data, skipna=False)
                         if inferred == 'integer':
                             data = maybe_cast_to_integer_array(data, dtype,
                                                                copy=copy)
@@ -376,7 +376,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
                         else:
                             data = data.astype(dtype)
                     elif is_float_dtype(dtype):
-                        inferred = lib.infer_dtype(data)
+                        inferred = lib.infer_dtype(data, skipna=False)
                         if inferred == 'string':
                             pass
                         else:
@@ -414,7 +414,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
                 subarr = subarr.copy()
 
             if dtype is None:
-                inferred = lib.infer_dtype(subarr)
+                inferred = lib.infer_dtype(subarr, skipna=False)
                 if inferred == 'integer':
                     try:
                         return cls._try_convert_to_int_index(
@@ -1718,7 +1718,7 @@ def inferred_type(self):
         """
         Return a string of the type inferred from the values.
         """
-        return lib.infer_dtype(self)
+        return lib.infer_dtype(self, skipna=False)
 
     @cache_readonly
     def is_all_dates(self):
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -2318,7 +2318,8 @@ def _partial_tup_index(self, tup, side='left'):
             section = labs[start:end]
 
             if lab not in lev:
-                if not lev.is_type_compatible(lib.infer_dtype([lab])):
+                if not lev.is_type_compatible(lib.infer_dtype([lab],
+                                                              skipna=False)):
                     raise TypeError('Level type mismatch: %s' % lab)
 
                 # short circuit
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -667,7 +667,7 @@ def sanitize_array(data, index, dtype=None, copy=False,
             subarr = np.array(data, dtype=object, copy=copy)
 
     if is_object_dtype(subarr.dtype) and dtype != 'object':
-        inferred = lib.infer_dtype(subarr)
+        inferred = lib.infer_dtype(subarr, skipna=False)
         if inferred == 'period':
             try:
                 subarr = period_array(subarr)
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -947,7 +947,8 @@ def _maybe_coerce_merge_keys(self):
                     continue
 
                 # let's infer and see if we are ok
-                elif lib.infer_dtype(lk) == lib.infer_dtype(rk):
+                elif (lib.infer_dtype(lk, skipna=False)
+                      == lib.infer_dtype(rk, skipna=False)):
                     continue
 
             # Check if we are trying to merge on obviously
diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
@@ -416,7 +416,7 @@ def _convert_bin_to_numeric_type(bins, dtype):
     ------
     ValueError if bins are not of a compat dtype to dtype
     """
-    bins_dtype = infer_dtype(bins)
+    bins_dtype = infer_dtype(bins, skipna=False)
     if is_timedelta64_dtype(dtype):
         if bins_dtype in ['timedelta', 'timedelta64']:
             bins = to_timedelta(bins).view(np.int64)
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -875,7 +875,7 @@ def _get_with(self, key):
         if isinstance(key, Index):
             key_type = key.inferred_type
         else:
-            key_type = lib.infer_dtype(key)
+            key_type = lib.infer_dtype(key, skipna=False)
 
         if key_type == 'integer':
             if self.index.is_integer() or self.index.is_floating():
@@ -1012,7 +1012,7 @@ def _set_with(self, key, value):
             if isinstance(key, Index):
                 key_type = key.inferred_type
             else:
-                key_type = lib.infer_dtype(key)
+                key_type = lib.infer_dtype(key, skipna=False)
 
             if key_type == 'integer':
                 if self.index.inferred_type == 'integer':
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
@@ -454,7 +454,7 @@ def sort_mixed(values):
         return np.concatenate([nums, np.asarray(strs, dtype=object)])
 
     sorter = None
-    if PY3 and lib.infer_dtype(values) == 'mixed-integer':
+    if PY3 and lib.infer_dtype(values, skipna=False) == 'mixed-integer':
         # unorderable in py3 if mixed str/int
         ordered = sort_mixed(values)
     else:
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1300,7 +1300,7 @@ def _validate_usecols_arg(usecols):
         elif not is_list_like(usecols):
             raise ValueError(msg)
         else:
-            usecols_dtype = lib.infer_dtype(usecols)
+            usecols_dtype = lib.infer_dtype(usecols, skipna=False)
             if usecols_dtype not in ('empty', 'integer',
                                      'string', 'unicode'):
                 raise ValueError(msg)
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -1952,7 +1952,7 @@ def set_atom(self, block, block_items, existing_col, min_itemsize,
             return self.set_atom_complex(block)
 
         dtype = block.dtype.name
-        inferred_type = lib.infer_dtype(block.values)
+        inferred_type = lib.infer_dtype(block.values, skipna=False)
 
         if inferred_type == 'date':
             raise TypeError(
@@ -1998,15 +1998,15 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize,
         data = block.values
 
         # see if we have a valid string type
-        inferred_type = lib.infer_dtype(data.ravel())
+        inferred_type = lib.infer_dtype(data.ravel(), skipna=False)
         if inferred_type != 'string':
 
             # we cannot serialize this data, so report an exception on a column
             # by column basis
             for i, item in enumerate(block_items):
 
                 col = block.iget(i)
-                inferred_type = lib.infer_dtype(col.ravel())
+                inferred_type = lib.infer_dtype(col.ravel(), skipna=False)
                 if inferred_type != 'string':
                     raise TypeError(
                         "Cannot serialize the column [%s] because\n"
@@ -2745,7 +2745,7 @@ def write_array(self, key, value, items=None):
 
             # infer the type, warn if we have a non-string type here (for
             # performance)
-            inferred_type = lib.infer_dtype(value.ravel())
+            inferred_type = lib.infer_dtype(value.ravel(), skipna=False)
             if empty_array:
                 pass
             elif inferred_type == 'string':
@@ -4512,7 +4512,7 @@ def _convert_index(index, encoding=None, errors='strict', format_type=None):
     if isinstance(index, MultiIndex):
         raise TypeError('MultiIndex not supported here!')
 
-    inferred_type = lib.infer_dtype(index)
+    inferred_type = lib.infer_dtype(index, skipna=False)
 
     values = np.asarray(index)
 
@@ -4745,7 +4745,7 @@ def __init__(self, table, where=None, start=None, stop=None):
 
             # see if we have a passed coordinate like
             try:
-                inferred = lib.infer_dtype(where)
+                inferred = lib.infer_dtype(where, skipna=False)
                 if inferred == 'integer' or inferred == 'boolean':
                     where = np.asarray(where)
                     if where.dtype == np.bool_:
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
@@ -857,27 +857,15 @@ def _harmonize_columns(self, parse_dates=None):
             except KeyError:
                 pass  # this column not in results
 
-    def _get_notna_col_dtype(self, col):
-        """
-        Infer datatype of the Series col.  In case the dtype of col is 'object'
-        and it contains NA values, this infers the datatype of the not-NA
-        values.  Needed for inserting typed data containing NULLs, GH8778.
-        """
-        col_for_inference = col
-        if col.dtype == 'object':
-            notnadata = col[~isna(col)]
-            if len(notnadata):
-                col_for_inference = notnadata
-
-        return lib.infer_dtype(col_for_inference)
-
     def _sqlalchemy_type(self, col):
 
         dtype = self.dtype or {}
         if col.name in dtype:
             return self.dtype[col.name]
 
-        col_type = self._get_notna_col_dtype(col)
+        # Infer type of column, while ignoring missing values.
+        # Needed for inserting typed data containing NULLs, GH 8778.
+        col_type = lib.infer_dtype(col, skipna=True)
 
         from sqlalchemy.types import (BigInteger, Integer, Float,
                                       Text, Boolean,
@@ -1374,7 +1362,10 @@ def _sql_type_name(self, col):
         if col.name in dtype:
             return dtype[col.name]
 
-        col_type = self._get_notna_col_dtype(col)
+        # Infer type of column, while ignoring missing values.
+        # Needed for inserting typed data containing NULLs, GH 8778.
+        col_type = lib.infer_dtype(col, skipna=True)
+
         if col_type == 'timedelta64':
             warnings.warn("the 'timedelta' type is not supported, and will be "
                           "written as integer values (ns frequency) to the "
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -396,7 +396,7 @@ def parse_dates_safe(dates, delta=False, year=False, days=False):
                         to_datetime(d['year'], format='%Y').astype(np.int64))
                 d['days'] = days // NS_PER_DAY
 
-        elif infer_dtype(dates) == 'datetime':
+        elif infer_dtype(dates, skipna=False) == 'datetime':
             if delta:
                 delta = dates.values - stata_epoch
                 f = lambda x: \
@@ -1867,7 +1867,7 @@ def _dtype_to_default_stata_fmt(dtype, column, dta_version=114,
         if force_strl:
             return '%9s'
     if dtype.type == np.object_:
-        inferred_dtype = infer_dtype(column.dropna())
+        inferred_dtype = infer_dtype(column, skipna=True)
         if not (inferred_dtype in ('string', 'unicode') or
                 len(column) == 0):
             raise ValueError('Column `{col}` cannot be exported.\n\nOnly '
diff --git a/pandas/plotting/_converter.py b/pandas/plotting/_converter.py
@@ -246,7 +246,7 @@ def _convert_1d(values, units, axis):
             return values.asfreq(axis.freq)._ndarray_values
         elif isinstance(values, Index):
             return values.map(lambda x: get_datevalue(x, axis.freq))
-        elif lib.infer_dtype(values) == 'period':
+        elif lib.infer_dtype(values, skipna=False) == 'period':
             # https://github.com/pandas-dev/pandas/issues/24304
             # convert ndarray[period] -> PeriodIndex
             return PeriodIndex(values, freq=axis.freq)._ndarray_values
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py