diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index cdd0717849e96..8a405d6ee0a00 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -330,7 +330,7 @@ def __init__( values = _convert_to_list_like(values) # By convention, empty lists result in object dtype: - sanitize_dtype = "object" if len(values) == 0 else None + sanitize_dtype = np.dtype("O") if len(values) == 0 else None null_mask = isna(values) if null_mask.any(): values = [values[idx] for idx in np.where(~null_mask)[0]] diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 2d60ad9ba50bf..d1b07585943ea 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -13,7 +13,7 @@ from pandas._libs import lib from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime -from pandas._typing import ArrayLike, Dtype +from pandas._typing import ArrayLike, Dtype, DtypeObj from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, @@ -36,7 +36,6 @@ is_list_like, is_object_dtype, is_timedelta64_ns_dtype, - pandas_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype, ExtensionDtype, registry from pandas.core.dtypes.generic import ( @@ -52,13 +51,12 @@ if TYPE_CHECKING: from pandas.core.series import Series # noqa: F401 from pandas.core.indexes.api import Index # noqa: F401 + from pandas.core.arrays import ExtensionArray # noqa: F401 def array( - data: Sequence[object], - dtype: Optional[Union[str, np.dtype, ExtensionDtype]] = None, - copy: bool = True, -) -> ABCExtensionArray: + data: Sequence[object], dtype: Optional[Dtype] = None, copy: bool = True, +) -> "ExtensionArray": """ Create an array. @@ -388,14 +386,16 @@ def extract_array(obj, extract_numpy: bool = False): def sanitize_array( - data, index, dtype=None, copy: bool = False, raise_cast_failure: bool = False -): + data, + index: Optional["Index"], + dtype: Optional[DtypeObj] = None, + copy: bool = False, + raise_cast_failure: bool = False, +) -> ArrayLike: """ - Sanitize input data to an ndarray, copy if specified, coerce to the - dtype if specified. + Sanitize input data to an ndarray or ExtensionArray, copy if specified, + coerce to the dtype if specified. """ - if dtype is not None: - dtype = pandas_dtype(dtype) if isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) @@ -508,10 +508,7 @@ def sanitize_array( def _try_cast( - arr, - dtype: Optional[Union[np.dtype, "ExtensionDtype"]], - copy: bool, - raise_cast_failure: bool, + arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bool, ): """ Convert input to numpy ndarray and optionally cast to a given dtype. diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index c9419fded5de9..e50d635a1ba6c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -3,7 +3,7 @@ """ from datetime import date, datetime, timedelta -from typing import TYPE_CHECKING, Type +from typing import TYPE_CHECKING, Any, Optional, Tuple, Type import numpy as np @@ -17,7 +17,7 @@ iNaT, ) from pandas._libs.tslibs.timezones import tz_compare -from pandas._typing import Dtype, DtypeObj +from pandas._typing import ArrayLike, Dtype, DtypeObj from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( @@ -613,7 +613,7 @@ def _ensure_dtype_type(value, dtype): return dtype.type(value) -def infer_dtype_from(val, pandas_dtype: bool = False): +def infer_dtype_from(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, Any]: """ Interpret the dtype from a scalar or array. @@ -630,7 +630,7 @@ def infer_dtype_from(val, pandas_dtype: bool = False): return infer_dtype_from_array(val, pandas_dtype=pandas_dtype) -def infer_dtype_from_scalar(val, pandas_dtype: bool = False): +def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, Any]: """ Interpret the dtype from a scalar. @@ -641,7 +641,7 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False): If False, scalar belongs to pandas extension types is inferred as object """ - dtype = np.object_ + dtype = np.dtype(object) # a 1-element ndarray if isinstance(val, np.ndarray): @@ -660,7 +660,7 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False): # instead of np.empty (but then you still don't want things # coming out as np.str_! - dtype = np.object_ + dtype = np.dtype(object) elif isinstance(val, (np.datetime64, datetime)): val = tslibs.Timestamp(val) @@ -671,7 +671,7 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False): dtype = DatetimeTZDtype(unit="ns", tz=val.tz) else: # return datetimetz as object - return np.object_, val + return np.dtype(object), val val = val.value elif isinstance(val, (np.timedelta64, timedelta)): @@ -679,22 +679,22 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False): dtype = np.dtype("m8[ns]") elif is_bool(val): - dtype = np.bool_ + dtype = np.dtype(np.bool_) elif is_integer(val): if isinstance(val, np.integer): - dtype = type(val) + dtype = np.dtype(type(val)) else: - dtype = np.int64 + dtype = np.dtype(np.int64) elif is_float(val): if isinstance(val, np.floating): - dtype = type(val) + dtype = np.dtype(type(val)) else: - dtype = np.float64 + dtype = np.dtype(np.float64) elif is_complex(val): - dtype = np.complex_ + dtype = np.dtype(np.complex_) elif pandas_dtype: if lib.is_period(val): @@ -707,7 +707,8 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False): return dtype, val -def infer_dtype_from_array(arr, pandas_dtype: bool = False): +# TODO: try to make the Any in the return annotation more specific +def infer_dtype_from_array(arr, pandas_dtype: bool = False) -> Tuple[DtypeObj, Any]: """ Infer the dtype from an array. @@ -738,7 +739,7 @@ def infer_dtype_from_array(arr, pandas_dtype: bool = False): array(['1', '1'], dtype='>> infer_dtype_from_array([1, '1']) - (, [1, '1']) + (dtype('O'), [1, '1']) """ if isinstance(arr, np.ndarray): return arr.dtype, arr @@ -755,7 +756,7 @@ def infer_dtype_from_array(arr, pandas_dtype: bool = False): # don't force numpy coerce with nan's inferred = lib.infer_dtype(arr, skipna=False) if inferred in ["string", "bytes", "mixed", "mixed-integer"]: - return (np.object_, arr) + return (np.dtype(np.object_), arr) arr = np.asarray(arr) return arr.dtype, arr @@ -1469,7 +1470,7 @@ def find_common_type(types): return np.find_common_type(types, []) -def cast_scalar_to_array(shape, value, dtype=None): +def cast_scalar_to_array(shape, value, dtype: Optional[DtypeObj] = None) -> np.ndarray: """ Create np.ndarray of specified shape and dtype, filled with values. @@ -1496,7 +1497,9 @@ def cast_scalar_to_array(shape, value, dtype=None): return values -def construct_1d_arraylike_from_scalar(value, length: int, dtype): +def construct_1d_arraylike_from_scalar( + value, length: int, dtype: DtypeObj +) -> ArrayLike: """ create a np.ndarray / pandas type of specified shape and dtype filled with values @@ -1505,7 +1508,7 @@ def construct_1d_arraylike_from_scalar(value, length: int, dtype): ---------- value : scalar value length : int - dtype : pandas_dtype / np.dtype + dtype : pandas_dtype or np.dtype Returns ------- @@ -1517,8 +1520,6 @@ def construct_1d_arraylike_from_scalar(value, length: int, dtype): subarr = cls._from_sequence([value] * length, dtype=dtype) else: - if not isinstance(dtype, (np.dtype, type(np.dtype))): - dtype = dtype.dtype if length and is_integer_dtype(dtype) and isna(value): # coerce if we have nan for an integer dtype @@ -1536,7 +1537,7 @@ def construct_1d_arraylike_from_scalar(value, length: int, dtype): return subarr -def construct_1d_object_array_from_listlike(values): +def construct_1d_object_array_from_listlike(values) -> np.ndarray: """ Transform any list-like object in a 1-dimensional numpy array of object dtype. @@ -1561,7 +1562,9 @@ def construct_1d_object_array_from_listlike(values): return result -def construct_1d_ndarray_preserving_na(values, dtype=None, copy: bool = False): +def construct_1d_ndarray_preserving_na( + values, dtype: Optional[DtypeObj] = None, copy: bool = False +) -> np.ndarray: """ Construct a new ndarray, coercing `values` to `dtype`, preserving NA. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8228edb12b29a..6ff72691ca757 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -104,6 +104,7 @@ is_scalar, is_sequence, needs_i8_conversion, + pandas_dtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -1917,7 +1918,12 @@ def to_records( @classmethod def _from_arrays( - cls, arrays, columns, index, dtype=None, verify_integrity=True + cls, + arrays, + columns, + index, + dtype: Optional[Dtype] = None, + verify_integrity: bool = True, ) -> "DataFrame": """ Create DataFrame from a list of arrays corresponding to the columns. @@ -1943,6 +1949,9 @@ def _from_arrays( ------- DataFrame """ + if dtype is not None: + dtype = pandas_dtype(dtype) + mgr = arrays_to_mgr( arrays, columns, diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 5c9e4b96047ee..ce3f07d06d6a2 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -3,13 +3,13 @@ constructors before passing them to a BlockManager. """ from collections import abc -from typing import Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple, Union import numpy as np import numpy.ma as ma from pandas._libs import lib -from pandas._typing import Axis, Dtype, Scalar +from pandas._typing import Axis, DtypeObj, Scalar from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, @@ -50,12 +50,20 @@ create_block_manager_from_blocks, ) +if TYPE_CHECKING: + from pandas import Series # noqa:F401 + # --------------------------------------------------------------------- # BlockManager Interface def arrays_to_mgr( - arrays, arr_names, index, columns, dtype=None, verify_integrity: bool = True + arrays, + arr_names, + index, + columns, + dtype: Optional[DtypeObj] = None, + verify_integrity: bool = True, ): """ Segregate Series based on type and coerce into matrices. @@ -85,7 +93,9 @@ def arrays_to_mgr( return create_block_manager_from_arrays(arrays, arr_names, axes) -def masked_rec_array_to_mgr(data, index, columns, dtype, copy: bool): +def masked_rec_array_to_mgr( + data, index, columns, dtype: Optional[DtypeObj], copy: bool +): """ Extract from a masked rec array and create the manager. """ @@ -130,7 +140,7 @@ def masked_rec_array_to_mgr(data, index, columns, dtype, copy: bool): # DataFrame Constructor Interface -def init_ndarray(values, index, columns, dtype=None, copy=False): +def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): # input must be a ndarray, list, Series, index if isinstance(values, ABCSeries): @@ -189,7 +199,10 @@ def init_ndarray(values, index, columns, dtype=None, copy=False): f"failed to cast to '{dtype}' (Exception was: {orig})" ) from orig - index, columns = _get_axes(*values.shape, index=index, columns=columns) + # _prep_ndarray ensures that values.ndim == 2 at this point + index, columns = _get_axes( + values.shape[0], values.shape[1], index=index, columns=columns + ) values = values.T # if we don't have a dtype specified, then try to convert objects @@ -221,13 +234,15 @@ def init_ndarray(values, index, columns, dtype=None, copy=False): return create_block_manager_from_blocks(block_values, [columns, index]) -def init_dict(data, index, columns, dtype=None): +def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. """ + arrays: Union[Sequence[Any], "Series"] + if columns is not None: - from pandas.core.series import Series + from pandas.core.series import Series # noqa:F811 arrays = Series(data, index=columns, dtype=object) data_names = arrays.index @@ -244,7 +259,7 @@ def init_dict(data, index, columns, dtype=None): if missing.any() and not is_integer_dtype(dtype): if dtype is None or np.issubdtype(dtype, np.flexible): # GH#1783 - nan_dtype = object + nan_dtype = np.dtype(object) else: nan_dtype = dtype val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) @@ -253,7 +268,7 @@ def init_dict(data, index, columns, dtype=None): else: keys = list(data.keys()) columns = data_names = Index(keys) - arrays = (com.maybe_iterable_to_list(data[k]) for k in keys) + arrays = [com.maybe_iterable_to_list(data[k]) for k in keys] # GH#24096 need copy to be deep for datetime64tz case # TODO: See if we can avoid these copies arrays = [ @@ -308,7 +323,7 @@ def convert(v): return values -def _homogenize(data, index, dtype=None): +def _homogenize(data, index, dtype: Optional[DtypeObj]): oindex = None homogenized = [] @@ -339,7 +354,10 @@ def _homogenize(data, index, dtype=None): return homogenized -def extract_index(data): +def extract_index(data) -> Index: + """ + Try to infer an Index from the passed data, raise ValueError on failure. + """ index = None if len(data) == 0: index = Index([]) @@ -381,6 +399,7 @@ def extract_index(data): ) if have_series: + assert index is not None # for mypy if lengths[0] != len(index): msg = ( f"array length {lengths[0]} does not match index " @@ -442,7 +461,8 @@ def _get_axes(N, K, index, columns) -> Tuple[Index, Index]: def dataclasses_to_dicts(data): - """ Converts a list of dataclass instances to a list of dictionaries + """ + Converts a list of dataclass instances to a list of dictionaries. Parameters ---------- @@ -472,7 +492,9 @@ def dataclasses_to_dicts(data): # Conversion of Inputs to Arrays -def to_arrays(data, columns, coerce_float=False, dtype=None): +def to_arrays( + data, columns, coerce_float: bool = False, dtype: Optional[DtypeObj] = None +): """ Return list of arrays, columns. """ @@ -527,7 +549,7 @@ def _list_to_arrays( data: List[Scalar], columns: Union[Index, List], coerce_float: bool = False, - dtype: Optional[Dtype] = None, + dtype: Optional[DtypeObj] = None, ) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: if len(data) > 0 and isinstance(data[0], tuple): content = list(lib.to_object_array_tuples(data).T) @@ -547,7 +569,7 @@ def _list_of_series_to_arrays( data: List, columns: Union[Index, List], coerce_float: bool = False, - dtype: Optional[Dtype] = None, + dtype: Optional[DtypeObj] = None, ) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: if columns is None: # We know pass_data is non-empty because data[0] is a Series @@ -585,7 +607,7 @@ def _list_of_dict_to_arrays( data: List, columns: Union[Index, List], coerce_float: bool = False, - dtype: Optional[Dtype] = None, + dtype: Optional[DtypeObj] = None, ) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: """ Convert list of dicts to numpy arrays @@ -624,7 +646,7 @@ def _list_of_dict_to_arrays( def _validate_or_indexify_columns( - content: List, columns: Union[Index, List, None] + content: List, columns: Optional[Union[Index, List]] ) -> Union[Index, List[Axis]]: """ If columns is None, make numbers as column names; Otherwise, validate that @@ -682,7 +704,7 @@ def _validate_or_indexify_columns( def _convert_object_array( - content: List[Scalar], coerce_float: bool = False, dtype: Optional[Dtype] = None + content: List[Scalar], coerce_float: bool = False, dtype: Optional[DtypeObj] = None ) -> List[Scalar]: """ Internal function ot convert object array. @@ -699,7 +721,7 @@ def _convert_object_array( """ # provide soft conversion of object dtypes def convert(arr): - if dtype != object and dtype != np.object: + if dtype != np.dtype("O"): arr = lib.maybe_convert_objects(arr, try_float=coerce_float) arr = maybe_cast_to_datetime(arr, dtype) return arr