diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ac1c34edba914..938ca53b5fdce 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -380,12 +380,12 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, dtype = CategoricalDtype(values.categories, dtype.ordered) elif not isinstance(values, (ABCIndexClass, ABCSeries)): - # _sanitize_array coerces np.nan to a string under certain versions + # sanitize_array coerces np.nan to a string under certain versions # of numpy values = maybe_infer_to_datetimelike(values, convert_dates=True) if not isinstance(values, np.ndarray): values = _convert_to_list_like(values) - from pandas.core.series import _sanitize_array + from pandas.core.internals.construction import sanitize_array # By convention, empty lists result in object dtype: if len(values) == 0: sanitize_dtype = 'object' @@ -394,7 +394,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, null_mask = isna(values) if null_mask.any(): values = [values[idx] for idx in np.where(~null_mask)[0]] - values = _sanitize_array(values, None, dtype=sanitize_dtype) + values = sanitize_array(values, None, dtype=sanitize_dtype) if dtype.categories is None: try: @@ -2442,12 +2442,12 @@ def isin(self, values): >>> s.isin(['lama']) array([ True, False, True, False, True, False]) """ - from pandas.core.series import _sanitize_array + from pandas.core.internals.construction import sanitize_array if not is_list_like(values): raise TypeError("only list-like objects are allowed to be passed" " to isin(), you passed a [{values_type}]" .format(values_type=type(values).__name__)) - values = _sanitize_array(values, None, None) + values = sanitize_array(values, None, None) null_mask = np.asarray(isna(values)) code_values = self.categories.get_indexer(values) code_values = code_values[null_mask | (code_values >= 0)] diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 96724b6c4b362..134466d769ada 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -631,8 +631,8 @@ def __init__(self, data, sparse_index=None, index=None, fill_value=None, if not is_array_like(data): try: # probably shared code in sanitize_series - from pandas.core.series import _sanitize_array - data = _sanitize_array(data, index=None) + from pandas.core.internals.construction import sanitize_array + data = sanitize_array(data, index=None) except ValueError: # NumPy may raise a ValueError on data like [1, []] # we retry with object dtype here. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b9f32042924b9..2c1fa5ef4439e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -31,7 +31,7 @@ validate_axis_style_args) from pandas import compat -from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u, +from pandas.compat import (range, map, zip, lmap, lzip, StringIO, u, OrderedDict, PY36, raise_with_traceback, string_and_binary_types) from pandas.compat.numpy import function as nv @@ -39,7 +39,6 @@ from pandas.core.dtypes.cast import ( maybe_upcast, cast_scalar_to_array, - construct_1d_arraylike_from_scalar, infer_dtype_from_scalar, maybe_cast_to_datetime, maybe_infer_to_datetimelike, @@ -50,11 +49,9 @@ maybe_upcast_putmask, find_common_type) from pandas.core.dtypes.common import ( - is_categorical_dtype, is_object_dtype, is_extension_type, is_extension_array_dtype, - is_datetime64tz_dtype, is_datetime64_any_dtype, is_bool_dtype, is_integer_dtype, @@ -88,12 +85,13 @@ from pandas.core.indexes import base as ibase from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import PeriodIndex -from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable, check_bool_indexer) -from pandas.core.internals import (BlockManager, - create_block_manager_from_arrays, - create_block_manager_from_blocks) +from pandas.core.internals import BlockManager +from pandas.core.internals.construction import ( + masked_rec_array_to_mgr, get_names_from_index, to_arrays, + reorder_arrays, init_ndarray, init_dict, + arrays_to_mgr, sanitize_index) from pandas.core.series import Series from pandas.io.formats import console @@ -386,13 +384,13 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, mgr = self._init_mgr(data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy) elif isinstance(data, dict): - mgr = self._init_dict(data, index, columns, dtype=dtype) + mgr = init_dict(data, index, columns, dtype=dtype) elif isinstance(data, ma.MaskedArray): import numpy.ma.mrecords as mrecords # masked recarray if isinstance(data, mrecords.MaskedRecords): - mgr = _masked_rec_array_to_mgr(data, index, columns, dtype, - copy) + mgr = masked_rec_array_to_mgr(data, index, columns, dtype, + copy) # a masked array else: @@ -402,8 +400,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, data[mask] = fill_value else: data = data.copy() - mgr = self._init_ndarray(data, index, columns, dtype=dtype, - copy=copy) + mgr = init_ndarray(data, index, columns, dtype=dtype, + copy=copy) elif isinstance(data, (np.ndarray, Series, Index)): if data.dtype.names: @@ -411,13 +409,13 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, data = {k: data[k] for k in data_columns} if columns is None: columns = data_columns - mgr = self._init_dict(data, index, columns, dtype=dtype) + mgr = init_dict(data, index, columns, dtype=dtype) elif getattr(data, 'name', None) is not None: - mgr = self._init_dict({data.name: data}, index, columns, - dtype=dtype) + mgr = init_dict({data.name: data}, index, columns, + dtype=dtype) else: - mgr = self._init_ndarray(data, index, columns, dtype=dtype, - copy=copy) + mgr = init_ndarray(data, index, columns, dtype=dtype, + copy=copy) # For data is list-like, or Iterable (will consume into list) elif (isinstance(data, compat.Iterable) @@ -428,25 +426,25 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1: if is_named_tuple(data[0]) and columns is None: columns = data[0]._fields - arrays, columns = _to_arrays(data, columns, dtype=dtype) + arrays, columns = to_arrays(data, columns, dtype=dtype) columns = ensure_index(columns) # set the index if index is None: if isinstance(data[0], Series): - index = _get_names_from_index(data) + index = get_names_from_index(data) elif isinstance(data[0], Categorical): index = ibase.default_index(len(data[0])) else: index = ibase.default_index(len(data)) - mgr = _arrays_to_mgr(arrays, columns, index, columns, - dtype=dtype) + mgr = arrays_to_mgr(arrays, columns, index, columns, + dtype=dtype) else: - mgr = self._init_ndarray(data, index, columns, dtype=dtype, - copy=copy) + mgr = init_ndarray(data, index, columns, dtype=dtype, + copy=copy) else: - mgr = self._init_dict({}, index, columns, dtype=dtype) + mgr = init_dict({}, index, columns, dtype=dtype) else: try: arr = np.array(data, dtype=dtype, copy=copy) @@ -458,8 +456,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, if arr.ndim == 0 and index is not None and columns is not None: values = cast_scalar_to_array((len(index), len(columns)), data, dtype=dtype) - mgr = self._init_ndarray(values, index, columns, - dtype=values.dtype, copy=False) + mgr = init_ndarray(values, index, columns, + dtype=values.dtype, copy=False) else: raise ValueError('DataFrame constructor not properly called!') @@ -470,113 +468,13 @@ def _init_dict(self, data, index, columns, dtype=None): Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. """ - if columns is not None: - arrays = Series(data, index=columns, dtype=object) - data_names = arrays.index - - missing = arrays.isnull() - if index is None: - # GH10856 - # raise ValueError if only scalars in dict - index = extract_index(arrays[~missing]) - else: - index = ensure_index(index) - - # no obvious "empty" int column - if missing.any() and not is_integer_dtype(dtype): - if dtype is None or np.issubdtype(dtype, np.flexible): - # 1783 - nan_dtype = object - else: - nan_dtype = dtype - v = construct_1d_arraylike_from_scalar(np.nan, len(index), - nan_dtype) - arrays.loc[missing] = [v] * missing.sum() - - else: - keys = com.dict_keys_to_ordered_list(data) - columns = data_names = Index(keys) - arrays = [data[k] for k in keys] - - return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) + return init_dict(data, index, columns, dtype=dtype) + # TODO: Can we get rid of this as a method? def _init_ndarray(self, values, index, columns, dtype=None, copy=False): # input must be a ndarray, list, Series, index - - if isinstance(values, Series): - if columns is None: - if values.name is not None: - columns = [values.name] - if index is None: - index = values.index - else: - values = values.reindex(index) - - # zero len case (GH #2234) - if not len(values) and columns is not None and len(columns): - values = np.empty((0, 1), dtype=object) - - # helper to create the axes as indexes - def _get_axes(N, K, index=index, columns=columns): - # return axes or defaults - - if index is None: - index = ibase.default_index(N) - else: - index = ensure_index(index) - - if columns is None: - columns = ibase.default_index(K) - else: - columns = ensure_index(columns) - return index, columns - - # we could have a categorical type passed or coerced to 'category' - # recast this to an _arrays_to_mgr - if (is_categorical_dtype(getattr(values, 'dtype', None)) or - is_categorical_dtype(dtype)): - - if not hasattr(values, 'dtype'): - values = _prep_ndarray(values, copy=copy) - values = values.ravel() - elif copy: - values = values.copy() - - index, columns = _get_axes(len(values), 1) - return _arrays_to_mgr([values], columns, index, columns, - dtype=dtype) - elif (is_datetime64tz_dtype(values) or - is_extension_array_dtype(values)): - # GH19157 - if columns is None: - columns = [0] - return _arrays_to_mgr([values], columns, index, columns, - dtype=dtype) - - # by definition an array here - # the dtypes will be coerced to a single dtype - values = _prep_ndarray(values, copy=copy) - - if dtype is not None: - if not is_dtype_equal(values.dtype, dtype): - try: - values = values.astype(dtype) - except Exception as orig: - e = ValueError("failed to cast to '{dtype}' (Exception " - "was: {orig})".format(dtype=dtype, - orig=orig)) - raise_with_traceback(e) - - index, columns = _get_axes(*values.shape) - values = values.T - - # if we don't have a dtype specified, then try to convert objects - # on the entire block; this is to convert if we have datetimelike's - # embedded in an object type - if dtype is None and is_object_dtype(values): - values = maybe_infer_to_datetimelike(values) - - return create_block_manager_from_blocks([values], [columns, index]) + return init_ndarray(values, index, columns, dtype=dtype, copy=copy) + # TODO: can we just get rid of this as a method? # ---------------------------------------------------------------------- @@ -1535,17 +1433,17 @@ def from_records(cls, data, index=None, exclude=None, columns=None, arr_columns.append(k) arrays.append(v) - arrays, arr_columns = _reorder_arrays(arrays, arr_columns, - columns) + arrays, arr_columns = reorder_arrays(arrays, arr_columns, + columns) elif isinstance(data, (np.ndarray, DataFrame)): - arrays, columns = _to_arrays(data, columns) + arrays, columns = to_arrays(data, columns) if columns is not None: columns = ensure_index(columns) arr_columns = columns else: - arrays, arr_columns = _to_arrays(data, columns, - coerce_float=coerce_float) + arrays, arr_columns = to_arrays(data, columns, + coerce_float=coerce_float) arr_columns = ensure_index(arr_columns) if columns is not None: @@ -1587,7 +1485,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None, arr_columns = arr_columns.drop(arr_exclude) columns = columns.drop(exclude) - mgr = _arrays_to_mgr(arrays, arr_columns, result_index, columns) + mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns) return cls(mgr) @@ -1780,7 +1678,7 @@ def from_items(cls, items, columns=None, orient='columns'): @classmethod def _from_arrays(cls, arrays, columns, index, dtype=None): - mgr = _arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) + mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) return cls(mgr) @classmethod @@ -3559,17 +3457,15 @@ def reindexer(value): value = reindexer(value).T elif isinstance(value, ExtensionArray): - from pandas.core.series import _sanitize_index - # Explicitly copy here, instead of in _sanitize_index, + # Explicitly copy here, instead of in sanitize_index, # as sanitize_index won't copy an EA, even with copy=True value = value.copy() - value = _sanitize_index(value, self.index, copy=False) + value = sanitize_index(value, self.index, copy=False) elif isinstance(value, Index) or is_sequence(value): - from pandas.core.series import _sanitize_index # turn me into an ndarray - value = _sanitize_index(value, self.index, copy=False) + value = sanitize_index(value, self.index, copy=False) if not isinstance(value, (np.ndarray, Index)): if isinstance(value, list) and len(value) > 0: value = maybe_convert_platform(value) @@ -7841,341 +7737,6 @@ def isin(self, values): ops.add_special_arithmetic_methods(DataFrame) -def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): - """ - Segregate Series based on type and coerce into matrices. - - Needs to handle a lot of exceptional cases. - """ - # figure out the index, if necessary - if index is None: - index = extract_index(arrays) - else: - index = ensure_index(index) - - # don't force copy because getting jammed in an ndarray anyway - arrays = _homogenize(arrays, index, dtype) - - # from BlockManager perspective - axes = [ensure_index(columns), index] - - return create_block_manager_from_arrays(arrays, arr_names, axes) - - -def extract_index(data): - from pandas.core.index import _union_indexes - - index = None - if len(data) == 0: - index = Index([]) - elif len(data) > 0: - raw_lengths = [] - indexes = [] - - have_raw_arrays = False - have_series = False - have_dicts = False - - for v in data: - if isinstance(v, Series): - have_series = True - indexes.append(v.index) - elif isinstance(v, dict): - have_dicts = True - indexes.append(list(v.keys())) - elif is_list_like(v) and getattr(v, 'ndim', 1) == 1: - have_raw_arrays = True - raw_lengths.append(len(v)) - - if not indexes and not raw_lengths: - raise ValueError('If using all scalar values, you must pass' - ' an index') - - if have_series or have_dicts: - index = _union_indexes(indexes) - - if have_raw_arrays: - lengths = list(set(raw_lengths)) - if len(lengths) > 1: - raise ValueError('arrays must all be same length') - - if have_dicts: - raise ValueError('Mixing dicts with non-Series may lead to ' - 'ambiguous ordering.') - - if have_series: - if lengths[0] != len(index): - msg = ('array length %d does not match index length %d' % - (lengths[0], len(index))) - raise ValueError(msg) - else: - index = ibase.default_index(lengths[0]) - - return ensure_index(index) - - -def _prep_ndarray(values, copy=True): - if not isinstance(values, (np.ndarray, Series, Index)): - if len(values) == 0: - return np.empty((0, 0), dtype=object) - - def convert(v): - return maybe_convert_platform(v) - - # we could have a 1-dim or 2-dim list here - # this is equiv of np.asarray, but does object conversion - # and platform dtype preservation - try: - if is_list_like(values[0]) or hasattr(values[0], 'len'): - values = np.array([convert(v) for v in values]) - elif isinstance(values[0], np.ndarray) and values[0].ndim == 0: - # GH#21861 - values = np.array([convert(v) for v in values]) - else: - values = convert(values) - except (ValueError, TypeError): - values = convert(values) - - else: - - # drop subclass info, do not copy data - values = np.asarray(values) - if copy: - values = values.copy() - - if values.ndim == 1: - values = values.reshape((values.shape[0], 1)) - elif values.ndim != 2: - raise ValueError('Must pass 2-d input') - - return values - - -def _to_arrays(data, columns, coerce_float=False, dtype=None): - """ - Return list of arrays, columns. - """ - if isinstance(data, DataFrame): - if columns is not None: - arrays = [data._ixs(i, axis=1).values - for i, col in enumerate(data.columns) if col in columns] - else: - columns = data.columns - arrays = [data._ixs(i, axis=1).values for i in range(len(columns))] - - return arrays, columns - - if not len(data): - if isinstance(data, np.ndarray): - columns = data.dtype.names - if columns is not None: - return [[]] * len(columns), columns - return [], [] # columns if columns is not None else [] - if isinstance(data[0], (list, tuple)): - return _list_to_arrays(data, columns, coerce_float=coerce_float, - dtype=dtype) - elif isinstance(data[0], compat.Mapping): - return _list_of_dict_to_arrays(data, columns, - coerce_float=coerce_float, dtype=dtype) - elif isinstance(data[0], Series): - return _list_of_series_to_arrays(data, columns, - coerce_float=coerce_float, - dtype=dtype) - elif isinstance(data[0], Categorical): - if columns is None: - columns = ibase.default_index(len(data)) - return data, columns - elif (isinstance(data, (np.ndarray, Series, Index)) and - data.dtype.names is not None): - - columns = list(data.dtype.names) - arrays = [data[k] for k in columns] - return arrays, columns - else: - # last ditch effort - data = lmap(tuple, data) - return _list_to_arrays(data, columns, coerce_float=coerce_float, - dtype=dtype) - - -def _masked_rec_array_to_mgr(data, index, columns, dtype, copy): - """ - Extract from a masked rec array and create the manager. - """ - - # essentially process a record array then fill it - fill_value = data.fill_value - fdata = ma.getdata(data) - if index is None: - index = _get_names_from_index(fdata) - if index is None: - index = ibase.default_index(len(data)) - index = ensure_index(index) - - if columns is not None: - columns = ensure_index(columns) - arrays, arr_columns = _to_arrays(fdata, columns) - - # fill if needed - new_arrays = [] - for fv, arr, col in zip(fill_value, arrays, arr_columns): - mask = ma.getmaskarray(data[col]) - if mask.any(): - arr, fv = maybe_upcast(arr, fill_value=fv, copy=True) - arr[mask] = fv - new_arrays.append(arr) - - # create the manager - arrays, arr_columns = _reorder_arrays(new_arrays, arr_columns, columns) - if columns is None: - columns = arr_columns - - mgr = _arrays_to_mgr(arrays, arr_columns, index, columns) - - if copy: - mgr = mgr.copy() - return mgr - - -def _reorder_arrays(arrays, arr_columns, columns): - # reorder according to the columns - if (columns is not None and len(columns) and arr_columns is not None and - len(arr_columns)): - indexer = ensure_index(arr_columns).get_indexer(columns) - arr_columns = ensure_index([arr_columns[i] for i in indexer]) - arrays = [arrays[i] for i in indexer] - return arrays, arr_columns - - -def _list_to_arrays(data, columns, coerce_float=False, dtype=None): - if len(data) > 0 and isinstance(data[0], tuple): - content = list(lib.to_object_array_tuples(data).T) - else: - # list of lists - content = list(lib.to_object_array(data).T) - return _convert_object_array(content, columns, dtype=dtype, - coerce_float=coerce_float) - - -def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): - from pandas.core.index import _get_objs_combined_axis - - if columns is None: - columns = _get_objs_combined_axis(data, sort=False) - - indexer_cache = {} - - aligned_values = [] - for s in data: - index = getattr(s, 'index', None) - if index is None: - index = ibase.default_index(len(s)) - - if id(index) in indexer_cache: - indexer = indexer_cache[id(index)] - else: - indexer = indexer_cache[id(index)] = index.get_indexer(columns) - - values = com.values_from_object(s) - aligned_values.append(algorithms.take_1d(values, indexer)) - - values = np.vstack(aligned_values) - - if values.dtype == np.object_: - content = list(values.T) - return _convert_object_array(content, columns, dtype=dtype, - coerce_float=coerce_float) - else: - return values.T, columns - - -def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): - if columns is None: - gen = (list(x.keys()) for x in data) - sort = not any(isinstance(d, OrderedDict) for d in data) - columns = lib.fast_unique_multiple_list_gen(gen, sort=sort) - - # assure that they are of the base dict class and not of derived - # classes - data = [(type(d) is dict) and d or dict(d) for d in data] - - content = list(lib.dicts_to_array(data, list(columns)).T) - return _convert_object_array(content, columns, dtype=dtype, - coerce_float=coerce_float) - - -def _convert_object_array(content, columns, coerce_float=False, dtype=None): - if columns is None: - columns = ibase.default_index(len(content)) - else: - if len(columns) != len(content): # pragma: no cover - # caller's responsibility to check for this... - raise AssertionError('{col:d} columns passed, passed data had ' - '{con} columns'.format(col=len(columns), - con=len(content))) - - # provide soft conversion of object dtypes - def convert(arr): - if dtype != object and dtype != np.object: - arr = lib.maybe_convert_objects(arr, try_float=coerce_float) - arr = maybe_cast_to_datetime(arr, dtype) - return arr - - arrays = [convert(arr) for arr in content] - - return arrays, columns - - -def _get_names_from_index(data): - has_some_name = any(getattr(s, 'name', None) is not None for s in data) - if not has_some_name: - return ibase.default_index(len(data)) - - index = lrange(len(data)) - count = 0 - for i, s in enumerate(data): - n = getattr(s, 'name', None) - if n is not None: - index[i] = n - else: - index[i] = 'Unnamed %d' % count - count += 1 - - return index - - -def _homogenize(data, index, dtype=None): - from pandas.core.series import _sanitize_array - - oindex = None - homogenized = [] - - for v in data: - if isinstance(v, Series): - if dtype is not None: - v = v.astype(dtype) - if v.index is not index: - # Forces alignment. No need to copy data since we - # are putting it into an ndarray later - v = v.reindex(index, copy=False) - else: - if isinstance(v, dict): - if oindex is None: - oindex = index.astype('O') - - if isinstance(index, (DatetimeIndex, TimedeltaIndex)): - v = com.dict_compat(v) - else: - v = dict(v) - v = lib.fast_multiget(v, oindex.values, default=np.nan) - v = _sanitize_array(v, index, dtype=dtype, copy=False, - raise_cast_failure=False) - - homogenized.append(v) - - return homogenized - - def _from_nested_dict(data): # TODO: this should be seriously cythonized new_data = OrderedDict() diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py new file mode 100644 index 0000000000000..5859dc9e858b7 --- /dev/null +++ b/pandas/core/internals/construction.py @@ -0,0 +1,699 @@ +""" +Functions for preparing various inputs passed to the DataFrame or Series +constructors before passing them to aBlockManager. +""" +from collections import OrderedDict + +import numpy as np +import numpy.ma as ma + +from pandas._libs import lib +from pandas._libs.tslibs import IncompatibleFrequency +import pandas.compat as compat +from pandas.compat import ( + get_range_parameters, lmap, lrange, raise_with_traceback, range) + +from pandas.core.dtypes.cast import ( + construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na, + construct_1d_object_array_from_listlike, infer_dtype_from_scalar, + maybe_cast_to_datetime, maybe_cast_to_integer_array, maybe_castable, + maybe_convert_platform, maybe_infer_to_datetimelike, maybe_upcast) +from pandas.core.dtypes.common import ( + is_categorical_dtype, is_datetime64tz_dtype, is_dtype_equal, + is_extension_array_dtype, is_extension_type, is_float_dtype, + is_integer_dtype, is_iterator, is_list_like, is_object_dtype, pandas_dtype) +from pandas.core.dtypes.generic import ( + ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, ABCPeriodIndex, ABCSeries, + ABCTimedeltaIndex) +from pandas.core.dtypes.missing import isna + +from pandas.core import algorithms, common as com +from pandas.core.arrays import Categorical, ExtensionArray, period_array +from pandas.core.index import ( + Index, _get_objs_combined_axis, _union_indexes, ensure_index) +from pandas.core.indexes import base as ibase +from pandas.core.internals import ( + create_block_manager_from_arrays, create_block_manager_from_blocks) + +# --------------------------------------------------------------------- +# BlockManager Interface + + +def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): + """ + Segregate Series based on type and coerce into matrices. + + Needs to handle a lot of exceptional cases. + """ + # figure out the index, if necessary + if index is None: + index = extract_index(arrays) + else: + index = ensure_index(index) + + # don't force copy because getting jammed in an ndarray anyway + arrays = _homogenize(arrays, index, dtype) + + # from BlockManager perspective + axes = [ensure_index(columns), index] + + return create_block_manager_from_arrays(arrays, arr_names, axes) + + +def masked_rec_array_to_mgr(data, index, columns, dtype, copy): + """ + Extract from a masked rec array and create the manager. + """ + + # essentially process a record array then fill it + fill_value = data.fill_value + fdata = ma.getdata(data) + if index is None: + index = get_names_from_index(fdata) + if index is None: + index = ibase.default_index(len(data)) + index = ensure_index(index) + + if columns is not None: + columns = ensure_index(columns) + arrays, arr_columns = to_arrays(fdata, columns) + + # fill if needed + new_arrays = [] + for fv, arr, col in zip(fill_value, arrays, arr_columns): + mask = ma.getmaskarray(data[col]) + if mask.any(): + arr, fv = maybe_upcast(arr, fill_value=fv, copy=True) + arr[mask] = fv + new_arrays.append(arr) + + # create the manager + arrays, arr_columns = reorder_arrays(new_arrays, arr_columns, columns) + if columns is None: + columns = arr_columns + + mgr = arrays_to_mgr(arrays, arr_columns, index, columns) + + if copy: + mgr = mgr.copy() + return mgr + + +# --------------------------------------------------------------------- +# DataFrame Constructor Interface + +def init_ndarray(values, index, columns, dtype=None, copy=False): + # input must be a ndarray, list, Series, index + + if isinstance(values, ABCSeries): + if columns is None: + if values.name is not None: + columns = [values.name] + if index is None: + index = values.index + else: + values = values.reindex(index) + + # zero len case (GH #2234) + if not len(values) and columns is not None and len(columns): + values = np.empty((0, 1), dtype=object) + + # we could have a categorical type passed or coerced to 'category' + # recast this to an arrays_to_mgr + if (is_categorical_dtype(getattr(values, 'dtype', None)) or + is_categorical_dtype(dtype)): + + if not hasattr(values, 'dtype'): + values = prep_ndarray(values, copy=copy) + values = values.ravel() + elif copy: + values = values.copy() + + index, columns = _get_axes(len(values), 1, index, columns) + return arrays_to_mgr([values], columns, index, columns, + dtype=dtype) + elif (is_datetime64tz_dtype(values) or + is_extension_array_dtype(values)): + # GH#19157 + if columns is None: + columns = [0] + return arrays_to_mgr([values], columns, index, columns, + dtype=dtype) + + # by definition an array here + # the dtypes will be coerced to a single dtype + values = prep_ndarray(values, copy=copy) + + if dtype is not None: + if not is_dtype_equal(values.dtype, dtype): + try: + values = values.astype(dtype) + except Exception as orig: + e = ValueError("failed to cast to '{dtype}' (Exception " + "was: {orig})".format(dtype=dtype, + orig=orig)) + raise_with_traceback(e) + + index, columns = _get_axes(*values.shape, index=index, columns=columns) + values = values.T + + # if we don't have a dtype specified, then try to convert objects + # on the entire block; this is to convert if we have datetimelike's + # embedded in an object type + if dtype is None and is_object_dtype(values): + values = maybe_infer_to_datetimelike(values) + + return create_block_manager_from_blocks([values], [columns, index]) + + +def init_dict(data, index, columns, dtype=None): + """ + Segregate Series based on type and coerce into matrices. + Needs to handle a lot of exceptional cases. + """ + if columns is not None: + from pandas.core.series import Series + arrays = Series(data, index=columns, dtype=object) + data_names = arrays.index + + missing = arrays.isnull() + if index is None: + # GH10856 + # raise ValueError if only scalars in dict + index = extract_index(arrays[~missing]) + else: + index = ensure_index(index) + + # no obvious "empty" int column + if missing.any() and not is_integer_dtype(dtype): + if dtype is None or np.issubdtype(dtype, np.flexible): + # GH#1783 + nan_dtype = object + else: + nan_dtype = dtype + v = construct_1d_arraylike_from_scalar(np.nan, len(index), + nan_dtype) + arrays.loc[missing] = [v] * missing.sum() + + else: + keys = com.dict_keys_to_ordered_list(data) + columns = data_names = Index(keys) + arrays = [data[k] for k in keys] + + return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) + + +# --------------------------------------------------------------------- + +def prep_ndarray(values, copy=True): + if not isinstance(values, (np.ndarray, ABCSeries, Index)): + if len(values) == 0: + return np.empty((0, 0), dtype=object) + + def convert(v): + return maybe_convert_platform(v) + + # we could have a 1-dim or 2-dim list here + # this is equiv of np.asarray, but does object conversion + # and platform dtype preservation + try: + if is_list_like(values[0]) or hasattr(values[0], 'len'): + values = np.array([convert(v) for v in values]) + elif isinstance(values[0], np.ndarray) and values[0].ndim == 0: + # GH#21861 + values = np.array([convert(v) for v in values]) + else: + values = convert(values) + except (ValueError, TypeError): + values = convert(values) + + else: + + # drop subclass info, do not copy data + values = np.asarray(values) + if copy: + values = values.copy() + + if values.ndim == 1: + values = values.reshape((values.shape[0], 1)) + elif values.ndim != 2: + raise ValueError('Must pass 2-d input') + + return values + + +def _homogenize(data, index, dtype=None): + oindex = None + homogenized = [] + + for v in data: + if isinstance(v, ABCSeries): + if dtype is not None: + v = v.astype(dtype) + if v.index is not index: + # Forces alignment. No need to copy data since we + # are putting it into an ndarray later + v = v.reindex(index, copy=False) + else: + if isinstance(v, dict): + if oindex is None: + oindex = index.astype('O') + + if isinstance(index, (ABCDatetimeIndex, ABCTimedeltaIndex)): + v = com.dict_compat(v) + else: + v = dict(v) + v = lib.fast_multiget(v, oindex.values, default=np.nan) + v = sanitize_array(v, index, dtype=dtype, copy=False, + raise_cast_failure=False) + + homogenized.append(v) + + return homogenized + + +def extract_index(data): + index = None + if len(data) == 0: + index = Index([]) + elif len(data) > 0: + raw_lengths = [] + indexes = [] + + have_raw_arrays = False + have_series = False + have_dicts = False + + for v in data: + if isinstance(v, ABCSeries): + have_series = True + indexes.append(v.index) + elif isinstance(v, dict): + have_dicts = True + indexes.append(list(v.keys())) + elif is_list_like(v) and getattr(v, 'ndim', 1) == 1: + have_raw_arrays = True + raw_lengths.append(len(v)) + + if not indexes and not raw_lengths: + raise ValueError('If using all scalar values, you must pass' + ' an index') + + if have_series or have_dicts: + index = _union_indexes(indexes) + + if have_raw_arrays: + lengths = list(set(raw_lengths)) + if len(lengths) > 1: + raise ValueError('arrays must all be same length') + + if have_dicts: + raise ValueError('Mixing dicts with non-Series may lead to ' + 'ambiguous ordering.') + + if have_series: + if lengths[0] != len(index): + msg = ('array length %d does not match index length %d' % + (lengths[0], len(index))) + raise ValueError(msg) + else: + index = ibase.default_index(lengths[0]) + + return ensure_index(index) + + +def reorder_arrays(arrays, arr_columns, columns): + # reorder according to the columns + if (columns is not None and len(columns) and arr_columns is not None and + len(arr_columns)): + indexer = ensure_index(arr_columns).get_indexer(columns) + arr_columns = ensure_index([arr_columns[i] for i in indexer]) + arrays = [arrays[i] for i in indexer] + return arrays, arr_columns + + +def get_names_from_index(data): + has_some_name = any(getattr(s, 'name', None) is not None for s in data) + if not has_some_name: + return ibase.default_index(len(data)) + + index = lrange(len(data)) + count = 0 + for i, s in enumerate(data): + n = getattr(s, 'name', None) + if n is not None: + index[i] = n + else: + index[i] = 'Unnamed %d' % count + count += 1 + + return index + + +def _get_axes(N, K, index, columns): + # helper to create the axes as indexes + # return axes or defaults + + if index is None: + index = ibase.default_index(N) + else: + index = ensure_index(index) + + if columns is None: + columns = ibase.default_index(K) + else: + columns = ensure_index(columns) + return index, columns + + +# --------------------------------------------------------------------- +# Conversion of Inputs to Arrays + +def to_arrays(data, columns, coerce_float=False, dtype=None): + """ + Return list of arrays, columns. + """ + if isinstance(data, ABCDataFrame): + if columns is not None: + arrays = [data._ixs(i, axis=1).values + for i, col in enumerate(data.columns) if col in columns] + else: + columns = data.columns + arrays = [data._ixs(i, axis=1).values for i in range(len(columns))] + + return arrays, columns + + if not len(data): + if isinstance(data, np.ndarray): + columns = data.dtype.names + if columns is not None: + return [[]] * len(columns), columns + return [], [] # columns if columns is not None else [] + if isinstance(data[0], (list, tuple)): + return _list_to_arrays(data, columns, coerce_float=coerce_float, + dtype=dtype) + elif isinstance(data[0], compat.Mapping): + return _list_of_dict_to_arrays(data, columns, + coerce_float=coerce_float, dtype=dtype) + elif isinstance(data[0], ABCSeries): + return _list_of_series_to_arrays(data, columns, + coerce_float=coerce_float, + dtype=dtype) + elif isinstance(data[0], Categorical): + if columns is None: + columns = ibase.default_index(len(data)) + return data, columns + elif (isinstance(data, (np.ndarray, ABCSeries, Index)) and + data.dtype.names is not None): + + columns = list(data.dtype.names) + arrays = [data[k] for k in columns] + return arrays, columns + else: + # last ditch effort + data = lmap(tuple, data) + return _list_to_arrays(data, columns, coerce_float=coerce_float, + dtype=dtype) + + +def _list_to_arrays(data, columns, coerce_float=False, dtype=None): + if len(data) > 0 and isinstance(data[0], tuple): + content = list(lib.to_object_array_tuples(data).T) + else: + # list of lists + content = list(lib.to_object_array(data).T) + return _convert_object_array(content, columns, dtype=dtype, + coerce_float=coerce_float) + + +def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): + if columns is None: + columns = _get_objs_combined_axis(data, sort=False) + + indexer_cache = {} + + aligned_values = [] + for s in data: + index = getattr(s, 'index', None) + if index is None: + index = ibase.default_index(len(s)) + + if id(index) in indexer_cache: + indexer = indexer_cache[id(index)] + else: + indexer = indexer_cache[id(index)] = index.get_indexer(columns) + + values = com.values_from_object(s) + aligned_values.append(algorithms.take_1d(values, indexer)) + + values = np.vstack(aligned_values) + + if values.dtype == np.object_: + content = list(values.T) + return _convert_object_array(content, columns, dtype=dtype, + coerce_float=coerce_float) + else: + return values.T, columns + + +def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): + if columns is None: + gen = (list(x.keys()) for x in data) + sort = not any(isinstance(d, OrderedDict) for d in data) + columns = lib.fast_unique_multiple_list_gen(gen, sort=sort) + + # assure that they are of the base dict class and not of derived + # classes + data = [(type(d) is dict) and d or dict(d) for d in data] + + content = list(lib.dicts_to_array(data, list(columns)).T) + return _convert_object_array(content, columns, dtype=dtype, + coerce_float=coerce_float) + + +def _convert_object_array(content, columns, coerce_float=False, dtype=None): + if columns is None: + columns = ibase.default_index(len(content)) + else: + if len(columns) != len(content): # pragma: no cover + # caller's responsibility to check for this... + raise AssertionError('{col:d} columns passed, passed data had ' + '{con} columns'.format(col=len(columns), + con=len(content))) + + # provide soft conversion of object dtypes + def convert(arr): + if dtype != object and dtype != np.object: + arr = lib.maybe_convert_objects(arr, try_float=coerce_float) + arr = maybe_cast_to_datetime(arr, dtype) + return arr + + arrays = [convert(arr) for arr in content] + + return arrays, columns + + +# --------------------------------------------------------------------- +# Series-Based + +def sanitize_index(data, index, copy=False): + """ + Sanitize an index type to return an ndarray of the underlying, pass + through a non-Index. + """ + + if index is None: + return data + + if len(data) != len(index): + raise ValueError('Length of values does not match length of ' 'index') + + if isinstance(data, ABCIndexClass) and not copy: + pass + elif isinstance(data, (ABCPeriodIndex, ABCDatetimeIndex)): + data = data._values + if copy: + data = data.copy() + + elif isinstance(data, np.ndarray): + + # coerce datetimelike types + if data.dtype.kind in ['M', 'm']: + data = sanitize_array(data, index, copy=copy) + + return data + + +def sanitize_array(data, index, dtype=None, copy=False, + raise_cast_failure=False): + """ + Sanitize input data to an ndarray, copy if specified, coerce to the + dtype if specified. + """ + + if dtype is not None: + dtype = pandas_dtype(dtype) + + if isinstance(data, ma.MaskedArray): + mask = ma.getmaskarray(data) + if mask.any(): + data, fill_value = maybe_upcast(data, copy=True) + data[mask] = fill_value + else: + data = data.copy() + + # GH#846 + if isinstance(data, (np.ndarray, Index, ABCSeries)): + + if dtype is not None: + subarr = np.array(data, copy=False) + + # possibility of nan -> garbage + if is_float_dtype(data.dtype) and is_integer_dtype(dtype): + if not isna(data).any(): + subarr = _try_cast(data, True, dtype, copy, + raise_cast_failure) + elif copy: + subarr = data.copy() + else: + subarr = _try_cast(data, True, dtype, copy, raise_cast_failure) + elif isinstance(data, Index): + # don't coerce Index types + # e.g. indexes can have different conversions (so don't fast path + # them) + # GH#6140 + subarr = sanitize_index(data, index, copy=copy) + else: + + # we will try to copy be-definition here + subarr = _try_cast(data, True, dtype, copy, raise_cast_failure) + + elif isinstance(data, ExtensionArray): + subarr = data + + if dtype is not None and not data.dtype.is_dtype(dtype): + subarr = data.astype(dtype) + + if copy: + subarr = data.copy() + return subarr + + elif isinstance(data, (list, tuple)) and len(data) > 0: + if dtype is not None: + try: + subarr = _try_cast(data, False, dtype, copy, + raise_cast_failure) + except Exception: + if raise_cast_failure: # pragma: no cover + raise + subarr = np.array(data, dtype=object, copy=copy) + subarr = lib.maybe_convert_objects(subarr) + + else: + subarr = maybe_convert_platform(data) + + subarr = maybe_cast_to_datetime(subarr, dtype) + + elif isinstance(data, range): + # GH#16804 + start, stop, step = get_range_parameters(data) + arr = np.arange(start, stop, step, dtype='int64') + subarr = _try_cast(arr, False, dtype, copy, raise_cast_failure) + else: + subarr = _try_cast(data, False, dtype, copy, raise_cast_failure) + + # scalar like, GH + if getattr(subarr, 'ndim', 0) == 0: + if isinstance(data, list): # pragma: no cover + subarr = np.array(data, dtype=object) + elif index is not None: + value = data + + # figure out the dtype from the value (upcast if necessary) + if dtype is None: + dtype, value = infer_dtype_from_scalar(value) + else: + # need to possibly convert the value here + value = maybe_cast_to_datetime(value, dtype) + + subarr = construct_1d_arraylike_from_scalar( + value, len(index), dtype) + + else: + return subarr.item() + + # the result that we want + elif subarr.ndim == 1: + if index is not None: + + # a 1-element ndarray + if len(subarr) != len(index) and len(subarr) == 1: + subarr = construct_1d_arraylike_from_scalar( + subarr[0], len(index), subarr.dtype) + + elif subarr.ndim > 1: + if isinstance(data, np.ndarray): + raise Exception('Data must be 1-dimensional') + else: + subarr = com.asarray_tuplesafe(data, dtype=dtype) + + # This is to prevent mixed-type Series getting all casted to + # NumPy string type, e.g. NaN --> '-1#IND'. + if issubclass(subarr.dtype.type, compat.string_types): + # GH#16605 + # If not empty convert the data to dtype + # GH#19853: If data is a scalar, subarr has already the result + if not lib.is_scalar(data): + if not np.all(isna(data)): + data = np.array(data, dtype=dtype, copy=False) + subarr = np.array(data, dtype=object, copy=copy) + + if is_object_dtype(subarr.dtype) and dtype != 'object': + inferred = lib.infer_dtype(subarr) + if inferred == 'period': + try: + subarr = period_array(subarr) + except IncompatibleFrequency: + pass + + return subarr + + +def _try_cast(arr, take_fast_path, dtype, copy, raise_cast_failure): + + # perf shortcut as this is the most common case + if take_fast_path: + if maybe_castable(arr) and not copy and dtype is None: + return arr + + try: + # GH#15832: Check if we are requesting a numeric dype and + # that we can convert the data to the requested dtype. + if is_integer_dtype(dtype): + subarr = maybe_cast_to_integer_array(arr, dtype) + + subarr = maybe_cast_to_datetime(arr, dtype) + # Take care in creating object arrays (but iterators are not + # supported): + if is_object_dtype(dtype) and (is_list_like(subarr) and + not (is_iterator(subarr) or + isinstance(subarr, np.ndarray))): + subarr = construct_1d_object_array_from_listlike(subarr) + elif not is_extension_type(subarr): + subarr = construct_1d_ndarray_preserving_na(subarr, dtype, + copy=copy) + except (ValueError, TypeError): + if is_categorical_dtype(dtype): + # We *do* allow casting to categorical, since we know + # that Categorical is the only array type for 'category'. + subarr = Categorical(arr, dtype.categories, + ordered=dtype.ordered) + elif is_extension_array_dtype(dtype): + # create an extension array from its dtype + array_type = dtype.construct_array_type()._from_sequence + subarr = array_type(arr, dtype=dtype, copy=copy) + elif dtype is not None and raise_cast_failure: + raise + else: + subarr = np.array(arr, dtype=object, copy=copy) + return subarr diff --git a/pandas/core/series.py b/pandas/core/series.py index 8d4d7677cca44..6b4c9927ef0f1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -7,35 +7,27 @@ import warnings import numpy as np -import numpy.ma as ma from pandas._libs import iNaT, index as libindex, lib, tslibs import pandas.compat as compat -from pandas.compat import ( - PY36, OrderedDict, StringIO, get_range_parameters, range, u, zip) +from pandas.compat import PY36, OrderedDict, StringIO, u, zip from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, deprecate from pandas.util._validators import validate_bool_kwarg -from pandas.core.dtypes.cast import ( - construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na, - construct_1d_object_array_from_listlike, infer_dtype_from_scalar, - maybe_cast_to_datetime, maybe_cast_to_integer_array, maybe_castable, - maybe_convert_platform, maybe_upcast) from pandas.core.dtypes.common import ( _is_unorderable_exception, ensure_platform_int, is_bool, is_categorical_dtype, is_datetime64tz_dtype, is_datetimelike, is_dict_like, - is_extension_array_dtype, is_extension_type, is_float_dtype, is_hashable, - is_integer, is_integer_dtype, is_iterator, is_list_like, is_object_dtype, - is_scalar, is_string_like, is_timedelta64_dtype, pandas_dtype) + is_extension_array_dtype, is_extension_type, is_hashable, is_integer, + is_iterator, is_list_like, is_scalar, is_string_like, is_timedelta64_dtype) from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCIndexClass, ABCSeries, ABCSparseArray, ABCSparseSeries) + ABCDataFrame, ABCSeries, ABCSparseArray, ABCSparseSeries) from pandas.core.dtypes.missing import ( isna, na_value_for_dtype, notna, remove_na_arraylike) from pandas.core import algorithms, base, generic, nanops, ops from pandas.core.accessor import CachedAccessor -from pandas.core.arrays import ExtensionArray, SparseArray, period_array +from pandas.core.arrays import ExtensionArray, SparseArray from pandas.core.arrays.categorical import Categorical, CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor import pandas.core.common as com @@ -49,6 +41,7 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.indexing import check_bool_indexer, maybe_convert_indices from pandas.core.internals import SingleBlockManager +from pandas.core.internals.construction import sanitize_array from pandas.core.strings import StringMethods from pandas.core.tools.datetimes import to_datetime @@ -256,8 +249,8 @@ def __init__(self, data=None, index=None, dtype=None, name=None, elif copy: data = data.copy() else: - data = _sanitize_array(data, index, dtype, copy, - raise_cast_failure=True) + data = sanitize_array(data, index, dtype, copy, + raise_cast_failure=True) data = SingleBlockManager(data, index, fastpath=True) @@ -4262,207 +4255,3 @@ def to_period(self, freq=None, copy=True): # Add arithmetic! ops.add_flex_arithmetic_methods(Series) ops.add_special_arithmetic_methods(Series) - - -# ----------------------------------------------------------------------------- -# Supplementary functions - - -def _sanitize_index(data, index, copy=False): - """ - Sanitize an index type to return an ndarray of the underlying, pass - through a non-Index. - """ - - if index is None: - return data - - if len(data) != len(index): - raise ValueError('Length of values does not match length of ' 'index') - - if isinstance(data, ABCIndexClass) and not copy: - pass - elif isinstance(data, (PeriodIndex, DatetimeIndex)): - data = data._values - if copy: - data = data.copy() - - elif isinstance(data, np.ndarray): - - # coerce datetimelike types - if data.dtype.kind in ['M', 'm']: - data = _sanitize_array(data, index, copy=copy) - - return data - - -def _sanitize_array(data, index, dtype=None, copy=False, - raise_cast_failure=False): - """ - Sanitize input data to an ndarray, copy if specified, coerce to the - dtype if specified. - """ - - if dtype is not None: - dtype = pandas_dtype(dtype) - - if isinstance(data, ma.MaskedArray): - mask = ma.getmaskarray(data) - if mask.any(): - data, fill_value = maybe_upcast(data, copy=True) - data[mask] = fill_value - else: - data = data.copy() - - def _try_cast(arr, take_fast_path): - - # perf shortcut as this is the most common case - if take_fast_path: - if maybe_castable(arr) and not copy and dtype is None: - return arr - - try: - # gh-15832: Check if we are requesting a numeric dype and - # that we can convert the data to the requested dtype. - if is_integer_dtype(dtype): - subarr = maybe_cast_to_integer_array(arr, dtype) - - subarr = maybe_cast_to_datetime(arr, dtype) - # Take care in creating object arrays (but iterators are not - # supported): - if is_object_dtype(dtype) and (is_list_like(subarr) and - not (is_iterator(subarr) or - isinstance(subarr, np.ndarray))): - subarr = construct_1d_object_array_from_listlike(subarr) - elif not is_extension_type(subarr): - subarr = construct_1d_ndarray_preserving_na(subarr, dtype, - copy=copy) - except (ValueError, TypeError): - if is_categorical_dtype(dtype): - # We *do* allow casting to categorical, since we know - # that Categorical is the only array type for 'category'. - subarr = Categorical(arr, dtype.categories, - ordered=dtype.ordered) - elif is_extension_array_dtype(dtype): - # create an extension array from its dtype - array_type = dtype.construct_array_type()._from_sequence - subarr = array_type(arr, dtype=dtype, copy=copy) - elif dtype is not None and raise_cast_failure: - raise - else: - subarr = np.array(arr, dtype=object, copy=copy) - return subarr - - # GH #846 - if isinstance(data, (np.ndarray, Index, Series)): - - if dtype is not None: - subarr = np.array(data, copy=False) - - # possibility of nan -> garbage - if is_float_dtype(data.dtype) and is_integer_dtype(dtype): - if not isna(data).any(): - subarr = _try_cast(data, True) - elif copy: - subarr = data.copy() - else: - subarr = _try_cast(data, True) - elif isinstance(data, Index): - # don't coerce Index types - # e.g. indexes can have different conversions (so don't fast path - # them) - # GH 6140 - subarr = _sanitize_index(data, index, copy=copy) - else: - - # we will try to copy be-definition here - subarr = _try_cast(data, True) - - elif isinstance(data, ExtensionArray): - subarr = data - - if dtype is not None and not data.dtype.is_dtype(dtype): - subarr = data.astype(dtype) - - if copy: - subarr = data.copy() - return subarr - - elif isinstance(data, (list, tuple)) and len(data) > 0: - if dtype is not None: - try: - subarr = _try_cast(data, False) - except Exception: - if raise_cast_failure: # pragma: no cover - raise - subarr = np.array(data, dtype=object, copy=copy) - subarr = lib.maybe_convert_objects(subarr) - - else: - subarr = maybe_convert_platform(data) - - subarr = maybe_cast_to_datetime(subarr, dtype) - - elif isinstance(data, range): - # GH 16804 - start, stop, step = get_range_parameters(data) - arr = np.arange(start, stop, step, dtype='int64') - subarr = _try_cast(arr, False) - else: - subarr = _try_cast(data, False) - - # scalar like, GH - if getattr(subarr, 'ndim', 0) == 0: - if isinstance(data, list): # pragma: no cover - subarr = np.array(data, dtype=object) - elif index is not None: - value = data - - # figure out the dtype from the value (upcast if necessary) - if dtype is None: - dtype, value = infer_dtype_from_scalar(value) - else: - # need to possibly convert the value here - value = maybe_cast_to_datetime(value, dtype) - - subarr = construct_1d_arraylike_from_scalar( - value, len(index), dtype) - - else: - return subarr.item() - - # the result that we want - elif subarr.ndim == 1: - if index is not None: - - # a 1-element ndarray - if len(subarr) != len(index) and len(subarr) == 1: - subarr = construct_1d_arraylike_from_scalar( - subarr[0], len(index), subarr.dtype) - - elif subarr.ndim > 1: - if isinstance(data, np.ndarray): - raise Exception('Data must be 1-dimensional') - else: - subarr = com.asarray_tuplesafe(data, dtype=dtype) - - # This is to prevent mixed-type Series getting all casted to - # NumPy string type, e.g. NaN --> '-1#IND'. - if issubclass(subarr.dtype.type, compat.string_types): - # GH 16605 - # If not empty convert the data to dtype - # GH 19853: If data is a scalar, subarr has already the result - if not is_scalar(data): - if not np.all(isna(data)): - data = np.array(data, dtype=dtype, copy=False) - subarr = np.array(data, dtype=object, copy=copy) - - if is_object_dtype(subarr.dtype) and dtype != 'object': - inferred = lib.infer_dtype(subarr) - if inferred == 'period': - try: - subarr = period_array(subarr) - except tslibs.period.IncompatibleFrequency: - pass - - return subarr diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index f1c46abfab0b2..8fc6a8d8e923f 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -21,12 +21,13 @@ import pandas.core.algorithms as algos from pandas.core.arrays.sparse import SparseArray, SparseDtype import pandas.core.common as com -from pandas.core.frame import DataFrame, _prep_ndarray, extract_index +from pandas.core.frame import DataFrame import pandas.core.generic as generic from pandas.core.index import Index, MultiIndex, ensure_index import pandas.core.indexes.base as ibase from pandas.core.internals import ( BlockManager, create_block_manager_from_arrays) +from pandas.core.internals.construction import extract_index, prep_ndarray import pandas.core.ops as ops from pandas.core.series import Series from pandas.core.sparse.series import SparseSeries @@ -194,7 +195,7 @@ def sp_maker(x): def _init_matrix(self, data, index, columns, dtype=None): """ Init self from ndarray or list of lists """ - data = _prep_ndarray(data, copy=False) + data = prep_ndarray(data, copy=False) index, columns = self._prep_index(data, index, columns) data = {idx: data[:, i] for i, idx in enumerate(columns)} return self._init_dict(data, index, columns, dtype)