From a12aca18ab3affae2081686c0d6f34f9edb64d46 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Fri, 13 Jul 2018 15:00:00 -0700 Subject: [PATCH 1/2] implement internals as dir --- pandas/core/internals/__init__.py | 20 + .../{internals.py => internals/blocks.py} | 3191 +++-------------- pandas/core/internals/concat.py | 519 +++ pandas/core/internals/managers.py | 1892 ++++++++++ pandas/tests/internals/test_internals.py | 3 +- 5 files changed, 2866 insertions(+), 2759 deletions(-) create mode 100644 pandas/core/internals/__init__.py rename pandas/core/{internals.py => internals/blocks.py} (60%) create mode 100644 pandas/core/internals/concat.py create mode 100644 pandas/core/internals/managers.py diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py new file mode 100644 index 0000000000000..f8c5bc2721ccc --- /dev/null +++ b/pandas/core/internals/__init__.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# flake8: noqa +from .blocks import ( + Block, + ObjectBlock, + DatetimeTZBlock, DatetimeBlock, TimeDeltaBlock, + CategoricalBlock, ExtensionBlock, + SparseBlock, + FloatBlock, IntBlock, + NonConsolidatableMixIn, # used by test_external_block + make_block, # generic, io.packers, io.pytables, test_internals + _block2d_to_blocknd, _factor_indexer, _block_shape, # io.pytables + _safe_reshape) # io.packers + +from .managers import ( + BlockManager, SingleBlockManager, + items_overlap_with_suffix, # reshape.merge + create_block_manager_from_arrays, # core.panel, core.frame, sparse.frame + create_block_manager_from_blocks) # core.frame, core.panel +from .concat import concatenate_block_managers diff --git a/pandas/core/internals.py b/pandas/core/internals/blocks.py similarity index 60% rename from pandas/core/internals.py rename to pandas/core/internals/blocks.py index 208d7b8bcf8a7..19c412b5e92c5 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals/blocks.py @@ -1,27 +1,27 @@ -import warnings -import copy -from warnings import catch_warnings +# -*- coding: utf-8 -*- +from collections import defaultdict +from datetime import date, datetime, timedelta import inspect import itertools import re -import operator -from datetime import datetime, timedelta, date -from collections import defaultdict -from functools import partial +import warnings import numpy as np -from pandas._libs import internals as libinternals +from pandas._libs import ( + lib, tslibs, tslib, Timedelta, + internals as libinternals) +from pandas._libs.tslibs import conversion -from pandas.core.base import PandasObject +from pandas import compat +from pandas.compat import range, zip -from pandas.core.dtypes.dtypes import ( - ExtensionDtype, DatetimeTZDtype, - PandasExtensionDtype, - CategoricalDtype) +from pandas.util._validators import validate_bool_kwarg + +import pandas.core.dtypes.generic as gt from pandas.core.dtypes.common import ( _TD_DTYPE, _NS_DTYPE, - _ensure_int64, _ensure_platform_int, + _ensure_platform_int, is_integer, is_dtype_equal, is_timedelta64_dtype, @@ -31,15 +31,17 @@ is_datetime64tz_dtype, is_bool_dtype, is_object_dtype, - is_datetimelike_v_numeric, - is_float_dtype, is_numeric_dtype, + is_float_dtype, is_numeric_v_string_like, is_extension_type, is_extension_array_dtype, is_list_like, is_re, is_re_compilable, - is_scalar, - _get_dtype) + is_scalar) +from pandas.core.dtypes.dtypes import ( + ExtensionDtype, DatetimeTZDtype, + PandasExtensionDtype, + CategoricalDtype) from pandas.core.dtypes.cast import ( maybe_downcast_to_dtype, maybe_upcast, @@ -57,31 +59,20 @@ is_null_datelike_scalar) import pandas.core.dtypes.concat as _concat -from pandas.core.dtypes.generic import ( - ABCSeries, - ABCDatetimeIndex, - ABCExtensionArray, - ABCIndexClass) import pandas.core.common as com import pandas.core.algorithms as algos +from pandas.core.base import PandasObject +from pandas.core import missing -from pandas.core.index import Index, MultiIndex, _ensure_index -from pandas.core.indexing import maybe_convert_indices, check_setitem_lengths from pandas.core.arrays import Categorical +from pandas.core.sparse.array import SparseArray, _maybe_to_sparse + +from pandas.core.index import _ensure_index from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex -from pandas.io.formats.printing import pprint_thing - -import pandas.core.missing as missing -from pandas.core.sparse.array import _maybe_to_sparse, SparseArray -from pandas._libs import lib, tslib, tslibs -from pandas._libs.tslibs import conversion, Timedelta -from pandas._libs.internals import BlockPlacement +from pandas.core.indexing import check_setitem_lengths -from pandas.util._decorators import cache_readonly -from pandas.util._validators import validate_bool_kwarg -from pandas import compat -from pandas.compat import range, map, zip, u +from pandas.io.formats.printing import pprint_thing class Block(PandasObject): @@ -235,8 +226,8 @@ def mgr_locs(self): @mgr_locs.setter def mgr_locs(self, new_mgr_locs): - if not isinstance(new_mgr_locs, BlockPlacement): - new_mgr_locs = BlockPlacement(new_mgr_locs) + if not isinstance(new_mgr_locs, libinternals.BlockPlacement): + new_mgr_locs = libinternals.BlockPlacement(new_mgr_locs) self._mgr_locs = new_mgr_locs @@ -302,7 +293,7 @@ def __getstate__(self): return self.mgr_locs.indexer, self.values def __setstate__(self, state): - self.mgr_locs = BlockPlacement(state[0]) + self.mgr_locs = libinternals.BlockPlacement(state[0]) self.values = state[1] self.ndim = self.values.ndim @@ -310,6 +301,7 @@ def _slice(self, slicer): """ return a slice of my values """ return self.values[slicer] + # TODO: remove after Panel is gone def reshape_nd(self, labels, shape, ref_items, mgr=None): """ Parameters @@ -1738,8 +1730,8 @@ def __init__(self, values, placement, ndim=None): """ # Placement must be converted to BlockPlacement so that we can check # its length - if not isinstance(placement, BlockPlacement): - placement = BlockPlacement(placement) + if not isinstance(placement, libinternals.BlockPlacement): + placement = libinternals.BlockPlacement(placement) # Maybe infer ndim from placement if ndim is None: @@ -1887,7 +1879,7 @@ def _maybe_coerce_values(self, values): ------- ExtensionArray """ - if isinstance(values, (ABCIndexClass, ABCSeries)): + if isinstance(values, (gt.ABCIndexClass, gt.ABCSeries)): values = values._values return values @@ -2130,148 +2122,6 @@ def should_store(self, value): return is_integer_dtype(value) and value.dtype == self.dtype -class DatetimeLikeBlockMixin(object): - """Mixin class for DatetimeBlock and DatetimeTZBlock.""" - - @property - def _holder(self): - return DatetimeIndex - - @property - def _na_value(self): - return tslibs.NaT - - @property - def fill_value(self): - return tslibs.iNaT - - def get_values(self, dtype=None): - """ - return object dtype as boxed values, such as Timestamps/Timedelta - """ - if is_object_dtype(dtype): - return lib.map_infer(self.values.ravel(), - self._box_func).reshape(self.values.shape) - return self.values - - -class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): - __slots__ = () - is_timedelta = True - _can_hold_na = True - is_numeric = False - - def __init__(self, values, placement, ndim=None): - if values.dtype != _TD_DTYPE: - values = conversion.ensure_timedelta64ns(values) - - super(TimeDeltaBlock, self).__init__(values, - placement=placement, ndim=ndim) - - @property - def _holder(self): - return TimedeltaIndex - - @property - def _box_func(self): - return lambda x: Timedelta(x, unit='ns') - - def _can_hold_element(self, element): - tipo = maybe_infer_dtype_type(element) - if tipo is not None: - return issubclass(tipo.type, np.timedelta64) - return is_integer(element) or isinstance( - element, (timedelta, np.timedelta64)) - - def fillna(self, value, **kwargs): - - # allow filling with integers to be - # interpreted as seconds - if is_integer(value) and not isinstance(value, np.timedelta64): - value = Timedelta(value, unit='s') - return super(TimeDeltaBlock, self).fillna(value, **kwargs) - - def _try_coerce_args(self, values, other): - """ - Coerce values and other to int64, with null values converted to - iNaT. values is always ndarray-like, other may not be - - Parameters - ---------- - values : ndarray-like - other : ndarray-like or scalar - - Returns - ------- - base-type values, values mask, base-type other, other mask - """ - - values_mask = isna(values) - values = values.view('i8') - other_mask = False - - if isinstance(other, bool): - raise TypeError - elif is_null_datelike_scalar(other): - other = tslibs.iNaT - other_mask = True - elif isinstance(other, Timedelta): - other_mask = isna(other) - other = other.value - elif isinstance(other, timedelta): - other = Timedelta(other).value - elif isinstance(other, np.timedelta64): - other_mask = isna(other) - other = Timedelta(other).value - elif hasattr(other, 'dtype') and is_timedelta64_dtype(other): - other_mask = isna(other) - other = other.astype('i8', copy=False).view('i8') - else: - # coercion issues - # let higher levels handle - raise TypeError - - return values, values_mask, other, other_mask - - def _try_coerce_result(self, result): - """ reverse of try_coerce_args / try_operate """ - if isinstance(result, np.ndarray): - mask = isna(result) - if result.dtype.kind in ['i', 'f', 'O']: - result = result.astype('m8[ns]') - result[mask] = tslibs.iNaT - elif isinstance(result, (np.integer, np.float)): - result = self._box_func(result) - return result - - def should_store(self, value): - return issubclass(value.dtype.type, np.timedelta64) - - def to_native_types(self, slicer=None, na_rep=None, quoting=None, - **kwargs): - """ convert to our native types format, slicing if desired """ - - values = self.values - if slicer is not None: - values = values[:, slicer] - mask = isna(values) - - rvalues = np.empty(values.shape, dtype=object) - if na_rep is None: - na_rep = 'NaT' - rvalues[mask] = na_rep - imask = (~mask).ravel() - - # FIXME: - # should use the formats.format.Timedelta64Formatter here - # to figure what format to pass to the Timedelta - # e.g. to not show the decimals say - rvalues.flat[imask] = np.array([Timedelta(val)._repr_base(format='all') - for val in values.ravel()[imask]], - dtype=object) - return rvalues - - class BoolBlock(NumericBlock): __slots__ = () is_bool = True @@ -2412,7 +2262,7 @@ def _can_hold_element(self, element): def _try_coerce_args(self, values, other): """ provide coercion to our input arguments """ - if isinstance(other, ABCDatetimeIndex): + if isinstance(other, gt.ABCDatetimeIndex): # to store DatetimeTZBlock as object other = other.astype(object).values @@ -2632,66 +2482,208 @@ def concat_same_type(self, to_concat, placement=None): ndim=self.ndim) -class DatetimeBlock(DatetimeLikeBlockMixin, Block): - __slots__ = () - is_datetime = True - _can_hold_na = True - - def __init__(self, values, placement, ndim=None): - values = self._maybe_coerce_values(values) - super(DatetimeBlock, self).__init__(values, - placement=placement, ndim=ndim) - - def _maybe_coerce_values(self, values): - """Input validation for values passed to __init__. Ensure that - we have datetime64ns, coercing if necessary. +class DatetimeLikeBlockMixin(object): + """Mixin class for DatetimeBlock and DatetimeTZBlock.""" - Parameters - ---------- - values : array-like - Must be convertible to datetime64 + @property + def _holder(self): + return DatetimeIndex - Returns - ------- - values : ndarray[datetime64ns] + @property + def _na_value(self): + return tslibs.NaT - Overridden by DatetimeTZBlock. - """ - if values.dtype != _NS_DTYPE: - values = conversion.ensure_datetime64ns(values) - return values + @property + def fill_value(self): + return tslibs.iNaT - def _astype(self, dtype, mgr=None, **kwargs): + def get_values(self, dtype=None): """ - these automatically copy, so copy=True has no effect - raise on an except if raise == True + return object dtype as boxed values, such as Timestamps/Timedelta """ + if is_object_dtype(dtype): + return lib.map_infer(self.values.ravel(), + self._box_func).reshape(self.values.shape) + return self.values - # if we are passed a datetime64[ns, tz] - if is_datetime64tz_dtype(dtype): - dtype = DatetimeTZDtype(dtype) - values = self.values - if getattr(values, 'tz', None) is None: - values = DatetimeIndex(values).tz_localize('UTC') - values = values.tz_convert(dtype.tz) - return self.make_block(values) +class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): + __slots__ = () + is_timedelta = True + _can_hold_na = True + is_numeric = False - # delegate - return super(DatetimeBlock, self)._astype(dtype=dtype, **kwargs) + def __init__(self, values, placement, ndim=None): + if values.dtype != _TD_DTYPE: + values = conversion.ensure_timedelta64ns(values) + + super(TimeDeltaBlock, self).__init__(values, + placement=placement, ndim=ndim) + + @property + def _holder(self): + return TimedeltaIndex + + @property + def _box_func(self): + return lambda x: Timedelta(x, unit='ns') def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: - # TODO: this still uses asarray, instead of dtype.type - element = np.array(element) - return element.dtype == _NS_DTYPE or element.dtype == np.int64 - return (is_integer(element) or isinstance(element, datetime) or - isna(element)) + return issubclass(tipo.type, np.timedelta64) + return is_integer(element) or isinstance( + element, (timedelta, np.timedelta64)) + + def fillna(self, value, **kwargs): + + # allow filling with integers to be + # interpreted as seconds + if is_integer(value) and not isinstance(value, np.timedelta64): + value = Timedelta(value, unit='s') + return super(TimeDeltaBlock, self).fillna(value, **kwargs) def _try_coerce_args(self, values, other): """ - Coerce values and other to dtype 'i8'. NaN and NaT convert to + Coerce values and other to int64, with null values converted to + iNaT. values is always ndarray-like, other may not be + + Parameters + ---------- + values : ndarray-like + other : ndarray-like or scalar + + Returns + ------- + base-type values, values mask, base-type other, other mask + """ + + values_mask = isna(values) + values = values.view('i8') + other_mask = False + + if isinstance(other, bool): + raise TypeError + elif is_null_datelike_scalar(other): + other = tslibs.iNaT + other_mask = True + elif isinstance(other, Timedelta): + other_mask = isna(other) + other = other.value + elif isinstance(other, timedelta): + other = Timedelta(other).value + elif isinstance(other, np.timedelta64): + other_mask = isna(other) + other = Timedelta(other).value + elif hasattr(other, 'dtype') and is_timedelta64_dtype(other): + other_mask = isna(other) + other = other.astype('i8', copy=False).view('i8') + else: + # coercion issues + # let higher levels handle + raise TypeError + + return values, values_mask, other, other_mask + + def _try_coerce_result(self, result): + """ reverse of try_coerce_args / try_operate """ + if isinstance(result, np.ndarray): + mask = isna(result) + if result.dtype.kind in ['i', 'f', 'O']: + result = result.astype('m8[ns]') + result[mask] = tslibs.iNaT + elif isinstance(result, (np.integer, np.float)): + result = self._box_func(result) + return result + + def should_store(self, value): + return issubclass(value.dtype.type, np.timedelta64) + + def to_native_types(self, slicer=None, na_rep=None, quoting=None, + **kwargs): + """ convert to our native types format, slicing if desired """ + + values = self.values + if slicer is not None: + values = values[:, slicer] + mask = isna(values) + + rvalues = np.empty(values.shape, dtype=object) + if na_rep is None: + na_rep = 'NaT' + rvalues[mask] = na_rep + imask = (~mask).ravel() + + # FIXME: + # should use the formats.format.Timedelta64Formatter here + # to figure what format to pass to the Timedelta + # e.g. to not show the decimals say + rvalues.flat[imask] = np.array([Timedelta(val)._repr_base(format='all') + for val in values.ravel()[imask]], + dtype=object) + return rvalues + + +class DatetimeBlock(DatetimeLikeBlockMixin, Block): + __slots__ = () + is_datetime = True + _can_hold_na = True + + def __init__(self, values, placement, ndim=None): + values = self._maybe_coerce_values(values) + super(DatetimeBlock, self).__init__(values, + placement=placement, ndim=ndim) + + def _maybe_coerce_values(self, values): + """Input validation for values passed to __init__. Ensure that + we have datetime64ns, coercing if necessary. + + Parameters + ---------- + values : array-like + Must be convertible to datetime64 + + Returns + ------- + values : ndarray[datetime64ns] + + Overridden by DatetimeTZBlock. + """ + if values.dtype != _NS_DTYPE: + values = conversion.ensure_datetime64ns(values) + return values + + def _astype(self, dtype, mgr=None, **kwargs): + """ + these automatically copy, so copy=True has no effect + raise on an except if raise == True + """ + + # if we are passed a datetime64[ns, tz] + if is_datetime64tz_dtype(dtype): + dtype = DatetimeTZDtype(dtype) + + values = self.values + if getattr(values, 'tz', None) is None: + values = DatetimeIndex(values).tz_localize('UTC') + values = values.tz_convert(dtype.tz) + return self.make_block(values) + + # delegate + return super(DatetimeBlock, self)._astype(dtype=dtype, **kwargs) + + def _can_hold_element(self, element): + tipo = maybe_infer_dtype_type(element) + if tipo is not None: + # TODO: this still uses asarray, instead of dtype.type + element = np.array(element) + return element.dtype == _NS_DTYPE or element.dtype == np.int64 + return (is_integer(element) or isinstance(element, datetime) or + isna(element)) + + def _try_coerce_args(self, values, other): + """ + Coerce values and other to dtype 'i8'. NaN and NaT convert to the smallest i8, and will correctly round-trip to NaT if converted back in _try_coerce_result. values is always ndarray-like, other may not be @@ -2883,7 +2875,7 @@ def _try_coerce_args(self, values, other): values = _block_shape(values.asi8, ndim=self.ndim) other_mask = False - if isinstance(other, ABCSeries): + if isinstance(other, gt.ABCSeries): other = self._holder(other) other_mask = isna(other) @@ -3007,7 +2999,7 @@ class SparseBlock(NonConsolidatableMixIn, Block): def __init__(self, values, placement, ndim=None): # Ensure that we have the underlying SparseArray here... - if isinstance(values, ABCSeries): + if isinstance(values, gt.ABCSeries): values = values.values assert isinstance(values, SparseArray) super(SparseBlock, self).__init__(values, placement, ndim=ndim) @@ -3153,6 +3145,9 @@ def sparse_reindex(self, new_index): placement=self.mgr_locs) +# ------------------------------------------------------------------- +# Constructor Helpers + def get_block_type(values, dtype=None): """ Find the appropriate Block subclass to use for the given values and dtype. @@ -3212,2099 +3207,180 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, return klass(values, ndim=ndim, placement=placement) -# TODO: flexible with index=None and/or items=None +def form_blocks(arrays, names, axes): + # put "leftover" items in float bucket, where else? + # generalize? + items_dict = defaultdict(list) + extra_locs = [] -class BlockManager(PandasObject): - """ - Core internal data structure to implement DataFrame, Series, Panel, etc. + names_idx = _ensure_index(names) + if names_idx.equals(axes[0]): + names_indexer = np.arange(len(names_idx)) + else: + assert names_idx.intersection(axes[0]).is_unique + names_indexer = names_idx.get_indexer_for(axes[0]) - Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a - lightweight blocked set of labeled data to be manipulated by the DataFrame - public API class + for i, name_idx in enumerate(names_indexer): + if name_idx == -1: + extra_locs.append(i) + continue - Attributes - ---------- - shape - ndim - axes - values - items + k = names[name_idx] + v = arrays[name_idx] - Methods - ------- - set_axis(axis, new_labels) - copy(deep=True) + block_type = get_block_type(v) + items_dict[block_type.__name__].append((i, k, v)) - get_dtype_counts - get_ftype_counts - get_dtypes - get_ftypes + blocks = [] + if len(items_dict['FloatBlock']): + float_blocks = _multi_blockify(items_dict['FloatBlock']) + blocks.extend(float_blocks) - apply(func, axes, block_filter_fn) + if len(items_dict['ComplexBlock']): + complex_blocks = _multi_blockify(items_dict['ComplexBlock']) + blocks.extend(complex_blocks) - get_bool_data - get_numeric_data + if len(items_dict['TimeDeltaBlock']): + timedelta_blocks = _multi_blockify(items_dict['TimeDeltaBlock']) + blocks.extend(timedelta_blocks) - get_slice(slice_like, axis) - get(label) - iget(loc) - get_scalar(label_tup) + if len(items_dict['IntBlock']): + int_blocks = _multi_blockify(items_dict['IntBlock']) + blocks.extend(int_blocks) - take(indexer, axis) - reindex_axis(new_labels, axis) - reindex_indexer(new_labels, indexer, axis) + if len(items_dict['DatetimeBlock']): + datetime_blocks = _simple_blockify(items_dict['DatetimeBlock'], + _NS_DTYPE) + blocks.extend(datetime_blocks) - delete(label) - insert(loc, label, value) - set(label, value) + if len(items_dict['DatetimeTZBlock']): + dttz_blocks = [make_block(array, + klass=DatetimeTZBlock, + placement=[i]) + for i, _, array in items_dict['DatetimeTZBlock']] + blocks.extend(dttz_blocks) - Parameters - ---------- + if len(items_dict['BoolBlock']): + bool_blocks = _simple_blockify(items_dict['BoolBlock'], np.bool_) + blocks.extend(bool_blocks) + if len(items_dict['ObjectBlock']) > 0: + object_blocks = _simple_blockify(items_dict['ObjectBlock'], np.object_) + blocks.extend(object_blocks) - Notes - ----- - This is *not* a public API class - """ - __slots__ = ['axes', 'blocks', '_ndim', '_shape', '_known_consolidated', - '_is_consolidated', '_blknos', '_blklocs'] - - def __init__(self, blocks, axes, do_integrity_check=True): - self.axes = [_ensure_index(ax) for ax in axes] - self.blocks = tuple(blocks) - - for block in blocks: - if block.is_sparse: - if len(block.mgr_locs) != 1: - raise AssertionError("Sparse block refers to multiple " - "items") - else: - if self.ndim != block.ndim: - raise AssertionError( - 'Number of Block dimensions ({block}) must equal ' - 'number of axes ({self})'.format(block=block.ndim, - self=self.ndim)) + if len(items_dict['SparseBlock']) > 0: + sparse_blocks = _sparse_blockify(items_dict['SparseBlock']) + blocks.extend(sparse_blocks) - if do_integrity_check: - self._verify_integrity() + if len(items_dict['CategoricalBlock']) > 0: + cat_blocks = [make_block(array, klass=CategoricalBlock, placement=[i]) + for i, _, array in items_dict['CategoricalBlock']] + blocks.extend(cat_blocks) - self._consolidate_check() + if len(items_dict['ExtensionBlock']): - self._rebuild_blknos_and_blklocs() + external_blocks = [ + make_block(array, klass=ExtensionBlock, placement=[i]) + for i, _, array in items_dict['ExtensionBlock'] + ] - def make_empty(self, axes=None): - """ return an empty BlockManager with the items axis of len 0 """ - if axes is None: - axes = [_ensure_index([])] + [_ensure_index(a) - for a in self.axes[1:]] + blocks.extend(external_blocks) - # preserve dtype if possible - if self.ndim == 1: - blocks = np.array([], dtype=self.array_dtype) - else: - blocks = [] - return self.__class__(blocks, axes) + if len(extra_locs): + shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:]) - def __nonzero__(self): - return True + # empty items -> dtype object + block_values = np.empty(shape, dtype=object) + block_values.fill(np.nan) - # Python3 compat - __bool__ = __nonzero__ + na_block = make_block(block_values, placement=extra_locs) + blocks.append(na_block) - @property - def shape(self): - return tuple(len(ax) for ax in self.axes) + return blocks - @property - def ndim(self): - return len(self.axes) - def set_axis(self, axis, new_labels): - new_labels = _ensure_index(new_labels) - old_len = len(self.axes[axis]) - new_len = len(new_labels) +def _simple_blockify(tuples, dtype): + """ return a single array of a block that has a single dtype; if dtype is + not None, coerce to this dtype + """ + values, placement = _stack_arrays(tuples, dtype) - if new_len != old_len: - raise ValueError( - 'Length mismatch: Expected axis has {old} elements, new ' - 'values have {new} elements'.format(old=old_len, new=new_len)) + # CHECK DTYPE? + if dtype is not None and values.dtype != dtype: # pragma: no cover + values = values.astype(dtype) - self.axes[axis] = new_labels + block = make_block(values, placement=placement) + return [block] - def rename_axis(self, mapper, axis, copy=True, level=None): - """ - Rename one of axes. - Parameters - ---------- - mapper : unary callable - axis : int - copy : boolean, default True - level : int, default None +def _multi_blockify(tuples, dtype=None): + """ return an array of blocks that potentially have different dtypes """ - """ - obj = self.copy(deep=copy) - obj.set_axis(axis, _transform_index(self.axes[axis], mapper, level)) - return obj + # group by dtype + grouper = itertools.groupby(tuples, lambda x: x[2].dtype) - def add_prefix(self, prefix): - f = partial('{prefix}{}'.format, prefix=prefix) - return self.rename_axis(f, axis=0) + new_blocks = [] + for dtype, tup_block in grouper: - def add_suffix(self, suffix): - f = partial('{}{suffix}'.format, suffix=suffix) - return self.rename_axis(f, axis=0) + values, placement = _stack_arrays(list(tup_block), dtype) - @property - def _is_single_block(self): - if self.ndim == 1: - return True + block = make_block(values, placement=placement) + new_blocks.append(block) - if len(self.blocks) != 1: - return False + return new_blocks - blk = self.blocks[0] - return (blk.mgr_locs.is_slice_like and - blk.mgr_locs.as_slice == slice(0, len(self), 1)) - def _rebuild_blknos_and_blklocs(self): - """ - Update mgr._blknos / mgr._blklocs. - """ - new_blknos = np.empty(self.shape[0], dtype=np.int64) - new_blklocs = np.empty(self.shape[0], dtype=np.int64) - new_blknos.fill(-1) - new_blklocs.fill(-1) - - for blkno, blk in enumerate(self.blocks): - rl = blk.mgr_locs - new_blknos[rl.indexer] = blkno - new_blklocs[rl.indexer] = np.arange(len(rl)) - - if (new_blknos == -1).any(): - raise AssertionError("Gaps in blk ref_locs") - - self._blknos = new_blknos - self._blklocs = new_blklocs - - # make items read only for now - def _get_items(self): - return self.axes[0] - - items = property(fget=_get_items) - - def _get_counts(self, f): - """ return a dict of the counts of the function in BlockManager """ - self._consolidate_inplace() - counts = dict() - for b in self.blocks: - v = f(b) - counts[v] = counts.get(v, 0) + b.shape[0] - return counts - - def get_dtype_counts(self): - return self._get_counts(lambda b: b.dtype.name) - - def get_ftype_counts(self): - return self._get_counts(lambda b: b.ftype) - - def get_dtypes(self): - dtypes = np.array([blk.dtype for blk in self.blocks]) - return algos.take_1d(dtypes, self._blknos, allow_fill=False) - - def get_ftypes(self): - ftypes = np.array([blk.ftype for blk in self.blocks]) - return algos.take_1d(ftypes, self._blknos, allow_fill=False) - - def __getstate__(self): - block_values = [b.values for b in self.blocks] - block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] - axes_array = [ax for ax in self.axes] - - extra_state = { - '0.14.1': { - 'axes': axes_array, - 'blocks': [dict(values=b.values, mgr_locs=b.mgr_locs.indexer) - for b in self.blocks] - } - } - - # First three elements of the state are to maintain forward - # compatibility with 0.13.1. - return axes_array, block_values, block_items, extra_state - - def __setstate__(self, state): - def unpickle_block(values, mgr_locs): - # numpy < 1.7 pickle compat - if values.dtype == 'M8[us]': - values = values.astype('M8[ns]') - return make_block(values, placement=mgr_locs) - - if (isinstance(state, tuple) and len(state) >= 4 and - '0.14.1' in state[3]): - state = state[3]['0.14.1'] - self.axes = [_ensure_index(ax) for ax in state['axes']] - self.blocks = tuple(unpickle_block(b['values'], b['mgr_locs']) - for b in state['blocks']) - else: - # discard anything after 3rd, support beta pickling format for a - # little while longer - ax_arrays, bvalues, bitems = state[:3] - - self.axes = [_ensure_index(ax) for ax in ax_arrays] - - if len(bitems) == 1 and self.axes[0].equals(bitems[0]): - # This is a workaround for pre-0.14.1 pickles that didn't - # support unpickling multi-block frames/panels with non-unique - # columns/items, because given a manager with items ["a", "b", - # "a"] there's no way of knowing which block's "a" is where. - # - # Single-block case can be supported under the assumption that - # block items corresponded to manager items 1-to-1. - all_mgr_locs = [slice(0, len(bitems[0]))] - else: - all_mgr_locs = [self.axes[0].get_indexer(blk_items) - for blk_items in bitems] - - self.blocks = tuple( - unpickle_block(values, mgr_locs) - for values, mgr_locs in zip(bvalues, all_mgr_locs)) - - self._post_setstate() - - def _post_setstate(self): - self._is_consolidated = False - self._known_consolidated = False - self._rebuild_blknos_and_blklocs() - - def __len__(self): - return len(self.items) - - def __unicode__(self): - output = pprint_thing(self.__class__.__name__) - for i, ax in enumerate(self.axes): - if i == 0: - output += u('\nItems: {ax}'.format(ax=ax)) - else: - output += u('\nAxis {i}: {ax}'.format(i=i, ax=ax)) - - for block in self.blocks: - output += u('\n{block}'.format(block=pprint_thing(block))) - return output - - def _verify_integrity(self): - mgr_shape = self.shape - tot_items = sum(len(x.mgr_locs) for x in self.blocks) - for block in self.blocks: - if block._verify_integrity and block.shape[1:] != mgr_shape[1:]: - construction_error(tot_items, block.shape[1:], self.axes) - if len(self.items) != tot_items: - raise AssertionError('Number of manager items must equal union of ' - 'block items\n# manager items: {0}, # ' - 'tot_items: {1}'.format( - len(self.items), tot_items)) - - def apply(self, f, axes=None, filter=None, do_integrity_check=False, - consolidate=True, **kwargs): - """ - iterate over the blocks, collect and create a new block manager - - Parameters - ---------- - f : the callable or function name to operate on at the block level - axes : optional (if not supplied, use self.axes) - filter : list, if supplied, only call the block if the filter is in - the block - do_integrity_check : boolean, default False. Do the block manager - integrity check - consolidate: boolean, default True. Join together blocks having same - dtype - - Returns - ------- - Block Manager (new object) - - """ - - result_blocks = [] - - # filter kwarg is used in replace-* family of methods - if filter is not None: - filter_locs = set(self.items.get_indexer_for(filter)) - if len(filter_locs) == len(self.items): - # All items are included, as if there were no filtering - filter = None - else: - kwargs['filter'] = filter_locs - - if consolidate: - self._consolidate_inplace() - - if f == 'where': - align_copy = True - if kwargs.get('align', True): - align_keys = ['other', 'cond'] - else: - align_keys = ['cond'] - elif f == 'putmask': - align_copy = False - if kwargs.get('align', True): - align_keys = ['new', 'mask'] - else: - align_keys = ['mask'] - elif f == 'eval': - align_copy = False - align_keys = ['other'] - elif f == 'fillna': - # fillna internally does putmask, maybe it's better to do this - # at mgr, not block level? - align_copy = False - align_keys = ['value'] - else: - align_keys = [] - - # TODO(EA): may interfere with ExtensionBlock.setitem for blocks - # with a .values attribute. - aligned_args = dict((k, kwargs[k]) - for k in align_keys - if hasattr(kwargs[k], 'values') and - not isinstance(kwargs[k], ABCExtensionArray)) - - for b in self.blocks: - if filter is not None: - if not b.mgr_locs.isin(filter_locs).any(): - result_blocks.append(b) - continue - - if aligned_args: - b_items = self.items[b.mgr_locs.indexer] - - for k, obj in aligned_args.items(): - axis = getattr(obj, '_info_axis_number', 0) - kwargs[k] = obj.reindex(b_items, axis=axis, - copy=align_copy) - - kwargs['mgr'] = self - applied = getattr(b, f)(**kwargs) - result_blocks = _extend_blocks(applied, result_blocks) - - if len(result_blocks) == 0: - return self.make_empty(axes or self.axes) - bm = self.__class__(result_blocks, axes or self.axes, - do_integrity_check=do_integrity_check) - bm._consolidate_inplace() - return bm - - def reduction(self, f, axis=0, consolidate=True, transposed=False, - **kwargs): - """ - iterate over the blocks, collect and create a new block manager. - This routine is intended for reduction type operations and - will do inference on the generated blocks. - - Parameters - ---------- - f: the callable or function name to operate on at the block level - axis: reduction axis, default 0 - consolidate: boolean, default True. Join together blocks having same - dtype - transposed: boolean, default False - we are holding transposed data - - Returns - ------- - Block Manager (new object) - - """ - - if consolidate: - self._consolidate_inplace() - - axes, blocks = [], [] - for b in self.blocks: - kwargs['mgr'] = self - axe, block = getattr(b, f)(axis=axis, **kwargs) - - axes.append(axe) - blocks.append(block) - - # note that some DatetimeTZ, Categorical are always ndim==1 - ndim = {b.ndim for b in blocks} - - if 2 in ndim: - - new_axes = list(self.axes) - - # multiple blocks that are reduced - if len(blocks) > 1: - new_axes[1] = axes[0] - - # reset the placement to the original - for b, sb in zip(blocks, self.blocks): - b.mgr_locs = sb.mgr_locs - - else: - new_axes[axis] = Index(np.concatenate( - [ax.values for ax in axes])) - - if transposed: - new_axes = new_axes[::-1] - blocks = [b.make_block(b.values.T, - placement=np.arange(b.shape[1]) - ) for b in blocks] - - return self.__class__(blocks, new_axes) - - # 0 ndim - if 0 in ndim and 1 not in ndim: - values = np.array([b.values for b in blocks]) - if len(values) == 1: - return values.item() - blocks = [make_block(values, ndim=1)] - axes = Index([ax[0] for ax in axes]) - - # single block - values = _concat._concat_compat([b.values for b in blocks]) - - # compute the orderings of our original data - if len(self.blocks) > 1: - - indexer = np.empty(len(self.axes[0]), dtype=np.intp) - i = 0 - for b in self.blocks: - for j in b.mgr_locs: - indexer[j] = i - i = i + 1 - - values = values.take(indexer) - - return SingleBlockManager( - [make_block(values, - ndim=1, - placement=np.arange(len(values)))], - axes[0]) - - def isna(self, func, **kwargs): - return self.apply('apply', func=func, **kwargs) - - def where(self, **kwargs): - return self.apply('where', **kwargs) - - def eval(self, **kwargs): - return self.apply('eval', **kwargs) - - def quantile(self, **kwargs): - return self.reduction('quantile', **kwargs) - - def setitem(self, **kwargs): - return self.apply('setitem', **kwargs) - - def putmask(self, **kwargs): - return self.apply('putmask', **kwargs) - - def diff(self, **kwargs): - return self.apply('diff', **kwargs) - - def interpolate(self, **kwargs): - return self.apply('interpolate', **kwargs) - - def shift(self, **kwargs): - return self.apply('shift', **kwargs) - - def fillna(self, **kwargs): - return self.apply('fillna', **kwargs) - - def downcast(self, **kwargs): - return self.apply('downcast', **kwargs) - - def astype(self, dtype, **kwargs): - return self.apply('astype', dtype=dtype, **kwargs) - - def convert(self, **kwargs): - return self.apply('convert', **kwargs) - - def replace(self, **kwargs): - return self.apply('replace', **kwargs) - - def replace_list(self, src_list, dest_list, inplace=False, regex=False, - mgr=None): - """ do a list replace """ - - inplace = validate_bool_kwarg(inplace, 'inplace') - - if mgr is None: - mgr = self - - # figure out our mask a-priori to avoid repeated replacements - values = self.as_array() - - def comp(s): - if isna(s): - return isna(values) - return _maybe_compare(values, getattr(s, 'asm8', s), operator.eq) - - masks = [comp(s) for i, s in enumerate(src_list)] - - result_blocks = [] - src_len = len(src_list) - 1 - for blk in self.blocks: - - # its possible to get multiple result blocks here - # replace ALWAYS will return a list - rb = [blk if inplace else blk.copy()] - for i, (s, d) in enumerate(zip(src_list, dest_list)): - new_rb = [] - for b in rb: - if b.dtype == np.object_: - convert = i == src_len - result = b.replace(s, d, inplace=inplace, regex=regex, - mgr=mgr, convert=convert) - new_rb = _extend_blocks(result, new_rb) - else: - # get our mask for this element, sized to this - # particular block - m = masks[i][b.mgr_locs.indexer] - if m.any(): - b = b.coerce_to_target_dtype(d) - new_rb.extend(b.putmask(m, d, inplace=True)) - else: - new_rb.append(b) - rb = new_rb - result_blocks.extend(rb) - - bm = self.__class__(result_blocks, self.axes) - bm._consolidate_inplace() - return bm - - def reshape_nd(self, axes, **kwargs): - """ a 2d-nd reshape operation on a BlockManager """ - return self.apply('reshape_nd', axes=axes, **kwargs) - - def is_consolidated(self): - """ - Return True if more than one block with the same dtype - """ - if not self._known_consolidated: - self._consolidate_check() - return self._is_consolidated - - def _consolidate_check(self): - ftypes = [blk.ftype for blk in self.blocks] - self._is_consolidated = len(ftypes) == len(set(ftypes)) - self._known_consolidated = True - - @property - def is_mixed_type(self): - # Warning, consolidation needs to get checked upstairs - self._consolidate_inplace() - return len(self.blocks) > 1 - - @property - def is_numeric_mixed_type(self): - # Warning, consolidation needs to get checked upstairs - self._consolidate_inplace() - return all(block.is_numeric for block in self.blocks) - - @property - def is_datelike_mixed_type(self): - # Warning, consolidation needs to get checked upstairs - self._consolidate_inplace() - return any(block.is_datelike for block in self.blocks) - - @property - def any_extension_types(self): - """Whether any of the blocks in this manager are extension blocks""" - return any(block.is_extension for block in self.blocks) - - @property - def is_view(self): - """ return a boolean if we are a single block and are a view """ - if len(self.blocks) == 1: - return self.blocks[0].is_view - - # It is technically possible to figure out which blocks are views - # e.g. [ b.values.base is not None for b in self.blocks ] - # but then we have the case of possibly some blocks being a view - # and some blocks not. setting in theory is possible on the non-view - # blocks w/o causing a SettingWithCopy raise/warn. But this is a bit - # complicated - - return False - - def get_bool_data(self, copy=False): - """ - Parameters - ---------- - copy : boolean, default False - Whether to copy the blocks - """ - self._consolidate_inplace() - return self.combine([b for b in self.blocks if b.is_bool], copy) - - def get_numeric_data(self, copy=False): - """ - Parameters - ---------- - copy : boolean, default False - Whether to copy the blocks - """ - self._consolidate_inplace() - return self.combine([b for b in self.blocks if b.is_numeric], copy) - - def combine(self, blocks, copy=True): - """ return a new manager with the blocks """ - if len(blocks) == 0: - return self.make_empty() - - # FIXME: optimization potential - indexer = np.sort(np.concatenate([b.mgr_locs.as_array - for b in blocks])) - inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) - - new_blocks = [] - for b in blocks: - b = b.copy(deep=copy) - b.mgr_locs = algos.take_1d(inv_indexer, b.mgr_locs.as_array, - axis=0, allow_fill=False) - new_blocks.append(b) - - axes = list(self.axes) - axes[0] = self.items.take(indexer) - - return self.__class__(new_blocks, axes, do_integrity_check=False) - - def get_slice(self, slobj, axis=0): - if axis >= self.ndim: - raise IndexError("Requested axis not found in manager") - - if axis == 0: - new_blocks = self._slice_take_blocks_ax0(slobj) - else: - slicer = [slice(None)] * (axis + 1) - slicer[axis] = slobj - slicer = tuple(slicer) - new_blocks = [blk.getitem_block(slicer) for blk in self.blocks] - - new_axes = list(self.axes) - new_axes[axis] = new_axes[axis][slobj] - - bm = self.__class__(new_blocks, new_axes, do_integrity_check=False) - bm._consolidate_inplace() - return bm - - def __contains__(self, item): - return item in self.items - - @property - def nblocks(self): - return len(self.blocks) - - def copy(self, deep=True, mgr=None): - """ - Make deep or shallow copy of BlockManager - - Parameters - ---------- - deep : boolean o rstring, default True - If False, return shallow copy (do not copy data) - If 'all', copy data and a deep copy of the index - - Returns - ------- - copy : BlockManager - """ - - # this preserves the notion of view copying of axes - if deep: - if deep == 'all': - copy = lambda ax: ax.copy(deep=True) - else: - copy = lambda ax: ax.view() - new_axes = [copy(ax) for ax in self.axes] - else: - new_axes = list(self.axes) - return self.apply('copy', axes=new_axes, deep=deep, - do_integrity_check=False) - - def as_array(self, transpose=False, items=None): - """Convert the blockmanager data into an numpy array. - - Parameters - ---------- - transpose : boolean, default False - If True, transpose the return array - items : list of strings or None - Names of block items that will be included in the returned - array. ``None`` means that all block items will be used - - Returns - ------- - arr : ndarray - """ - if len(self.blocks) == 0: - arr = np.empty(self.shape, dtype=float) - return arr.transpose() if transpose else arr - - if items is not None: - mgr = self.reindex_axis(items, axis=0) - else: - mgr = self - - if self._is_single_block or not self.is_mixed_type: - arr = mgr.blocks[0].get_values() - else: - arr = mgr._interleave() - - return arr.transpose() if transpose else arr - - def _interleave(self): - """ - Return ndarray from blocks with specified item order - Items must be contained in the blocks - """ - dtype = _interleaved_dtype(self.blocks) - - result = np.empty(self.shape, dtype=dtype) - - if result.shape[0] == 0: - # Workaround for numpy 1.7 bug: - # - # >>> a = np.empty((0,10)) - # >>> a[slice(0,0)] - # array([], shape=(0, 10), dtype=float64) - # >>> a[[]] - # Traceback (most recent call last): - # File "", line 1, in - # IndexError: index 0 is out of bounds for axis 0 with size 0 - return result - - itemmask = np.zeros(self.shape[0]) - - for blk in self.blocks: - rl = blk.mgr_locs - result[rl.indexer] = blk.get_values(dtype) - itemmask[rl.indexer] = 1 - - if not itemmask.all(): - raise AssertionError('Some items were not contained in blocks') - - return result - - def to_dict(self, copy=True): - """ - Return a dict of str(dtype) -> BlockManager - - Parameters - ---------- - copy : boolean, default True - - Returns - ------- - values : a dict of dtype -> BlockManager - - Notes - ----- - This consolidates based on str(dtype) - """ - self._consolidate_inplace() - - bd = {} - for b in self.blocks: - bd.setdefault(str(b.dtype), []).append(b) - - return {dtype: self.combine(blocks, copy=copy) - for dtype, blocks in bd.items()} - - def xs(self, key, axis=1, copy=True, takeable=False): - if axis < 1: - raise AssertionError( - 'Can only take xs across axis >= 1, got {ax}'.format(ax=axis)) - - # take by position - if takeable: - loc = key - else: - loc = self.axes[axis].get_loc(key) - - slicer = [slice(None, None) for _ in range(self.ndim)] - slicer[axis] = loc - slicer = tuple(slicer) - - new_axes = list(self.axes) - - # could be an array indexer! - if isinstance(loc, (slice, np.ndarray)): - new_axes[axis] = new_axes[axis][loc] - else: - new_axes.pop(axis) - - new_blocks = [] - if len(self.blocks) > 1: - # we must copy here as we are mixed type - for blk in self.blocks: - newb = make_block(values=blk.values[slicer], - klass=blk.__class__, - placement=blk.mgr_locs) - new_blocks.append(newb) - elif len(self.blocks) == 1: - block = self.blocks[0] - vals = block.values[slicer] - if copy: - vals = vals.copy() - new_blocks = [make_block(values=vals, - placement=block.mgr_locs, - klass=block.__class__)] - - return self.__class__(new_blocks, new_axes) - - def fast_xs(self, loc): - """ - get a cross sectional for a given location in the - items ; handle dups - - return the result, is *could* be a view in the case of a - single block - """ - if len(self.blocks) == 1: - return self.blocks[0].iget((slice(None), loc)) - - items = self.items - - # non-unique (GH4726) - if not items.is_unique: - result = self._interleave() - if self.ndim == 2: - result = result.T - return result[loc] - - # unique - dtype = _interleaved_dtype(self.blocks) - n = len(items) - result = np.empty(n, dtype=dtype) - for blk in self.blocks: - # Such assignment may incorrectly coerce NaT to None - # result[blk.mgr_locs] = blk._slice((slice(None), loc)) - for i, rl in enumerate(blk.mgr_locs): - result[rl] = blk._try_coerce_result(blk.iget((i, loc))) - - return result - - def consolidate(self): - """ - Join together blocks having same dtype - - Returns - ------- - y : BlockManager - """ - if self.is_consolidated(): - return self - - bm = self.__class__(self.blocks, self.axes) - bm._is_consolidated = False - bm._consolidate_inplace() - return bm - - def _consolidate_inplace(self): - if not self.is_consolidated(): - self.blocks = tuple(_consolidate(self.blocks)) - self._is_consolidated = True - self._known_consolidated = True - self._rebuild_blknos_and_blklocs() - - def get(self, item, fastpath=True): - """ - Return values for selected item (ndarray or BlockManager). - """ - if self.items.is_unique: - - if not isna(item): - loc = self.items.get_loc(item) - else: - indexer = np.arange(len(self.items))[isna(self.items)] - - # allow a single nan location indexer - if not is_scalar(indexer): - if len(indexer) == 1: - loc = indexer.item() - else: - raise ValueError("cannot label index with a null key") - - return self.iget(loc, fastpath=fastpath) - else: - - if isna(item): - raise TypeError("cannot label index with a null key") - - indexer = self.items.get_indexer_for([item]) - return self.reindex_indexer(new_axis=self.items[indexer], - indexer=indexer, axis=0, - allow_dups=True) - - def iget(self, i, fastpath=True): - """ - Return the data as a SingleBlockManager if fastpath=True and possible - - Otherwise return as a ndarray - """ - block = self.blocks[self._blknos[i]] - values = block.iget(self._blklocs[i]) - if not fastpath or not block._box_to_block_values or values.ndim != 1: - return values - - # fastpath shortcut for select a single-dim from a 2-dim BM - return SingleBlockManager( - [block.make_block_same_class(values, - placement=slice(0, len(values)), - ndim=1)], - self.axes[1]) - - def get_scalar(self, tup): - """ - Retrieve single item - """ - full_loc = [ax.get_loc(x) for ax, x in zip(self.axes, tup)] - blk = self.blocks[self._blknos[full_loc[0]]] - values = blk.values - - # FIXME: this may return non-upcasted types? - if values.ndim == 1: - return values[full_loc[1]] - - full_loc[0] = self._blklocs[full_loc[0]] - return values[tuple(full_loc)] - - def delete(self, item): - """ - Delete selected item (items if non-unique) in-place. - """ - indexer = self.items.get_loc(item) - - is_deleted = np.zeros(self.shape[0], dtype=np.bool_) - is_deleted[indexer] = True - ref_loc_offset = -is_deleted.cumsum() - - is_blk_deleted = [False] * len(self.blocks) - - if isinstance(indexer, int): - affected_start = indexer - else: - affected_start = is_deleted.nonzero()[0][0] - - for blkno, _ in _fast_count_smallints(self._blknos[affected_start:]): - blk = self.blocks[blkno] - bml = blk.mgr_locs - blk_del = is_deleted[bml.indexer].nonzero()[0] - - if len(blk_del) == len(bml): - is_blk_deleted[blkno] = True - continue - elif len(blk_del) != 0: - blk.delete(blk_del) - bml = blk.mgr_locs - - blk.mgr_locs = bml.add(ref_loc_offset[bml.indexer]) - - # FIXME: use Index.delete as soon as it uses fastpath=True - self.axes[0] = self.items[~is_deleted] - self.blocks = tuple(b for blkno, b in enumerate(self.blocks) - if not is_blk_deleted[blkno]) - self._shape = None - self._rebuild_blknos_and_blklocs() - - def set(self, item, value, check=False): - """ - Set new item in-place. Does not consolidate. Adds new Block if not - contained in the current set of items - if check, then validate that we are not setting the same data in-place - """ - # FIXME: refactor, clearly separate broadcasting & zip-like assignment - # can prob also fix the various if tests for sparse/categorical - - # TODO(EA): Remove an is_extension_ when all extension types satisfy - # the interface - value_is_extension_type = (is_extension_type(value) or - is_extension_array_dtype(value)) - - # categorical/spares/datetimetz - if value_is_extension_type: - - def value_getitem(placement): - return value - else: - if value.ndim == self.ndim - 1: - value = _safe_reshape(value, (1,) + value.shape) - - def value_getitem(placement): - return value - else: - - def value_getitem(placement): - return value[placement.indexer] - - if value.shape[1:] != self.shape[1:]: - raise AssertionError('Shape of new values must be compatible ' - 'with manager shape') - - try: - loc = self.items.get_loc(item) - except KeyError: - # This item wasn't present, just insert at end - self.insert(len(self.items), item, value) - return - - if isinstance(loc, int): - loc = [loc] - - blknos = self._blknos[loc] - blklocs = self._blklocs[loc].copy() - - unfit_mgr_locs = [] - unfit_val_locs = [] - removed_blknos = [] - for blkno, val_locs in _get_blkno_placements(blknos, len(self.blocks), - group=True): - blk = self.blocks[blkno] - blk_locs = blklocs[val_locs.indexer] - if blk.should_store(value): - blk.set(blk_locs, value_getitem(val_locs), check=check) - else: - unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) - unfit_val_locs.append(val_locs) - - # If all block items are unfit, schedule the block for removal. - if len(val_locs) == len(blk.mgr_locs): - removed_blknos.append(blkno) - else: - self._blklocs[blk.mgr_locs.indexer] = -1 - blk.delete(blk_locs) - self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk)) - - if len(removed_blknos): - # Remove blocks & update blknos accordingly - is_deleted = np.zeros(self.nblocks, dtype=np.bool_) - is_deleted[removed_blknos] = True - - new_blknos = np.empty(self.nblocks, dtype=np.int64) - new_blknos.fill(-1) - new_blknos[~is_deleted] = np.arange(self.nblocks - - len(removed_blknos)) - self._blknos = algos.take_1d(new_blknos, self._blknos, axis=0, - allow_fill=False) - self.blocks = tuple(blk for i, blk in enumerate(self.blocks) - if i not in set(removed_blknos)) - - if unfit_val_locs: - unfit_mgr_locs = np.concatenate(unfit_mgr_locs) - unfit_count = len(unfit_mgr_locs) - - new_blocks = [] - if value_is_extension_type: - # This code (ab-)uses the fact that sparse blocks contain only - # one item. - new_blocks.extend( - make_block(values=value.copy(), ndim=self.ndim, - placement=slice(mgr_loc, mgr_loc + 1)) - for mgr_loc in unfit_mgr_locs) - - self._blknos[unfit_mgr_locs] = (np.arange(unfit_count) + - len(self.blocks)) - self._blklocs[unfit_mgr_locs] = 0 - - else: - # unfit_val_locs contains BlockPlacement objects - unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:]) - - new_blocks.append( - make_block(values=value_getitem(unfit_val_items), - ndim=self.ndim, placement=unfit_mgr_locs)) - - self._blknos[unfit_mgr_locs] = len(self.blocks) - self._blklocs[unfit_mgr_locs] = np.arange(unfit_count) - - self.blocks += tuple(new_blocks) - - # Newly created block's dtype may already be present. - self._known_consolidated = False - - def insert(self, loc, item, value, allow_duplicates=False): - """ - Insert item at selected position. - - Parameters - ---------- - loc : int - item : hashable - value : array_like - allow_duplicates: bool - If False, trying to insert non-unique item will raise - - """ - if not allow_duplicates and item in self.items: - # Should this be a different kind of error?? - raise ValueError('cannot insert {}, already exists'.format(item)) - - if not isinstance(loc, int): - raise TypeError("loc must be int") - - # insert to the axis; this could possibly raise a TypeError - new_axis = self.items.insert(loc, item) - - block = make_block(values=value, ndim=self.ndim, - placement=slice(loc, loc + 1)) - - for blkno, count in _fast_count_smallints(self._blknos[loc:]): - blk = self.blocks[blkno] - if count == len(blk.mgr_locs): - blk.mgr_locs = blk.mgr_locs.add(1) - else: - new_mgr_locs = blk.mgr_locs.as_array.copy() - new_mgr_locs[new_mgr_locs >= loc] += 1 - blk.mgr_locs = new_mgr_locs - - if loc == self._blklocs.shape[0]: - # np.append is a lot faster (at least in numpy 1.7.1), let's use it - # if we can. - self._blklocs = np.append(self._blklocs, 0) - self._blknos = np.append(self._blknos, len(self.blocks)) - else: - self._blklocs = np.insert(self._blklocs, loc, 0) - self._blknos = np.insert(self._blknos, loc, len(self.blocks)) - - self.axes[0] = new_axis - self.blocks += (block,) - self._shape = None - - self._known_consolidated = False - - if len(self.blocks) > 100: - self._consolidate_inplace() - - def reindex_axis(self, new_index, axis, method=None, limit=None, - fill_value=None, copy=True): - """ - Conform block manager to new index. - """ - new_index = _ensure_index(new_index) - new_index, indexer = self.axes[axis].reindex(new_index, method=method, - limit=limit) - - return self.reindex_indexer(new_index, indexer, axis=axis, - fill_value=fill_value, copy=copy) - - def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, - allow_dups=False, copy=True): - """ - Parameters - ---------- - new_axis : Index - indexer : ndarray of int64 or None - axis : int - fill_value : object - allow_dups : bool - - pandas-indexer with -1's only. - """ - if indexer is None: - if new_axis is self.axes[axis] and not copy: - return self - - result = self.copy(deep=copy) - result.axes = list(self.axes) - result.axes[axis] = new_axis - return result - - self._consolidate_inplace() - - # some axes don't allow reindexing with dups - if not allow_dups: - self.axes[axis]._can_reindex(indexer) - - if axis >= self.ndim: - raise IndexError("Requested axis not found in manager") - - if axis == 0: - new_blocks = self._slice_take_blocks_ax0(indexer, - fill_tuple=(fill_value,)) - else: - new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=( - fill_value if fill_value is not None else blk.fill_value,)) - for blk in self.blocks] - - new_axes = list(self.axes) - new_axes[axis] = new_axis - return self.__class__(new_blocks, new_axes) - - def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): - """ - Slice/take blocks along axis=0. - - Overloaded for SingleBlock - - Returns - ------- - new_blocks : list of Block - - """ - - allow_fill = fill_tuple is not None - - sl_type, slobj, sllen = _preprocess_slice_or_indexer( - slice_or_indexer, self.shape[0], allow_fill=allow_fill) - - if self._is_single_block: - blk = self.blocks[0] - - if sl_type in ('slice', 'mask'): - return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] - elif not allow_fill or self.ndim == 1: - if allow_fill and fill_tuple[0] is None: - _, fill_value = maybe_promote(blk.dtype) - fill_tuple = (fill_value, ) - - return [blk.take_nd(slobj, axis=0, - new_mgr_locs=slice(0, sllen), - fill_tuple=fill_tuple)] - - if sl_type in ('slice', 'mask'): - blknos = self._blknos[slobj] - blklocs = self._blklocs[slobj] - else: - blknos = algos.take_1d(self._blknos, slobj, fill_value=-1, - allow_fill=allow_fill) - blklocs = algos.take_1d(self._blklocs, slobj, fill_value=-1, - allow_fill=allow_fill) - - # When filling blknos, make sure blknos is updated before appending to - # blocks list, that way new blkno is exactly len(blocks). - # - # FIXME: mgr_groupby_blknos must return mgr_locs in ascending order, - # pytables serialization will break otherwise. - blocks = [] - for blkno, mgr_locs in _get_blkno_placements(blknos, len(self.blocks), - group=True): - if blkno == -1: - # If we've got here, fill_tuple was not None. - fill_value = fill_tuple[0] - - blocks.append(self._make_na_block(placement=mgr_locs, - fill_value=fill_value)) - else: - blk = self.blocks[blkno] - - # Otherwise, slicing along items axis is necessary. - if not blk._can_consolidate: - # A non-consolidatable block, it's easy, because there's - # only one item and each mgr loc is a copy of that single - # item. - for mgr_loc in mgr_locs: - newblk = blk.copy(deep=True) - newblk.mgr_locs = slice(mgr_loc, mgr_loc + 1) - blocks.append(newblk) - - else: - blocks.append(blk.take_nd(blklocs[mgr_locs.indexer], - axis=0, new_mgr_locs=mgr_locs, - fill_tuple=None)) - - return blocks - - def _make_na_block(self, placement, fill_value=None): - # TODO: infer dtypes other than float64 from fill_value - - if fill_value is None: - fill_value = np.nan - block_shape = list(self.shape) - block_shape[0] = len(placement) - - dtype, fill_value = infer_dtype_from_scalar(fill_value) - block_values = np.empty(block_shape, dtype=dtype) - block_values.fill(fill_value) - return make_block(block_values, placement=placement) - - def take(self, indexer, axis=1, verify=True, convert=True): - """ - Take items along any axis. - """ - self._consolidate_inplace() - indexer = (np.arange(indexer.start, indexer.stop, indexer.step, - dtype='int64') - if isinstance(indexer, slice) - else np.asanyarray(indexer, dtype='int64')) - - n = self.shape[axis] - if convert: - indexer = maybe_convert_indices(indexer, n) - - if verify: - if ((indexer == -1) | (indexer >= n)).any(): - raise Exception('Indices must be nonzero and less than ' - 'the axis length') - - new_labels = self.axes[axis].take(indexer) - return self.reindex_indexer(new_axis=new_labels, indexer=indexer, - axis=axis, allow_dups=True) - - def merge(self, other, lsuffix='', rsuffix=''): - if not self._is_indexed_like(other): - raise AssertionError('Must have same axes to merge managers') - - l, r = items_overlap_with_suffix(left=self.items, lsuffix=lsuffix, - right=other.items, rsuffix=rsuffix) - new_items = _concat_indexes([l, r]) - - new_blocks = [blk.copy(deep=False) for blk in self.blocks] - - offset = self.shape[0] - for blk in other.blocks: - blk = blk.copy(deep=False) - blk.mgr_locs = blk.mgr_locs.add(offset) - new_blocks.append(blk) - - new_axes = list(self.axes) - new_axes[0] = new_items - - return self.__class__(_consolidate(new_blocks), new_axes) - - def _is_indexed_like(self, other): - """ - Check all axes except items - """ - if self.ndim != other.ndim: - raise AssertionError( - 'Number of dimensions must agree got {ndim} and ' - '{oth_ndim}'.format(ndim=self.ndim, oth_ndim=other.ndim)) - for ax, oax in zip(self.axes[1:], other.axes[1:]): - if not ax.equals(oax): - return False - return True - - def equals(self, other): - self_axes, other_axes = self.axes, other.axes - if len(self_axes) != len(other_axes): - return False - if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)): - return False - self._consolidate_inplace() - other._consolidate_inplace() - if len(self.blocks) != len(other.blocks): - return False - - # canonicalize block order, using a tuple combining the type - # name and then mgr_locs because there might be unconsolidated - # blocks (say, Categorical) which can only be distinguished by - # the iteration order - def canonicalize(block): - return (block.dtype.name, block.mgr_locs.as_array.tolist()) - - self_blocks = sorted(self.blocks, key=canonicalize) - other_blocks = sorted(other.blocks, key=canonicalize) - return all(block.equals(oblock) - for block, oblock in zip(self_blocks, other_blocks)) - - def unstack(self, unstacker_func): - """Return a blockmanager with all blocks unstacked. - - Parameters - ---------- - unstacker_func : callable - A (partially-applied) ``pd.core.reshape._Unstacker`` class. - - Returns - ------- - unstacked : BlockManager - """ - dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items) - new_columns = dummy.get_new_columns() - new_index = dummy.get_new_index() - new_blocks = [] - columns_mask = [] - - for blk in self.blocks: - blocks, mask = blk._unstack( - partial(unstacker_func, - value_columns=self.items[blk.mgr_locs.indexer]), - new_columns) - - new_blocks.extend(blocks) - columns_mask.extend(mask) - - new_columns = new_columns[columns_mask] - - bm = BlockManager(new_blocks, [new_columns, new_index]) - return bm - - -class SingleBlockManager(BlockManager): - """ manage a single block with """ - - ndim = 1 - _is_consolidated = True - _known_consolidated = True - __slots__ = () - - def __init__(self, block, axis, do_integrity_check=False, fastpath=False): - - if isinstance(axis, list): - if len(axis) != 1: - raise ValueError("cannot create SingleBlockManager with more " - "than 1 axis") - axis = axis[0] - - # passed from constructor, single block, single axis - if fastpath: - self.axes = [axis] - if isinstance(block, list): - - # empty block - if len(block) == 0: - block = [np.array([])] - elif len(block) != 1: - raise ValueError('Cannot create SingleBlockManager with ' - 'more than 1 block') - block = block[0] - else: - self.axes = [_ensure_index(axis)] - - # create the block here - if isinstance(block, list): - - # provide consolidation to the interleaved_dtype - if len(block) > 1: - dtype = _interleaved_dtype(block) - block = [b.astype(dtype) for b in block] - block = _consolidate(block) - - if len(block) != 1: - raise ValueError('Cannot create SingleBlockManager with ' - 'more than 1 block') - block = block[0] - - if not isinstance(block, Block): - block = make_block(block, placement=slice(0, len(axis)), ndim=1) - - self.blocks = [block] - - def _post_setstate(self): - pass - - @property - def _block(self): - return self.blocks[0] - - @property - def _values(self): - return self._block.values - - @property - def _blknos(self): - """ compat with BlockManager """ - return None - - @property - def _blklocs(self): - """ compat with BlockManager """ - return None - - def get_slice(self, slobj, axis=0): - if axis >= self.ndim: - raise IndexError("Requested axis not found in manager") - - return self.__class__(self._block._slice(slobj), - self.index[slobj], fastpath=True) - - @property - def index(self): - return self.axes[0] - - def convert(self, **kwargs): - """ convert the whole block as one """ - kwargs['by_item'] = False - return self.apply('convert', **kwargs) - - @property - def dtype(self): - return self._block.dtype - - @property - def array_dtype(self): - return self._block.array_dtype - - @property - def ftype(self): - return self._block.ftype - - def get_dtype_counts(self): - return {self.dtype.name: 1} - - def get_ftype_counts(self): - return {self.ftype: 1} - - def get_dtypes(self): - return np.array([self._block.dtype]) - - def get_ftypes(self): - return np.array([self._block.ftype]) - - def external_values(self): - return self._block.external_values() - - def internal_values(self): - return self._block.internal_values() - - def formatting_values(self): - """Return the internal values used by the DataFrame/SeriesFormatter""" - return self._block.formatting_values() - - def get_values(self): - """ return a dense type view """ - return np.array(self._block.to_dense(), copy=False) - - @property - def asobject(self): - """ - return a object dtype array. datetime/timedelta like values are boxed - to Timestamp/Timedelta instances. - """ - return self._block.get_values(dtype=object) - - @property - def _can_hold_na(self): - return self._block._can_hold_na - - def is_consolidated(self): - return True - - def _consolidate_check(self): - pass - - def _consolidate_inplace(self): - pass - - def delete(self, item): - """ - Delete single item from SingleBlockManager. - - Ensures that self.blocks doesn't become empty. - """ - loc = self.items.get_loc(item) - self._block.delete(loc) - self.axes[0] = self.axes[0].delete(loc) - - def fast_xs(self, loc): - """ - fast path for getting a cross-section - return a view of the data - """ - return self._block.values[loc] - - def concat(self, to_concat, new_axis): - """ - Concatenate a list of SingleBlockManagers into a single - SingleBlockManager. - - Used for pd.concat of Series objects with axis=0. - - Parameters - ---------- - to_concat : list of SingleBlockManagers - new_axis : Index of the result - - Returns - ------- - SingleBlockManager - - """ - non_empties = [x for x in to_concat if len(x) > 0] - - # check if all series are of the same block type: - if len(non_empties) > 0: - blocks = [obj.blocks[0] for obj in non_empties] - - if all(type(b) is type(blocks[0]) for b in blocks[1:]): # noqa - new_block = blocks[0].concat_same_type(blocks) - else: - values = [x.values for x in blocks] - values = _concat._concat_compat(values) - new_block = make_block( - values, placement=slice(0, len(values), 1)) - else: - values = [x._block.values for x in to_concat] - values = _concat._concat_compat(values) - new_block = make_block( - values, placement=slice(0, len(values), 1)) - - mgr = SingleBlockManager(new_block, new_axis) - return mgr - - -def construction_error(tot_items, block_shape, axes, e=None): - """ raise a helpful message about our construction """ - passed = tuple(map(int, [tot_items] + list(block_shape))) - implied = tuple(map(int, [len(ax) for ax in axes])) - if passed == implied and e is not None: - raise e - if block_shape[0] == 0: - raise ValueError("Empty data passed with indices specified.") - raise ValueError("Shape of passed values is {0}, indices imply {1}".format( - passed, implied)) - - -def create_block_manager_from_blocks(blocks, axes): - try: - if len(blocks) == 1 and not isinstance(blocks[0], Block): - # if blocks[0] is of length 0, return empty blocks - if not len(blocks[0]): - blocks = [] - else: - # It's OK if a single block is passed as values, its placement - # is basically "all items", but if there're many, don't bother - # converting, it's an error anyway. - blocks = [make_block(values=blocks[0], - placement=slice(0, len(axes[0])))] - - mgr = BlockManager(blocks, axes) - mgr._consolidate_inplace() - return mgr - - except (ValueError) as e: - blocks = [getattr(b, 'values', b) for b in blocks] - tot_items = sum(b.shape[0] for b in blocks) - construction_error(tot_items, blocks[0].shape[1:], axes, e) - - -def create_block_manager_from_arrays(arrays, names, axes): - - try: - blocks = form_blocks(arrays, names, axes) - mgr = BlockManager(blocks, axes) - mgr._consolidate_inplace() - return mgr - except ValueError as e: - construction_error(len(arrays), arrays[0].shape, axes, e) - - -def form_blocks(arrays, names, axes): - # put "leftover" items in float bucket, where else? - # generalize? - items_dict = defaultdict(list) - extra_locs = [] - - names_idx = _ensure_index(names) - if names_idx.equals(axes[0]): - names_indexer = np.arange(len(names_idx)) - else: - assert names_idx.intersection(axes[0]).is_unique - names_indexer = names_idx.get_indexer_for(axes[0]) - - for i, name_idx in enumerate(names_indexer): - if name_idx == -1: - extra_locs.append(i) - continue - - k = names[name_idx] - v = arrays[name_idx] - - block_type = get_block_type(v) - items_dict[block_type.__name__].append((i, k, v)) - - blocks = [] - if len(items_dict['FloatBlock']): - float_blocks = _multi_blockify(items_dict['FloatBlock']) - blocks.extend(float_blocks) - - if len(items_dict['ComplexBlock']): - complex_blocks = _multi_blockify(items_dict['ComplexBlock']) - blocks.extend(complex_blocks) - - if len(items_dict['TimeDeltaBlock']): - timedelta_blocks = _multi_blockify(items_dict['TimeDeltaBlock']) - blocks.extend(timedelta_blocks) - - if len(items_dict['IntBlock']): - int_blocks = _multi_blockify(items_dict['IntBlock']) - blocks.extend(int_blocks) - - if len(items_dict['DatetimeBlock']): - datetime_blocks = _simple_blockify(items_dict['DatetimeBlock'], - _NS_DTYPE) - blocks.extend(datetime_blocks) - - if len(items_dict['DatetimeTZBlock']): - dttz_blocks = [make_block(array, - klass=DatetimeTZBlock, - placement=[i]) - for i, _, array in items_dict['DatetimeTZBlock']] - blocks.extend(dttz_blocks) - - if len(items_dict['BoolBlock']): - bool_blocks = _simple_blockify(items_dict['BoolBlock'], np.bool_) - blocks.extend(bool_blocks) - - if len(items_dict['ObjectBlock']) > 0: - object_blocks = _simple_blockify(items_dict['ObjectBlock'], np.object_) - blocks.extend(object_blocks) - - if len(items_dict['SparseBlock']) > 0: - sparse_blocks = _sparse_blockify(items_dict['SparseBlock']) - blocks.extend(sparse_blocks) - - if len(items_dict['CategoricalBlock']) > 0: - cat_blocks = [make_block(array, klass=CategoricalBlock, placement=[i]) - for i, _, array in items_dict['CategoricalBlock']] - blocks.extend(cat_blocks) - - if len(items_dict['ExtensionBlock']): - - external_blocks = [ - make_block(array, klass=ExtensionBlock, placement=[i]) - for i, _, array in items_dict['ExtensionBlock'] - ] - - blocks.extend(external_blocks) - - if len(extra_locs): - shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:]) - - # empty items -> dtype object - block_values = np.empty(shape, dtype=object) - block_values.fill(np.nan) - - na_block = make_block(block_values, placement=extra_locs) - blocks.append(na_block) - - return blocks - - -def _simple_blockify(tuples, dtype): - """ return a single array of a block that has a single dtype; if dtype is - not None, coerce to this dtype - """ - values, placement = _stack_arrays(tuples, dtype) - - # CHECK DTYPE? - if dtype is not None and values.dtype != dtype: # pragma: no cover - values = values.astype(dtype) - - block = make_block(values, placement=placement) - return [block] - - -def _multi_blockify(tuples, dtype=None): - """ return an array of blocks that potentially have different dtypes """ - - # group by dtype - grouper = itertools.groupby(tuples, lambda x: x[2].dtype) - - new_blocks = [] - for dtype, tup_block in grouper: - - values, placement = _stack_arrays(list(tup_block), dtype) - - block = make_block(values, placement=placement) - new_blocks.append(block) - - return new_blocks - - -def _sparse_blockify(tuples, dtype=None): - """ return an array of blocks that potentially have different dtypes (and - are sparse) - """ - - new_blocks = [] - for i, names, array in tuples: - array = _maybe_to_sparse(array) - block = make_block(array, klass=SparseBlock, placement=[i]) - new_blocks.append(block) - - return new_blocks - - -def _stack_arrays(tuples, dtype): - - # fml - def _asarray_compat(x): - if isinstance(x, ABCSeries): - return x._values - else: - return np.asarray(x) - - def _shape_compat(x): - if isinstance(x, ABCSeries): - return len(x), - else: - return x.shape - - placement, names, arrays = zip(*tuples) - - first = arrays[0] - shape = (len(arrays),) + _shape_compat(first) - - stacked = np.empty(shape, dtype=dtype) - for i, arr in enumerate(arrays): - stacked[i] = _asarray_compat(arr) - - return stacked, placement - - -def _interleaved_dtype(blocks): - if not len(blocks): - return None - - dtype = find_common_type([b.dtype for b in blocks]) - - # only numpy compat - if isinstance(dtype, (PandasExtensionDtype, ExtensionDtype)): - dtype = np.object - - return dtype - - -def _consolidate(blocks): - """ - Merge blocks having same dtype, exclude non-consolidating blocks - """ - - # sort by _can_consolidate, dtype - gkey = lambda x: x._consolidate_key - grouper = itertools.groupby(sorted(blocks, key=gkey), gkey) - - new_blocks = [] - for (_can_consolidate, dtype), group_blocks in grouper: - merged_blocks = _merge_blocks(list(group_blocks), dtype=dtype, - _can_consolidate=_can_consolidate) - new_blocks = _extend_blocks(merged_blocks, new_blocks) - return new_blocks - - -def _merge_blocks(blocks, dtype=None, _can_consolidate=True): - - if len(blocks) == 1: - return blocks[0] - - if _can_consolidate: - - if dtype is None: - if len({b.dtype for b in blocks}) != 1: - raise AssertionError("_merge_blocks are invalid!") - dtype = blocks[0].dtype - - # FIXME: optimization potential in case all mgrs contain slices and - # combination of those slices is a slice, too. - new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks]) - new_values = _vstack([b.values for b in blocks], dtype) - - argsort = np.argsort(new_mgr_locs) - new_values = new_values[argsort] - new_mgr_locs = new_mgr_locs[argsort] - - return make_block(new_values, placement=new_mgr_locs) - - # no merge - return blocks - - -def _extend_blocks(result, blocks=None): - """ return a new extended blocks, givin the result """ - if blocks is None: - blocks = [] - if isinstance(result, list): - for r in result: - if isinstance(r, list): - blocks.extend(r) - else: - blocks.append(r) - elif isinstance(result, BlockManager): - blocks.extend(result.blocks) - else: - blocks.append(result) - return blocks - - -def _block_shape(values, ndim=1, shape=None): - """ guarantee the shape of the values to be at least 1 d """ - if values.ndim < ndim: - if shape is None: - shape = values.shape - values = values.reshape(tuple((1, ) + shape)) - return values - - -def _vstack(to_stack, dtype): - - # work around NumPy 1.6 bug - if dtype == _NS_DTYPE or dtype == _TD_DTYPE: - new_values = np.vstack([x.view('i8') for x in to_stack]) - return new_values.view(dtype) - - else: - return np.vstack(to_stack) - - -def _maybe_compare(a, b, op): - - is_a_array = isinstance(a, np.ndarray) - is_b_array = isinstance(b, np.ndarray) - - # numpy deprecation warning to have i8 vs integer comparisons - if is_datetimelike_v_numeric(a, b): - result = False - - # numpy deprecation warning if comparing numeric vs string-like - elif is_numeric_v_string_like(a, b): - result = False - - else: - result = op(a, b) - - if is_scalar(result) and (is_a_array or is_b_array): - type_names = [type(a).__name__, type(b).__name__] - - if is_a_array: - type_names[0] = 'ndarray(dtype={dtype})'.format(dtype=a.dtype) - - if is_b_array: - type_names[1] = 'ndarray(dtype={dtype})'.format(dtype=b.dtype) - - raise TypeError( - "Cannot compare types {a!r} and {b!r}".format(a=type_names[0], - b=type_names[1])) - return result - - -def _concat_indexes(indexes): - return indexes[0].append(indexes[1:]) - - -def _block2d_to_blocknd(values, placement, shape, labels, ref_items): - """ pivot to the labels shape """ - panel_shape = (len(placement),) + shape - - # TODO: lexsort depth needs to be 2!! - - # Create observation selection vector using major and minor - # labels, for converting to panel format. - selector = _factor_indexer(shape[1:], labels) - mask = np.zeros(np.prod(shape), dtype=bool) - mask.put(selector, True) - - if mask.all(): - pvalues = np.empty(panel_shape, dtype=values.dtype) - else: - dtype, fill_value = maybe_promote(values.dtype) - pvalues = np.empty(panel_shape, dtype=dtype) - pvalues.fill(fill_value) - - for i in range(len(placement)): - pvalues[i].flat[mask] = values[:, i] - - return make_block(pvalues, placement=placement) - - -def _factor_indexer(shape, labels): - """ - given a tuple of shape and a list of Categorical labels, return the - expanded label indexer - """ - mult = np.array(shape)[::-1].cumprod()[::-1] - return _ensure_platform_int( - np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T) - - -def _get_blkno_placements(blknos, blk_count, group=True): - """ - - Parameters - ---------- - blknos : array of int64 - blk_count : int - group : bool - - Returns - ------- - iterator - yield (BlockPlacement, blkno) - - """ - - blknos = _ensure_int64(blknos) - - # FIXME: blk_count is unused, but it may avoid the use of dicts in cython - for blkno, indexer in libinternals.get_blkno_indexers(blknos, group): - yield blkno, BlockPlacement(indexer) - - -def items_overlap_with_suffix(left, lsuffix, right, rsuffix): +def _sparse_blockify(tuples, dtype=None): + """ return an array of blocks that potentially have different dtypes (and + are sparse) """ - If two indices overlap, add suffixes to overlapping entries. - If corresponding suffix is empty, the entry is simply converted to string. + new_blocks = [] + for i, names, array in tuples: + array = _maybe_to_sparse(array) + block = make_block(array, klass=SparseBlock, placement=[i]) + new_blocks.append(block) - """ - to_rename = left.intersection(right) - if len(to_rename) == 0: - return left, right - else: - if not lsuffix and not rsuffix: - raise ValueError('columns overlap but no suffix specified: ' - '{rename}'.format(rename=to_rename)) + return new_blocks - def lrenamer(x): - if x in to_rename: - return '{x}{lsuffix}'.format(x=x, lsuffix=lsuffix) - return x - def rrenamer(x): - if x in to_rename: - return '{x}{rsuffix}'.format(x=x, rsuffix=rsuffix) - return x +def _stack_arrays(tuples, dtype): - return (_transform_index(left, lrenamer), - _transform_index(right, rrenamer)) + # fml + def _asarray_compat(x): + if isinstance(x, gt.ABCSeries): + return x._values + else: + return np.asarray(x) + def _shape_compat(x): + if isinstance(x, gt.ABCSeries): + return len(x), + else: + return x.shape -def _safe_reshape(arr, new_shape): - """ - If possible, reshape `arr` to have shape `new_shape`, - with a couple of exceptions (see gh-13012): + placement, names, arrays = zip(*tuples) - 1) If `arr` is a ExtensionArray or Index, `arr` will be - returned as is. - 2) If `arr` is a Series, the `_values` attribute will - be reshaped and returned. + first = arrays[0] + shape = (len(arrays),) + _shape_compat(first) - Parameters - ---------- - arr : array-like, object to be reshaped - new_shape : int or tuple of ints, the new shape - """ - if isinstance(arr, ABCSeries): - arr = arr._values - if not isinstance(arr, ABCExtensionArray): - arr = arr.reshape(new_shape) - return arr + stacked = np.empty(shape, dtype=dtype) + for i, arr in enumerate(arrays): + stacked[i] = _asarray_compat(arr) + return stacked, placement -def _transform_index(index, func, level=None): - """ - Apply function to all values found in index. - This includes transforming multiindex entries separately. - Only apply function to one level of the MultiIndex if level is specified. +# ------------------------------------------------------------------- - """ - if isinstance(index, MultiIndex): - if level is not None: - items = [tuple(func(y) if i == level else y - for i, y in enumerate(x)) for x in index] - else: - items = [tuple(func(y) for y in x) for x in index] - return MultiIndex.from_tuples(items, names=index.names) - else: - items = [func(x) for x in index] - return Index(items, name=index.name, tupleize_cols=False) +def _block_shape(values, ndim=1, shape=None): + """ guarantee the shape of the values to be at least 1 d """ + if values.ndim < ndim: + if shape is None: + shape = values.shape + values = values.reshape(tuple((1, ) + shape)) + return values def _putmask_smart(v, m, n): @@ -5346,8 +3422,8 @@ def _putmask_smart(v, m, n): if not _isna_compat(v, nn[0]): raise ValueError - # we ignore ComplexWarning here - with catch_warnings(record=True): + with warnings.catch_warnings(record=True): + # we ignore ComplexWarning here nn_at = nn.astype(v.dtype) # avoid invalid dtype comparisons @@ -5393,518 +3469,117 @@ def _putmask_preserve(nv, n): return _putmask_preserve(v, n) -def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): - """ - Concatenate block managers into one. - - Parameters - ---------- - mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples - axes : list of Index - concat_axis : int - copy : bool - - """ - concat_plan = combine_concat_plans( - [get_mgr_concatenation_plan(mgr, indexers) - for mgr, indexers in mgrs_indexers], concat_axis) - - blocks = [] - - for placement, join_units in concat_plan: - - if len(join_units) == 1 and not join_units[0].indexers: - b = join_units[0].block - values = b.values - if copy: - values = values.copy() - elif not copy: - values = values.view() - b = b.make_block_same_class(values, placement=placement) - elif is_uniform_join_units(join_units): - b = join_units[0].block.concat_same_type( - [ju.block for ju in join_units], placement=placement) - else: - b = make_block( - concatenate_join_units(join_units, concat_axis, copy=copy), - placement=placement) - blocks.append(b) - - return BlockManager(blocks, axes) - - -def is_uniform_join_units(join_units): - """ - Check if the join units consist of blocks of uniform type that can - be concatenated using Block.concat_same_type instead of the generic - concatenate_join_units (which uses `_concat._concat_compat`). - - """ - return ( - # all blocks need to have the same type - all(type(ju.block) is type(join_units[0].block) for ju in join_units) and # noqa - # no blocks that would get missing values (can lead to type upcasts) - # unless we're an extension dtype. - all(not ju.is_na or ju.block.is_extension for ju in join_units) and - # no blocks with indexers (as then the dimensions do not fit) - all(not ju.indexers for ju in join_units) and - # disregard Panels - all(ju.block.ndim <= 2 for ju in join_units) and - # only use this path when there is something to concatenate - len(join_units) > 1) - - -def is_uniform_reindex(join_units): - return ( - # TODO: should this be ju.block._can_hold_na? - all(ju.block and ju.block.is_extension for ju in join_units) and - len(set(ju.block.dtype.name for ju in join_units)) == 1 - ) - - -def get_empty_dtype_and_na(join_units): - """ - Return dtype and N/A values to use when concatenating specified units. - - Returned N/A value may be None which means there was no casting involved. - - Returns - ------- - dtype - na - """ - - if len(join_units) == 1: - blk = join_units[0].block - if blk is None: - return np.float64, np.nan - - if is_uniform_reindex(join_units): - # XXX: integrate property - empty_dtype = join_units[0].block.dtype - upcasted_na = join_units[0].block.fill_value - return empty_dtype, upcasted_na - - has_none_blocks = False - dtypes = [None] * len(join_units) - for i, unit in enumerate(join_units): - if unit.block is None: - has_none_blocks = True - else: - dtypes[i] = unit.dtype - - upcast_classes = defaultdict(list) - null_upcast_classes = defaultdict(list) - for dtype, unit in zip(dtypes, join_units): - if dtype is None: - continue - - if is_categorical_dtype(dtype): - upcast_cls = 'category' - elif is_datetimetz(dtype): - upcast_cls = 'datetimetz' - elif issubclass(dtype.type, np.bool_): - upcast_cls = 'bool' - elif issubclass(dtype.type, np.object_): - upcast_cls = 'object' - elif is_datetime64_dtype(dtype): - upcast_cls = 'datetime' - elif is_timedelta64_dtype(dtype): - upcast_cls = 'timedelta' - elif is_float_dtype(dtype) or is_numeric_dtype(dtype): - upcast_cls = dtype.name - else: - upcast_cls = 'float' - - # Null blocks should not influence upcast class selection, unless there - # are only null blocks, when same upcasting rules must be applied to - # null upcast classes. - if unit.is_na: - null_upcast_classes[upcast_cls].append(dtype) - else: - upcast_classes[upcast_cls].append(dtype) - - if not upcast_classes: - upcast_classes = null_upcast_classes - - # create the result - if 'object' in upcast_classes: - return np.dtype(np.object_), np.nan - elif 'bool' in upcast_classes: - if has_none_blocks: - return np.dtype(np.object_), np.nan - else: - return np.dtype(np.bool_), None - elif 'category' in upcast_classes: - return np.dtype(np.object_), np.nan - elif 'datetimetz' in upcast_classes: - dtype = upcast_classes['datetimetz'] - return dtype[0], tslibs.iNaT - elif 'datetime' in upcast_classes: - return np.dtype('M8[ns]'), tslibs.iNaT - elif 'timedelta' in upcast_classes: - return np.dtype('m8[ns]'), tslibs.iNaT - else: # pragma - g = np.find_common_type(upcast_classes, []) - if is_float_dtype(g): - return g, g.type(np.nan) - elif is_numeric_dtype(g): - if has_none_blocks: - return np.float64, np.nan - else: - return g, None - - msg = "invalid dtype determination in get_concat_dtype" - raise AssertionError(msg) - - -def concatenate_join_units(join_units, concat_axis, copy): - """ - Concatenate values from several join units along selected axis. - """ - if concat_axis == 0 and len(join_units) > 1: - # Concatenating join units along ax0 is handled in _merge_blocks. - raise AssertionError("Concatenating join units along axis0") - - empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units) - - to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype, - upcasted_na=upcasted_na) - for ju in join_units] - - if len(to_concat) == 1: - # Only one block, nothing to concatenate. - concat_values = to_concat[0] - if copy: - if isinstance(concat_values, np.ndarray): - # non-reindexed (=not yet copied) arrays are made into a view - # in JoinUnit.get_reindexed_values - if concat_values.base is not None: - concat_values = concat_values.copy() +# Also needed by managers +def _extend_blocks(result, blocks=None): + """ return a new extended blocks, givin the result """ + from .managers import BlockManager + if blocks is None: + blocks = [] + if isinstance(result, list): + for r in result: + if isinstance(r, list): + blocks.extend(r) else: - concat_values = concat_values.copy() + blocks.append(r) + elif isinstance(result, BlockManager): + blocks.extend(result.blocks) else: - concat_values = _concat._concat_compat(to_concat, axis=concat_axis) - - return concat_values + blocks.append(result) + return blocks -def get_mgr_concatenation_plan(mgr, indexers): +# Also needed by managers +def _safe_reshape(arr, new_shape): """ - Construct concatenation plan for given block manager and indexers. + If possible, reshape `arr` to have shape `new_shape`, + with a couple of exceptions (see gh-13012): + + 1) If `arr` is a ExtensionArray or Index, `arr` will be + returned as is. + 2) If `arr` is a Series, the `_values` attribute will + be reshaped and returned. Parameters ---------- - mgr : BlockManager - indexers : dict of {axis: indexer} - - Returns - ------- - plan : list of (BlockPlacement, JoinUnit) tuples - + arr : array-like, object to be reshaped + new_shape : int or tuple of ints, the new shape """ - # Calculate post-reindex shape , save for item axis which will be separate - # for each block anyway. - mgr_shape = list(mgr.shape) - for ax, indexer in indexers.items(): - mgr_shape[ax] = len(indexer) - mgr_shape = tuple(mgr_shape) - - if 0 in indexers: - ax0_indexer = indexers.pop(0) - blknos = algos.take_1d(mgr._blknos, ax0_indexer, fill_value=-1) - blklocs = algos.take_1d(mgr._blklocs, ax0_indexer, fill_value=-1) - else: - - if mgr._is_single_block: - blk = mgr.blocks[0] - return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] - - ax0_indexer = None - blknos = mgr._blknos - blklocs = mgr._blklocs - - plan = [] - for blkno, placements in _get_blkno_placements(blknos, len(mgr.blocks), - group=False): - - assert placements.is_slice_like + if isinstance(arr, gt.ABCSeries): + arr = arr._values + if not isinstance(arr, gt.ABCExtensionArray): + arr = arr.reshape(new_shape) + return arr - join_unit_indexers = indexers.copy() - shape = list(mgr_shape) - shape[0] = len(placements) - shape = tuple(shape) +def _merge_blocks(blocks, dtype=None, _can_consolidate=True): - if blkno == -1: - unit = JoinUnit(None, shape) - else: - blk = mgr.blocks[blkno] - ax0_blk_indexer = blklocs[placements.indexer] - - unit_no_ax0_reindexing = (len(placements) == len(blk.mgr_locs) and - # Fastpath detection of join unit not - # needing to reindex its block: no ax0 - # reindexing took place and block - # placement was sequential before. - ((ax0_indexer is None and - blk.mgr_locs.is_slice_like and - blk.mgr_locs.as_slice.step == 1) or - # Slow-ish detection: all indexer locs - # are sequential (and length match is - # checked above). - (np.diff(ax0_blk_indexer) == 1).all())) - - # Omit indexer if no item reindexing is required. - if unit_no_ax0_reindexing: - join_unit_indexers.pop(0, None) - else: - join_unit_indexers[0] = ax0_blk_indexer + if len(blocks) == 1: + return blocks[0] - unit = JoinUnit(blk, shape, join_unit_indexers) + if _can_consolidate: - plan.append((placements, unit)) + if dtype is None: + if len({b.dtype for b in blocks}) != 1: + raise AssertionError("_merge_blocks are invalid!") + dtype = blocks[0].dtype - return plan + # FIXME: optimization potential in case all mgrs contain slices and + # combination of those slices is a slice, too. + new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks]) + new_values = _vstack([b.values for b in blocks], dtype) + argsort = np.argsort(new_mgr_locs) + new_values = new_values[argsort] + new_mgr_locs = new_mgr_locs[argsort] -def combine_concat_plans(plans, concat_axis): - """ - Combine multiple concatenation plans into one. + return make_block(new_values, placement=new_mgr_locs) - existing_plan is updated in-place. - """ - if len(plans) == 1: - for p in plans[0]: - yield p[0], [p[1]] + # no merge + return blocks - elif concat_axis == 0: - offset = 0 - for plan in plans: - last_plc = None - for plc, unit in plan: - yield plc.add(offset), [unit] - last_plc = plc +def _vstack(to_stack, dtype): - if last_plc is not None: - offset += last_plc.as_slice.stop + # work around NumPy 1.6 bug --> TODO: Still necessary? + if dtype == _NS_DTYPE or dtype == _TD_DTYPE: + new_values = np.vstack([x.view('i8') for x in to_stack]) + return new_values.view(dtype) else: - num_ended = [0] - - def _next_or_none(seq): - retval = next(seq, None) - if retval is None: - num_ended[0] += 1 - return retval - - plans = list(map(iter, plans)) - next_items = list(map(_next_or_none, plans)) - - while num_ended[0] != len(next_items): - if num_ended[0] > 0: - raise ValueError("Plan shapes are not aligned") - - placements, units = zip(*next_items) - - lengths = list(map(len, placements)) - min_len, max_len = min(lengths), max(lengths) - - if min_len == max_len: - yield placements[0], units - next_items[:] = map(_next_or_none, plans) - else: - yielded_placement = None - yielded_units = [None] * len(next_items) - for i, (plc, unit) in enumerate(next_items): - yielded_units[i] = unit - if len(plc) > min_len: - # trim_join_unit updates unit in place, so only - # placement needs to be sliced to skip min_len. - next_items[i] = (plc[min_len:], - trim_join_unit(unit, min_len)) - else: - yielded_placement = plc - next_items[i] = _next_or_none(plans[i]) - - yield yielded_placement, yielded_units + return np.vstack(to_stack) -def trim_join_unit(join_unit, length): +def _factor_indexer(shape, labels): """ - Reduce join_unit's shape along item axis to length. - - Extra items that didn't fit are returned as a separate block. + given a tuple of shape and a list of Categorical labels, return the + expanded label indexer """ + mult = np.array(shape)[::-1].cumprod()[::-1] + return _ensure_platform_int( + np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T) - if 0 not in join_unit.indexers: - extra_indexers = join_unit.indexers - - if join_unit.block is None: - extra_block = None - else: - extra_block = join_unit.block.getitem_block(slice(length, None)) - join_unit.block = join_unit.block.getitem_block(slice(length)) - else: - extra_block = join_unit.block - - extra_indexers = copy.copy(join_unit.indexers) - extra_indexers[0] = extra_indexers[0][length:] - join_unit.indexers[0] = join_unit.indexers[0][:length] - - extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:] - join_unit.shape = (length,) + join_unit.shape[1:] - - return JoinUnit(block=extra_block, indexers=extra_indexers, - shape=extra_shape) - - -class JoinUnit(object): - - def __init__(self, block, shape, indexers=None): - # Passing shape explicitly is required for cases when block is None. - if indexers is None: - indexers = {} - self.block = block - self.indexers = indexers - self.shape = shape - - def __repr__(self): - return '{name}({block!r}, {indexers})'.format( - name=self.__class__.__name__, block=self.block, - indexers=self.indexers) - - @cache_readonly - def needs_filling(self): - for indexer in self.indexers.values(): - # FIXME: cache results of indexer == -1 checks. - if (indexer == -1).any(): - return True - - return False - - @cache_readonly - def dtype(self): - if self.block is None: - raise AssertionError("Block is None, no dtype") - - if not self.needs_filling: - return self.block.dtype - else: - return _get_dtype(maybe_promote(self.block.dtype, - self.block.fill_value)[0]) - - @cache_readonly - def is_na(self): - if self.block is None: - return True - - if not self.block._can_hold_na: - return False - - # Usually it's enough to check but a small fraction of values to see if - # a block is NOT null, chunks should help in such cases. 1000 value - # was chosen rather arbitrarily. - values = self.block.values - if self.block.is_categorical: - values_flat = values.categories - elif self.block.is_sparse: - # fill_value is not NaN and have holes - if not values._null_fill_value and values.sp_index.ngaps > 0: - return False - values_flat = values.ravel(order='K') - elif isinstance(self.block, ExtensionBlock): - values_flat = values - else: - values_flat = values.ravel(order='K') - total_len = values_flat.shape[0] - chunk_len = max(total_len // 40, 1000) - for i in range(0, total_len, chunk_len): - if not isna(values_flat[i:i + chunk_len]).all(): - return False - - return True - def get_reindexed_values(self, empty_dtype, upcasted_na): - if upcasted_na is None: - # No upcasting is necessary - fill_value = self.block.fill_value - values = self.block.get_values() - else: - fill_value = upcasted_na - - if self.is_na: - if getattr(self.block, 'is_object', False): - # we want to avoid filling with np.nan if we are - # using None; we already know that we are all - # nulls - values = self.block.values.ravel(order='K') - if len(values) and values[0] is None: - fill_value = None - - if getattr(self.block, 'is_datetimetz', False) or \ - is_datetimetz(empty_dtype): - pass - elif getattr(self.block, 'is_categorical', False): - pass - elif getattr(self.block, 'is_sparse', False): - pass - else: - missing_arr = np.empty(self.shape, dtype=empty_dtype) - missing_arr.fill(fill_value) - return missing_arr - - if not self.indexers: - if not self.block._can_consolidate: - # preserve these for validation in _concat_compat - return self.block.values - - if self.block.is_bool and not self.block.is_categorical: - # External code requested filling/upcasting, bool values must - # be upcasted to object to avoid being upcasted to numeric. - values = self.block.astype(np.object_).values - elif self.block.is_extension: - values = self.block.values - else: - # No dtype upcasting is done here, it will be performed during - # concatenation itself. - values = self.block.get_values() +# TODO: remove after Panel is gone +def _block2d_to_blocknd(values, placement, shape, labels, ref_items): + """ pivot to the labels shape """ + panel_shape = (len(placement),) + shape - if not self.indexers: - # If there's no indexing to be done, we want to signal outside - # code that this array must be copied explicitly. This is done - # by returning a view and checking `retval.base`. - values = values.view() + # TODO: lexsort depth needs to be 2!! - else: - for ax, indexer in self.indexers.items(): - values = algos.take_nd(values, indexer, axis=ax, - fill_value=fill_value) + # Create observation selection vector using major and minor + # labels, for converting to panel format. + selector = _factor_indexer(shape[1:], labels) + mask = np.zeros(np.prod(shape), dtype=bool) + mask.put(selector, True) - return values + if mask.all(): + pvalues = np.empty(panel_shape, dtype=values.dtype) + else: + dtype, fill_value = maybe_promote(values.dtype) + pvalues = np.empty(panel_shape, dtype=dtype) + pvalues.fill(fill_value) + for i in range(len(placement)): + pvalues[i].flat[mask] = values[:, i] -def _fast_count_smallints(arr): - """Faster version of set(arr) for sequences of small numbers.""" - if len(arr) == 0: - # Handle empty arr case separately: numpy 1.6 chokes on that. - return np.empty((0, 2), dtype=arr.dtype) - else: - counts = np.bincount(arr.astype(np.int_)) - nz = counts.nonzero()[0] - return np.c_[nz, counts[nz]] - - -def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill): - if isinstance(slice_or_indexer, slice): - return ('slice', slice_or_indexer, - libinternals.slice_len(slice_or_indexer, length)) - elif (isinstance(slice_or_indexer, np.ndarray) and - slice_or_indexer.dtype == np.bool_): - return 'mask', slice_or_indexer, slice_or_indexer.sum() - else: - indexer = np.asanyarray(slice_or_indexer, dtype=np.int64) - if not allow_fill: - indexer = maybe_convert_indices(indexer, length) - return 'fancy', indexer, len(indexer) + return make_block(pvalues, placement=placement) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py new file mode 100644 index 0000000000000..d574428b30525 --- /dev/null +++ b/pandas/core/internals/concat.py @@ -0,0 +1,519 @@ +# -*- coding: utf-8 -*- + +# TODO: Needs a new name; too many things are already called "concat" +import copy +from collections import defaultdict + +import numpy as np + +from pandas._libs import tslibs + +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.common import ( + is_timedelta64_dtype, + is_datetime64_dtype, is_datetimetz, + is_categorical_dtype, + is_numeric_dtype, + is_float_dtype, + _get_dtype) +from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.cast import maybe_promote +import pandas.core.dtypes.concat as _concat + +import pandas.core.algorithms as algos + +from .blocks import make_block +from .managers import BlockManager, _get_blkno_placements + + +class JoinUnit(object): + + def __init__(self, block, shape, indexers=None): + # Passing shape explicitly is required for cases when block is None. + if indexers is None: + indexers = {} + self.block = block + self.indexers = indexers + self.shape = shape + + def __repr__(self): + return '{name}({block!r}, {indexers})'.format( + name=self.__class__.__name__, block=self.block, + indexers=self.indexers) + + @cache_readonly + def needs_filling(self): + for indexer in self.indexers.values(): + # FIXME: cache results of indexer == -1 checks. + if (indexer == -1).any(): + return True + + return False + + @cache_readonly + def dtype(self): + if self.block is None: + raise AssertionError("Block is None, no dtype") + + if not self.needs_filling: + return self.block.dtype + else: + return _get_dtype(maybe_promote(self.block.dtype, + self.block.fill_value)[0]) + + @cache_readonly + def is_na(self): + if self.block is None: + return True + + if not self.block._can_hold_na: + return False + + # Usually it's enough to check but a small fraction of values to see if + # a block is NOT null, chunks should help in such cases. 1000 value + # was chosen rather arbitrarily. + values = self.block.values + if self.block.is_categorical: + values_flat = values.categories + elif self.block.is_sparse: + # fill_value is not NaN and have holes + if not values._null_fill_value and values.sp_index.ngaps > 0: + return False + values_flat = values.ravel(order='K') + elif self.block.is_extension: + values_flat = values + else: + values_flat = values.ravel(order='K') + total_len = values_flat.shape[0] + chunk_len = max(total_len // 40, 1000) + for i in range(0, total_len, chunk_len): + if not isna(values_flat[i:i + chunk_len]).all(): + return False + + return True + + def get_reindexed_values(self, empty_dtype, upcasted_na): + if upcasted_na is None: + # No upcasting is necessary + fill_value = self.block.fill_value + values = self.block.get_values() + else: + fill_value = upcasted_na + + if self.is_na: + if getattr(self.block, 'is_object', False): + # we want to avoid filling with np.nan if we are + # using None; we already know that we are all + # nulls + values = self.block.values.ravel(order='K') + if len(values) and values[0] is None: + fill_value = None + + if getattr(self.block, 'is_datetimetz', False) or \ + is_datetimetz(empty_dtype): + pass + elif getattr(self.block, 'is_categorical', False): + pass + elif getattr(self.block, 'is_sparse', False): + pass + else: + missing_arr = np.empty(self.shape, dtype=empty_dtype) + missing_arr.fill(fill_value) + return missing_arr + + if not self.indexers: + if not self.block._can_consolidate: + # preserve these for validation in _concat_compat + return self.block.values + + if self.block.is_bool and not self.block.is_categorical: + # External code requested filling/upcasting, bool values must + # be upcasted to object to avoid being upcasted to numeric. + values = self.block.astype(np.object_).values + elif self.block.is_extension: + values = self.block.values + else: + # No dtype upcasting is done here, it will be performed during + # concatenation itself. + values = self.block.get_values() + + if not self.indexers: + # If there's no indexing to be done, we want to signal outside + # code that this array must be copied explicitly. This is done + # by returning a view and checking `retval.base`. + values = values.view() + + else: + for ax, indexer in self.indexers.items(): + values = algos.take_nd(values, indexer, axis=ax, + fill_value=fill_value) + + return values + + +def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): + """ + Concatenate block managers into one. + + Parameters + ---------- + mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples + axes : list of Index + concat_axis : int + copy : bool + + """ + concat_plan = combine_concat_plans( + [get_mgr_concatenation_plan(mgr, indexers) + for mgr, indexers in mgrs_indexers], concat_axis) + + blocks = [] + + for placement, join_units in concat_plan: + + if len(join_units) == 1 and not join_units[0].indexers: + b = join_units[0].block + values = b.values + if copy: + values = values.copy() + elif not copy: + values = values.view() + b = b.make_block_same_class(values, placement=placement) + elif is_uniform_join_units(join_units): + b = join_units[0].block.concat_same_type( + [ju.block for ju in join_units], placement=placement) + else: + b = make_block( + concatenate_join_units(join_units, concat_axis, copy=copy), + placement=placement) + blocks.append(b) + + return BlockManager(blocks, axes) + + +def is_uniform_join_units(join_units): + """ + Check if the join units consist of blocks of uniform type that can + be concatenated using Block.concat_same_type instead of the generic + concatenate_join_units (which uses `_concat._concat_compat`). + + """ + return ( + # all blocks need to have the same type + all(type(ju.block) is type(join_units[0].block) for ju in join_units) and # noqa + # no blocks that would get missing values (can lead to type upcasts) + # unless we're an extension dtype. + all(not ju.is_na or ju.block.is_extension for ju in join_units) and + # no blocks with indexers (as then the dimensions do not fit) + all(not ju.indexers for ju in join_units) and + # disregard Panels + all(ju.block.ndim <= 2 for ju in join_units) and + # only use this path when there is something to concatenate + len(join_units) > 1) + + +def is_uniform_reindex(join_units): + return ( + # TODO: should this be ju.block._can_hold_na? + all(ju.block and ju.block.is_extension for ju in join_units) and + len(set(ju.block.dtype.name for ju in join_units)) == 1 + ) + + +def get_empty_dtype_and_na(join_units): + """ + Return dtype and N/A values to use when concatenating specified units. + + Returned N/A value may be None which means there was no casting involved. + + Returns + ------- + dtype + na + """ + + if len(join_units) == 1: + blk = join_units[0].block + if blk is None: + return np.float64, np.nan + + if is_uniform_reindex(join_units): + # XXX: integrate property + empty_dtype = join_units[0].block.dtype + upcasted_na = join_units[0].block.fill_value + return empty_dtype, upcasted_na + + has_none_blocks = False + dtypes = [None] * len(join_units) + for i, unit in enumerate(join_units): + if unit.block is None: + has_none_blocks = True + else: + dtypes[i] = unit.dtype + + upcast_classes = defaultdict(list) + null_upcast_classes = defaultdict(list) + for dtype, unit in zip(dtypes, join_units): + if dtype is None: + continue + + if is_categorical_dtype(dtype): + upcast_cls = 'category' + elif is_datetimetz(dtype): + upcast_cls = 'datetimetz' + elif issubclass(dtype.type, np.bool_): + upcast_cls = 'bool' + elif issubclass(dtype.type, np.object_): + upcast_cls = 'object' + elif is_datetime64_dtype(dtype): + upcast_cls = 'datetime' + elif is_timedelta64_dtype(dtype): + upcast_cls = 'timedelta' + elif is_float_dtype(dtype) or is_numeric_dtype(dtype): + upcast_cls = dtype.name + else: + upcast_cls = 'float' + + # Null blocks should not influence upcast class selection, unless there + # are only null blocks, when same upcasting rules must be applied to + # null upcast classes. + if unit.is_na: + null_upcast_classes[upcast_cls].append(dtype) + else: + upcast_classes[upcast_cls].append(dtype) + + if not upcast_classes: + upcast_classes = null_upcast_classes + + # create the result + if 'object' in upcast_classes: + return np.dtype(np.object_), np.nan + elif 'bool' in upcast_classes: + if has_none_blocks: + return np.dtype(np.object_), np.nan + else: + return np.dtype(np.bool_), None + elif 'category' in upcast_classes: + return np.dtype(np.object_), np.nan + elif 'datetimetz' in upcast_classes: + dtype = upcast_classes['datetimetz'] + return dtype[0], tslibs.iNaT + elif 'datetime' in upcast_classes: + return np.dtype('M8[ns]'), tslibs.iNaT + elif 'timedelta' in upcast_classes: + return np.dtype('m8[ns]'), tslibs.iNaT + else: # pragma + g = np.find_common_type(upcast_classes, []) + if is_float_dtype(g): + return g, g.type(np.nan) + elif is_numeric_dtype(g): + if has_none_blocks: + return np.float64, np.nan + else: + return g, None + + msg = "invalid dtype determination in get_concat_dtype" + raise AssertionError(msg) + + +def concatenate_join_units(join_units, concat_axis, copy): + """ + Concatenate values from several join units along selected axis. + """ + if concat_axis == 0 and len(join_units) > 1: + # Concatenating join units along ax0 is handled in _merge_blocks. + raise AssertionError("Concatenating join units along axis0") + + empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units) + + to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype, + upcasted_na=upcasted_na) + for ju in join_units] + + if len(to_concat) == 1: + # Only one block, nothing to concatenate. + concat_values = to_concat[0] + if copy: + if isinstance(concat_values, np.ndarray): + # non-reindexed (=not yet copied) arrays are made into a view + # in JoinUnit.get_reindexed_values + if concat_values.base is not None: + concat_values = concat_values.copy() + else: + concat_values = concat_values.copy() + else: + concat_values = _concat._concat_compat(to_concat, axis=concat_axis) + + return concat_values + + +def get_mgr_concatenation_plan(mgr, indexers): + """ + Construct concatenation plan for given block manager and indexers. + + Parameters + ---------- + mgr : BlockManager + indexers : dict of {axis: indexer} + + Returns + ------- + plan : list of (BlockPlacement, JoinUnit) tuples + + """ + # Calculate post-reindex shape , save for item axis which will be separate + # for each block anyway. + mgr_shape = list(mgr.shape) + for ax, indexer in indexers.items(): + mgr_shape[ax] = len(indexer) + mgr_shape = tuple(mgr_shape) + + if 0 in indexers: + ax0_indexer = indexers.pop(0) + blknos = algos.take_1d(mgr._blknos, ax0_indexer, fill_value=-1) + blklocs = algos.take_1d(mgr._blklocs, ax0_indexer, fill_value=-1) + else: + + if mgr._is_single_block: + blk = mgr.blocks[0] + return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] + + ax0_indexer = None + blknos = mgr._blknos + blklocs = mgr._blklocs + + plan = [] + for blkno, placements in _get_blkno_placements(blknos, len(mgr.blocks), + group=False): + + assert placements.is_slice_like + + join_unit_indexers = indexers.copy() + + shape = list(mgr_shape) + shape[0] = len(placements) + shape = tuple(shape) + + if blkno == -1: + unit = JoinUnit(None, shape) + else: + blk = mgr.blocks[blkno] + ax0_blk_indexer = blklocs[placements.indexer] + + unit_no_ax0_reindexing = (len(placements) == len(blk.mgr_locs) and + # Fastpath detection of join unit not + # needing to reindex its block: no ax0 + # reindexing took place and block + # placement was sequential before. + ((ax0_indexer is None and + blk.mgr_locs.is_slice_like and + blk.mgr_locs.as_slice.step == 1) or + # Slow-ish detection: all indexer locs + # are sequential (and length match is + # checked above). + (np.diff(ax0_blk_indexer) == 1).all())) + + # Omit indexer if no item reindexing is required. + if unit_no_ax0_reindexing: + join_unit_indexers.pop(0, None) + else: + join_unit_indexers[0] = ax0_blk_indexer + + unit = JoinUnit(blk, shape, join_unit_indexers) + + plan.append((placements, unit)) + + return plan + + +def combine_concat_plans(plans, concat_axis): + """ + Combine multiple concatenation plans into one. + + existing_plan is updated in-place. + """ + if len(plans) == 1: + for p in plans[0]: + yield p[0], [p[1]] + + elif concat_axis == 0: + offset = 0 + for plan in plans: + last_plc = None + + for plc, unit in plan: + yield plc.add(offset), [unit] + last_plc = plc + + if last_plc is not None: + offset += last_plc.as_slice.stop + + else: + num_ended = [0] + + def _next_or_none(seq): + retval = next(seq, None) + if retval is None: + num_ended[0] += 1 + return retval + + plans = list(map(iter, plans)) + next_items = list(map(_next_or_none, plans)) + + while num_ended[0] != len(next_items): + if num_ended[0] > 0: + raise ValueError("Plan shapes are not aligned") + + placements, units = zip(*next_items) + + lengths = list(map(len, placements)) + min_len, max_len = min(lengths), max(lengths) + + if min_len == max_len: + yield placements[0], units + next_items[:] = map(_next_or_none, plans) + else: + yielded_placement = None + yielded_units = [None] * len(next_items) + for i, (plc, unit) in enumerate(next_items): + yielded_units[i] = unit + if len(plc) > min_len: + # trim_join_unit updates unit in place, so only + # placement needs to be sliced to skip min_len. + next_items[i] = (plc[min_len:], + trim_join_unit(unit, min_len)) + else: + yielded_placement = plc + next_items[i] = _next_or_none(plans[i]) + + yield yielded_placement, yielded_units + + +def trim_join_unit(join_unit, length): + """ + Reduce join_unit's shape along item axis to length. + + Extra items that didn't fit are returned as a separate block. + """ + + if 0 not in join_unit.indexers: + extra_indexers = join_unit.indexers + + if join_unit.block is None: + extra_block = None + else: + extra_block = join_unit.block.getitem_block(slice(length, None)) + join_unit.block = join_unit.block.getitem_block(slice(length)) + else: + extra_block = join_unit.block + + extra_indexers = copy.copy(join_unit.indexers) + extra_indexers[0] = extra_indexers[0][length:] + join_unit.indexers[0] = join_unit.indexers[0][:length] + + extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:] + join_unit.shape = (length,) + join_unit.shape[1:] + + return JoinUnit(block=extra_block, indexers=extra_indexers, + shape=extra_shape) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py new file mode 100644 index 0000000000000..beba2dd9839ac --- /dev/null +++ b/pandas/core/internals/managers.py @@ -0,0 +1,1892 @@ +# -*- coding: utf-8 -*- +from functools import partial +import itertools +import operator + +import numpy as np + +from pandas._libs import lib, internals as libinternals + +from pandas.util._validators import validate_bool_kwarg + +from pandas.compat import range, map, zip + +import pandas.core.dtypes.generic as gt +from pandas.core.dtypes.common import ( + _ensure_int64, + is_datetimelike_v_numeric, + is_numeric_v_string_like, is_extension_type, + is_extension_array_dtype, + is_scalar) +from pandas.core.dtypes.missing import isna +import pandas.core.dtypes.concat as _concat +from pandas.core.dtypes.dtypes import ( + ExtensionDtype, + PandasExtensionDtype) +from pandas.core.dtypes.cast import ( + maybe_promote, + infer_dtype_from_scalar, + find_common_type) + +import pandas.core.algorithms as algos +from pandas.core.base import PandasObject + +from pandas.core.index import Index, MultiIndex, _ensure_index +from pandas.core.indexing import maybe_convert_indices + +from pandas.io.formats.printing import pprint_thing + +from .blocks import ( + form_blocks, + Block, + make_block, + _extend_blocks, _safe_reshape, _merge_blocks) + + +# TODO: flexible with index=None and/or items=None + + +class BlockManager(PandasObject): + """ + Core internal data structure to implement DataFrame, Series, Panel, etc. + + Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a + lightweight blocked set of labeled data to be manipulated by the DataFrame + public API class + + Attributes + ---------- + shape + ndim + axes + values + items + + Methods + ------- + set_axis(axis, new_labels) + copy(deep=True) + + get_dtype_counts + get_ftype_counts + get_dtypes + get_ftypes + + apply(func, axes, block_filter_fn) + + get_bool_data + get_numeric_data + + get_slice(slice_like, axis) + get(label) + iget(loc) + get_scalar(label_tup) + + take(indexer, axis) + reindex_axis(new_labels, axis) + reindex_indexer(new_labels, indexer, axis) + + delete(label) + insert(loc, label, value) + set(label, value) + + Parameters + ---------- + + + Notes + ----- + This is *not* a public API class + """ + __slots__ = ['axes', 'blocks', '_ndim', '_shape', '_known_consolidated', + '_is_consolidated', '_blknos', '_blklocs'] + + def __init__(self, blocks, axes, do_integrity_check=True): + self.axes = [_ensure_index(ax) for ax in axes] + self.blocks = tuple(blocks) + + for block in blocks: + if block.is_sparse: + if len(block.mgr_locs) != 1: + raise AssertionError("Sparse block refers to multiple " + "items") + else: + if self.ndim != block.ndim: + raise AssertionError( + 'Number of Block dimensions ({block}) must equal ' + 'number of axes ({self})'.format(block=block.ndim, + self=self.ndim)) + + if do_integrity_check: + self._verify_integrity() + + self._consolidate_check() + + self._rebuild_blknos_and_blklocs() + + def make_empty(self, axes=None): + """ return an empty BlockManager with the items axis of len 0 """ + if axes is None: + axes = [_ensure_index([])] + [_ensure_index(a) + for a in self.axes[1:]] + + # preserve dtype if possible + if self.ndim == 1: + blocks = np.array([], dtype=self.array_dtype) + else: + blocks = [] + return self.__class__(blocks, axes) + + def __nonzero__(self): + return True + + # Python3 compat + __bool__ = __nonzero__ + + @property + def shape(self): + return tuple(len(ax) for ax in self.axes) + + @property + def ndim(self): + return len(self.axes) + + def set_axis(self, axis, new_labels): + new_labels = _ensure_index(new_labels) + old_len = len(self.axes[axis]) + new_len = len(new_labels) + + if new_len != old_len: + raise ValueError( + 'Length mismatch: Expected axis has {old} elements, new ' + 'values have {new} elements'.format(old=old_len, new=new_len)) + + self.axes[axis] = new_labels + + def rename_axis(self, mapper, axis, copy=True, level=None): + """ + Rename one of axes. + + Parameters + ---------- + mapper : unary callable + axis : int + copy : boolean, default True + level : int, default None + + """ + obj = self.copy(deep=copy) + obj.set_axis(axis, _transform_index(self.axes[axis], mapper, level)) + return obj + + def add_prefix(self, prefix): + f = partial('{prefix}{}'.format, prefix=prefix) + return self.rename_axis(f, axis=0) + + def add_suffix(self, suffix): + f = partial('{}{suffix}'.format, suffix=suffix) + return self.rename_axis(f, axis=0) + + @property + def _is_single_block(self): + if self.ndim == 1: + return True + + if len(self.blocks) != 1: + return False + + blk = self.blocks[0] + return (blk.mgr_locs.is_slice_like and + blk.mgr_locs.as_slice == slice(0, len(self), 1)) + + def _rebuild_blknos_and_blklocs(self): + """ + Update mgr._blknos / mgr._blklocs. + """ + new_blknos = np.empty(self.shape[0], dtype=np.int64) + new_blklocs = np.empty(self.shape[0], dtype=np.int64) + new_blknos.fill(-1) + new_blklocs.fill(-1) + + for blkno, blk in enumerate(self.blocks): + rl = blk.mgr_locs + new_blknos[rl.indexer] = blkno + new_blklocs[rl.indexer] = np.arange(len(rl)) + + if (new_blknos == -1).any(): + raise AssertionError("Gaps in blk ref_locs") + + self._blknos = new_blknos + self._blklocs = new_blklocs + + # make items read only for now + def _get_items(self): + return self.axes[0] + + items = property(fget=_get_items) + + def _get_counts(self, f): + """ return a dict of the counts of the function in BlockManager """ + self._consolidate_inplace() + counts = dict() + for b in self.blocks: + v = f(b) + counts[v] = counts.get(v, 0) + b.shape[0] + return counts + + def get_dtype_counts(self): + return self._get_counts(lambda b: b.dtype.name) + + def get_ftype_counts(self): + return self._get_counts(lambda b: b.ftype) + + def get_dtypes(self): + dtypes = np.array([blk.dtype for blk in self.blocks]) + return algos.take_1d(dtypes, self._blknos, allow_fill=False) + + def get_ftypes(self): + ftypes = np.array([blk.ftype for blk in self.blocks]) + return algos.take_1d(ftypes, self._blknos, allow_fill=False) + + def __getstate__(self): + block_values = [b.values for b in self.blocks] + block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] + axes_array = [ax for ax in self.axes] + + extra_state = { + '0.14.1': { + 'axes': axes_array, + 'blocks': [dict(values=b.values, mgr_locs=b.mgr_locs.indexer) + for b in self.blocks] + } + } + + # First three elements of the state are to maintain forward + # compatibility with 0.13.1. + return axes_array, block_values, block_items, extra_state + + def __setstate__(self, state): + def unpickle_block(values, mgr_locs): + # numpy < 1.7 pickle compat + if values.dtype == 'M8[us]': + values = values.astype('M8[ns]') + return make_block(values, placement=mgr_locs) + + if (isinstance(state, tuple) and len(state) >= 4 and + '0.14.1' in state[3]): + state = state[3]['0.14.1'] + self.axes = [_ensure_index(ax) for ax in state['axes']] + self.blocks = tuple(unpickle_block(b['values'], b['mgr_locs']) + for b in state['blocks']) + else: + # discard anything after 3rd, support beta pickling format for a + # little while longer + ax_arrays, bvalues, bitems = state[:3] + + self.axes = [_ensure_index(ax) for ax in ax_arrays] + + if len(bitems) == 1 and self.axes[0].equals(bitems[0]): + # This is a workaround for pre-0.14.1 pickles that didn't + # support unpickling multi-block frames/panels with non-unique + # columns/items, because given a manager with items ["a", "b", + # "a"] there's no way of knowing which block's "a" is where. + # + # Single-block case can be supported under the assumption that + # block items corresponded to manager items 1-to-1. + all_mgr_locs = [slice(0, len(bitems[0]))] + else: + all_mgr_locs = [self.axes[0].get_indexer(blk_items) + for blk_items in bitems] + + self.blocks = tuple( + unpickle_block(values, mgr_locs) + for values, mgr_locs in zip(bvalues, all_mgr_locs)) + + self._post_setstate() + + def _post_setstate(self): + self._is_consolidated = False + self._known_consolidated = False + self._rebuild_blknos_and_blklocs() + + def __len__(self): + return len(self.items) + + def __unicode__(self): + output = pprint_thing(self.__class__.__name__) + for i, ax in enumerate(self.axes): + if i == 0: + output += u'\nItems: {ax}'.format(ax=ax) + else: + output += u'\nAxis {i}: {ax}'.format(i=i, ax=ax) + + for block in self.blocks: + output += u'\n{block}'.format(block=pprint_thing(block)) + return output + + def _verify_integrity(self): + mgr_shape = self.shape + tot_items = sum(len(x.mgr_locs) for x in self.blocks) + for block in self.blocks: + if block._verify_integrity and block.shape[1:] != mgr_shape[1:]: + construction_error(tot_items, block.shape[1:], self.axes) + if len(self.items) != tot_items: + raise AssertionError('Number of manager items must equal union of ' + 'block items\n# manager items: {0}, # ' + 'tot_items: {1}'.format( + len(self.items), tot_items)) + + def apply(self, f, axes=None, filter=None, do_integrity_check=False, + consolidate=True, **kwargs): + """ + iterate over the blocks, collect and create a new block manager + + Parameters + ---------- + f : the callable or function name to operate on at the block level + axes : optional (if not supplied, use self.axes) + filter : list, if supplied, only call the block if the filter is in + the block + do_integrity_check : boolean, default False. Do the block manager + integrity check + consolidate: boolean, default True. Join together blocks having same + dtype + + Returns + ------- + Block Manager (new object) + + """ + + result_blocks = [] + + # filter kwarg is used in replace-* family of methods + if filter is not None: + filter_locs = set(self.items.get_indexer_for(filter)) + if len(filter_locs) == len(self.items): + # All items are included, as if there were no filtering + filter = None + else: + kwargs['filter'] = filter_locs + + if consolidate: + self._consolidate_inplace() + + if f == 'where': + align_copy = True + if kwargs.get('align', True): + align_keys = ['other', 'cond'] + else: + align_keys = ['cond'] + elif f == 'putmask': + align_copy = False + if kwargs.get('align', True): + align_keys = ['new', 'mask'] + else: + align_keys = ['mask'] + elif f == 'eval': + align_copy = False + align_keys = ['other'] + elif f == 'fillna': + # fillna internally does putmask, maybe it's better to do this + # at mgr, not block level? + align_copy = False + align_keys = ['value'] + else: + align_keys = [] + + # TODO(EA): may interfere with ExtensionBlock.setitem for blocks + # with a .values attribute. + aligned_args = dict((k, kwargs[k]) + for k in align_keys + if hasattr(kwargs[k], 'values') and + not isinstance(kwargs[k], gt.ABCExtensionArray)) + + for b in self.blocks: + if filter is not None: + if not b.mgr_locs.isin(filter_locs).any(): + result_blocks.append(b) + continue + + if aligned_args: + b_items = self.items[b.mgr_locs.indexer] + + for k, obj in aligned_args.items(): + axis = getattr(obj, '_info_axis_number', 0) + kwargs[k] = obj.reindex(b_items, axis=axis, + copy=align_copy) + + kwargs['mgr'] = self + applied = getattr(b, f)(**kwargs) + result_blocks = _extend_blocks(applied, result_blocks) + + if len(result_blocks) == 0: + return self.make_empty(axes or self.axes) + bm = self.__class__(result_blocks, axes or self.axes, + do_integrity_check=do_integrity_check) + bm._consolidate_inplace() + return bm + + def reduction(self, f, axis=0, consolidate=True, transposed=False, + **kwargs): + """ + iterate over the blocks, collect and create a new block manager. + This routine is intended for reduction type operations and + will do inference on the generated blocks. + + Parameters + ---------- + f: the callable or function name to operate on at the block level + axis: reduction axis, default 0 + consolidate: boolean, default True. Join together blocks having same + dtype + transposed: boolean, default False + we are holding transposed data + + Returns + ------- + Block Manager (new object) + + """ + + if consolidate: + self._consolidate_inplace() + + axes, blocks = [], [] + for b in self.blocks: + kwargs['mgr'] = self + axe, block = getattr(b, f)(axis=axis, **kwargs) + + axes.append(axe) + blocks.append(block) + + # note that some DatetimeTZ, Categorical are always ndim==1 + ndim = {b.ndim for b in blocks} + + if 2 in ndim: + + new_axes = list(self.axes) + + # multiple blocks that are reduced + if len(blocks) > 1: + new_axes[1] = axes[0] + + # reset the placement to the original + for b, sb in zip(blocks, self.blocks): + b.mgr_locs = sb.mgr_locs + + else: + new_axes[axis] = Index(np.concatenate( + [ax.values for ax in axes])) + + if transposed: + new_axes = new_axes[::-1] + blocks = [b.make_block(b.values.T, + placement=np.arange(b.shape[1]) + ) for b in blocks] + + return self.__class__(blocks, new_axes) + + # 0 ndim + if 0 in ndim and 1 not in ndim: + values = np.array([b.values for b in blocks]) + if len(values) == 1: + return values.item() + blocks = [make_block(values, ndim=1)] + axes = Index([ax[0] for ax in axes]) + + # single block + values = _concat._concat_compat([b.values for b in blocks]) + + # compute the orderings of our original data + if len(self.blocks) > 1: + + indexer = np.empty(len(self.axes[0]), dtype=np.intp) + i = 0 + for b in self.blocks: + for j in b.mgr_locs: + indexer[j] = i + i = i + 1 + + values = values.take(indexer) + + return SingleBlockManager( + [make_block(values, + ndim=1, + placement=np.arange(len(values)))], + axes[0]) + + def isna(self, func, **kwargs): + return self.apply('apply', func=func, **kwargs) + + def where(self, **kwargs): + return self.apply('where', **kwargs) + + def eval(self, **kwargs): + return self.apply('eval', **kwargs) + + def quantile(self, **kwargs): + return self.reduction('quantile', **kwargs) + + def setitem(self, **kwargs): + return self.apply('setitem', **kwargs) + + def putmask(self, **kwargs): + return self.apply('putmask', **kwargs) + + def diff(self, **kwargs): + return self.apply('diff', **kwargs) + + def interpolate(self, **kwargs): + return self.apply('interpolate', **kwargs) + + def shift(self, **kwargs): + return self.apply('shift', **kwargs) + + def fillna(self, **kwargs): + return self.apply('fillna', **kwargs) + + def downcast(self, **kwargs): + return self.apply('downcast', **kwargs) + + def astype(self, dtype, **kwargs): + return self.apply('astype', dtype=dtype, **kwargs) + + def convert(self, **kwargs): + return self.apply('convert', **kwargs) + + def replace(self, **kwargs): + return self.apply('replace', **kwargs) + + def replace_list(self, src_list, dest_list, inplace=False, regex=False, + mgr=None): + """ do a list replace """ + + inplace = validate_bool_kwarg(inplace, 'inplace') + + if mgr is None: + mgr = self + + # figure out our mask a-priori to avoid repeated replacements + values = self.as_array() + + def comp(s): + if isna(s): + return isna(values) + return _maybe_compare(values, getattr(s, 'asm8', s), operator.eq) + + masks = [comp(s) for i, s in enumerate(src_list)] + + result_blocks = [] + src_len = len(src_list) - 1 + for blk in self.blocks: + + # its possible to get multiple result blocks here + # replace ALWAYS will return a list + rb = [blk if inplace else blk.copy()] + for i, (s, d) in enumerate(zip(src_list, dest_list)): + new_rb = [] + for b in rb: + if b.dtype == np.object_: + convert = i == src_len + result = b.replace(s, d, inplace=inplace, regex=regex, + mgr=mgr, convert=convert) + new_rb = _extend_blocks(result, new_rb) + else: + # get our mask for this element, sized to this + # particular block + m = masks[i][b.mgr_locs.indexer] + if m.any(): + b = b.coerce_to_target_dtype(d) + new_rb.extend(b.putmask(m, d, inplace=True)) + else: + new_rb.append(b) + rb = new_rb + result_blocks.extend(rb) + + bm = self.__class__(result_blocks, self.axes) + bm._consolidate_inplace() + return bm + + # TODO: remove once Panel is gone + def reshape_nd(self, axes, **kwargs): + """ a 2d-nd reshape operation on a BlockManager """ + return self.apply('reshape_nd', axes=axes, **kwargs) + + def is_consolidated(self): + """ + Return True if more than one block with the same dtype + """ + if not self._known_consolidated: + self._consolidate_check() + return self._is_consolidated + + def _consolidate_check(self): + ftypes = [blk.ftype for blk in self.blocks] + self._is_consolidated = len(ftypes) == len(set(ftypes)) + self._known_consolidated = True + + @property + def is_mixed_type(self): + # Warning, consolidation needs to get checked upstairs + self._consolidate_inplace() + return len(self.blocks) > 1 + + @property + def is_numeric_mixed_type(self): + # Warning, consolidation needs to get checked upstairs + self._consolidate_inplace() + return all(block.is_numeric for block in self.blocks) + + @property + def is_datelike_mixed_type(self): + # Warning, consolidation needs to get checked upstairs + self._consolidate_inplace() + return any(block.is_datelike for block in self.blocks) + + @property + def any_extension_types(self): + """Whether any of the blocks in this manager are extension blocks""" + return any(block.is_extension for block in self.blocks) + + @property + def is_view(self): + """ return a boolean if we are a single block and are a view """ + if len(self.blocks) == 1: + return self.blocks[0].is_view + + # It is technically possible to figure out which blocks are views + # e.g. [ b.values.base is not None for b in self.blocks ] + # but then we have the case of possibly some blocks being a view + # and some blocks not. setting in theory is possible on the non-view + # blocks w/o causing a SettingWithCopy raise/warn. But this is a bit + # complicated + + return False + + def get_bool_data(self, copy=False): + """ + Parameters + ---------- + copy : boolean, default False + Whether to copy the blocks + """ + self._consolidate_inplace() + return self.combine([b for b in self.blocks if b.is_bool], copy) + + def get_numeric_data(self, copy=False): + """ + Parameters + ---------- + copy : boolean, default False + Whether to copy the blocks + """ + self._consolidate_inplace() + return self.combine([b for b in self.blocks if b.is_numeric], copy) + + def combine(self, blocks, copy=True): + """ return a new manager with the blocks """ + if len(blocks) == 0: + return self.make_empty() + + # FIXME: optimization potential + indexer = np.sort(np.concatenate([b.mgr_locs.as_array + for b in blocks])) + inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) + + new_blocks = [] + for b in blocks: + b = b.copy(deep=copy) + b.mgr_locs = algos.take_1d(inv_indexer, b.mgr_locs.as_array, + axis=0, allow_fill=False) + new_blocks.append(b) + + axes = list(self.axes) + axes[0] = self.items.take(indexer) + + return self.__class__(new_blocks, axes, do_integrity_check=False) + + def get_slice(self, slobj, axis=0): + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + if axis == 0: + new_blocks = self._slice_take_blocks_ax0(slobj) + else: + slicer = [slice(None)] * (axis + 1) + slicer[axis] = slobj + slicer = tuple(slicer) + new_blocks = [blk.getitem_block(slicer) for blk in self.blocks] + + new_axes = list(self.axes) + new_axes[axis] = new_axes[axis][slobj] + + bm = self.__class__(new_blocks, new_axes, do_integrity_check=False) + bm._consolidate_inplace() + return bm + + def __contains__(self, item): + return item in self.items + + @property + def nblocks(self): + return len(self.blocks) + + def copy(self, deep=True, mgr=None): + """ + Make deep or shallow copy of BlockManager + + Parameters + ---------- + deep : boolean o rstring, default True + If False, return shallow copy (do not copy data) + If 'all', copy data and a deep copy of the index + + Returns + ------- + copy : BlockManager + """ + + # this preserves the notion of view copying of axes + if deep: + if deep == 'all': + copy = lambda ax: ax.copy(deep=True) + else: + copy = lambda ax: ax.view() + new_axes = [copy(ax) for ax in self.axes] + else: + new_axes = list(self.axes) + return self.apply('copy', axes=new_axes, deep=deep, + do_integrity_check=False) + + def as_array(self, transpose=False, items=None): + """Convert the blockmanager data into an numpy array. + + Parameters + ---------- + transpose : boolean, default False + If True, transpose the return array + items : list of strings or None + Names of block items that will be included in the returned + array. ``None`` means that all block items will be used + + Returns + ------- + arr : ndarray + """ + if len(self.blocks) == 0: + arr = np.empty(self.shape, dtype=float) + return arr.transpose() if transpose else arr + + if items is not None: + mgr = self.reindex_axis(items, axis=0) + else: + mgr = self + + if self._is_single_block or not self.is_mixed_type: + arr = mgr.blocks[0].get_values() + else: + arr = mgr._interleave() + + return arr.transpose() if transpose else arr + + def _interleave(self): + """ + Return ndarray from blocks with specified item order + Items must be contained in the blocks + """ + dtype = _interleaved_dtype(self.blocks) + + result = np.empty(self.shape, dtype=dtype) + + if result.shape[0] == 0: + # Workaround for numpy 1.7 bug: + # + # >>> a = np.empty((0,10)) + # >>> a[slice(0,0)] + # array([], shape=(0, 10), dtype=float64) + # >>> a[[]] + # Traceback (most recent call last): + # File "", line 1, in + # IndexError: index 0 is out of bounds for axis 0 with size 0 + return result + + itemmask = np.zeros(self.shape[0]) + + for blk in self.blocks: + rl = blk.mgr_locs + result[rl.indexer] = blk.get_values(dtype) + itemmask[rl.indexer] = 1 + + if not itemmask.all(): + raise AssertionError('Some items were not contained in blocks') + + return result + + def to_dict(self, copy=True): + """ + Return a dict of str(dtype) -> BlockManager + + Parameters + ---------- + copy : boolean, default True + + Returns + ------- + values : a dict of dtype -> BlockManager + + Notes + ----- + This consolidates based on str(dtype) + """ + self._consolidate_inplace() + + bd = {} + for b in self.blocks: + bd.setdefault(str(b.dtype), []).append(b) + + return {dtype: self.combine(blocks, copy=copy) + for dtype, blocks in bd.items()} + + def xs(self, key, axis=1, copy=True, takeable=False): + if axis < 1: + raise AssertionError( + 'Can only take xs across axis >= 1, got {ax}'.format(ax=axis)) + + # take by position + if takeable: + loc = key + else: + loc = self.axes[axis].get_loc(key) + + slicer = [slice(None, None) for _ in range(self.ndim)] + slicer[axis] = loc + slicer = tuple(slicer) + + new_axes = list(self.axes) + + # could be an array indexer! + if isinstance(loc, (slice, np.ndarray)): + new_axes[axis] = new_axes[axis][loc] + else: + new_axes.pop(axis) + + new_blocks = [] + if len(self.blocks) > 1: + # we must copy here as we are mixed type + for blk in self.blocks: + newb = make_block(values=blk.values[slicer], + klass=blk.__class__, + placement=blk.mgr_locs) + new_blocks.append(newb) + elif len(self.blocks) == 1: + block = self.blocks[0] + vals = block.values[slicer] + if copy: + vals = vals.copy() + new_blocks = [make_block(values=vals, + placement=block.mgr_locs, + klass=block.__class__)] + + return self.__class__(new_blocks, new_axes) + + def fast_xs(self, loc): + """ + get a cross sectional for a given location in the + items ; handle dups + + return the result, is *could* be a view in the case of a + single block + """ + if len(self.blocks) == 1: + return self.blocks[0].iget((slice(None), loc)) + + items = self.items + + # non-unique (GH4726) + if not items.is_unique: + result = self._interleave() + if self.ndim == 2: + result = result.T + return result[loc] + + # unique + dtype = _interleaved_dtype(self.blocks) + n = len(items) + result = np.empty(n, dtype=dtype) + for blk in self.blocks: + # Such assignment may incorrectly coerce NaT to None + # result[blk.mgr_locs] = blk._slice((slice(None), loc)) + for i, rl in enumerate(blk.mgr_locs): + result[rl] = blk._try_coerce_result(blk.iget((i, loc))) + + return result + + def consolidate(self): + """ + Join together blocks having same dtype + + Returns + ------- + y : BlockManager + """ + if self.is_consolidated(): + return self + + bm = self.__class__(self.blocks, self.axes) + bm._is_consolidated = False + bm._consolidate_inplace() + return bm + + def _consolidate_inplace(self): + if not self.is_consolidated(): + self.blocks = tuple(_consolidate(self.blocks)) + self._is_consolidated = True + self._known_consolidated = True + self._rebuild_blknos_and_blklocs() + + def get(self, item, fastpath=True): + """ + Return values for selected item (ndarray or BlockManager). + """ + if self.items.is_unique: + + if not isna(item): + loc = self.items.get_loc(item) + else: + indexer = np.arange(len(self.items))[isna(self.items)] + + # allow a single nan location indexer + if not is_scalar(indexer): + if len(indexer) == 1: + loc = indexer.item() + else: + raise ValueError("cannot label index with a null key") + + return self.iget(loc, fastpath=fastpath) + else: + + if isna(item): + raise TypeError("cannot label index with a null key") + + indexer = self.items.get_indexer_for([item]) + return self.reindex_indexer(new_axis=self.items[indexer], + indexer=indexer, axis=0, + allow_dups=True) + + def iget(self, i, fastpath=True): + """ + Return the data as a SingleBlockManager if fastpath=True and possible + + Otherwise return as a ndarray + """ + block = self.blocks[self._blknos[i]] + values = block.iget(self._blklocs[i]) + if not fastpath or not block._box_to_block_values or values.ndim != 1: + return values + + # fastpath shortcut for select a single-dim from a 2-dim BM + return SingleBlockManager( + [block.make_block_same_class(values, + placement=slice(0, len(values)), + ndim=1)], + self.axes[1]) + + def get_scalar(self, tup): + """ + Retrieve single item + """ + full_loc = [ax.get_loc(x) for ax, x in zip(self.axes, tup)] + blk = self.blocks[self._blknos[full_loc[0]]] + values = blk.values + + # FIXME: this may return non-upcasted types? + if values.ndim == 1: + return values[full_loc[1]] + + full_loc[0] = self._blklocs[full_loc[0]] + return values[tuple(full_loc)] + + def delete(self, item): + """ + Delete selected item (items if non-unique) in-place. + """ + indexer = self.items.get_loc(item) + + is_deleted = np.zeros(self.shape[0], dtype=np.bool_) + is_deleted[indexer] = True + ref_loc_offset = -is_deleted.cumsum() + + is_blk_deleted = [False] * len(self.blocks) + + if isinstance(indexer, int): + affected_start = indexer + else: + affected_start = is_deleted.nonzero()[0][0] + + for blkno, _ in _fast_count_smallints(self._blknos[affected_start:]): + blk = self.blocks[blkno] + bml = blk.mgr_locs + blk_del = is_deleted[bml.indexer].nonzero()[0] + + if len(blk_del) == len(bml): + is_blk_deleted[blkno] = True + continue + elif len(blk_del) != 0: + blk.delete(blk_del) + bml = blk.mgr_locs + + blk.mgr_locs = bml.add(ref_loc_offset[bml.indexer]) + + # FIXME: use Index.delete as soon as it uses fastpath=True + self.axes[0] = self.items[~is_deleted] + self.blocks = tuple(b for blkno, b in enumerate(self.blocks) + if not is_blk_deleted[blkno]) + self._shape = None + self._rebuild_blknos_and_blklocs() + + def set(self, item, value, check=False): + """ + Set new item in-place. Does not consolidate. Adds new Block if not + contained in the current set of items + if check, then validate that we are not setting the same data in-place + """ + # FIXME: refactor, clearly separate broadcasting & zip-like assignment + # can prob also fix the various if tests for sparse/categorical + + # TODO(EA): Remove an is_extension_ when all extension types satisfy + # the interface + value_is_extension_type = (is_extension_type(value) or + is_extension_array_dtype(value)) + + # categorical/spares/datetimetz + if value_is_extension_type: + + def value_getitem(placement): + return value + else: + if value.ndim == self.ndim - 1: + value = _safe_reshape(value, (1,) + value.shape) + + def value_getitem(placement): + return value + else: + + def value_getitem(placement): + return value[placement.indexer] + + if value.shape[1:] != self.shape[1:]: + raise AssertionError('Shape of new values must be compatible ' + 'with manager shape') + + try: + loc = self.items.get_loc(item) + except KeyError: + # This item wasn't present, just insert at end + self.insert(len(self.items), item, value) + return + + if isinstance(loc, int): + loc = [loc] + + blknos = self._blknos[loc] + blklocs = self._blklocs[loc].copy() + + unfit_mgr_locs = [] + unfit_val_locs = [] + removed_blknos = [] + for blkno, val_locs in _get_blkno_placements(blknos, len(self.blocks), + group=True): + blk = self.blocks[blkno] + blk_locs = blklocs[val_locs.indexer] + if blk.should_store(value): + blk.set(blk_locs, value_getitem(val_locs), check=check) + else: + unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) + unfit_val_locs.append(val_locs) + + # If all block items are unfit, schedule the block for removal. + if len(val_locs) == len(blk.mgr_locs): + removed_blknos.append(blkno) + else: + self._blklocs[blk.mgr_locs.indexer] = -1 + blk.delete(blk_locs) + self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk)) + + if len(removed_blknos): + # Remove blocks & update blknos accordingly + is_deleted = np.zeros(self.nblocks, dtype=np.bool_) + is_deleted[removed_blknos] = True + + new_blknos = np.empty(self.nblocks, dtype=np.int64) + new_blknos.fill(-1) + new_blknos[~is_deleted] = np.arange(self.nblocks - + len(removed_blknos)) + self._blknos = algos.take_1d(new_blknos, self._blknos, axis=0, + allow_fill=False) + self.blocks = tuple(blk for i, blk in enumerate(self.blocks) + if i not in set(removed_blknos)) + + if unfit_val_locs: + unfit_mgr_locs = np.concatenate(unfit_mgr_locs) + unfit_count = len(unfit_mgr_locs) + + new_blocks = [] + if value_is_extension_type: + # This code (ab-)uses the fact that sparse blocks contain only + # one item. + new_blocks.extend( + make_block(values=value.copy(), ndim=self.ndim, + placement=slice(mgr_loc, mgr_loc + 1)) + for mgr_loc in unfit_mgr_locs) + + self._blknos[unfit_mgr_locs] = (np.arange(unfit_count) + + len(self.blocks)) + self._blklocs[unfit_mgr_locs] = 0 + + else: + # unfit_val_locs contains BlockPlacement objects + unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:]) + + new_blocks.append( + make_block(values=value_getitem(unfit_val_items), + ndim=self.ndim, placement=unfit_mgr_locs)) + + self._blknos[unfit_mgr_locs] = len(self.blocks) + self._blklocs[unfit_mgr_locs] = np.arange(unfit_count) + + self.blocks += tuple(new_blocks) + + # Newly created block's dtype may already be present. + self._known_consolidated = False + + def insert(self, loc, item, value, allow_duplicates=False): + """ + Insert item at selected position. + + Parameters + ---------- + loc : int + item : hashable + value : array_like + allow_duplicates: bool + If False, trying to insert non-unique item will raise + + """ + if not allow_duplicates and item in self.items: + # Should this be a different kind of error?? + raise ValueError('cannot insert {}, already exists'.format(item)) + + if not isinstance(loc, int): + raise TypeError("loc must be int") + + # insert to the axis; this could possibly raise a TypeError + new_axis = self.items.insert(loc, item) + + block = make_block(values=value, ndim=self.ndim, + placement=slice(loc, loc + 1)) + + for blkno, count in _fast_count_smallints(self._blknos[loc:]): + blk = self.blocks[blkno] + if count == len(blk.mgr_locs): + blk.mgr_locs = blk.mgr_locs.add(1) + else: + new_mgr_locs = blk.mgr_locs.as_array.copy() + new_mgr_locs[new_mgr_locs >= loc] += 1 + blk.mgr_locs = new_mgr_locs + + if loc == self._blklocs.shape[0]: + # np.append is a lot faster (at least in numpy 1.7.1), let's use it + # if we can. + self._blklocs = np.append(self._blklocs, 0) + self._blknos = np.append(self._blknos, len(self.blocks)) + else: + self._blklocs = np.insert(self._blklocs, loc, 0) + self._blknos = np.insert(self._blknos, loc, len(self.blocks)) + + self.axes[0] = new_axis + self.blocks += (block,) + self._shape = None + + self._known_consolidated = False + + if len(self.blocks) > 100: + self._consolidate_inplace() + + def reindex_axis(self, new_index, axis, method=None, limit=None, + fill_value=None, copy=True): + """ + Conform block manager to new index. + """ + new_index = _ensure_index(new_index) + new_index, indexer = self.axes[axis].reindex(new_index, method=method, + limit=limit) + + return self.reindex_indexer(new_index, indexer, axis=axis, + fill_value=fill_value, copy=copy) + + def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, + allow_dups=False, copy=True): + """ + Parameters + ---------- + new_axis : Index + indexer : ndarray of int64 or None + axis : int + fill_value : object + allow_dups : bool + + pandas-indexer with -1's only. + """ + if indexer is None: + if new_axis is self.axes[axis] and not copy: + return self + + result = self.copy(deep=copy) + result.axes = list(self.axes) + result.axes[axis] = new_axis + return result + + self._consolidate_inplace() + + # some axes don't allow reindexing with dups + if not allow_dups: + self.axes[axis]._can_reindex(indexer) + + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + if axis == 0: + new_blocks = self._slice_take_blocks_ax0(indexer, + fill_tuple=(fill_value,)) + else: + new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=( + fill_value if fill_value is not None else blk.fill_value,)) + for blk in self.blocks] + + new_axes = list(self.axes) + new_axes[axis] = new_axis + return self.__class__(new_blocks, new_axes) + + def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): + """ + Slice/take blocks along axis=0. + + Overloaded for SingleBlock + + Returns + ------- + new_blocks : list of Block + + """ + + allow_fill = fill_tuple is not None + + sl_type, slobj, sllen = _preprocess_slice_or_indexer( + slice_or_indexer, self.shape[0], allow_fill=allow_fill) + + if self._is_single_block: + blk = self.blocks[0] + + if sl_type in ('slice', 'mask'): + return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] + elif not allow_fill or self.ndim == 1: + if allow_fill and fill_tuple[0] is None: + _, fill_value = maybe_promote(blk.dtype) + fill_tuple = (fill_value, ) + + return [blk.take_nd(slobj, axis=0, + new_mgr_locs=slice(0, sllen), + fill_tuple=fill_tuple)] + + if sl_type in ('slice', 'mask'): + blknos = self._blknos[slobj] + blklocs = self._blklocs[slobj] + else: + blknos = algos.take_1d(self._blknos, slobj, fill_value=-1, + allow_fill=allow_fill) + blklocs = algos.take_1d(self._blklocs, slobj, fill_value=-1, + allow_fill=allow_fill) + + # When filling blknos, make sure blknos is updated before appending to + # blocks list, that way new blkno is exactly len(blocks). + # + # FIXME: mgr_groupby_blknos must return mgr_locs in ascending order, + # pytables serialization will break otherwise. + blocks = [] + for blkno, mgr_locs in _get_blkno_placements(blknos, len(self.blocks), + group=True): + if blkno == -1: + # If we've got here, fill_tuple was not None. + fill_value = fill_tuple[0] + + blocks.append(self._make_na_block(placement=mgr_locs, + fill_value=fill_value)) + else: + blk = self.blocks[blkno] + + # Otherwise, slicing along items axis is necessary. + if not blk._can_consolidate: + # A non-consolidatable block, it's easy, because there's + # only one item and each mgr loc is a copy of that single + # item. + for mgr_loc in mgr_locs: + newblk = blk.copy(deep=True) + newblk.mgr_locs = slice(mgr_loc, mgr_loc + 1) + blocks.append(newblk) + + else: + blocks.append(blk.take_nd(blklocs[mgr_locs.indexer], + axis=0, new_mgr_locs=mgr_locs, + fill_tuple=None)) + + return blocks + + def _make_na_block(self, placement, fill_value=None): + # TODO: infer dtypes other than float64 from fill_value + + if fill_value is None: + fill_value = np.nan + block_shape = list(self.shape) + block_shape[0] = len(placement) + + dtype, fill_value = infer_dtype_from_scalar(fill_value) + block_values = np.empty(block_shape, dtype=dtype) + block_values.fill(fill_value) + return make_block(block_values, placement=placement) + + def take(self, indexer, axis=1, verify=True, convert=True): + """ + Take items along any axis. + """ + self._consolidate_inplace() + indexer = (np.arange(indexer.start, indexer.stop, indexer.step, + dtype='int64') + if isinstance(indexer, slice) + else np.asanyarray(indexer, dtype='int64')) + + n = self.shape[axis] + if convert: + indexer = maybe_convert_indices(indexer, n) + + if verify: + if ((indexer == -1) | (indexer >= n)).any(): + raise Exception('Indices must be nonzero and less than ' + 'the axis length') + + new_labels = self.axes[axis].take(indexer) + return self.reindex_indexer(new_axis=new_labels, indexer=indexer, + axis=axis, allow_dups=True) + + def merge(self, other, lsuffix='', rsuffix=''): + if not self._is_indexed_like(other): + raise AssertionError('Must have same axes to merge managers') + + l, r = items_overlap_with_suffix(left=self.items, lsuffix=lsuffix, + right=other.items, rsuffix=rsuffix) + new_items = _concat_indexes([l, r]) + + new_blocks = [blk.copy(deep=False) for blk in self.blocks] + + offset = self.shape[0] + for blk in other.blocks: + blk = blk.copy(deep=False) + blk.mgr_locs = blk.mgr_locs.add(offset) + new_blocks.append(blk) + + new_axes = list(self.axes) + new_axes[0] = new_items + + return self.__class__(_consolidate(new_blocks), new_axes) + + def _is_indexed_like(self, other): + """ + Check all axes except items + """ + if self.ndim != other.ndim: + raise AssertionError( + 'Number of dimensions must agree got {ndim} and ' + '{oth_ndim}'.format(ndim=self.ndim, oth_ndim=other.ndim)) + for ax, oax in zip(self.axes[1:], other.axes[1:]): + if not ax.equals(oax): + return False + return True + + def equals(self, other): + self_axes, other_axes = self.axes, other.axes + if len(self_axes) != len(other_axes): + return False + if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)): + return False + self._consolidate_inplace() + other._consolidate_inplace() + if len(self.blocks) != len(other.blocks): + return False + + # canonicalize block order, using a tuple combining the type + # name and then mgr_locs because there might be unconsolidated + # blocks (say, Categorical) which can only be distinguished by + # the iteration order + def canonicalize(block): + return (block.dtype.name, block.mgr_locs.as_array.tolist()) + + self_blocks = sorted(self.blocks, key=canonicalize) + other_blocks = sorted(other.blocks, key=canonicalize) + return all(block.equals(oblock) + for block, oblock in zip(self_blocks, other_blocks)) + + def unstack(self, unstacker_func): + """Return a blockmanager with all blocks unstacked. + + Parameters + ---------- + unstacker_func : callable + A (partially-applied) ``pd.core.reshape._Unstacker`` class. + + Returns + ------- + unstacked : BlockManager + """ + dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items) + new_columns = dummy.get_new_columns() + new_index = dummy.get_new_index() + new_blocks = [] + columns_mask = [] + + for blk in self.blocks: + blocks, mask = blk._unstack( + partial(unstacker_func, + value_columns=self.items[blk.mgr_locs.indexer]), + new_columns) + + new_blocks.extend(blocks) + columns_mask.extend(mask) + + new_columns = new_columns[columns_mask] + + bm = BlockManager(new_blocks, [new_columns, new_index]) + return bm + + +class SingleBlockManager(BlockManager): + """ manage a single block with """ + + ndim = 1 + _is_consolidated = True + _known_consolidated = True + __slots__ = () + + def __init__(self, block, axis, do_integrity_check=False, fastpath=False): + + if isinstance(axis, list): + if len(axis) != 1: + raise ValueError("cannot create SingleBlockManager with more " + "than 1 axis") + axis = axis[0] + + # passed from constructor, single block, single axis + if fastpath: + self.axes = [axis] + if isinstance(block, list): + + # empty block + if len(block) == 0: + block = [np.array([])] + elif len(block) != 1: + raise ValueError('Cannot create SingleBlockManager with ' + 'more than 1 block') + block = block[0] + else: + self.axes = [_ensure_index(axis)] + + # create the block here + if isinstance(block, list): + + # provide consolidation to the interleaved_dtype + if len(block) > 1: + dtype = _interleaved_dtype(block) + block = [b.astype(dtype) for b in block] + block = _consolidate(block) + + if len(block) != 1: + raise ValueError('Cannot create SingleBlockManager with ' + 'more than 1 block') + block = block[0] + + if not isinstance(block, Block): + block = make_block(block, placement=slice(0, len(axis)), ndim=1) + + self.blocks = [block] + + def _post_setstate(self): + pass + + @property + def _block(self): + return self.blocks[0] + + @property + def _values(self): + return self._block.values + + @property + def _blknos(self): + """ compat with BlockManager """ + return None + + @property + def _blklocs(self): + """ compat with BlockManager """ + return None + + def get_slice(self, slobj, axis=0): + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + return self.__class__(self._block._slice(slobj), + self.index[slobj], fastpath=True) + + @property + def index(self): + return self.axes[0] + + def convert(self, **kwargs): + """ convert the whole block as one """ + kwargs['by_item'] = False + return self.apply('convert', **kwargs) + + @property + def dtype(self): + return self._block.dtype + + @property + def array_dtype(self): + return self._block.array_dtype + + @property + def ftype(self): + return self._block.ftype + + def get_dtype_counts(self): + return {self.dtype.name: 1} + + def get_ftype_counts(self): + return {self.ftype: 1} + + def get_dtypes(self): + return np.array([self._block.dtype]) + + def get_ftypes(self): + return np.array([self._block.ftype]) + + def external_values(self): + return self._block.external_values() + + def internal_values(self): + return self._block.internal_values() + + def formatting_values(self): + """Return the internal values used by the DataFrame/SeriesFormatter""" + return self._block.formatting_values() + + def get_values(self): + """ return a dense type view """ + return np.array(self._block.to_dense(), copy=False) + + @property + def asobject(self): + """ + return a object dtype array. datetime/timedelta like values are boxed + to Timestamp/Timedelta instances. + """ + return self._block.get_values(dtype=object) + + @property + def _can_hold_na(self): + return self._block._can_hold_na + + def is_consolidated(self): + return True + + def _consolidate_check(self): + pass + + def _consolidate_inplace(self): + pass + + def delete(self, item): + """ + Delete single item from SingleBlockManager. + + Ensures that self.blocks doesn't become empty. + """ + loc = self.items.get_loc(item) + self._block.delete(loc) + self.axes[0] = self.axes[0].delete(loc) + + def fast_xs(self, loc): + """ + fast path for getting a cross-section + return a view of the data + """ + return self._block.values[loc] + + def concat(self, to_concat, new_axis): + """ + Concatenate a list of SingleBlockManagers into a single + SingleBlockManager. + + Used for pd.concat of Series objects with axis=0. + + Parameters + ---------- + to_concat : list of SingleBlockManagers + new_axis : Index of the result + + Returns + ------- + SingleBlockManager + + """ + non_empties = [x for x in to_concat if len(x) > 0] + + # check if all series are of the same block type: + if len(non_empties) > 0: + blocks = [obj.blocks[0] for obj in non_empties] + + if all(type(b) is type(blocks[0]) for b in blocks[1:]): # noqa + new_block = blocks[0].concat_same_type(blocks) + else: + values = [x.values for x in blocks] + values = _concat._concat_compat(values) + new_block = make_block( + values, placement=slice(0, len(values), 1)) + else: + values = [x._block.values for x in to_concat] + values = _concat._concat_compat(values) + new_block = make_block( + values, placement=slice(0, len(values), 1)) + + mgr = SingleBlockManager(new_block, new_axis) + return mgr + + +# ------------------------------------------------------------------ +# Manager Helpers - Used only by BlockManager Methods (or each other) + +def _consolidate(blocks): + """ + Merge blocks having same dtype, exclude non-consolidating blocks + """ + + # sort by _can_consolidate, dtype + gkey = lambda x: x._consolidate_key + grouper = itertools.groupby(sorted(blocks, key=gkey), gkey) + + new_blocks = [] + for (_can_consolidate, dtype), group_blocks in grouper: + merged_blocks = _merge_blocks(list(group_blocks), dtype=dtype, + _can_consolidate=_can_consolidate) + new_blocks = _extend_blocks(merged_blocks, new_blocks) + return new_blocks + + +def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill): + if isinstance(slice_or_indexer, slice): + return ('slice', slice_or_indexer, + libinternals.slice_len(slice_or_indexer, length)) + elif (isinstance(slice_or_indexer, np.ndarray) and + slice_or_indexer.dtype == np.bool_): + return 'mask', slice_or_indexer, slice_or_indexer.sum() + else: + indexer = np.asanyarray(slice_or_indexer, dtype=np.int64) + if not allow_fill: + indexer = maybe_convert_indices(indexer, length) + return 'fancy', indexer, len(indexer) + + +def _maybe_compare(a, b, op): + + is_a_array = isinstance(a, np.ndarray) + is_b_array = isinstance(b, np.ndarray) + + # numpy deprecation warning to have i8 vs integer comparisons + if is_datetimelike_v_numeric(a, b): + result = False + + # numpy deprecation warning if comparing numeric vs string-like + elif is_numeric_v_string_like(a, b): + result = False + + else: + result = op(a, b) + + if is_scalar(result) and (is_a_array or is_b_array): + type_names = [type(a).__name__, type(b).__name__] + + if is_a_array: + type_names[0] = 'ndarray(dtype={dtype})'.format(dtype=a.dtype) + + if is_b_array: + type_names[1] = 'ndarray(dtype={dtype})'.format(dtype=b.dtype) + + raise TypeError( + "Cannot compare types {a!r} and {b!r}".format(a=type_names[0], + b=type_names[1])) + return result + + +def _interleaved_dtype(blocks): + if not len(blocks): + return None + + dtype = find_common_type([b.dtype for b in blocks]) + + # only numpy compat + if isinstance(dtype, (PandasExtensionDtype, ExtensionDtype)): + dtype = np.object + + return dtype + + +def _fast_count_smallints(arr): + """Faster version of set(arr) for sequences of small numbers.""" + if len(arr) == 0: + # Handle empty arr case separately: numpy 1.6 chokes on that. + return np.empty((0, 2), dtype=arr.dtype) + else: + counts = np.bincount(arr.astype(np.int_)) + nz = counts.nonzero()[0] + return np.c_[nz, counts[nz]] + + +# ------------------------------------------------------------------ +# Externally Used + +# also used in concat +def _get_blkno_placements(blknos, blk_count, group=True): + """ + + Parameters + ---------- + blknos : array of int64 + blk_count : int + group : bool + + Returns + ------- + iterator + yield (BlockPlacement, blkno) + """ + + blknos = _ensure_int64(blknos) + + # FIXME: blk_count is unused, but it may avoid the use of dicts in cython + for blkno, indexer in libinternals.get_blkno_indexers(blknos, group): + yield blkno, libinternals.BlockPlacement(indexer) + + +# also used in reshape.merge +def items_overlap_with_suffix(left, lsuffix, right, rsuffix): + """ + If two indices overlap, add suffixes to overlapping entries. + + If corresponding suffix is empty, the entry is simply converted to string. + + """ + to_rename = left.intersection(right) + if len(to_rename) == 0: + return left, right + else: + if not lsuffix and not rsuffix: + raise ValueError('columns overlap but no suffix specified: ' + '{rename}'.format(rename=to_rename)) + + def lrenamer(x): + if x in to_rename: + return '{x}{lsuffix}'.format(x=x, lsuffix=lsuffix) + return x + + def rrenamer(x): + if x in to_rename: + return '{x}{rsuffix}'.format(x=x, rsuffix=rsuffix) + return x + + return (_transform_index(left, lrenamer), + _transform_index(right, rrenamer)) + + +# ------------------------------------------------------------------ +# Index-Related + +def _transform_index(index, func, level=None): + """ + Apply function to all values found in index. + + This includes transforming multiindex entries separately. + Only apply function to one level of the MultiIndex if level is specified. + + """ + if isinstance(index, MultiIndex): + if level is not None: + items = [tuple(func(y) if i == level else y + for i, y in enumerate(x)) for x in index] + else: + items = [tuple(func(y) for y in x) for x in index] + return MultiIndex.from_tuples(items, names=index.names) + else: + items = [func(x) for x in index] + return Index(items, name=index.name, tupleize_cols=False) + + +def _concat_indexes(indexes): + return indexes[0].append(indexes[1:]) + + +# ------------------------------------------------------------------ +# Constructor Helpers + +def create_block_manager_from_blocks(blocks, axes): + try: + if len(blocks) == 1 and not isinstance(blocks[0], Block): + # if blocks[0] is of length 0, return empty blocks + if not len(blocks[0]): + blocks = [] + else: + # It's OK if a single block is passed as values, its placement + # is basically "all items", but if there're many, don't bother + # converting, it's an error anyway. + blocks = [make_block(values=blocks[0], + placement=slice(0, len(axes[0])))] + + mgr = BlockManager(blocks, axes) + mgr._consolidate_inplace() + return mgr + + except (ValueError) as e: + blocks = [getattr(b, 'values', b) for b in blocks] + tot_items = sum(b.shape[0] for b in blocks) + construction_error(tot_items, blocks[0].shape[1:], axes, e) + + +def create_block_manager_from_arrays(arrays, names, axes): + + try: + blocks = form_blocks(arrays, names, axes) + mgr = BlockManager(blocks, axes) + mgr._consolidate_inplace() + return mgr + except ValueError as e: + construction_error(len(arrays), arrays[0].shape, axes, e) + + +def construction_error(tot_items, block_shape, axes, e=None): + """ raise a helpful message about our construction """ + passed = tuple(map(int, [tot_items] + list(block_shape))) + implied = tuple(map(int, [len(ax) for ax in axes])) + if passed == implied and e is not None: + raise e + if block_shape[0] == 0: + raise ValueError("Empty data passed with indices specified.") + raise ValueError("Shape of passed values is {0}, indices imply {1}".format( + passed, implied)) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 7fbf7ec05e91e..39418fb72bf4a 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -13,11 +13,12 @@ from pandas import (Index, MultiIndex, DataFrame, DatetimeIndex, Series, Categorical, TimedeltaIndex, SparseArray) from pandas.compat import OrderedDict, lrange -from pandas.core.internals import (BlockPlacement, SingleBlockManager, +from pandas.core.internals import (SingleBlockManager, make_block, BlockManager) import pandas.core.algorithms as algos import pandas.util.testing as tm import pandas as pd +from pandas._libs.internals import BlockPlacement from pandas.util.testing import (assert_almost_equal, assert_frame_equal, randn, assert_series_equal) from pandas.compat import zip, u From d67f05372dedffa7be60b19264d32deaf52a6119 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Fri, 20 Jul 2018 07:18:13 -0700 Subject: [PATCH 2/2] move internals.py to internals/__init__.py unchanged --- pandas/core/internals/__init__.py | 5929 ++++++++++++++++++++++++++++- pandas/core/internals/blocks.py | 3585 ----------------- pandas/core/internals/concat.py | 519 --- pandas/core/internals/managers.py | 1892 --------- 4 files changed, 5910 insertions(+), 6015 deletions(-) delete mode 100644 pandas/core/internals/blocks.py delete mode 100644 pandas/core/internals/concat.py delete mode 100644 pandas/core/internals/managers.py diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index f8c5bc2721ccc..5d16109fba296 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,20 +1,5911 @@ # -*- coding: utf-8 -*- -# flake8: noqa -from .blocks import ( - Block, - ObjectBlock, - DatetimeTZBlock, DatetimeBlock, TimeDeltaBlock, - CategoricalBlock, ExtensionBlock, - SparseBlock, - FloatBlock, IntBlock, - NonConsolidatableMixIn, # used by test_external_block - make_block, # generic, io.packers, io.pytables, test_internals - _block2d_to_blocknd, _factor_indexer, _block_shape, # io.pytables - _safe_reshape) # io.packers - -from .managers import ( - BlockManager, SingleBlockManager, - items_overlap_with_suffix, # reshape.merge - create_block_manager_from_arrays, # core.panel, core.frame, sparse.frame - create_block_manager_from_blocks) # core.frame, core.panel -from .concat import concatenate_block_managers +import warnings +import copy +from warnings import catch_warnings +import inspect +import itertools +import re +import operator +from datetime import datetime, timedelta, date +from collections import defaultdict +from functools import partial + +import numpy as np + +from pandas._libs import internals as libinternals + +from pandas.core.base import PandasObject + +from pandas.core.dtypes.dtypes import ( + ExtensionDtype, DatetimeTZDtype, + PandasExtensionDtype, + CategoricalDtype) +from pandas.core.dtypes.common import ( + _TD_DTYPE, _NS_DTYPE, + ensure_int64, ensure_platform_int, + is_integer, + is_dtype_equal, + is_timedelta64_dtype, + is_datetime64_dtype, is_datetimetz, is_sparse, + is_categorical, is_categorical_dtype, + is_integer_dtype, + is_datetime64tz_dtype, + is_bool_dtype, + is_object_dtype, + is_datetimelike_v_numeric, + is_float_dtype, is_numeric_dtype, + is_numeric_v_string_like, is_extension_type, + is_extension_array_dtype, + is_list_like, + is_re, + is_re_compilable, + is_scalar, + _get_dtype) +from pandas.core.dtypes.cast import ( + maybe_downcast_to_dtype, + maybe_upcast, + maybe_promote, + infer_dtype_from, + infer_dtype_from_scalar, + soft_convert_objects, + maybe_convert_objects, + astype_nansafe, + find_common_type, + maybe_infer_dtype_type) +from pandas.core.dtypes.missing import ( + isna, notna, array_equivalent, + _isna_compat, + is_null_datelike_scalar) +import pandas.core.dtypes.concat as _concat + +from pandas.core.dtypes.generic import ( + ABCSeries, + ABCDatetimeIndex, + ABCExtensionArray, + ABCIndexClass) +import pandas.core.common as com +import pandas.core.algorithms as algos + +from pandas.core.index import Index, MultiIndex, ensure_index +from pandas.core.indexing import maybe_convert_indices, check_setitem_lengths +from pandas.core.arrays import Categorical +from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.indexes.timedeltas import TimedeltaIndex +from pandas.io.formats.printing import pprint_thing + +import pandas.core.missing as missing +from pandas.core.sparse.array import _maybe_to_sparse, SparseArray +from pandas._libs import lib, tslib, tslibs +from pandas._libs.tslibs import conversion, Timedelta +from pandas._libs.internals import BlockPlacement + +from pandas.util._decorators import cache_readonly +from pandas.util._validators import validate_bool_kwarg +from pandas import compat +from pandas.compat import range, map, zip, u + + +class Block(PandasObject): + """ + Canonical n-dimensional unit of homogeneous dtype contained in a pandas + data structure + + Index-ignorant; let the container take care of that + """ + __slots__ = ['_mgr_locs', 'values', 'ndim'] + is_numeric = False + is_float = False + is_integer = False + is_complex = False + is_datetime = False + is_datetimetz = False + is_timedelta = False + is_bool = False + is_object = False + is_categorical = False + is_sparse = False + is_extension = False + _box_to_block_values = True + _can_hold_na = False + _can_consolidate = True + _verify_integrity = True + _validate_ndim = True + _ftype = 'dense' + _concatenator = staticmethod(np.concatenate) + + def __init__(self, values, placement, ndim=None): + self.ndim = self._check_ndim(values, ndim) + self.mgr_locs = placement + self.values = values + + if (self._validate_ndim and self.ndim and + len(self.mgr_locs) != len(self.values)): + raise ValueError( + 'Wrong number of items passed {val}, placement implies ' + '{mgr}'.format(val=len(self.values), mgr=len(self.mgr_locs))) + + def _check_ndim(self, values, ndim): + """ndim inference and validation. + + Infers ndim from 'values' if not provided to __init__. + Validates that values.ndim and ndim are consistent if and only if + the class variable '_validate_ndim' is True. + + Parameters + ---------- + values : array-like + ndim : int or None + + Returns + ------- + ndim : int + + Raises + ------ + ValueError : the number of dimensions do not match + """ + if ndim is None: + ndim = values.ndim + + if self._validate_ndim and values.ndim != ndim: + msg = ("Wrong number of dimensions. values.ndim != ndim " + "[{} != {}]") + raise ValueError(msg.format(values.ndim, ndim)) + + return ndim + + @property + def _holder(self): + """The array-like that can hold the underlying values. + + None for 'Block', overridden by subclasses that don't + use an ndarray. + """ + return None + + @property + def _consolidate_key(self): + return (self._can_consolidate, self.dtype.name) + + @property + def _is_single_block(self): + return self.ndim == 1 + + @property + def is_view(self): + """ return a boolean if I am possibly a view """ + return self.values.base is not None + + @property + def is_datelike(self): + """ return True if I am a non-datelike """ + return self.is_datetime or self.is_timedelta + + def is_categorical_astype(self, dtype): + """ + validate that we have a astypeable to categorical, + returns a boolean if we are a categorical + """ + if dtype is Categorical or dtype is CategoricalDtype: + # this is a pd.Categorical, but is not + # a valid type for astypeing + raise TypeError("invalid type {0} for astype".format(dtype)) + + elif is_categorical_dtype(dtype): + return True + + return False + + def external_values(self, dtype=None): + """ return an outside world format, currently just the ndarray """ + return self.values + + def internal_values(self, dtype=None): + """ return an internal format, currently just the ndarray + this should be the pure internal API format + """ + return self.values + + def formatting_values(self): + """Return the internal values used by the DataFrame/SeriesFormatter""" + return self.internal_values() + + def get_values(self, dtype=None): + """ + return an internal format, currently just the ndarray + this is often overridden to handle to_dense like operations + """ + if is_object_dtype(dtype): + return self.values.astype(object) + return self.values + + def to_dense(self): + return self.values.view() + + @property + def _na_value(self): + return np.nan + + @property + def fill_value(self): + return np.nan + + @property + def mgr_locs(self): + return self._mgr_locs + + @mgr_locs.setter + def mgr_locs(self, new_mgr_locs): + if not isinstance(new_mgr_locs, BlockPlacement): + new_mgr_locs = BlockPlacement(new_mgr_locs) + + self._mgr_locs = new_mgr_locs + + @property + def array_dtype(self): + """ the dtype to return if I want to construct this block as an + array + """ + return self.dtype + + def make_block(self, values, placement=None, ndim=None): + """ + Create a new block, with type inference propagate any values that are + not specified + """ + if placement is None: + placement = self.mgr_locs + if ndim is None: + ndim = self.ndim + + return make_block(values, placement=placement, ndim=ndim) + + def make_block_scalar(self, values): + """ + Create a ScalarBlock + """ + return ScalarBlock(values) + + def make_block_same_class(self, values, placement=None, ndim=None, + dtype=None): + """ Wrap given values in a block of same type as self. """ + if dtype is not None: + # issue 19431 fastparquet is passing this + warnings.warn("dtype argument is deprecated, will be removed " + "in a future release.", DeprecationWarning) + if placement is None: + placement = self.mgr_locs + return make_block(values, placement=placement, ndim=ndim, + klass=self.__class__, dtype=dtype) + + def __unicode__(self): + + # don't want to print out all of the items here + name = pprint_thing(self.__class__.__name__) + if self._is_single_block: + + result = '{name}: {len} dtype: {dtype}'.format( + name=name, len=len(self), dtype=self.dtype) + + else: + + shape = ' x '.join(pprint_thing(s) for s in self.shape) + result = '{name}: {index}, {shape}, dtype: {dtype}'.format( + name=name, index=pprint_thing(self.mgr_locs.indexer), + shape=shape, dtype=self.dtype) + + return result + + def __len__(self): + return len(self.values) + + def __getstate__(self): + return self.mgr_locs.indexer, self.values + + def __setstate__(self, state): + self.mgr_locs = BlockPlacement(state[0]) + self.values = state[1] + self.ndim = self.values.ndim + + def _slice(self, slicer): + """ return a slice of my values """ + return self.values[slicer] + + def reshape_nd(self, labels, shape, ref_items, mgr=None): + """ + Parameters + ---------- + labels : list of new axis labels + shape : new shape + ref_items : new ref_items + + return a new block that is transformed to a nd block + """ + return _block2d_to_blocknd(values=self.get_values().T, + placement=self.mgr_locs, shape=shape, + labels=labels, ref_items=ref_items) + + def getitem_block(self, slicer, new_mgr_locs=None): + """ + Perform __getitem__-like, return result as block. + + As of now, only supports slices that preserve dimensionality. + """ + if new_mgr_locs is None: + if isinstance(slicer, tuple): + axis0_slicer = slicer[0] + else: + axis0_slicer = slicer + new_mgr_locs = self.mgr_locs[axis0_slicer] + + new_values = self._slice(slicer) + + if self._validate_ndim and new_values.ndim != self.ndim: + raise ValueError("Only same dim slicing is allowed") + + return self.make_block_same_class(new_values, new_mgr_locs) + + @property + def shape(self): + return self.values.shape + + @property + def dtype(self): + return self.values.dtype + + @property + def ftype(self): + return "{dtype}:{ftype}".format(dtype=self.dtype, ftype=self._ftype) + + def merge(self, other): + return _merge_blocks([self, other]) + + def concat_same_type(self, to_concat, placement=None): + """ + Concatenate list of single blocks of the same type. + """ + values = self._concatenator([blk.values for blk in to_concat], + axis=self.ndim - 1) + return self.make_block_same_class( + values, placement=placement or slice(0, len(values), 1)) + + def iget(self, i): + return self.values[i] + + def set(self, locs, values, check=False): + """ + Modify Block in-place with new item value + + Returns + ------- + None + """ + self.values[locs] = values + + def delete(self, loc): + """ + Delete given loc(-s) from block in-place. + """ + self.values = np.delete(self.values, loc, 0) + self.mgr_locs = self.mgr_locs.delete(loc) + + def apply(self, func, mgr=None, **kwargs): + """ apply the function to my values; return a block if we are not + one + """ + with np.errstate(all='ignore'): + result = func(self.values, **kwargs) + if not isinstance(result, Block): + result = self.make_block(values=_block_shape(result, + ndim=self.ndim)) + + return result + + def fillna(self, value, limit=None, inplace=False, downcast=None, + mgr=None): + """ fillna on the block with the value. If we fail, then convert to + ObjectBlock and try again + """ + inplace = validate_bool_kwarg(inplace, 'inplace') + + if not self._can_hold_na: + if inplace: + return self + else: + return self.copy() + + mask = isna(self.values) + if limit is not None: + if not is_integer(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') + if self.ndim > 2: + raise NotImplementedError("number of dimensions for 'fillna' " + "is currently limited to 2") + mask[mask.cumsum(self.ndim - 1) > limit] = False + + # fillna, but if we cannot coerce, then try again as an ObjectBlock + try: + values, _, _, _ = self._try_coerce_args(self.values, value) + blocks = self.putmask(mask, value, inplace=inplace) + blocks = [b.make_block(values=self._try_coerce_result(b.values)) + for b in blocks] + return self._maybe_downcast(blocks, downcast) + except (TypeError, ValueError): + + # we can't process the value, but nothing to do + if not mask.any(): + return self if inplace else self.copy() + + # operate column-by-column + def f(m, v, i): + block = self.coerce_to_target_dtype(value) + + # slice out our block + if i is not None: + block = block.getitem_block(slice(i, i + 1)) + return block.fillna(value, + limit=limit, + inplace=inplace, + downcast=None) + + return self.split_and_operate(mask, f, inplace) + + def split_and_operate(self, mask, f, inplace): + """ + split the block per-column, and apply the callable f + per-column, return a new block for each. Handle + masking which will not change a block unless needed. + + Parameters + ---------- + mask : 2-d boolean mask + f : callable accepting (1d-mask, 1d values, indexer) + inplace : boolean + + Returns + ------- + list of blocks + """ + + if mask is None: + mask = np.ones(self.shape, dtype=bool) + new_values = self.values + + def make_a_block(nv, ref_loc): + if isinstance(nv, Block): + block = nv + elif isinstance(nv, list): + block = nv[0] + else: + # Put back the dimension that was taken from it and make + # a block out of the result. + try: + nv = _block_shape(nv, ndim=self.ndim) + except (AttributeError, NotImplementedError): + pass + block = self.make_block(values=nv, + placement=ref_loc) + return block + + # ndim == 1 + if self.ndim == 1: + if mask.any(): + nv = f(mask, new_values, None) + else: + nv = new_values if inplace else new_values.copy() + block = make_a_block(nv, self.mgr_locs) + return [block] + + # ndim > 1 + new_blocks = [] + for i, ref_loc in enumerate(self.mgr_locs): + m = mask[i] + v = new_values[i] + + # need a new block + if m.any(): + nv = f(m, v, i) + else: + nv = v if inplace else v.copy() + + block = make_a_block(nv, [ref_loc]) + new_blocks.append(block) + + return new_blocks + + def _maybe_downcast(self, blocks, downcast=None): + + # no need to downcast our float + # unless indicated + if downcast is None and self.is_float: + return blocks + elif downcast is None and (self.is_timedelta or self.is_datetime): + return blocks + + if not isinstance(blocks, list): + blocks = [blocks] + return _extend_blocks([b.downcast(downcast) for b in blocks]) + + def downcast(self, dtypes=None, mgr=None): + """ try to downcast each item to the dict of dtypes if present """ + + # turn it off completely + if dtypes is False: + return self + + values = self.values + + # single block handling + if self._is_single_block: + + # try to cast all non-floats here + if dtypes is None: + dtypes = 'infer' + + nv = maybe_downcast_to_dtype(values, dtypes) + return self.make_block(nv) + + # ndim > 1 + if dtypes is None: + return self + + if not (dtypes == 'infer' or isinstance(dtypes, dict)): + raise ValueError("downcast must have a dictionary or 'infer' as " + "its argument") + + # operate column-by-column + # this is expensive as it splits the blocks items-by-item + def f(m, v, i): + + if dtypes == 'infer': + dtype = 'infer' + else: + raise AssertionError("dtypes as dict is not supported yet") + + if dtype is not None: + v = maybe_downcast_to_dtype(v, dtype) + return v + + return self.split_and_operate(None, f, False) + + def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs): + return self._astype(dtype, copy=copy, errors=errors, values=values, + **kwargs) + + def _astype(self, dtype, copy=False, errors='raise', values=None, + klass=None, mgr=None, **kwargs): + """Coerce to the new type + + Parameters + ---------- + dtype : str, dtype convertible + copy : boolean, default False + copy if indicated + errors : str, {'raise', 'ignore'}, default 'ignore' + - ``raise`` : allow exceptions to be raised + - ``ignore`` : suppress exceptions. On error return original object + + Returns + ------- + Block + """ + errors_legal_values = ('raise', 'ignore') + + if errors not in errors_legal_values: + invalid_arg = ("Expected value of kwarg 'errors' to be one of {}. " + "Supplied value is '{}'".format( + list(errors_legal_values), errors)) + raise ValueError(invalid_arg) + + if (inspect.isclass(dtype) and + issubclass(dtype, (PandasExtensionDtype, ExtensionDtype))): + msg = ("Expected an instance of {}, but got the class instead. " + "Try instantiating 'dtype'.".format(dtype.__name__)) + raise TypeError(msg) + + # may need to convert to categorical + if self.is_categorical_astype(dtype): + + # deprecated 17636 + if ('categories' in kwargs or 'ordered' in kwargs): + if isinstance(dtype, CategoricalDtype): + raise TypeError( + "Cannot specify a CategoricalDtype and also " + "`categories` or `ordered`. Use " + "`dtype=CategoricalDtype(categories, ordered)`" + " instead.") + warnings.warn("specifying 'categories' or 'ordered' in " + ".astype() is deprecated; pass a " + "CategoricalDtype instead", + FutureWarning, stacklevel=7) + + categories = kwargs.get('categories', None) + ordered = kwargs.get('ordered', None) + if com._any_not_none(categories, ordered): + dtype = CategoricalDtype(categories, ordered) + + if is_categorical_dtype(self.values): + # GH 10696/18593: update an existing categorical efficiently + return self.make_block(self.values.astype(dtype, copy=copy)) + + return self.make_block(Categorical(self.values, dtype=dtype)) + + # astype processing + if not is_extension_array_dtype(dtype): + dtype = np.dtype(dtype) + if is_dtype_equal(self.dtype, dtype): + if copy: + return self.copy() + return self + + if klass is None: + if dtype == np.object_: + klass = ObjectBlock + try: + # force the copy here + if values is None: + + if issubclass(dtype.type, + (compat.text_type, compat.string_types)): + + # use native type formatting for datetime/tz/timedelta + if self.is_datelike: + values = self.to_native_types() + + # astype formatting + else: + values = self.get_values() + + else: + values = self.get_values(dtype=dtype) + + # _astype_nansafe works fine with 1-d only + values = astype_nansafe(values.ravel(), dtype, copy=True) + + # TODO(extension) + # should we make this attribute? + try: + values = values.reshape(self.shape) + except AttributeError: + pass + + newb = make_block(values, placement=self.mgr_locs, + klass=klass) + except: + if errors == 'raise': + raise + newb = self.copy() if copy else self + + if newb.is_numeric and self.is_numeric: + if newb.shape != self.shape: + raise TypeError( + "cannot set astype for copy = [{copy}] for dtype " + "({dtype} [{itemsize}]) with smaller itemsize than " + "current ({newb_dtype} [{newb_size}])".format( + copy=copy, dtype=self.dtype.name, + itemsize=self.itemsize, newb_dtype=newb.dtype.name, + newb_size=newb.itemsize)) + return newb + + def convert(self, copy=True, **kwargs): + """ attempt to coerce any object types to better types return a copy + of the block (if copy = True) by definition we are not an ObjectBlock + here! + """ + + return self.copy() if copy else self + + def _can_hold_element(self, element): + """ require the same dtype as ourselves """ + dtype = self.values.dtype.type + tipo = maybe_infer_dtype_type(element) + if tipo is not None: + return issubclass(tipo.type, dtype) + return isinstance(element, dtype) + + def _try_cast_result(self, result, dtype=None): + """ try to cast the result to our original type, we may have + roundtripped thru object in the mean-time + """ + if dtype is None: + dtype = self.dtype + + if self.is_integer or self.is_bool or self.is_datetime: + pass + elif self.is_float and result.dtype == self.dtype: + + # protect against a bool/object showing up here + if isinstance(dtype, compat.string_types) and dtype == 'infer': + return result + if not isinstance(dtype, type): + dtype = dtype.type + if issubclass(dtype, (np.bool_, np.object_)): + if issubclass(dtype, np.bool_): + if isna(result).all(): + return result.astype(np.bool_) + else: + result = result.astype(np.object_) + result[result == 1] = True + result[result == 0] = False + return result + else: + return result.astype(np.object_) + + return result + + # may need to change the dtype here + return maybe_downcast_to_dtype(result, dtype) + + def _try_coerce_args(self, values, other): + """ provide coercion to our input arguments """ + + if np.any(notna(other)) and not self._can_hold_element(other): + # coercion issues + # let higher levels handle + raise TypeError("cannot convert {} to an {}".format( + type(other).__name__, + type(self).__name__.lower().replace('Block', ''))) + + return values, False, other, False + + def _try_coerce_result(self, result): + """ reverse of try_coerce_args """ + return result + + def _try_coerce_and_cast_result(self, result, dtype=None): + result = self._try_coerce_result(result) + result = self._try_cast_result(result, dtype=dtype) + return result + + def to_native_types(self, slicer=None, na_rep='nan', quoting=None, + **kwargs): + """ convert to our native types format, slicing if desired """ + + values = self.get_values() + + if slicer is not None: + values = values[:, slicer] + mask = isna(values) + + if not self.is_object and not quoting: + values = values.astype(str) + else: + values = np.array(values, dtype='object') + + values[mask] = na_rep + return values + + # block actions #### + def copy(self, deep=True, mgr=None): + """ copy constructor """ + values = self.values + if deep: + values = values.copy() + return self.make_block_same_class(values) + + def replace(self, to_replace, value, inplace=False, filter=None, + regex=False, convert=True, mgr=None): + """ replace the to_replace value with value, possible to create new + blocks here this is just a call to putmask. regex is not used here. + It is used in ObjectBlocks. It is here for API + compatibility. + """ + + inplace = validate_bool_kwarg(inplace, 'inplace') + original_to_replace = to_replace + + # try to replace, if we raise an error, convert to ObjectBlock and + # retry + try: + values, _, to_replace, _ = self._try_coerce_args(self.values, + to_replace) + mask = missing.mask_missing(values, to_replace) + if filter is not None: + filtered_out = ~self.mgr_locs.isin(filter) + mask[filtered_out.nonzero()[0]] = False + + blocks = self.putmask(mask, value, inplace=inplace) + if convert: + blocks = [b.convert(by_item=True, numeric=False, + copy=not inplace) for b in blocks] + return blocks + except (TypeError, ValueError): + + # try again with a compatible block + block = self.astype(object) + return block.replace( + to_replace=original_to_replace, value=value, inplace=inplace, + filter=filter, regex=regex, convert=convert) + + def _replace_single(self, *args, **kwargs): + """ no-op on a non-ObjectBlock """ + return self if kwargs['inplace'] else self.copy() + + def setitem(self, indexer, value, mgr=None): + """Set the value inplace, returning a a maybe different typed block. + + Parameters + ---------- + indexer : tuple, list-like, array-like, slice + The subset of self.values to set + value : object + The value being set + mgr : BlockPlacement, optional + + Returns + ------- + Block + + Notes + ----- + `indexer` is a direct slice/positional indexer. `value` must + be a compatible shape. + """ + # coerce None values, if appropriate + if value is None: + if self.is_numeric: + value = np.nan + + # coerce if block dtype can store value + values = self.values + try: + values, _, value, _ = self._try_coerce_args(values, value) + # can keep its own dtype + if hasattr(value, 'dtype') and is_dtype_equal(values.dtype, + value.dtype): + dtype = self.dtype + else: + dtype = 'infer' + + except (TypeError, ValueError): + # current dtype cannot store value, coerce to common dtype + find_dtype = False + + if hasattr(value, 'dtype'): + dtype = value.dtype + find_dtype = True + + elif is_scalar(value): + if isna(value): + # NaN promotion is handled in latter path + dtype = False + else: + dtype, _ = infer_dtype_from_scalar(value, + pandas_dtype=True) + find_dtype = True + else: + dtype = 'infer' + + if find_dtype: + dtype = find_common_type([values.dtype, dtype]) + if not is_dtype_equal(self.dtype, dtype): + b = self.astype(dtype) + return b.setitem(indexer, value, mgr=mgr) + + # value must be storeable at this moment + arr_value = np.array(value) + + # cast the values to a type that can hold nan (if necessary) + if not self._can_hold_element(value): + dtype, _ = maybe_promote(arr_value.dtype) + values = values.astype(dtype) + + transf = (lambda x: x.T) if self.ndim == 2 else (lambda x: x) + values = transf(values) + + # length checking + check_setitem_lengths(indexer, value, values) + + def _is_scalar_indexer(indexer): + # return True if we are all scalar indexers + + if arr_value.ndim == 1: + if not isinstance(indexer, tuple): + indexer = tuple([indexer]) + return any(isinstance(idx, np.ndarray) and len(idx) == 0 + for idx in indexer) + return False + + def _is_empty_indexer(indexer): + # return a boolean if we have an empty indexer + + if is_list_like(indexer) and not len(indexer): + return True + if arr_value.ndim == 1: + if not isinstance(indexer, tuple): + indexer = tuple([indexer]) + return any(isinstance(idx, np.ndarray) and len(idx) == 0 + for idx in indexer) + return False + + # empty indexers + # 8669 (empty) + if _is_empty_indexer(indexer): + pass + + # setting a single element for each dim and with a rhs that could + # be say a list + # GH 6043 + elif _is_scalar_indexer(indexer): + values[indexer] = value + + # if we are an exact match (ex-broadcasting), + # then use the resultant dtype + elif (len(arr_value.shape) and + arr_value.shape[0] == values.shape[0] and + np.prod(arr_value.shape) == np.prod(values.shape)): + values[indexer] = value + try: + values = values.astype(arr_value.dtype) + except ValueError: + pass + + # set + else: + values[indexer] = value + + # coerce and try to infer the dtypes of the result + values = self._try_coerce_and_cast_result(values, dtype) + block = self.make_block(transf(values)) + return block + + def putmask(self, mask, new, align=True, inplace=False, axis=0, + transpose=False, mgr=None): + """ putmask the data to the block; it is possible that we may create a + new dtype of block + + return the resulting block(s) + + Parameters + ---------- + mask : the condition to respect + new : a ndarray/object + align : boolean, perform alignment on other/cond, default is True + inplace : perform inplace modification, default is False + axis : int + transpose : boolean + Set to True if self is stored with axes reversed + + Returns + ------- + a list of new blocks, the result of the putmask + """ + + new_values = self.values if inplace else self.values.copy() + + new = getattr(new, 'values', new) + mask = getattr(mask, 'values', mask) + + # if we are passed a scalar None, convert it here + if not is_list_like(new) and isna(new) and not self.is_object: + new = self.fill_value + + if self._can_hold_element(new): + _, _, new, _ = self._try_coerce_args(new_values, new) + + if transpose: + new_values = new_values.T + + # If the default repeat behavior in np.putmask would go in the + # wrong direction, then explicitly repeat and reshape new instead + if getattr(new, 'ndim', 0) >= 1: + if self.ndim - 1 == new.ndim and axis == 1: + new = np.repeat( + new, new_values.shape[-1]).reshape(self.shape) + new = new.astype(new_values.dtype) + + # we require exact matches between the len of the + # values we are setting (or is compat). np.putmask + # doesn't check this and will simply truncate / pad + # the output, but we want sane error messages + # + # TODO: this prob needs some better checking + # for 2D cases + if ((is_list_like(new) and + np.any(mask[mask]) and + getattr(new, 'ndim', 1) == 1)): + + if not (mask.shape[-1] == len(new) or + mask[mask].shape[-1] == len(new) or + len(new) == 1): + raise ValueError("cannot assign mismatch " + "length to masked array") + + np.putmask(new_values, mask, new) + + # maybe upcast me + elif mask.any(): + if transpose: + mask = mask.T + if isinstance(new, np.ndarray): + new = new.T + axis = new_values.ndim - axis - 1 + + # Pseudo-broadcast + if getattr(new, 'ndim', 0) >= 1: + if self.ndim - 1 == new.ndim: + new_shape = list(new.shape) + new_shape.insert(axis, 1) + new = new.reshape(tuple(new_shape)) + + # operate column-by-column + def f(m, v, i): + + if i is None: + # ndim==1 case. + n = new + else: + + if isinstance(new, np.ndarray): + n = np.squeeze(new[i % new.shape[0]]) + else: + n = np.array(new) + + # type of the new block + dtype, _ = maybe_promote(n.dtype) + + # we need to explicitly astype here to make a copy + n = n.astype(dtype) + + nv = _putmask_smart(v, m, n) + return nv + + new_blocks = self.split_and_operate(mask, f, inplace) + return new_blocks + + if inplace: + return [self] + + if transpose: + new_values = new_values.T + + return [self.make_block(new_values)] + + def coerce_to_target_dtype(self, other): + """ + coerce the current block to a dtype compat for other + we will return a block, possibly object, and not raise + + we can also safely try to coerce to the same dtype + and will receive the same block + """ + + # if we cannot then coerce to object + dtype, _ = infer_dtype_from(other, pandas_dtype=True) + + if is_dtype_equal(self.dtype, dtype): + return self + + if self.is_bool or is_object_dtype(dtype) or is_bool_dtype(dtype): + # we don't upcast to bool + return self.astype(object) + + elif ((self.is_float or self.is_complex) and + (is_integer_dtype(dtype) or is_float_dtype(dtype))): + # don't coerce float/complex to int + return self + + elif (self.is_datetime or + is_datetime64_dtype(dtype) or + is_datetime64tz_dtype(dtype)): + + # not a datetime + if not ((is_datetime64_dtype(dtype) or + is_datetime64tz_dtype(dtype)) and self.is_datetime): + return self.astype(object) + + # don't upcast timezone with different timezone or no timezone + mytz = getattr(self.dtype, 'tz', None) + othertz = getattr(dtype, 'tz', None) + + if str(mytz) != str(othertz): + return self.astype(object) + + raise AssertionError("possible recursion in " + "coerce_to_target_dtype: {} {}".format( + self, other)) + + elif (self.is_timedelta or is_timedelta64_dtype(dtype)): + + # not a timedelta + if not (is_timedelta64_dtype(dtype) and self.is_timedelta): + return self.astype(object) + + raise AssertionError("possible recursion in " + "coerce_to_target_dtype: {} {}".format( + self, other)) + + try: + return self.astype(dtype) + except (ValueError, TypeError): + pass + + return self.astype(object) + + def interpolate(self, method='pad', axis=0, index=None, values=None, + inplace=False, limit=None, limit_direction='forward', + limit_area=None, fill_value=None, coerce=False, + downcast=None, mgr=None, **kwargs): + + inplace = validate_bool_kwarg(inplace, 'inplace') + + def check_int_bool(self, inplace): + # Only FloatBlocks will contain NaNs. + # timedelta subclasses IntBlock + if (self.is_bool or self.is_integer) and not self.is_timedelta: + if inplace: + return self + else: + return self.copy() + + # a fill na type method + try: + m = missing.clean_fill_method(method) + except: + m = None + + if m is not None: + r = check_int_bool(self, inplace) + if r is not None: + return r + return self._interpolate_with_fill(method=m, axis=axis, + inplace=inplace, limit=limit, + fill_value=fill_value, + coerce=coerce, + downcast=downcast, mgr=mgr) + # try an interp method + try: + m = missing.clean_interp_method(method, **kwargs) + except: + m = None + + if m is not None: + r = check_int_bool(self, inplace) + if r is not None: + return r + return self._interpolate(method=m, index=index, values=values, + axis=axis, limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + fill_value=fill_value, inplace=inplace, + downcast=downcast, mgr=mgr, **kwargs) + + raise ValueError("invalid method '{0}' to interpolate.".format(method)) + + def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, + limit=None, fill_value=None, coerce=False, + downcast=None, mgr=None): + """ fillna but using the interpolate machinery """ + + inplace = validate_bool_kwarg(inplace, 'inplace') + + # if we are coercing, then don't force the conversion + # if the block can't hold the type + if coerce: + if not self._can_hold_na: + if inplace: + return [self] + else: + return [self.copy()] + + values = self.values if inplace else self.values.copy() + values, _, fill_value, _ = self._try_coerce_args(values, fill_value) + values = missing.interpolate_2d(values, method=method, axis=axis, + limit=limit, fill_value=fill_value, + dtype=self.dtype) + values = self._try_coerce_result(values) + + blocks = [self.make_block_same_class(values, ndim=self.ndim)] + return self._maybe_downcast(blocks, downcast) + + def _interpolate(self, method=None, index=None, values=None, + fill_value=None, axis=0, limit=None, + limit_direction='forward', limit_area=None, + inplace=False, downcast=None, mgr=None, **kwargs): + """ interpolate using scipy wrappers """ + + inplace = validate_bool_kwarg(inplace, 'inplace') + data = self.values if inplace else self.values.copy() + + # only deal with floats + if not self.is_float: + if not self.is_integer: + return self + data = data.astype(np.float64) + + if fill_value is None: + fill_value = self.fill_value + + if method in ('krogh', 'piecewise_polynomial', 'pchip'): + if not index.is_monotonic: + raise ValueError("{0} interpolation requires that the " + "index be monotonic.".format(method)) + # process 1-d slices in the axis direction + + def func(x): + + # process a 1-d slice, returning it + # should the axis argument be handled below in apply_along_axis? + # i.e. not an arg to missing.interpolate_1d + return missing.interpolate_1d(index, x, method=method, limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + fill_value=fill_value, + bounds_error=False, **kwargs) + + # interp each column independently + interp_values = np.apply_along_axis(func, axis, data) + + blocks = [self.make_block_same_class(interp_values)] + return self._maybe_downcast(blocks, downcast) + + def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): + """ + Take values according to indexer and return them as a block.bb + + """ + + # algos.take_nd dispatches for DatetimeTZBlock, CategoricalBlock + # so need to preserve types + # sparse is treated like an ndarray, but needs .get_values() shaping + + values = self.values + if self.is_sparse: + values = self.get_values() + + if fill_tuple is None: + fill_value = self.fill_value + new_values = algos.take_nd(values, indexer, axis=axis, + allow_fill=False) + else: + fill_value = fill_tuple[0] + new_values = algos.take_nd(values, indexer, axis=axis, + allow_fill=True, fill_value=fill_value) + + if new_mgr_locs is None: + if axis == 0: + slc = libinternals.indexer_as_slice(indexer) + if slc is not None: + new_mgr_locs = self.mgr_locs[slc] + else: + new_mgr_locs = self.mgr_locs[indexer] + else: + new_mgr_locs = self.mgr_locs + + if not is_dtype_equal(new_values.dtype, self.dtype): + return self.make_block(new_values, new_mgr_locs) + else: + return self.make_block_same_class(new_values, new_mgr_locs) + + def diff(self, n, axis=1, mgr=None): + """ return block for the diff of the values """ + new_values = algos.diff(self.values, n, axis=axis) + return [self.make_block(values=new_values)] + + def shift(self, periods, axis=0, mgr=None): + """ shift the block by periods, possibly upcast """ + + # convert integer to float if necessary. need to do a lot more than + # that, handle boolean etc also + new_values, fill_value = maybe_upcast(self.values) + + # make sure array sent to np.roll is c_contiguous + f_ordered = new_values.flags.f_contiguous + if f_ordered: + new_values = new_values.T + axis = new_values.ndim - axis - 1 + + if np.prod(new_values.shape): + new_values = np.roll(new_values, ensure_platform_int(periods), + axis=axis) + + axis_indexer = [slice(None)] * self.ndim + if periods > 0: + axis_indexer[axis] = slice(None, periods) + else: + axis_indexer[axis] = slice(periods, None) + new_values[tuple(axis_indexer)] = fill_value + + # restore original order + if f_ordered: + new_values = new_values.T + + return [self.make_block(new_values)] + + def eval(self, func, other, errors='raise', try_cast=False, mgr=None): + """ + evaluate the block; return result block from the result + + Parameters + ---------- + func : how to combine self, other + other : a ndarray/object + errors : str, {'raise', 'ignore'}, default 'raise' + - ``raise`` : allow exceptions to be raised + - ``ignore`` : suppress exceptions. On error return original object + + try_cast : try casting the results to the input type + + Returns + ------- + a new block, the result of the func + """ + orig_other = other + values = self.values + + other = getattr(other, 'values', other) + + # make sure that we can broadcast + is_transposed = False + if hasattr(other, 'ndim') and hasattr(values, 'ndim'): + if values.ndim != other.ndim: + is_transposed = True + else: + if values.shape == other.shape[::-1]: + is_transposed = True + elif values.shape[0] == other.shape[-1]: + is_transposed = True + else: + # this is a broadcast error heree + raise ValueError( + "cannot broadcast shape [{t_shape}] with " + "block values [{oth_shape}]".format( + t_shape=values.T.shape, oth_shape=other.shape)) + + transf = (lambda x: x.T) if is_transposed else (lambda x: x) + + # coerce/transpose the args if needed + try: + values, values_mask, other, other_mask = self._try_coerce_args( + transf(values), other) + except TypeError: + block = self.coerce_to_target_dtype(orig_other) + return block.eval(func, orig_other, + errors=errors, + try_cast=try_cast, mgr=mgr) + + # get the result, may need to transpose the other + def get_result(other): + + # avoid numpy warning of comparisons again None + if other is None: + result = not func.__name__ == 'eq' + + # avoid numpy warning of elementwise comparisons to object + elif is_numeric_v_string_like(values, other): + result = False + + # avoid numpy warning of elementwise comparisons + elif func.__name__ == 'eq': + if is_list_like(other) and not isinstance(other, np.ndarray): + other = np.asarray(other) + + # if we can broadcast, then ok + if values.shape[-1] != other.shape[-1]: + return False + result = func(values, other) + else: + result = func(values, other) + + # mask if needed + if isinstance(values_mask, np.ndarray) and values_mask.any(): + result = result.astype('float64', copy=False) + result[values_mask] = np.nan + if other_mask is True: + result = result.astype('float64', copy=False) + result[:] = np.nan + elif isinstance(other_mask, np.ndarray) and other_mask.any(): + result = result.astype('float64', copy=False) + result[other_mask.ravel()] = np.nan + + return result + + # error handler if we have an issue operating with the function + def handle_error(): + + if errors == 'raise': + # The 'detail' variable is defined in outer scope. + raise TypeError( + 'Could not operate {other!r} with block values ' + '{detail!s}'.format(other=other, detail=detail)) # noqa + else: + # return the values + result = np.empty(values.shape, dtype='O') + result.fill(np.nan) + return result + + # get the result + try: + with np.errstate(all='ignore'): + result = get_result(other) + + # if we have an invalid shape/broadcast error + # GH4576, so raise instead of allowing to pass through + except ValueError as detail: + raise + except Exception as detail: + result = handle_error() + + # technically a broadcast error in numpy can 'work' by returning a + # boolean False + if not isinstance(result, np.ndarray): + if not isinstance(result, np.ndarray): + + # differentiate between an invalid ndarray-ndarray comparison + # and an invalid type comparison + if isinstance(values, np.ndarray) and is_list_like(other): + raise ValueError( + 'Invalid broadcasting comparison [{other!r}] with ' + 'block values'.format(other=other)) + + raise TypeError('Could not compare [{other!r}] ' + 'with block values'.format(other=other)) + + # transpose if needed + result = transf(result) + + # try to cast if requested + if try_cast: + result = self._try_cast_result(result) + + result = _block_shape(result, ndim=self.ndim) + return [self.make_block(result)] + + def where(self, other, cond, align=True, errors='raise', + try_cast=False, axis=0, transpose=False, mgr=None): + """ + evaluate the block; return result block(s) from the result + + Parameters + ---------- + other : a ndarray/object + cond : the condition to respect + align : boolean, perform alignment on other/cond + errors : str, {'raise', 'ignore'}, default 'raise' + - ``raise`` : allow exceptions to be raised + - ``ignore`` : suppress exceptions. On error return original object + + axis : int + transpose : boolean + Set to True if self is stored with axes reversed + + Returns + ------- + a new block(s), the result of the func + """ + import pandas.core.computation.expressions as expressions + assert errors in ['raise', 'ignore'] + + values = self.values + orig_other = other + if transpose: + values = values.T + + other = getattr(other, '_values', getattr(other, 'values', other)) + cond = getattr(cond, 'values', cond) + + # If the default broadcasting would go in the wrong direction, then + # explicitly reshape other instead + if getattr(other, 'ndim', 0) >= 1: + if values.ndim - 1 == other.ndim and axis == 1: + other = other.reshape(tuple(other.shape + (1, ))) + elif transpose and values.ndim == self.ndim - 1: + cond = cond.T + + if not hasattr(cond, 'shape'): + raise ValueError("where must have a condition that is ndarray " + "like") + + # our where function + def func(cond, values, other): + if cond.ravel().all(): + return values + + values, values_mask, other, other_mask = self._try_coerce_args( + values, other) + + try: + return self._try_coerce_result(expressions.where( + cond, values, other)) + except Exception as detail: + if errors == 'raise': + raise TypeError( + 'Could not operate [{other!r}] with block values ' + '[{detail!s}]'.format(other=other, detail=detail)) + else: + # return the values + result = np.empty(values.shape, dtype='float64') + result.fill(np.nan) + return result + + # see if we can operate on the entire block, or need item-by-item + # or if we are a single block (ndim == 1) + try: + result = func(cond, values, other) + except TypeError: + + # we cannot coerce, return a compat dtype + # we are explicitly ignoring errors + block = self.coerce_to_target_dtype(other) + blocks = block.where(orig_other, cond, align=align, + errors=errors, + try_cast=try_cast, axis=axis, + transpose=transpose) + return self._maybe_downcast(blocks, 'infer') + + if self._can_hold_na or self.ndim == 1: + + if transpose: + result = result.T + + # try to cast if requested + if try_cast: + result = self._try_cast_result(result) + + return self.make_block(result) + + # might need to separate out blocks + axis = cond.ndim - 1 + cond = cond.swapaxes(axis, 0) + mask = np.array([cond[i].all() for i in range(cond.shape[0])], + dtype=bool) + + result_blocks = [] + for m in [mask, ~mask]: + if m.any(): + r = self._try_cast_result(result.take(m.nonzero()[0], + axis=axis)) + result_blocks.append( + self.make_block(r.T, placement=self.mgr_locs[m])) + + return result_blocks + + def equals(self, other): + if self.dtype != other.dtype or self.shape != other.shape: + return False + return array_equivalent(self.values, other.values) + + def _unstack(self, unstacker_func, new_columns): + """Return a list of unstacked blocks of self + + Parameters + ---------- + unstacker_func : callable + Partially applied unstacker. + new_columns : Index + All columns of the unstacked BlockManager. + + Returns + ------- + blocks : list of Block + New blocks of unstacked values. + mask : array_like of bool + The mask of columns of `blocks` we should keep. + """ + unstacker = unstacker_func(self.values.T) + new_items = unstacker.get_new_columns() + new_placement = new_columns.get_indexer(new_items) + new_values, mask = unstacker.get_new_values() + + mask = mask.any(0) + new_values = new_values.T[mask] + new_placement = new_placement[mask] + + blocks = [make_block(new_values, placement=new_placement)] + return blocks, mask + + def quantile(self, qs, interpolation='linear', axis=0, mgr=None): + """ + compute the quantiles of the + + Parameters + ---------- + qs: a scalar or list of the quantiles to be computed + interpolation: type of interpolation, default 'linear' + axis: axis to compute, default 0 + + Returns + ------- + tuple of (axis, block) + + """ + kw = {'interpolation': interpolation} + values = self.get_values() + values, _, _, _ = self._try_coerce_args(values, values) + + def _nanpercentile1D(values, mask, q, **kw): + values = values[~mask] + + if len(values) == 0: + if is_scalar(q): + return self._na_value + else: + return np.array([self._na_value] * len(q), + dtype=values.dtype) + + return np.percentile(values, q, **kw) + + def _nanpercentile(values, q, axis, **kw): + + mask = isna(self.values) + if not is_scalar(mask) and mask.any(): + if self.ndim == 1: + return _nanpercentile1D(values, mask, q, **kw) + else: + # for nonconsolidatable blocks mask is 1D, but values 2D + if mask.ndim < values.ndim: + mask = mask.reshape(values.shape) + if axis == 0: + values = values.T + mask = mask.T + result = [_nanpercentile1D(val, m, q, **kw) for (val, m) + in zip(list(values), list(mask))] + result = np.array(result, dtype=values.dtype, copy=False).T + return result + else: + return np.percentile(values, q, axis=axis, **kw) + + from pandas import Float64Index + is_empty = values.shape[axis] == 0 + if is_list_like(qs): + ax = Float64Index(qs) + + if is_empty: + if self.ndim == 1: + result = self._na_value + else: + # create the array of na_values + # 2d len(values) * len(qs) + result = np.repeat(np.array([self._na_value] * len(qs)), + len(values)).reshape(len(values), + len(qs)) + else: + + try: + result = _nanpercentile(values, np.array(qs) * 100, + axis=axis, **kw) + except ValueError: + + # older numpies don't handle an array for q + result = [_nanpercentile(values, q * 100, + axis=axis, **kw) for q in qs] + + result = np.array(result, copy=False) + if self.ndim > 1: + result = result.T + + else: + + if self.ndim == 1: + ax = Float64Index([qs]) + else: + ax = mgr.axes[0] + + if is_empty: + if self.ndim == 1: + result = self._na_value + else: + result = np.array([self._na_value] * len(self)) + else: + result = _nanpercentile(values, qs * 100, axis=axis, **kw) + + ndim = getattr(result, 'ndim', None) or 0 + result = self._try_coerce_result(result) + if is_scalar(result): + return ax, self.make_block_scalar(result) + return ax, make_block(result, + placement=np.arange(len(result)), + ndim=ndim) + + +class ScalarBlock(Block): + """ + a scalar compat Block + """ + __slots__ = ['_mgr_locs', 'values', 'ndim'] + + def __init__(self, values): + self.ndim = 0 + self.mgr_locs = [0] + self.values = values + + @property + def dtype(self): + return type(self.values) + + @property + def shape(self): + return tuple([0]) + + def __len__(self): + return 0 + + +class NonConsolidatableMixIn(object): + """ hold methods for the nonconsolidatable blocks """ + _can_consolidate = False + _verify_integrity = False + _validate_ndim = False + + def __init__(self, values, placement, ndim=None): + """Initialize a non-consolidatable block. + + 'ndim' may be inferred from 'placement'. + + This will call continue to call __init__ for the other base + classes mixed in with this Mixin. + """ + # Placement must be converted to BlockPlacement so that we can check + # its length + if not isinstance(placement, BlockPlacement): + placement = BlockPlacement(placement) + + # Maybe infer ndim from placement + if ndim is None: + if len(placement) != 1: + ndim = 1 + else: + ndim = 2 + super(NonConsolidatableMixIn, self).__init__(values, placement, + ndim=ndim) + + @property + def shape(self): + if self.ndim == 1: + return (len(self.values)), + return (len(self.mgr_locs), len(self.values)) + + def get_values(self, dtype=None): + """ need to to_dense myself (and always return a ndim sized object) """ + values = self.values.to_dense() + if values.ndim == self.ndim - 1: + values = values.reshape((1,) + values.shape) + return values + + def iget(self, col): + + if self.ndim == 2 and isinstance(col, tuple): + col, loc = col + if not com.is_null_slice(col) and col != 0: + raise IndexError("{0} only contains one item".format(self)) + return self.values[loc] + else: + if col != 0: + raise IndexError("{0} only contains one item".format(self)) + return self.values + + def should_store(self, value): + return isinstance(value, self._holder) + + def set(self, locs, values, check=False): + assert locs.tolist() == [0] + self.values = values + + def putmask(self, mask, new, align=True, inplace=False, axis=0, + transpose=False, mgr=None): + """ + putmask the data to the block; we must be a single block and not + generate other blocks + + return the resulting block + + Parameters + ---------- + mask : the condition to respect + new : a ndarray/object + align : boolean, perform alignment on other/cond, default is True + inplace : perform inplace modification, default is False + + Returns + ------- + a new block, the result of the putmask + """ + inplace = validate_bool_kwarg(inplace, 'inplace') + + # use block's copy logic. + # .values may be an Index which does shallow copy by default + new_values = self.values if inplace else self.copy().values + new_values, _, new, _ = self._try_coerce_args(new_values, new) + + if isinstance(new, np.ndarray) and len(new) == len(mask): + new = new[mask] + + mask = _safe_reshape(mask, new_values.shape) + + new_values[mask] = new + new_values = self._try_coerce_result(new_values) + return [self.make_block(values=new_values)] + + def _slice(self, slicer): + """ return a slice of my values (but densify first) """ + return self.get_values()[slicer] + + def _try_cast_result(self, result, dtype=None): + return result + + def _unstack(self, unstacker_func, new_columns): + """Return a list of unstacked blocks of self + + Parameters + ---------- + unstacker_func : callable + Partially applied unstacker. + new_columns : Index + All columns of the unstacked BlockManager. + + Returns + ------- + blocks : list of Block + New blocks of unstacked values. + mask : array_like of bool + The mask of columns of `blocks` we should keep. + """ + # NonConsolidatable blocks can have a single item only, so we return + # one block per item + unstacker = unstacker_func(self.values.T) + new_items = unstacker.get_new_columns() + new_placement = new_columns.get_indexer(new_items) + new_values, mask = unstacker.get_new_values() + + mask = mask.any(0) + new_values = new_values.T[mask] + new_placement = new_placement[mask] + + blocks = [self.make_block_same_class(vals, [place]) + for vals, place in zip(new_values, new_placement)] + return blocks, mask + + +class ExtensionBlock(NonConsolidatableMixIn, Block): + """Block for holding extension types. + + Notes + ----- + This holds all 3rd-party extension array types. It's also the immediate + parent class for our internal extension types' blocks, CategoricalBlock. + + ExtensionArrays are limited to 1-D. + """ + is_extension = True + + def __init__(self, values, placement, ndim=None): + values = self._maybe_coerce_values(values) + super(ExtensionBlock, self).__init__(values, placement, ndim) + + def _maybe_coerce_values(self, values): + """Unbox to an extension array. + + This will unbox an ExtensionArray stored in an Index or Series. + ExtensionArrays pass through. No dtype coercion is done. + + Parameters + ---------- + values : Index, Series, ExtensionArray + + Returns + ------- + ExtensionArray + """ + if isinstance(values, (ABCIndexClass, ABCSeries)): + values = values._values + return values + + @property + def _holder(self): + # For extension blocks, the holder is values-dependent. + return type(self.values) + + @property + def fill_value(self): + # Used in reindex_indexer + return self.values.dtype.na_value + + @property + def _can_hold_na(self): + # The default ExtensionArray._can_hold_na is True + return self._holder._can_hold_na + + @property + def is_view(self): + """Extension arrays are never treated as views.""" + return False + + def setitem(self, indexer, value, mgr=None): + """Set the value inplace, returning a same-typed block. + + This differs from Block.setitem by not allowing setitem to change + the dtype of the Block. + + Parameters + ---------- + indexer : tuple, list-like, array-like, slice + The subset of self.values to set + value : object + The value being set + mgr : BlockPlacement, optional + + Returns + ------- + Block + + Notes + ----- + `indexer` is a direct slice/positional indexer. `value` must + be a compatible shape. + """ + if isinstance(indexer, tuple): + # we are always 1-D + indexer = indexer[0] + + check_setitem_lengths(indexer, value, self.values) + self.values[indexer] = value + return self + + def get_values(self, dtype=None): + # ExtensionArrays must be iterable, so this works. + values = np.asarray(self.values) + if values.ndim == self.ndim - 1: + values = values.reshape((1,) + values.shape) + return values + + def to_dense(self): + return np.asarray(self.values) + + def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): + """ + Take values according to indexer and return them as a block. + """ + if fill_tuple is None: + fill_value = None + else: + fill_value = fill_tuple[0] + + # axis doesn't matter; we are really a single-dim object + # but are passed the axis depending on the calling routing + # if its REALLY axis 0, then this will be a reindex and not a take + new_values = self.values.take(indexer, fill_value=fill_value, + allow_fill=True) + + # if we are a 1-dim object, then always place at 0 + if self.ndim == 1: + new_mgr_locs = [0] + else: + if new_mgr_locs is None: + new_mgr_locs = self.mgr_locs + + return self.make_block_same_class(new_values, new_mgr_locs) + + def _can_hold_element(self, element): + # XXX: We may need to think about pushing this onto the array. + # We're doing the same as CategoricalBlock here. + return True + + def _slice(self, slicer): + """ return a slice of my values """ + + # slice the category + # return same dims as we currently have + + if isinstance(slicer, tuple) and len(slicer) == 2: + if not com.is_null_slice(slicer[0]): + raise AssertionError("invalid slicing for a 1-ndim " + "categorical") + slicer = slicer[1] + + return self.values[slicer] + + def formatting_values(self): + return self.values._formatting_values() + + def concat_same_type(self, to_concat, placement=None): + """ + Concatenate list of single blocks of the same type. + """ + values = self._holder._concat_same_type( + [blk.values for blk in to_concat]) + placement = placement or slice(0, len(values), 1) + return self.make_block_same_class(values, ndim=self.ndim, + placement=placement) + + def fillna(self, value, limit=None, inplace=False, downcast=None, + mgr=None): + values = self.values if inplace else self.values.copy() + values = values.fillna(value=value, limit=limit) + return [self.make_block_same_class(values=values, + placement=self.mgr_locs, + ndim=self.ndim)] + + def interpolate(self, method='pad', axis=0, inplace=False, limit=None, + fill_value=None, **kwargs): + + values = self.values if inplace else self.values.copy() + return self.make_block_same_class( + values=values.fillna(value=fill_value, method=method, + limit=limit), + placement=self.mgr_locs) + + +class NumericBlock(Block): + __slots__ = () + is_numeric = True + _can_hold_na = True + + +class FloatOrComplexBlock(NumericBlock): + __slots__ = () + + def equals(self, other): + if self.dtype != other.dtype or self.shape != other.shape: + return False + left, right = self.values, other.values + return ((left == right) | (np.isnan(left) & np.isnan(right))).all() + + +class FloatBlock(FloatOrComplexBlock): + __slots__ = () + is_float = True + + def _can_hold_element(self, element): + tipo = maybe_infer_dtype_type(element) + if tipo is not None: + return (issubclass(tipo.type, (np.floating, np.integer)) and + not issubclass(tipo.type, (np.datetime64, np.timedelta64))) + return ( + isinstance( + element, (float, int, np.floating, np.int_, compat.long)) + and not isinstance(element, (bool, np.bool_, datetime, timedelta, + np.datetime64, np.timedelta64))) + + def to_native_types(self, slicer=None, na_rep='', float_format=None, + decimal='.', quoting=None, **kwargs): + """ convert to our native types format, slicing if desired """ + + values = self.values + if slicer is not None: + values = values[:, slicer] + + # see gh-13418: no special formatting is desired at the + # output (important for appropriate 'quoting' behaviour), + # so do not pass it through the FloatArrayFormatter + if float_format is None and decimal == '.': + mask = isna(values) + + if not quoting: + values = values.astype(str) + else: + values = np.array(values, dtype='object') + + values[mask] = na_rep + return values + + from pandas.io.formats.format import FloatArrayFormatter + formatter = FloatArrayFormatter(values, na_rep=na_rep, + float_format=float_format, + decimal=decimal, quoting=quoting, + fixed_width=False) + return formatter.get_result_as_array() + + def should_store(self, value): + # when inserting a column should not coerce integers to floats + # unnecessarily + return (issubclass(value.dtype.type, np.floating) and + value.dtype == self.dtype) + + +class ComplexBlock(FloatOrComplexBlock): + __slots__ = () + is_complex = True + + def _can_hold_element(self, element): + tipo = maybe_infer_dtype_type(element) + if tipo is not None: + return issubclass(tipo.type, + (np.floating, np.integer, np.complexfloating)) + return ( + isinstance( + element, + (float, int, complex, np.float_, np.int_, compat.long)) + and not isinstance(element, (bool, np.bool_))) + + def should_store(self, value): + return issubclass(value.dtype.type, np.complexfloating) + + +class IntBlock(NumericBlock): + __slots__ = () + is_integer = True + _can_hold_na = False + + def _can_hold_element(self, element): + tipo = maybe_infer_dtype_type(element) + if tipo is not None: + return (issubclass(tipo.type, np.integer) and + not issubclass(tipo.type, (np.datetime64, + np.timedelta64)) and + self.dtype.itemsize >= tipo.itemsize) + return is_integer(element) + + def should_store(self, value): + return is_integer_dtype(value) and value.dtype == self.dtype + + +class DatetimeLikeBlockMixin(object): + """Mixin class for DatetimeBlock and DatetimeTZBlock.""" + + @property + def _holder(self): + return DatetimeIndex + + @property + def _na_value(self): + return tslibs.NaT + + @property + def fill_value(self): + return tslibs.iNaT + + def get_values(self, dtype=None): + """ + return object dtype as boxed values, such as Timestamps/Timedelta + """ + if is_object_dtype(dtype): + return lib.map_infer(self.values.ravel(), + self._box_func).reshape(self.values.shape) + return self.values + + +class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): + __slots__ = () + is_timedelta = True + _can_hold_na = True + is_numeric = False + + def __init__(self, values, placement, ndim=None): + if values.dtype != _TD_DTYPE: + values = conversion.ensure_timedelta64ns(values) + + super(TimeDeltaBlock, self).__init__(values, + placement=placement, ndim=ndim) + + @property + def _holder(self): + return TimedeltaIndex + + @property + def _box_func(self): + return lambda x: Timedelta(x, unit='ns') + + def _can_hold_element(self, element): + tipo = maybe_infer_dtype_type(element) + if tipo is not None: + return issubclass(tipo.type, np.timedelta64) + return is_integer(element) or isinstance( + element, (timedelta, np.timedelta64)) + + def fillna(self, value, **kwargs): + + # allow filling with integers to be + # interpreted as seconds + if is_integer(value) and not isinstance(value, np.timedelta64): + value = Timedelta(value, unit='s') + return super(TimeDeltaBlock, self).fillna(value, **kwargs) + + def _try_coerce_args(self, values, other): + """ + Coerce values and other to int64, with null values converted to + iNaT. values is always ndarray-like, other may not be + + Parameters + ---------- + values : ndarray-like + other : ndarray-like or scalar + + Returns + ------- + base-type values, values mask, base-type other, other mask + """ + + values_mask = isna(values) + values = values.view('i8') + other_mask = False + + if isinstance(other, bool): + raise TypeError + elif is_null_datelike_scalar(other): + other = tslibs.iNaT + other_mask = True + elif isinstance(other, Timedelta): + other_mask = isna(other) + other = other.value + elif isinstance(other, timedelta): + other = Timedelta(other).value + elif isinstance(other, np.timedelta64): + other_mask = isna(other) + other = Timedelta(other).value + elif hasattr(other, 'dtype') and is_timedelta64_dtype(other): + other_mask = isna(other) + other = other.astype('i8', copy=False).view('i8') + else: + # coercion issues + # let higher levels handle + raise TypeError + + return values, values_mask, other, other_mask + + def _try_coerce_result(self, result): + """ reverse of try_coerce_args / try_operate """ + if isinstance(result, np.ndarray): + mask = isna(result) + if result.dtype.kind in ['i', 'f', 'O']: + result = result.astype('m8[ns]') + result[mask] = tslibs.iNaT + elif isinstance(result, (np.integer, np.float)): + result = self._box_func(result) + return result + + def should_store(self, value): + return issubclass(value.dtype.type, np.timedelta64) + + def to_native_types(self, slicer=None, na_rep=None, quoting=None, + **kwargs): + """ convert to our native types format, slicing if desired """ + + values = self.values + if slicer is not None: + values = values[:, slicer] + mask = isna(values) + + rvalues = np.empty(values.shape, dtype=object) + if na_rep is None: + na_rep = 'NaT' + rvalues[mask] = na_rep + imask = (~mask).ravel() + + # FIXME: + # should use the formats.format.Timedelta64Formatter here + # to figure what format to pass to the Timedelta + # e.g. to not show the decimals say + rvalues.flat[imask] = np.array([Timedelta(val)._repr_base(format='all') + for val in values.ravel()[imask]], + dtype=object) + return rvalues + + +class BoolBlock(NumericBlock): + __slots__ = () + is_bool = True + _can_hold_na = False + + def _can_hold_element(self, element): + tipo = maybe_infer_dtype_type(element) + if tipo is not None: + return issubclass(tipo.type, np.bool_) + return isinstance(element, (bool, np.bool_)) + + def should_store(self, value): + return issubclass(value.dtype.type, np.bool_) + + def replace(self, to_replace, value, inplace=False, filter=None, + regex=False, convert=True, mgr=None): + inplace = validate_bool_kwarg(inplace, 'inplace') + to_replace_values = np.atleast_1d(to_replace) + if not np.can_cast(to_replace_values, bool): + return self + return super(BoolBlock, self).replace(to_replace, value, + inplace=inplace, filter=filter, + regex=regex, convert=convert, + mgr=mgr) + + +class ObjectBlock(Block): + __slots__ = () + is_object = True + _can_hold_na = True + + def __init__(self, values, placement=None, ndim=2): + if issubclass(values.dtype.type, compat.string_types): + values = np.array(values, dtype=object) + + super(ObjectBlock, self).__init__(values, ndim=ndim, + placement=placement) + + @property + def is_bool(self): + """ we can be a bool if we have only bool values but are of type + object + """ + return lib.is_bool_array(self.values.ravel()) + + # TODO: Refactor when convert_objects is removed since there will be 1 path + def convert(self, *args, **kwargs): + """ attempt to coerce any object types to better types return a copy of + the block (if copy = True) by definition we ARE an ObjectBlock!!!!! + + can return multiple blocks! + """ + + if args: + raise NotImplementedError + by_item = True if 'by_item' not in kwargs else kwargs['by_item'] + + new_inputs = ['coerce', 'datetime', 'numeric', 'timedelta'] + new_style = False + for kw in new_inputs: + new_style |= kw in kwargs + + if new_style: + fn = soft_convert_objects + fn_inputs = new_inputs + else: + fn = maybe_convert_objects + fn_inputs = ['convert_dates', 'convert_numeric', + 'convert_timedeltas'] + fn_inputs += ['copy'] + + fn_kwargs = {} + for key in fn_inputs: + if key in kwargs: + fn_kwargs[key] = kwargs[key] + + # operate column-by-column + def f(m, v, i): + shape = v.shape + values = fn(v.ravel(), **fn_kwargs) + try: + values = values.reshape(shape) + values = _block_shape(values, ndim=self.ndim) + except (AttributeError, NotImplementedError): + pass + + return values + + if by_item and not self._is_single_block: + blocks = self.split_and_operate(None, f, False) + else: + values = f(None, self.values.ravel(), None) + blocks = [make_block(values, ndim=self.ndim, + placement=self.mgr_locs)] + + return blocks + + def set(self, locs, values, check=False): + """ + Modify Block in-place with new item value + + Returns + ------- + None + """ + + # GH6026 + if check: + try: + if (self.values[locs] == values).all(): + return + except: + pass + try: + self.values[locs] = values + except (ValueError): + + # broadcasting error + # see GH6171 + new_shape = list(values.shape) + new_shape[0] = len(self.items) + self.values = np.empty(tuple(new_shape), dtype=self.dtype) + self.values.fill(np.nan) + self.values[locs] = values + + def _maybe_downcast(self, blocks, downcast=None): + + if downcast is not None: + return blocks + + # split and convert the blocks + return _extend_blocks([b.convert(datetime=True, numeric=False) + for b in blocks]) + + def _can_hold_element(self, element): + return True + + def _try_coerce_args(self, values, other): + """ provide coercion to our input arguments """ + + if isinstance(other, ABCDatetimeIndex): + # to store DatetimeTZBlock as object + other = other.astype(object).values + + return values, False, other, False + + def should_store(self, value): + return not (issubclass(value.dtype.type, + (np.integer, np.floating, np.complexfloating, + np.datetime64, np.bool_)) or + # TODO(ExtensionArray): remove is_extension_type + # when all extension arrays have been ported. + is_extension_type(value) or + is_extension_array_dtype(value)) + + def replace(self, to_replace, value, inplace=False, filter=None, + regex=False, convert=True, mgr=None): + to_rep_is_list = is_list_like(to_replace) + value_is_list = is_list_like(value) + both_lists = to_rep_is_list and value_is_list + either_list = to_rep_is_list or value_is_list + + result_blocks = [] + blocks = [self] + + if not either_list and is_re(to_replace): + return self._replace_single(to_replace, value, inplace=inplace, + filter=filter, regex=True, + convert=convert, mgr=mgr) + elif not (either_list or regex): + return super(ObjectBlock, self).replace(to_replace, value, + inplace=inplace, + filter=filter, regex=regex, + convert=convert, mgr=mgr) + elif both_lists: + for to_rep, v in zip(to_replace, value): + result_blocks = [] + for b in blocks: + result = b._replace_single(to_rep, v, inplace=inplace, + filter=filter, regex=regex, + convert=convert, mgr=mgr) + result_blocks = _extend_blocks(result, result_blocks) + blocks = result_blocks + return result_blocks + + elif to_rep_is_list and regex: + for to_rep in to_replace: + result_blocks = [] + for b in blocks: + result = b._replace_single(to_rep, value, inplace=inplace, + filter=filter, regex=regex, + convert=convert, mgr=mgr) + result_blocks = _extend_blocks(result, result_blocks) + blocks = result_blocks + return result_blocks + + return self._replace_single(to_replace, value, inplace=inplace, + filter=filter, convert=convert, + regex=regex, mgr=mgr) + + def _replace_single(self, to_replace, value, inplace=False, filter=None, + regex=False, convert=True, mgr=None): + + inplace = validate_bool_kwarg(inplace, 'inplace') + + # to_replace is regex compilable + to_rep_re = regex and is_re_compilable(to_replace) + + # regex is regex compilable + regex_re = is_re_compilable(regex) + + # only one will survive + if to_rep_re and regex_re: + raise AssertionError('only one of to_replace and regex can be ' + 'regex compilable') + + # if regex was passed as something that can be a regex (rather than a + # boolean) + if regex_re: + to_replace = regex + + regex = regex_re or to_rep_re + + # try to get the pattern attribute (compiled re) or it's a string + try: + pattern = to_replace.pattern + except AttributeError: + pattern = to_replace + + # if the pattern is not empty and to_replace is either a string or a + # regex + if regex and pattern: + rx = re.compile(to_replace) + else: + # if the thing to replace is not a string or compiled regex call + # the superclass method -> to_replace is some kind of object + return super(ObjectBlock, self).replace(to_replace, value, + inplace=inplace, + filter=filter, regex=regex, + mgr=mgr) + + new_values = self.values if inplace else self.values.copy() + + # deal with replacing values with objects (strings) that match but + # whose replacement is not a string (numeric, nan, object) + if isna(value) or not isinstance(value, compat.string_types): + + def re_replacer(s): + try: + return value if rx.search(s) is not None else s + except TypeError: + return s + else: + # value is guaranteed to be a string here, s can be either a string + # or null if it's null it gets returned + def re_replacer(s): + try: + return rx.sub(value, s) + except TypeError: + return s + + f = np.vectorize(re_replacer, otypes=[self.dtype]) + + if filter is None: + filt = slice(None) + else: + filt = self.mgr_locs.isin(filter).nonzero()[0] + + new_values[filt] = f(new_values[filt]) + + # convert + block = self.make_block(new_values) + if convert: + block = block.convert(by_item=True, numeric=False) + + return block + + +class CategoricalBlock(ExtensionBlock): + __slots__ = () + is_categorical = True + _verify_integrity = True + _can_hold_na = True + _concatenator = staticmethod(_concat._concat_categorical) + + def __init__(self, values, placement, ndim=None): + from pandas.core.arrays.categorical import _maybe_to_categorical + + # coerce to categorical if we can + super(CategoricalBlock, self).__init__(_maybe_to_categorical(values), + placement=placement, + ndim=ndim) + + @property + def _holder(self): + return Categorical + + @property + def array_dtype(self): + """ the dtype to return if I want to construct this block as an + array + """ + return np.object_ + + def _try_coerce_result(self, result): + """ reverse of try_coerce_args """ + + # GH12564: CategoricalBlock is 1-dim only + # while returned results could be any dim + if ((not is_categorical_dtype(result)) and + isinstance(result, np.ndarray)): + result = _block_shape(result, ndim=self.ndim) + + return result + + def shift(self, periods, axis=0, mgr=None): + return self.make_block_same_class(values=self.values.shift(periods), + placement=self.mgr_locs) + + def to_dense(self): + # Categorical.get_values returns a DatetimeIndex for datetime + # categories, so we can't simply use `np.asarray(self.values)` like + # other types. + return self.values.get_values() + + def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): + """ convert to our native types format, slicing if desired """ + + values = self.values + if slicer is not None: + # Categorical is always one dimension + values = values[slicer] + mask = isna(values) + values = np.array(values, dtype='object') + values[mask] = na_rep + + # we are expected to return a 2-d ndarray + return values.reshape(1, len(values)) + + def concat_same_type(self, to_concat, placement=None): + """ + Concatenate list of single blocks of the same type. + + Note that this CategoricalBlock._concat_same_type *may* not + return a CategoricalBlock. When the categories in `to_concat` + differ, this will return an object ndarray. + + If / when we decide we don't like that behavior: + + 1. Change Categorical._concat_same_type to use union_categoricals + 2. Delete this method. + """ + values = self._concatenator([blk.values for blk in to_concat], + axis=self.ndim - 1) + # not using self.make_block_same_class as values can be object dtype + return make_block( + values, placement=placement or slice(0, len(values), 1), + ndim=self.ndim) + + +class DatetimeBlock(DatetimeLikeBlockMixin, Block): + __slots__ = () + is_datetime = True + _can_hold_na = True + + def __init__(self, values, placement, ndim=None): + values = self._maybe_coerce_values(values) + super(DatetimeBlock, self).__init__(values, + placement=placement, ndim=ndim) + + def _maybe_coerce_values(self, values): + """Input validation for values passed to __init__. Ensure that + we have datetime64ns, coercing if necessary. + + Parameters + ---------- + values : array-like + Must be convertible to datetime64 + + Returns + ------- + values : ndarray[datetime64ns] + + Overridden by DatetimeTZBlock. + """ + if values.dtype != _NS_DTYPE: + values = conversion.ensure_datetime64ns(values) + return values + + def _astype(self, dtype, mgr=None, **kwargs): + """ + these automatically copy, so copy=True has no effect + raise on an except if raise == True + """ + + # if we are passed a datetime64[ns, tz] + if is_datetime64tz_dtype(dtype): + dtype = DatetimeTZDtype(dtype) + + values = self.values + if getattr(values, 'tz', None) is None: + values = DatetimeIndex(values).tz_localize('UTC') + values = values.tz_convert(dtype.tz) + return self.make_block(values) + + # delegate + return super(DatetimeBlock, self)._astype(dtype=dtype, **kwargs) + + def _can_hold_element(self, element): + tipo = maybe_infer_dtype_type(element) + if tipo is not None: + # TODO: this still uses asarray, instead of dtype.type + element = np.array(element) + return element.dtype == _NS_DTYPE or element.dtype == np.int64 + return (is_integer(element) or isinstance(element, datetime) or + isna(element)) + + def _try_coerce_args(self, values, other): + """ + Coerce values and other to dtype 'i8'. NaN and NaT convert to + the smallest i8, and will correctly round-trip to NaT if converted + back in _try_coerce_result. values is always ndarray-like, other + may not be + + Parameters + ---------- + values : ndarray-like + other : ndarray-like or scalar + + Returns + ------- + base-type values, values mask, base-type other, other mask + """ + + values_mask = isna(values) + values = values.view('i8') + other_mask = False + + if isinstance(other, bool): + raise TypeError + elif is_null_datelike_scalar(other): + other = tslibs.iNaT + other_mask = True + elif isinstance(other, (datetime, np.datetime64, date)): + other = self._box_func(other) + if getattr(other, 'tz') is not None: + raise TypeError("cannot coerce a Timestamp with a tz on a " + "naive Block") + other_mask = isna(other) + other = other.asm8.view('i8') + elif hasattr(other, 'dtype') and is_datetime64_dtype(other): + other_mask = isna(other) + other = other.astype('i8', copy=False).view('i8') + else: + # coercion issues + # let higher levels handle + raise TypeError + + return values, values_mask, other, other_mask + + def _try_coerce_result(self, result): + """ reverse of try_coerce_args """ + if isinstance(result, np.ndarray): + if result.dtype.kind in ['i', 'f', 'O']: + try: + result = result.astype('M8[ns]') + except ValueError: + pass + elif isinstance(result, (np.integer, np.float, np.datetime64)): + result = self._box_func(result) + return result + + @property + def _box_func(self): + return tslibs.Timestamp + + def to_native_types(self, slicer=None, na_rep=None, date_format=None, + quoting=None, **kwargs): + """ convert to our native types format, slicing if desired """ + + values = self.values + if slicer is not None: + values = values[..., slicer] + + from pandas.io.formats.format import _get_format_datetime64_from_values + format = _get_format_datetime64_from_values(values, date_format) + + result = tslib.format_array_from_datetime( + values.view('i8').ravel(), tz=getattr(self.values, 'tz', None), + format=format, na_rep=na_rep).reshape(values.shape) + return np.atleast_2d(result) + + def should_store(self, value): + return (issubclass(value.dtype.type, np.datetime64) and + not is_datetimetz(value)) + + def set(self, locs, values, check=False): + """ + Modify Block in-place with new item value + + Returns + ------- + None + """ + if values.dtype != _NS_DTYPE: + # Workaround for numpy 1.6 bug + values = conversion.ensure_datetime64ns(values) + + self.values[locs] = values + + +class DatetimeTZBlock(NonConsolidatableMixIn, DatetimeBlock): + """ implement a datetime64 block with a tz attribute """ + __slots__ = () + _concatenator = staticmethod(_concat._concat_datetime) + is_datetimetz = True + + def __init__(self, values, placement, ndim=2, dtype=None): + # XXX: This will end up calling _maybe_coerce_values twice + # when dtype is not None. It's relatively cheap (just an isinstance) + # but it'd nice to avoid. + # + # If we can remove dtype from __init__, and push that conversion + # push onto the callers, then we can remove this entire __init__ + # and just use DatetimeBlock's. + if dtype is not None: + values = self._maybe_coerce_values(values, dtype=dtype) + super(DatetimeTZBlock, self).__init__(values, placement=placement, + ndim=ndim) + + def _maybe_coerce_values(self, values, dtype=None): + """Input validation for values passed to __init__. Ensure that + we have datetime64TZ, coercing if necessary. + + Parametetrs + ----------- + values : array-like + Must be convertible to datetime64 + dtype : string or DatetimeTZDtype, optional + Does a shallow copy to this tz + + Returns + ------- + values : ndarray[datetime64ns] + """ + if not isinstance(values, self._holder): + values = self._holder(values) + + if dtype is not None: + if isinstance(dtype, compat.string_types): + dtype = DatetimeTZDtype.construct_from_string(dtype) + values = values._shallow_copy(tz=dtype.tz) + + if values.tz is None: + raise ValueError("cannot create a DatetimeTZBlock without a tz") + + return values + + @property + def is_view(self): + """ return a boolean if I am possibly a view """ + # check the ndarray values of the DatetimeIndex values + return self.values.values.base is not None + + def copy(self, deep=True, mgr=None): + """ copy constructor """ + values = self.values + if deep: + values = values.copy(deep=True) + return self.make_block_same_class(values) + + def external_values(self): + """ we internally represent the data as a DatetimeIndex, but for + external compat with ndarray, export as a ndarray of Timestamps + """ + return self.values.astype('datetime64[ns]').values + + def get_values(self, dtype=None): + # return object dtype as Timestamps with the zones + if is_object_dtype(dtype): + return lib.map_infer( + self.values.ravel(), self._box_func).reshape(self.values.shape) + return self.values + + def _slice(self, slicer): + """ return a slice of my values """ + if isinstance(slicer, tuple): + col, loc = slicer + if not com.is_null_slice(col) and col != 0: + raise IndexError("{0} only contains one item".format(self)) + return self.values[loc] + return self.values[slicer] + + def _try_coerce_args(self, values, other): + """ + localize and return i8 for the values + + Parameters + ---------- + values : ndarray-like + other : ndarray-like or scalar + + Returns + ------- + base-type values, values mask, base-type other, other mask + """ + values_mask = _block_shape(isna(values), ndim=self.ndim) + # asi8 is a view, needs copy + values = _block_shape(values.asi8, ndim=self.ndim) + other_mask = False + + if isinstance(other, ABCSeries): + other = self._holder(other) + other_mask = isna(other) + + if isinstance(other, bool): + raise TypeError + elif (is_null_datelike_scalar(other) or + (is_scalar(other) and isna(other))): + other = tslibs.iNaT + other_mask = True + elif isinstance(other, self._holder): + if other.tz != self.values.tz: + raise ValueError("incompatible or non tz-aware value") + other_mask = _block_shape(isna(other), ndim=self.ndim) + other = _block_shape(other.asi8, ndim=self.ndim) + elif isinstance(other, (np.datetime64, datetime, date)): + other = tslibs.Timestamp(other) + tz = getattr(other, 'tz', None) + + # test we can have an equal time zone + if tz is None or str(tz) != str(self.values.tz): + raise ValueError("incompatible or non tz-aware value") + other_mask = isna(other) + other = other.value + else: + raise TypeError + + return values, values_mask, other, other_mask + + def _try_coerce_result(self, result): + """ reverse of try_coerce_args """ + if isinstance(result, np.ndarray): + if result.dtype.kind in ['i', 'f', 'O']: + result = result.astype('M8[ns]') + elif isinstance(result, (np.integer, np.float, np.datetime64)): + result = tslibs.Timestamp(result, tz=self.values.tz) + if isinstance(result, np.ndarray): + # allow passing of > 1dim if its trivial + if result.ndim > 1: + result = result.reshape(np.prod(result.shape)) + result = self.values._shallow_copy(result) + + return result + + @property + def _box_func(self): + return lambda x: tslibs.Timestamp(x, tz=self.dtype.tz) + + def shift(self, periods, axis=0, mgr=None): + """ shift the block by periods """ + + # think about moving this to the DatetimeIndex. This is a non-freq + # (number of periods) shift ### + + N = len(self) + indexer = np.zeros(N, dtype=int) + if periods > 0: + indexer[periods:] = np.arange(N - periods) + else: + indexer[:periods] = np.arange(-periods, N) + + new_values = self.values.asi8.take(indexer) + + if periods > 0: + new_values[:periods] = tslibs.iNaT + else: + new_values[periods:] = tslibs.iNaT + + new_values = self.values._shallow_copy(new_values) + return [self.make_block_same_class(new_values, + placement=self.mgr_locs)] + + def diff(self, n, axis=0, mgr=None): + """1st discrete difference + + Parameters + ---------- + n : int, number of periods to diff + axis : int, axis to diff upon. default 0 + mgr : default None + + Return + ------ + A list with a new TimeDeltaBlock. + + Note + ---- + The arguments here are mimicking shift so they are called correctly + by apply. + """ + if axis == 0: + # Cannot currently calculate diff across multiple blocks since this + # function is invoked via apply + raise NotImplementedError + new_values = (self.values - self.shift(n, axis=axis)[0].values).asi8 + + # Reshape the new_values like how algos.diff does for timedelta data + new_values = new_values.reshape(1, len(new_values)) + new_values = new_values.astype('timedelta64[ns]') + return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] + + def concat_same_type(self, to_concat, placement=None): + """ + Concatenate list of single blocks of the same type. + """ + values = self._concatenator([blk.values for blk in to_concat], + axis=self.ndim - 1) + # not using self.make_block_same_class as values can be non-tz dtype + return make_block( + values, placement=placement or slice(0, len(values), 1)) + + +class SparseBlock(NonConsolidatableMixIn, Block): + """ implement as a list of sparse arrays of the same dtype """ + __slots__ = () + is_sparse = True + is_numeric = True + _box_to_block_values = False + _can_hold_na = True + _ftype = 'sparse' + _concatenator = staticmethod(_concat._concat_sparse) + + def __init__(self, values, placement, ndim=None): + # Ensure that we have the underlying SparseArray here... + if isinstance(values, ABCSeries): + values = values.values + assert isinstance(values, SparseArray) + super(SparseBlock, self).__init__(values, placement, ndim=ndim) + + @property + def _holder(self): + return SparseArray + + @property + def shape(self): + return (len(self.mgr_locs), self.sp_index.length) + + @property + def fill_value(self): + # return np.nan + return self.values.fill_value + + @fill_value.setter + def fill_value(self, v): + self.values.fill_value = v + + def to_dense(self): + return self.values.to_dense().view() + + @property + def sp_values(self): + return self.values.sp_values + + @sp_values.setter + def sp_values(self, v): + # reset the sparse values + self.values = SparseArray(v, sparse_index=self.sp_index, + kind=self.kind, dtype=v.dtype, + fill_value=self.values.fill_value, + copy=False) + + @property + def sp_index(self): + return self.values.sp_index + + @property + def kind(self): + return self.values.kind + + def _astype(self, dtype, copy=False, errors='raise', values=None, + klass=None, mgr=None, **kwargs): + if values is None: + values = self.values + values = values.astype(dtype, copy=copy) + return self.make_block_same_class(values=values, + placement=self.mgr_locs) + + def __len__(self): + try: + return self.sp_index.length + except: + return 0 + + def copy(self, deep=True, mgr=None): + return self.make_block_same_class(values=self.values, + sparse_index=self.sp_index, + kind=self.kind, copy=deep, + placement=self.mgr_locs) + + def make_block_same_class(self, values, placement, sparse_index=None, + kind=None, dtype=None, fill_value=None, + copy=False, ndim=None): + """ return a new block """ + if dtype is None: + dtype = values.dtype + if fill_value is None and not isinstance(values, SparseArray): + fill_value = self.values.fill_value + + # if not isinstance(values, SparseArray) and values.ndim != self.ndim: + # raise ValueError("ndim mismatch") + + if values.ndim == 2: + nitems = values.shape[0] + + if nitems == 0: + # kludgy, but SparseBlocks cannot handle slices, where the + # output is 0-item, so let's convert it to a dense block: it + # won't take space since there's 0 items, plus it will preserve + # the dtype. + return self.make_block(np.empty(values.shape, dtype=dtype), + placement) + elif nitems > 1: + raise ValueError("Only 1-item 2d sparse blocks are supported") + else: + values = values.reshape(values.shape[1]) + + new_values = SparseArray(values, sparse_index=sparse_index, + kind=kind or self.kind, dtype=dtype, + fill_value=fill_value, copy=copy) + return self.make_block(new_values, + placement=placement) + + def interpolate(self, method='pad', axis=0, inplace=False, limit=None, + fill_value=None, **kwargs): + + values = missing.interpolate_2d(self.values.to_dense(), method, axis, + limit, fill_value) + return self.make_block_same_class(values=values, + placement=self.mgr_locs) + + def fillna(self, value, limit=None, inplace=False, downcast=None, + mgr=None): + # we may need to upcast our fill to match our dtype + if limit is not None: + raise NotImplementedError("specifying a limit for 'fillna' has " + "not been implemented yet") + values = self.values if inplace else self.values.copy() + values = values.fillna(value, downcast=downcast) + return [self.make_block_same_class(values=values, + placement=self.mgr_locs)] + + def shift(self, periods, axis=0, mgr=None): + """ shift the block by periods """ + N = len(self.values.T) + indexer = np.zeros(N, dtype=int) + if periods > 0: + indexer[periods:] = np.arange(N - periods) + else: + indexer[:periods] = np.arange(-periods, N) + new_values = self.values.to_dense().take(indexer) + # convert integer to float if necessary. need to do a lot more than + # that, handle boolean etc also + new_values, fill_value = maybe_upcast(new_values) + if periods > 0: + new_values[:periods] = fill_value + else: + new_values[periods:] = fill_value + return [self.make_block_same_class(new_values, + placement=self.mgr_locs)] + + def sparse_reindex(self, new_index): + """ sparse reindex and return a new block + current reindex only works for float64 dtype! """ + values = self.values + values = values.sp_index.to_int_index().reindex( + values.sp_values.astype('float64'), values.fill_value, new_index) + return self.make_block_same_class(values, sparse_index=new_index, + placement=self.mgr_locs) + + +def get_block_type(values, dtype=None): + """ + Find the appropriate Block subclass to use for the given values and dtype. + + Parameters + ---------- + values : ndarray-like + dtype : numpy or pandas dtype + + Returns + ------- + cls : class, subclass of Block + """ + dtype = dtype or values.dtype + vtype = dtype.type + + if is_sparse(values): + cls = SparseBlock + elif issubclass(vtype, np.floating): + cls = FloatBlock + elif issubclass(vtype, np.timedelta64): + assert issubclass(vtype, np.integer) + cls = TimeDeltaBlock + elif issubclass(vtype, np.complexfloating): + cls = ComplexBlock + elif is_categorical(values): + cls = CategoricalBlock + elif is_extension_array_dtype(values): + cls = ExtensionBlock + elif issubclass(vtype, np.datetime64): + assert not is_datetimetz(values) + cls = DatetimeBlock + elif is_datetimetz(values): + cls = DatetimeTZBlock + elif issubclass(vtype, np.integer): + cls = IntBlock + elif dtype == np.bool_: + cls = BoolBlock + else: + cls = ObjectBlock + return cls + + +def make_block(values, placement, klass=None, ndim=None, dtype=None, + fastpath=None): + if fastpath is not None: + # GH#19265 pyarrow is passing this + warnings.warn("fastpath argument is deprecated, will be removed " + "in a future release.", DeprecationWarning) + if klass is None: + dtype = dtype or values.dtype + klass = get_block_type(values, dtype) + + elif klass is DatetimeTZBlock and not is_datetimetz(values): + return klass(values, ndim=ndim, + placement=placement, dtype=dtype) + + return klass(values, ndim=ndim, placement=placement) + +# TODO: flexible with index=None and/or items=None + + +class BlockManager(PandasObject): + """ + Core internal data structure to implement DataFrame, Series, Panel, etc. + + Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a + lightweight blocked set of labeled data to be manipulated by the DataFrame + public API class + + Attributes + ---------- + shape + ndim + axes + values + items + + Methods + ------- + set_axis(axis, new_labels) + copy(deep=True) + + get_dtype_counts + get_ftype_counts + get_dtypes + get_ftypes + + apply(func, axes, block_filter_fn) + + get_bool_data + get_numeric_data + + get_slice(slice_like, axis) + get(label) + iget(loc) + get_scalar(label_tup) + + take(indexer, axis) + reindex_axis(new_labels, axis) + reindex_indexer(new_labels, indexer, axis) + + delete(label) + insert(loc, label, value) + set(label, value) + + Parameters + ---------- + + + Notes + ----- + This is *not* a public API class + """ + __slots__ = ['axes', 'blocks', '_ndim', '_shape', '_known_consolidated', + '_is_consolidated', '_blknos', '_blklocs'] + + def __init__(self, blocks, axes, do_integrity_check=True): + self.axes = [ensure_index(ax) for ax in axes] + self.blocks = tuple(blocks) + + for block in blocks: + if block.is_sparse: + if len(block.mgr_locs) != 1: + raise AssertionError("Sparse block refers to multiple " + "items") + else: + if self.ndim != block.ndim: + raise AssertionError( + 'Number of Block dimensions ({block}) must equal ' + 'number of axes ({self})'.format(block=block.ndim, + self=self.ndim)) + + if do_integrity_check: + self._verify_integrity() + + self._consolidate_check() + + self._rebuild_blknos_and_blklocs() + + def make_empty(self, axes=None): + """ return an empty BlockManager with the items axis of len 0 """ + if axes is None: + axes = [ensure_index([])] + [ensure_index(a) + for a in self.axes[1:]] + + # preserve dtype if possible + if self.ndim == 1: + blocks = np.array([], dtype=self.array_dtype) + else: + blocks = [] + return self.__class__(blocks, axes) + + def __nonzero__(self): + return True + + # Python3 compat + __bool__ = __nonzero__ + + @property + def shape(self): + return tuple(len(ax) for ax in self.axes) + + @property + def ndim(self): + return len(self.axes) + + def set_axis(self, axis, new_labels): + new_labels = ensure_index(new_labels) + old_len = len(self.axes[axis]) + new_len = len(new_labels) + + if new_len != old_len: + raise ValueError( + 'Length mismatch: Expected axis has {old} elements, new ' + 'values have {new} elements'.format(old=old_len, new=new_len)) + + self.axes[axis] = new_labels + + def rename_axis(self, mapper, axis, copy=True, level=None): + """ + Rename one of axes. + + Parameters + ---------- + mapper : unary callable + axis : int + copy : boolean, default True + level : int, default None + + """ + obj = self.copy(deep=copy) + obj.set_axis(axis, _transform_index(self.axes[axis], mapper, level)) + return obj + + def add_prefix(self, prefix): + f = partial('{prefix}{}'.format, prefix=prefix) + return self.rename_axis(f, axis=0) + + def add_suffix(self, suffix): + f = partial('{}{suffix}'.format, suffix=suffix) + return self.rename_axis(f, axis=0) + + @property + def _is_single_block(self): + if self.ndim == 1: + return True + + if len(self.blocks) != 1: + return False + + blk = self.blocks[0] + return (blk.mgr_locs.is_slice_like and + blk.mgr_locs.as_slice == slice(0, len(self), 1)) + + def _rebuild_blknos_and_blklocs(self): + """ + Update mgr._blknos / mgr._blklocs. + """ + new_blknos = np.empty(self.shape[0], dtype=np.int64) + new_blklocs = np.empty(self.shape[0], dtype=np.int64) + new_blknos.fill(-1) + new_blklocs.fill(-1) + + for blkno, blk in enumerate(self.blocks): + rl = blk.mgr_locs + new_blknos[rl.indexer] = blkno + new_blklocs[rl.indexer] = np.arange(len(rl)) + + if (new_blknos == -1).any(): + raise AssertionError("Gaps in blk ref_locs") + + self._blknos = new_blknos + self._blklocs = new_blklocs + + # make items read only for now + def _get_items(self): + return self.axes[0] + + items = property(fget=_get_items) + + def _get_counts(self, f): + """ return a dict of the counts of the function in BlockManager """ + self._consolidate_inplace() + counts = dict() + for b in self.blocks: + v = f(b) + counts[v] = counts.get(v, 0) + b.shape[0] + return counts + + def get_dtype_counts(self): + return self._get_counts(lambda b: b.dtype.name) + + def get_ftype_counts(self): + return self._get_counts(lambda b: b.ftype) + + def get_dtypes(self): + dtypes = np.array([blk.dtype for blk in self.blocks]) + return algos.take_1d(dtypes, self._blknos, allow_fill=False) + + def get_ftypes(self): + ftypes = np.array([blk.ftype for blk in self.blocks]) + return algos.take_1d(ftypes, self._blknos, allow_fill=False) + + def __getstate__(self): + block_values = [b.values for b in self.blocks] + block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] + axes_array = [ax for ax in self.axes] + + extra_state = { + '0.14.1': { + 'axes': axes_array, + 'blocks': [dict(values=b.values, mgr_locs=b.mgr_locs.indexer) + for b in self.blocks] + } + } + + # First three elements of the state are to maintain forward + # compatibility with 0.13.1. + return axes_array, block_values, block_items, extra_state + + def __setstate__(self, state): + def unpickle_block(values, mgr_locs): + # numpy < 1.7 pickle compat + if values.dtype == 'M8[us]': + values = values.astype('M8[ns]') + return make_block(values, placement=mgr_locs) + + if (isinstance(state, tuple) and len(state) >= 4 and + '0.14.1' in state[3]): + state = state[3]['0.14.1'] + self.axes = [ensure_index(ax) for ax in state['axes']] + self.blocks = tuple(unpickle_block(b['values'], b['mgr_locs']) + for b in state['blocks']) + else: + # discard anything after 3rd, support beta pickling format for a + # little while longer + ax_arrays, bvalues, bitems = state[:3] + + self.axes = [ensure_index(ax) for ax in ax_arrays] + + if len(bitems) == 1 and self.axes[0].equals(bitems[0]): + # This is a workaround for pre-0.14.1 pickles that didn't + # support unpickling multi-block frames/panels with non-unique + # columns/items, because given a manager with items ["a", "b", + # "a"] there's no way of knowing which block's "a" is where. + # + # Single-block case can be supported under the assumption that + # block items corresponded to manager items 1-to-1. + all_mgr_locs = [slice(0, len(bitems[0]))] + else: + all_mgr_locs = [self.axes[0].get_indexer(blk_items) + for blk_items in bitems] + + self.blocks = tuple( + unpickle_block(values, mgr_locs) + for values, mgr_locs in zip(bvalues, all_mgr_locs)) + + self._post_setstate() + + def _post_setstate(self): + self._is_consolidated = False + self._known_consolidated = False + self._rebuild_blknos_and_blklocs() + + def __len__(self): + return len(self.items) + + def __unicode__(self): + output = pprint_thing(self.__class__.__name__) + for i, ax in enumerate(self.axes): + if i == 0: + output += u('\nItems: {ax}'.format(ax=ax)) + else: + output += u('\nAxis {i}: {ax}'.format(i=i, ax=ax)) + + for block in self.blocks: + output += u('\n{block}'.format(block=pprint_thing(block))) + return output + + def _verify_integrity(self): + mgr_shape = self.shape + tot_items = sum(len(x.mgr_locs) for x in self.blocks) + for block in self.blocks: + if block._verify_integrity and block.shape[1:] != mgr_shape[1:]: + construction_error(tot_items, block.shape[1:], self.axes) + if len(self.items) != tot_items: + raise AssertionError('Number of manager items must equal union of ' + 'block items\n# manager items: {0}, # ' + 'tot_items: {1}'.format( + len(self.items), tot_items)) + + def apply(self, f, axes=None, filter=None, do_integrity_check=False, + consolidate=True, **kwargs): + """ + iterate over the blocks, collect and create a new block manager + + Parameters + ---------- + f : the callable or function name to operate on at the block level + axes : optional (if not supplied, use self.axes) + filter : list, if supplied, only call the block if the filter is in + the block + do_integrity_check : boolean, default False. Do the block manager + integrity check + consolidate: boolean, default True. Join together blocks having same + dtype + + Returns + ------- + Block Manager (new object) + + """ + + result_blocks = [] + + # filter kwarg is used in replace-* family of methods + if filter is not None: + filter_locs = set(self.items.get_indexer_for(filter)) + if len(filter_locs) == len(self.items): + # All items are included, as if there were no filtering + filter = None + else: + kwargs['filter'] = filter_locs + + if consolidate: + self._consolidate_inplace() + + if f == 'where': + align_copy = True + if kwargs.get('align', True): + align_keys = ['other', 'cond'] + else: + align_keys = ['cond'] + elif f == 'putmask': + align_copy = False + if kwargs.get('align', True): + align_keys = ['new', 'mask'] + else: + align_keys = ['mask'] + elif f == 'eval': + align_copy = False + align_keys = ['other'] + elif f == 'fillna': + # fillna internally does putmask, maybe it's better to do this + # at mgr, not block level? + align_copy = False + align_keys = ['value'] + else: + align_keys = [] + + # TODO(EA): may interfere with ExtensionBlock.setitem for blocks + # with a .values attribute. + aligned_args = dict((k, kwargs[k]) + for k in align_keys + if hasattr(kwargs[k], 'values') and + not isinstance(kwargs[k], ABCExtensionArray)) + + for b in self.blocks: + if filter is not None: + if not b.mgr_locs.isin(filter_locs).any(): + result_blocks.append(b) + continue + + if aligned_args: + b_items = self.items[b.mgr_locs.indexer] + + for k, obj in aligned_args.items(): + axis = getattr(obj, '_info_axis_number', 0) + kwargs[k] = obj.reindex(b_items, axis=axis, + copy=align_copy) + + kwargs['mgr'] = self + applied = getattr(b, f)(**kwargs) + result_blocks = _extend_blocks(applied, result_blocks) + + if len(result_blocks) == 0: + return self.make_empty(axes or self.axes) + bm = self.__class__(result_blocks, axes or self.axes, + do_integrity_check=do_integrity_check) + bm._consolidate_inplace() + return bm + + def reduction(self, f, axis=0, consolidate=True, transposed=False, + **kwargs): + """ + iterate over the blocks, collect and create a new block manager. + This routine is intended for reduction type operations and + will do inference on the generated blocks. + + Parameters + ---------- + f: the callable or function name to operate on at the block level + axis: reduction axis, default 0 + consolidate: boolean, default True. Join together blocks having same + dtype + transposed: boolean, default False + we are holding transposed data + + Returns + ------- + Block Manager (new object) + + """ + + if consolidate: + self._consolidate_inplace() + + axes, blocks = [], [] + for b in self.blocks: + kwargs['mgr'] = self + axe, block = getattr(b, f)(axis=axis, **kwargs) + + axes.append(axe) + blocks.append(block) + + # note that some DatetimeTZ, Categorical are always ndim==1 + ndim = {b.ndim for b in blocks} + + if 2 in ndim: + + new_axes = list(self.axes) + + # multiple blocks that are reduced + if len(blocks) > 1: + new_axes[1] = axes[0] + + # reset the placement to the original + for b, sb in zip(blocks, self.blocks): + b.mgr_locs = sb.mgr_locs + + else: + new_axes[axis] = Index(np.concatenate( + [ax.values for ax in axes])) + + if transposed: + new_axes = new_axes[::-1] + blocks = [b.make_block(b.values.T, + placement=np.arange(b.shape[1]) + ) for b in blocks] + + return self.__class__(blocks, new_axes) + + # 0 ndim + if 0 in ndim and 1 not in ndim: + values = np.array([b.values for b in blocks]) + if len(values) == 1: + return values.item() + blocks = [make_block(values, ndim=1)] + axes = Index([ax[0] for ax in axes]) + + # single block + values = _concat._concat_compat([b.values for b in blocks]) + + # compute the orderings of our original data + if len(self.blocks) > 1: + + indexer = np.empty(len(self.axes[0]), dtype=np.intp) + i = 0 + for b in self.blocks: + for j in b.mgr_locs: + indexer[j] = i + i = i + 1 + + values = values.take(indexer) + + return SingleBlockManager( + [make_block(values, + ndim=1, + placement=np.arange(len(values)))], + axes[0]) + + def isna(self, func, **kwargs): + return self.apply('apply', func=func, **kwargs) + + def where(self, **kwargs): + return self.apply('where', **kwargs) + + def eval(self, **kwargs): + return self.apply('eval', **kwargs) + + def quantile(self, **kwargs): + return self.reduction('quantile', **kwargs) + + def setitem(self, **kwargs): + return self.apply('setitem', **kwargs) + + def putmask(self, **kwargs): + return self.apply('putmask', **kwargs) + + def diff(self, **kwargs): + return self.apply('diff', **kwargs) + + def interpolate(self, **kwargs): + return self.apply('interpolate', **kwargs) + + def shift(self, **kwargs): + return self.apply('shift', **kwargs) + + def fillna(self, **kwargs): + return self.apply('fillna', **kwargs) + + def downcast(self, **kwargs): + return self.apply('downcast', **kwargs) + + def astype(self, dtype, **kwargs): + return self.apply('astype', dtype=dtype, **kwargs) + + def convert(self, **kwargs): + return self.apply('convert', **kwargs) + + def replace(self, **kwargs): + return self.apply('replace', **kwargs) + + def replace_list(self, src_list, dest_list, inplace=False, regex=False, + mgr=None): + """ do a list replace """ + + inplace = validate_bool_kwarg(inplace, 'inplace') + + if mgr is None: + mgr = self + + # figure out our mask a-priori to avoid repeated replacements + values = self.as_array() + + def comp(s): + if isna(s): + return isna(values) + return _maybe_compare(values, getattr(s, 'asm8', s), operator.eq) + + masks = [comp(s) for i, s in enumerate(src_list)] + + result_blocks = [] + src_len = len(src_list) - 1 + for blk in self.blocks: + + # its possible to get multiple result blocks here + # replace ALWAYS will return a list + rb = [blk if inplace else blk.copy()] + for i, (s, d) in enumerate(zip(src_list, dest_list)): + new_rb = [] + for b in rb: + if b.dtype == np.object_: + convert = i == src_len + result = b.replace(s, d, inplace=inplace, regex=regex, + mgr=mgr, convert=convert) + new_rb = _extend_blocks(result, new_rb) + else: + # get our mask for this element, sized to this + # particular block + m = masks[i][b.mgr_locs.indexer] + if m.any(): + b = b.coerce_to_target_dtype(d) + new_rb.extend(b.putmask(m, d, inplace=True)) + else: + new_rb.append(b) + rb = new_rb + result_blocks.extend(rb) + + bm = self.__class__(result_blocks, self.axes) + bm._consolidate_inplace() + return bm + + def reshape_nd(self, axes, **kwargs): + """ a 2d-nd reshape operation on a BlockManager """ + return self.apply('reshape_nd', axes=axes, **kwargs) + + def is_consolidated(self): + """ + Return True if more than one block with the same dtype + """ + if not self._known_consolidated: + self._consolidate_check() + return self._is_consolidated + + def _consolidate_check(self): + ftypes = [blk.ftype for blk in self.blocks] + self._is_consolidated = len(ftypes) == len(set(ftypes)) + self._known_consolidated = True + + @property + def is_mixed_type(self): + # Warning, consolidation needs to get checked upstairs + self._consolidate_inplace() + return len(self.blocks) > 1 + + @property + def is_numeric_mixed_type(self): + # Warning, consolidation needs to get checked upstairs + self._consolidate_inplace() + return all(block.is_numeric for block in self.blocks) + + @property + def is_datelike_mixed_type(self): + # Warning, consolidation needs to get checked upstairs + self._consolidate_inplace() + return any(block.is_datelike for block in self.blocks) + + @property + def any_extension_types(self): + """Whether any of the blocks in this manager are extension blocks""" + return any(block.is_extension for block in self.blocks) + + @property + def is_view(self): + """ return a boolean if we are a single block and are a view """ + if len(self.blocks) == 1: + return self.blocks[0].is_view + + # It is technically possible to figure out which blocks are views + # e.g. [ b.values.base is not None for b in self.blocks ] + # but then we have the case of possibly some blocks being a view + # and some blocks not. setting in theory is possible on the non-view + # blocks w/o causing a SettingWithCopy raise/warn. But this is a bit + # complicated + + return False + + def get_bool_data(self, copy=False): + """ + Parameters + ---------- + copy : boolean, default False + Whether to copy the blocks + """ + self._consolidate_inplace() + return self.combine([b for b in self.blocks if b.is_bool], copy) + + def get_numeric_data(self, copy=False): + """ + Parameters + ---------- + copy : boolean, default False + Whether to copy the blocks + """ + self._consolidate_inplace() + return self.combine([b for b in self.blocks if b.is_numeric], copy) + + def combine(self, blocks, copy=True): + """ return a new manager with the blocks """ + if len(blocks) == 0: + return self.make_empty() + + # FIXME: optimization potential + indexer = np.sort(np.concatenate([b.mgr_locs.as_array + for b in blocks])) + inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) + + new_blocks = [] + for b in blocks: + b = b.copy(deep=copy) + b.mgr_locs = algos.take_1d(inv_indexer, b.mgr_locs.as_array, + axis=0, allow_fill=False) + new_blocks.append(b) + + axes = list(self.axes) + axes[0] = self.items.take(indexer) + + return self.__class__(new_blocks, axes, do_integrity_check=False) + + def get_slice(self, slobj, axis=0): + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + if axis == 0: + new_blocks = self._slice_take_blocks_ax0(slobj) + else: + slicer = [slice(None)] * (axis + 1) + slicer[axis] = slobj + slicer = tuple(slicer) + new_blocks = [blk.getitem_block(slicer) for blk in self.blocks] + + new_axes = list(self.axes) + new_axes[axis] = new_axes[axis][slobj] + + bm = self.__class__(new_blocks, new_axes, do_integrity_check=False) + bm._consolidate_inplace() + return bm + + def __contains__(self, item): + return item in self.items + + @property + def nblocks(self): + return len(self.blocks) + + def copy(self, deep=True, mgr=None): + """ + Make deep or shallow copy of BlockManager + + Parameters + ---------- + deep : boolean o rstring, default True + If False, return shallow copy (do not copy data) + If 'all', copy data and a deep copy of the index + + Returns + ------- + copy : BlockManager + """ + + # this preserves the notion of view copying of axes + if deep: + if deep == 'all': + copy = lambda ax: ax.copy(deep=True) + else: + copy = lambda ax: ax.view() + new_axes = [copy(ax) for ax in self.axes] + else: + new_axes = list(self.axes) + return self.apply('copy', axes=new_axes, deep=deep, + do_integrity_check=False) + + def as_array(self, transpose=False, items=None): + """Convert the blockmanager data into an numpy array. + + Parameters + ---------- + transpose : boolean, default False + If True, transpose the return array + items : list of strings or None + Names of block items that will be included in the returned + array. ``None`` means that all block items will be used + + Returns + ------- + arr : ndarray + """ + if len(self.blocks) == 0: + arr = np.empty(self.shape, dtype=float) + return arr.transpose() if transpose else arr + + if items is not None: + mgr = self.reindex_axis(items, axis=0) + else: + mgr = self + + if self._is_single_block or not self.is_mixed_type: + arr = mgr.blocks[0].get_values() + else: + arr = mgr._interleave() + + return arr.transpose() if transpose else arr + + def _interleave(self): + """ + Return ndarray from blocks with specified item order + Items must be contained in the blocks + """ + dtype = _interleaved_dtype(self.blocks) + + result = np.empty(self.shape, dtype=dtype) + + if result.shape[0] == 0: + # Workaround for numpy 1.7 bug: + # + # >>> a = np.empty((0,10)) + # >>> a[slice(0,0)] + # array([], shape=(0, 10), dtype=float64) + # >>> a[[]] + # Traceback (most recent call last): + # File "", line 1, in + # IndexError: index 0 is out of bounds for axis 0 with size 0 + return result + + itemmask = np.zeros(self.shape[0]) + + for blk in self.blocks: + rl = blk.mgr_locs + result[rl.indexer] = blk.get_values(dtype) + itemmask[rl.indexer] = 1 + + if not itemmask.all(): + raise AssertionError('Some items were not contained in blocks') + + return result + + def to_dict(self, copy=True): + """ + Return a dict of str(dtype) -> BlockManager + + Parameters + ---------- + copy : boolean, default True + + Returns + ------- + values : a dict of dtype -> BlockManager + + Notes + ----- + This consolidates based on str(dtype) + """ + self._consolidate_inplace() + + bd = {} + for b in self.blocks: + bd.setdefault(str(b.dtype), []).append(b) + + return {dtype: self.combine(blocks, copy=copy) + for dtype, blocks in bd.items()} + + def xs(self, key, axis=1, copy=True, takeable=False): + if axis < 1: + raise AssertionError( + 'Can only take xs across axis >= 1, got {ax}'.format(ax=axis)) + + # take by position + if takeable: + loc = key + else: + loc = self.axes[axis].get_loc(key) + + slicer = [slice(None, None) for _ in range(self.ndim)] + slicer[axis] = loc + slicer = tuple(slicer) + + new_axes = list(self.axes) + + # could be an array indexer! + if isinstance(loc, (slice, np.ndarray)): + new_axes[axis] = new_axes[axis][loc] + else: + new_axes.pop(axis) + + new_blocks = [] + if len(self.blocks) > 1: + # we must copy here as we are mixed type + for blk in self.blocks: + newb = make_block(values=blk.values[slicer], + klass=blk.__class__, + placement=blk.mgr_locs) + new_blocks.append(newb) + elif len(self.blocks) == 1: + block = self.blocks[0] + vals = block.values[slicer] + if copy: + vals = vals.copy() + new_blocks = [make_block(values=vals, + placement=block.mgr_locs, + klass=block.__class__)] + + return self.__class__(new_blocks, new_axes) + + def fast_xs(self, loc): + """ + get a cross sectional for a given location in the + items ; handle dups + + return the result, is *could* be a view in the case of a + single block + """ + if len(self.blocks) == 1: + return self.blocks[0].iget((slice(None), loc)) + + items = self.items + + # non-unique (GH4726) + if not items.is_unique: + result = self._interleave() + if self.ndim == 2: + result = result.T + return result[loc] + + # unique + dtype = _interleaved_dtype(self.blocks) + n = len(items) + result = np.empty(n, dtype=dtype) + for blk in self.blocks: + # Such assignment may incorrectly coerce NaT to None + # result[blk.mgr_locs] = blk._slice((slice(None), loc)) + for i, rl in enumerate(blk.mgr_locs): + result[rl] = blk._try_coerce_result(blk.iget((i, loc))) + + return result + + def consolidate(self): + """ + Join together blocks having same dtype + + Returns + ------- + y : BlockManager + """ + if self.is_consolidated(): + return self + + bm = self.__class__(self.blocks, self.axes) + bm._is_consolidated = False + bm._consolidate_inplace() + return bm + + def _consolidate_inplace(self): + if not self.is_consolidated(): + self.blocks = tuple(_consolidate(self.blocks)) + self._is_consolidated = True + self._known_consolidated = True + self._rebuild_blknos_and_blklocs() + + def get(self, item, fastpath=True): + """ + Return values for selected item (ndarray or BlockManager). + """ + if self.items.is_unique: + + if not isna(item): + loc = self.items.get_loc(item) + else: + indexer = np.arange(len(self.items))[isna(self.items)] + + # allow a single nan location indexer + if not is_scalar(indexer): + if len(indexer) == 1: + loc = indexer.item() + else: + raise ValueError("cannot label index with a null key") + + return self.iget(loc, fastpath=fastpath) + else: + + if isna(item): + raise TypeError("cannot label index with a null key") + + indexer = self.items.get_indexer_for([item]) + return self.reindex_indexer(new_axis=self.items[indexer], + indexer=indexer, axis=0, + allow_dups=True) + + def iget(self, i, fastpath=True): + """ + Return the data as a SingleBlockManager if fastpath=True and possible + + Otherwise return as a ndarray + """ + block = self.blocks[self._blknos[i]] + values = block.iget(self._blklocs[i]) + if not fastpath or not block._box_to_block_values or values.ndim != 1: + return values + + # fastpath shortcut for select a single-dim from a 2-dim BM + return SingleBlockManager( + [block.make_block_same_class(values, + placement=slice(0, len(values)), + ndim=1)], + self.axes[1]) + + def get_scalar(self, tup): + """ + Retrieve single item + """ + full_loc = [ax.get_loc(x) for ax, x in zip(self.axes, tup)] + blk = self.blocks[self._blknos[full_loc[0]]] + values = blk.values + + # FIXME: this may return non-upcasted types? + if values.ndim == 1: + return values[full_loc[1]] + + full_loc[0] = self._blklocs[full_loc[0]] + return values[tuple(full_loc)] + + def delete(self, item): + """ + Delete selected item (items if non-unique) in-place. + """ + indexer = self.items.get_loc(item) + + is_deleted = np.zeros(self.shape[0], dtype=np.bool_) + is_deleted[indexer] = True + ref_loc_offset = -is_deleted.cumsum() + + is_blk_deleted = [False] * len(self.blocks) + + if isinstance(indexer, int): + affected_start = indexer + else: + affected_start = is_deleted.nonzero()[0][0] + + for blkno, _ in _fast_count_smallints(self._blknos[affected_start:]): + blk = self.blocks[blkno] + bml = blk.mgr_locs + blk_del = is_deleted[bml.indexer].nonzero()[0] + + if len(blk_del) == len(bml): + is_blk_deleted[blkno] = True + continue + elif len(blk_del) != 0: + blk.delete(blk_del) + bml = blk.mgr_locs + + blk.mgr_locs = bml.add(ref_loc_offset[bml.indexer]) + + # FIXME: use Index.delete as soon as it uses fastpath=True + self.axes[0] = self.items[~is_deleted] + self.blocks = tuple(b for blkno, b in enumerate(self.blocks) + if not is_blk_deleted[blkno]) + self._shape = None + self._rebuild_blknos_and_blklocs() + + def set(self, item, value, check=False): + """ + Set new item in-place. Does not consolidate. Adds new Block if not + contained in the current set of items + if check, then validate that we are not setting the same data in-place + """ + # FIXME: refactor, clearly separate broadcasting & zip-like assignment + # can prob also fix the various if tests for sparse/categorical + + # TODO(EA): Remove an is_extension_ when all extension types satisfy + # the interface + value_is_extension_type = (is_extension_type(value) or + is_extension_array_dtype(value)) + + # categorical/spares/datetimetz + if value_is_extension_type: + + def value_getitem(placement): + return value + else: + if value.ndim == self.ndim - 1: + value = _safe_reshape(value, (1,) + value.shape) + + def value_getitem(placement): + return value + else: + + def value_getitem(placement): + return value[placement.indexer] + + if value.shape[1:] != self.shape[1:]: + raise AssertionError('Shape of new values must be compatible ' + 'with manager shape') + + try: + loc = self.items.get_loc(item) + except KeyError: + # This item wasn't present, just insert at end + self.insert(len(self.items), item, value) + return + + if isinstance(loc, int): + loc = [loc] + + blknos = self._blknos[loc] + blklocs = self._blklocs[loc].copy() + + unfit_mgr_locs = [] + unfit_val_locs = [] + removed_blknos = [] + for blkno, val_locs in _get_blkno_placements(blknos, len(self.blocks), + group=True): + blk = self.blocks[blkno] + blk_locs = blklocs[val_locs.indexer] + if blk.should_store(value): + blk.set(blk_locs, value_getitem(val_locs), check=check) + else: + unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) + unfit_val_locs.append(val_locs) + + # If all block items are unfit, schedule the block for removal. + if len(val_locs) == len(blk.mgr_locs): + removed_blknos.append(blkno) + else: + self._blklocs[blk.mgr_locs.indexer] = -1 + blk.delete(blk_locs) + self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk)) + + if len(removed_blknos): + # Remove blocks & update blknos accordingly + is_deleted = np.zeros(self.nblocks, dtype=np.bool_) + is_deleted[removed_blknos] = True + + new_blknos = np.empty(self.nblocks, dtype=np.int64) + new_blknos.fill(-1) + new_blknos[~is_deleted] = np.arange(self.nblocks - + len(removed_blknos)) + self._blknos = algos.take_1d(new_blknos, self._blknos, axis=0, + allow_fill=False) + self.blocks = tuple(blk for i, blk in enumerate(self.blocks) + if i not in set(removed_blknos)) + + if unfit_val_locs: + unfit_mgr_locs = np.concatenate(unfit_mgr_locs) + unfit_count = len(unfit_mgr_locs) + + new_blocks = [] + if value_is_extension_type: + # This code (ab-)uses the fact that sparse blocks contain only + # one item. + new_blocks.extend( + make_block(values=value.copy(), ndim=self.ndim, + placement=slice(mgr_loc, mgr_loc + 1)) + for mgr_loc in unfit_mgr_locs) + + self._blknos[unfit_mgr_locs] = (np.arange(unfit_count) + + len(self.blocks)) + self._blklocs[unfit_mgr_locs] = 0 + + else: + # unfit_val_locs contains BlockPlacement objects + unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:]) + + new_blocks.append( + make_block(values=value_getitem(unfit_val_items), + ndim=self.ndim, placement=unfit_mgr_locs)) + + self._blknos[unfit_mgr_locs] = len(self.blocks) + self._blklocs[unfit_mgr_locs] = np.arange(unfit_count) + + self.blocks += tuple(new_blocks) + + # Newly created block's dtype may already be present. + self._known_consolidated = False + + def insert(self, loc, item, value, allow_duplicates=False): + """ + Insert item at selected position. + + Parameters + ---------- + loc : int + item : hashable + value : array_like + allow_duplicates: bool + If False, trying to insert non-unique item will raise + + """ + if not allow_duplicates and item in self.items: + # Should this be a different kind of error?? + raise ValueError('cannot insert {}, already exists'.format(item)) + + if not isinstance(loc, int): + raise TypeError("loc must be int") + + # insert to the axis; this could possibly raise a TypeError + new_axis = self.items.insert(loc, item) + + block = make_block(values=value, ndim=self.ndim, + placement=slice(loc, loc + 1)) + + for blkno, count in _fast_count_smallints(self._blknos[loc:]): + blk = self.blocks[blkno] + if count == len(blk.mgr_locs): + blk.mgr_locs = blk.mgr_locs.add(1) + else: + new_mgr_locs = blk.mgr_locs.as_array.copy() + new_mgr_locs[new_mgr_locs >= loc] += 1 + blk.mgr_locs = new_mgr_locs + + if loc == self._blklocs.shape[0]: + # np.append is a lot faster (at least in numpy 1.7.1), let's use it + # if we can. + self._blklocs = np.append(self._blklocs, 0) + self._blknos = np.append(self._blknos, len(self.blocks)) + else: + self._blklocs = np.insert(self._blklocs, loc, 0) + self._blknos = np.insert(self._blknos, loc, len(self.blocks)) + + self.axes[0] = new_axis + self.blocks += (block,) + self._shape = None + + self._known_consolidated = False + + if len(self.blocks) > 100: + self._consolidate_inplace() + + def reindex_axis(self, new_index, axis, method=None, limit=None, + fill_value=None, copy=True): + """ + Conform block manager to new index. + """ + new_index = ensure_index(new_index) + new_index, indexer = self.axes[axis].reindex(new_index, method=method, + limit=limit) + + return self.reindex_indexer(new_index, indexer, axis=axis, + fill_value=fill_value, copy=copy) + + def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, + allow_dups=False, copy=True): + """ + Parameters + ---------- + new_axis : Index + indexer : ndarray of int64 or None + axis : int + fill_value : object + allow_dups : bool + + pandas-indexer with -1's only. + """ + if indexer is None: + if new_axis is self.axes[axis] and not copy: + return self + + result = self.copy(deep=copy) + result.axes = list(self.axes) + result.axes[axis] = new_axis + return result + + self._consolidate_inplace() + + # some axes don't allow reindexing with dups + if not allow_dups: + self.axes[axis]._can_reindex(indexer) + + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + if axis == 0: + new_blocks = self._slice_take_blocks_ax0(indexer, + fill_tuple=(fill_value,)) + else: + new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=( + fill_value if fill_value is not None else blk.fill_value,)) + for blk in self.blocks] + + new_axes = list(self.axes) + new_axes[axis] = new_axis + return self.__class__(new_blocks, new_axes) + + def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): + """ + Slice/take blocks along axis=0. + + Overloaded for SingleBlock + + Returns + ------- + new_blocks : list of Block + + """ + + allow_fill = fill_tuple is not None + + sl_type, slobj, sllen = _preprocess_slice_or_indexer( + slice_or_indexer, self.shape[0], allow_fill=allow_fill) + + if self._is_single_block: + blk = self.blocks[0] + + if sl_type in ('slice', 'mask'): + return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] + elif not allow_fill or self.ndim == 1: + if allow_fill and fill_tuple[0] is None: + _, fill_value = maybe_promote(blk.dtype) + fill_tuple = (fill_value, ) + + return [blk.take_nd(slobj, axis=0, + new_mgr_locs=slice(0, sllen), + fill_tuple=fill_tuple)] + + if sl_type in ('slice', 'mask'): + blknos = self._blknos[slobj] + blklocs = self._blklocs[slobj] + else: + blknos = algos.take_1d(self._blknos, slobj, fill_value=-1, + allow_fill=allow_fill) + blklocs = algos.take_1d(self._blklocs, slobj, fill_value=-1, + allow_fill=allow_fill) + + # When filling blknos, make sure blknos is updated before appending to + # blocks list, that way new blkno is exactly len(blocks). + # + # FIXME: mgr_groupby_blknos must return mgr_locs in ascending order, + # pytables serialization will break otherwise. + blocks = [] + for blkno, mgr_locs in _get_blkno_placements(blknos, len(self.blocks), + group=True): + if blkno == -1: + # If we've got here, fill_tuple was not None. + fill_value = fill_tuple[0] + + blocks.append(self._make_na_block(placement=mgr_locs, + fill_value=fill_value)) + else: + blk = self.blocks[blkno] + + # Otherwise, slicing along items axis is necessary. + if not blk._can_consolidate: + # A non-consolidatable block, it's easy, because there's + # only one item and each mgr loc is a copy of that single + # item. + for mgr_loc in mgr_locs: + newblk = blk.copy(deep=True) + newblk.mgr_locs = slice(mgr_loc, mgr_loc + 1) + blocks.append(newblk) + + else: + blocks.append(blk.take_nd(blklocs[mgr_locs.indexer], + axis=0, new_mgr_locs=mgr_locs, + fill_tuple=None)) + + return blocks + + def _make_na_block(self, placement, fill_value=None): + # TODO: infer dtypes other than float64 from fill_value + + if fill_value is None: + fill_value = np.nan + block_shape = list(self.shape) + block_shape[0] = len(placement) + + dtype, fill_value = infer_dtype_from_scalar(fill_value) + block_values = np.empty(block_shape, dtype=dtype) + block_values.fill(fill_value) + return make_block(block_values, placement=placement) + + def take(self, indexer, axis=1, verify=True, convert=True): + """ + Take items along any axis. + """ + self._consolidate_inplace() + indexer = (np.arange(indexer.start, indexer.stop, indexer.step, + dtype='int64') + if isinstance(indexer, slice) + else np.asanyarray(indexer, dtype='int64')) + + n = self.shape[axis] + if convert: + indexer = maybe_convert_indices(indexer, n) + + if verify: + if ((indexer == -1) | (indexer >= n)).any(): + raise Exception('Indices must be nonzero and less than ' + 'the axis length') + + new_labels = self.axes[axis].take(indexer) + return self.reindex_indexer(new_axis=new_labels, indexer=indexer, + axis=axis, allow_dups=True) + + def merge(self, other, lsuffix='', rsuffix=''): + if not self._is_indexed_like(other): + raise AssertionError('Must have same axes to merge managers') + + l, r = items_overlap_with_suffix(left=self.items, lsuffix=lsuffix, + right=other.items, rsuffix=rsuffix) + new_items = _concat_indexes([l, r]) + + new_blocks = [blk.copy(deep=False) for blk in self.blocks] + + offset = self.shape[0] + for blk in other.blocks: + blk = blk.copy(deep=False) + blk.mgr_locs = blk.mgr_locs.add(offset) + new_blocks.append(blk) + + new_axes = list(self.axes) + new_axes[0] = new_items + + return self.__class__(_consolidate(new_blocks), new_axes) + + def _is_indexed_like(self, other): + """ + Check all axes except items + """ + if self.ndim != other.ndim: + raise AssertionError( + 'Number of dimensions must agree got {ndim} and ' + '{oth_ndim}'.format(ndim=self.ndim, oth_ndim=other.ndim)) + for ax, oax in zip(self.axes[1:], other.axes[1:]): + if not ax.equals(oax): + return False + return True + + def equals(self, other): + self_axes, other_axes = self.axes, other.axes + if len(self_axes) != len(other_axes): + return False + if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)): + return False + self._consolidate_inplace() + other._consolidate_inplace() + if len(self.blocks) != len(other.blocks): + return False + + # canonicalize block order, using a tuple combining the type + # name and then mgr_locs because there might be unconsolidated + # blocks (say, Categorical) which can only be distinguished by + # the iteration order + def canonicalize(block): + return (block.dtype.name, block.mgr_locs.as_array.tolist()) + + self_blocks = sorted(self.blocks, key=canonicalize) + other_blocks = sorted(other.blocks, key=canonicalize) + return all(block.equals(oblock) + for block, oblock in zip(self_blocks, other_blocks)) + + def unstack(self, unstacker_func): + """Return a blockmanager with all blocks unstacked. + + Parameters + ---------- + unstacker_func : callable + A (partially-applied) ``pd.core.reshape._Unstacker`` class. + + Returns + ------- + unstacked : BlockManager + """ + dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items) + new_columns = dummy.get_new_columns() + new_index = dummy.get_new_index() + new_blocks = [] + columns_mask = [] + + for blk in self.blocks: + blocks, mask = blk._unstack( + partial(unstacker_func, + value_columns=self.items[blk.mgr_locs.indexer]), + new_columns) + + new_blocks.extend(blocks) + columns_mask.extend(mask) + + new_columns = new_columns[columns_mask] + + bm = BlockManager(new_blocks, [new_columns, new_index]) + return bm + + +class SingleBlockManager(BlockManager): + """ manage a single block with """ + + ndim = 1 + _is_consolidated = True + _known_consolidated = True + __slots__ = () + + def __init__(self, block, axis, do_integrity_check=False, fastpath=False): + + if isinstance(axis, list): + if len(axis) != 1: + raise ValueError("cannot create SingleBlockManager with more " + "than 1 axis") + axis = axis[0] + + # passed from constructor, single block, single axis + if fastpath: + self.axes = [axis] + if isinstance(block, list): + + # empty block + if len(block) == 0: + block = [np.array([])] + elif len(block) != 1: + raise ValueError('Cannot create SingleBlockManager with ' + 'more than 1 block') + block = block[0] + else: + self.axes = [ensure_index(axis)] + + # create the block here + if isinstance(block, list): + + # provide consolidation to the interleaved_dtype + if len(block) > 1: + dtype = _interleaved_dtype(block) + block = [b.astype(dtype) for b in block] + block = _consolidate(block) + + if len(block) != 1: + raise ValueError('Cannot create SingleBlockManager with ' + 'more than 1 block') + block = block[0] + + if not isinstance(block, Block): + block = make_block(block, placement=slice(0, len(axis)), ndim=1) + + self.blocks = [block] + + def _post_setstate(self): + pass + + @property + def _block(self): + return self.blocks[0] + + @property + def _values(self): + return self._block.values + + @property + def _blknos(self): + """ compat with BlockManager """ + return None + + @property + def _blklocs(self): + """ compat with BlockManager """ + return None + + def get_slice(self, slobj, axis=0): + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + return self.__class__(self._block._slice(slobj), + self.index[slobj], fastpath=True) + + @property + def index(self): + return self.axes[0] + + def convert(self, **kwargs): + """ convert the whole block as one """ + kwargs['by_item'] = False + return self.apply('convert', **kwargs) + + @property + def dtype(self): + return self._block.dtype + + @property + def array_dtype(self): + return self._block.array_dtype + + @property + def ftype(self): + return self._block.ftype + + def get_dtype_counts(self): + return {self.dtype.name: 1} + + def get_ftype_counts(self): + return {self.ftype: 1} + + def get_dtypes(self): + return np.array([self._block.dtype]) + + def get_ftypes(self): + return np.array([self._block.ftype]) + + def external_values(self): + return self._block.external_values() + + def internal_values(self): + return self._block.internal_values() + + def formatting_values(self): + """Return the internal values used by the DataFrame/SeriesFormatter""" + return self._block.formatting_values() + + def get_values(self): + """ return a dense type view """ + return np.array(self._block.to_dense(), copy=False) + + @property + def asobject(self): + """ + return a object dtype array. datetime/timedelta like values are boxed + to Timestamp/Timedelta instances. + """ + return self._block.get_values(dtype=object) + + @property + def _can_hold_na(self): + return self._block._can_hold_na + + def is_consolidated(self): + return True + + def _consolidate_check(self): + pass + + def _consolidate_inplace(self): + pass + + def delete(self, item): + """ + Delete single item from SingleBlockManager. + + Ensures that self.blocks doesn't become empty. + """ + loc = self.items.get_loc(item) + self._block.delete(loc) + self.axes[0] = self.axes[0].delete(loc) + + def fast_xs(self, loc): + """ + fast path for getting a cross-section + return a view of the data + """ + return self._block.values[loc] + + def concat(self, to_concat, new_axis): + """ + Concatenate a list of SingleBlockManagers into a single + SingleBlockManager. + + Used for pd.concat of Series objects with axis=0. + + Parameters + ---------- + to_concat : list of SingleBlockManagers + new_axis : Index of the result + + Returns + ------- + SingleBlockManager + + """ + non_empties = [x for x in to_concat if len(x) > 0] + + # check if all series are of the same block type: + if len(non_empties) > 0: + blocks = [obj.blocks[0] for obj in non_empties] + + if all(type(b) is type(blocks[0]) for b in blocks[1:]): # noqa + new_block = blocks[0].concat_same_type(blocks) + else: + values = [x.values for x in blocks] + values = _concat._concat_compat(values) + new_block = make_block( + values, placement=slice(0, len(values), 1)) + else: + values = [x._block.values for x in to_concat] + values = _concat._concat_compat(values) + new_block = make_block( + values, placement=slice(0, len(values), 1)) + + mgr = SingleBlockManager(new_block, new_axis) + return mgr + + +def construction_error(tot_items, block_shape, axes, e=None): + """ raise a helpful message about our construction """ + passed = tuple(map(int, [tot_items] + list(block_shape))) + implied = tuple(map(int, [len(ax) for ax in axes])) + if passed == implied and e is not None: + raise e + if block_shape[0] == 0: + raise ValueError("Empty data passed with indices specified.") + raise ValueError("Shape of passed values is {0}, indices imply {1}".format( + passed, implied)) + + +def create_block_manager_from_blocks(blocks, axes): + try: + if len(blocks) == 1 and not isinstance(blocks[0], Block): + # if blocks[0] is of length 0, return empty blocks + if not len(blocks[0]): + blocks = [] + else: + # It's OK if a single block is passed as values, its placement + # is basically "all items", but if there're many, don't bother + # converting, it's an error anyway. + blocks = [make_block(values=blocks[0], + placement=slice(0, len(axes[0])))] + + mgr = BlockManager(blocks, axes) + mgr._consolidate_inplace() + return mgr + + except (ValueError) as e: + blocks = [getattr(b, 'values', b) for b in blocks] + tot_items = sum(b.shape[0] for b in blocks) + construction_error(tot_items, blocks[0].shape[1:], axes, e) + + +def create_block_manager_from_arrays(arrays, names, axes): + + try: + blocks = form_blocks(arrays, names, axes) + mgr = BlockManager(blocks, axes) + mgr._consolidate_inplace() + return mgr + except ValueError as e: + construction_error(len(arrays), arrays[0].shape, axes, e) + + +def form_blocks(arrays, names, axes): + # put "leftover" items in float bucket, where else? + # generalize? + items_dict = defaultdict(list) + extra_locs = [] + + names_idx = ensure_index(names) + if names_idx.equals(axes[0]): + names_indexer = np.arange(len(names_idx)) + else: + assert names_idx.intersection(axes[0]).is_unique + names_indexer = names_idx.get_indexer_for(axes[0]) + + for i, name_idx in enumerate(names_indexer): + if name_idx == -1: + extra_locs.append(i) + continue + + k = names[name_idx] + v = arrays[name_idx] + + block_type = get_block_type(v) + items_dict[block_type.__name__].append((i, k, v)) + + blocks = [] + if len(items_dict['FloatBlock']): + float_blocks = _multi_blockify(items_dict['FloatBlock']) + blocks.extend(float_blocks) + + if len(items_dict['ComplexBlock']): + complex_blocks = _multi_blockify(items_dict['ComplexBlock']) + blocks.extend(complex_blocks) + + if len(items_dict['TimeDeltaBlock']): + timedelta_blocks = _multi_blockify(items_dict['TimeDeltaBlock']) + blocks.extend(timedelta_blocks) + + if len(items_dict['IntBlock']): + int_blocks = _multi_blockify(items_dict['IntBlock']) + blocks.extend(int_blocks) + + if len(items_dict['DatetimeBlock']): + datetime_blocks = _simple_blockify(items_dict['DatetimeBlock'], + _NS_DTYPE) + blocks.extend(datetime_blocks) + + if len(items_dict['DatetimeTZBlock']): + dttz_blocks = [make_block(array, + klass=DatetimeTZBlock, + placement=[i]) + for i, _, array in items_dict['DatetimeTZBlock']] + blocks.extend(dttz_blocks) + + if len(items_dict['BoolBlock']): + bool_blocks = _simple_blockify(items_dict['BoolBlock'], np.bool_) + blocks.extend(bool_blocks) + + if len(items_dict['ObjectBlock']) > 0: + object_blocks = _simple_blockify(items_dict['ObjectBlock'], np.object_) + blocks.extend(object_blocks) + + if len(items_dict['SparseBlock']) > 0: + sparse_blocks = _sparse_blockify(items_dict['SparseBlock']) + blocks.extend(sparse_blocks) + + if len(items_dict['CategoricalBlock']) > 0: + cat_blocks = [make_block(array, klass=CategoricalBlock, placement=[i]) + for i, _, array in items_dict['CategoricalBlock']] + blocks.extend(cat_blocks) + + if len(items_dict['ExtensionBlock']): + + external_blocks = [ + make_block(array, klass=ExtensionBlock, placement=[i]) + for i, _, array in items_dict['ExtensionBlock'] + ] + + blocks.extend(external_blocks) + + if len(extra_locs): + shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:]) + + # empty items -> dtype object + block_values = np.empty(shape, dtype=object) + block_values.fill(np.nan) + + na_block = make_block(block_values, placement=extra_locs) + blocks.append(na_block) + + return blocks + + +def _simple_blockify(tuples, dtype): + """ return a single array of a block that has a single dtype; if dtype is + not None, coerce to this dtype + """ + values, placement = _stack_arrays(tuples, dtype) + + # CHECK DTYPE? + if dtype is not None and values.dtype != dtype: # pragma: no cover + values = values.astype(dtype) + + block = make_block(values, placement=placement) + return [block] + + +def _multi_blockify(tuples, dtype=None): + """ return an array of blocks that potentially have different dtypes """ + + # group by dtype + grouper = itertools.groupby(tuples, lambda x: x[2].dtype) + + new_blocks = [] + for dtype, tup_block in grouper: + + values, placement = _stack_arrays(list(tup_block), dtype) + + block = make_block(values, placement=placement) + new_blocks.append(block) + + return new_blocks + + +def _sparse_blockify(tuples, dtype=None): + """ return an array of blocks that potentially have different dtypes (and + are sparse) + """ + + new_blocks = [] + for i, names, array in tuples: + array = _maybe_to_sparse(array) + block = make_block(array, klass=SparseBlock, placement=[i]) + new_blocks.append(block) + + return new_blocks + + +def _stack_arrays(tuples, dtype): + + # fml + def _asarray_compat(x): + if isinstance(x, ABCSeries): + return x._values + else: + return np.asarray(x) + + def _shape_compat(x): + if isinstance(x, ABCSeries): + return len(x), + else: + return x.shape + + placement, names, arrays = zip(*tuples) + + first = arrays[0] + shape = (len(arrays),) + _shape_compat(first) + + stacked = np.empty(shape, dtype=dtype) + for i, arr in enumerate(arrays): + stacked[i] = _asarray_compat(arr) + + return stacked, placement + + +def _interleaved_dtype(blocks): + if not len(blocks): + return None + + dtype = find_common_type([b.dtype for b in blocks]) + + # only numpy compat + if isinstance(dtype, (PandasExtensionDtype, ExtensionDtype)): + dtype = np.object + + return dtype + + +def _consolidate(blocks): + """ + Merge blocks having same dtype, exclude non-consolidating blocks + """ + + # sort by _can_consolidate, dtype + gkey = lambda x: x._consolidate_key + grouper = itertools.groupby(sorted(blocks, key=gkey), gkey) + + new_blocks = [] + for (_can_consolidate, dtype), group_blocks in grouper: + merged_blocks = _merge_blocks(list(group_blocks), dtype=dtype, + _can_consolidate=_can_consolidate) + new_blocks = _extend_blocks(merged_blocks, new_blocks) + return new_blocks + + +def _merge_blocks(blocks, dtype=None, _can_consolidate=True): + + if len(blocks) == 1: + return blocks[0] + + if _can_consolidate: + + if dtype is None: + if len({b.dtype for b in blocks}) != 1: + raise AssertionError("_merge_blocks are invalid!") + dtype = blocks[0].dtype + + # FIXME: optimization potential in case all mgrs contain slices and + # combination of those slices is a slice, too. + new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks]) + new_values = _vstack([b.values for b in blocks], dtype) + + argsort = np.argsort(new_mgr_locs) + new_values = new_values[argsort] + new_mgr_locs = new_mgr_locs[argsort] + + return make_block(new_values, placement=new_mgr_locs) + + # no merge + return blocks + + +def _extend_blocks(result, blocks=None): + """ return a new extended blocks, givin the result """ + if blocks is None: + blocks = [] + if isinstance(result, list): + for r in result: + if isinstance(r, list): + blocks.extend(r) + else: + blocks.append(r) + elif isinstance(result, BlockManager): + blocks.extend(result.blocks) + else: + blocks.append(result) + return blocks + + +def _block_shape(values, ndim=1, shape=None): + """ guarantee the shape of the values to be at least 1 d """ + if values.ndim < ndim: + if shape is None: + shape = values.shape + values = values.reshape(tuple((1, ) + shape)) + return values + + +def _vstack(to_stack, dtype): + + # work around NumPy 1.6 bug + if dtype == _NS_DTYPE or dtype == _TD_DTYPE: + new_values = np.vstack([x.view('i8') for x in to_stack]) + return new_values.view(dtype) + + else: + return np.vstack(to_stack) + + +def _maybe_compare(a, b, op): + + is_a_array = isinstance(a, np.ndarray) + is_b_array = isinstance(b, np.ndarray) + + # numpy deprecation warning to have i8 vs integer comparisons + if is_datetimelike_v_numeric(a, b): + result = False + + # numpy deprecation warning if comparing numeric vs string-like + elif is_numeric_v_string_like(a, b): + result = False + + else: + result = op(a, b) + + if is_scalar(result) and (is_a_array or is_b_array): + type_names = [type(a).__name__, type(b).__name__] + + if is_a_array: + type_names[0] = 'ndarray(dtype={dtype})'.format(dtype=a.dtype) + + if is_b_array: + type_names[1] = 'ndarray(dtype={dtype})'.format(dtype=b.dtype) + + raise TypeError( + "Cannot compare types {a!r} and {b!r}".format(a=type_names[0], + b=type_names[1])) + return result + + +def _concat_indexes(indexes): + return indexes[0].append(indexes[1:]) + + +def _block2d_to_blocknd(values, placement, shape, labels, ref_items): + """ pivot to the labels shape """ + panel_shape = (len(placement),) + shape + + # TODO: lexsort depth needs to be 2!! + + # Create observation selection vector using major and minor + # labels, for converting to panel format. + selector = _factor_indexer(shape[1:], labels) + mask = np.zeros(np.prod(shape), dtype=bool) + mask.put(selector, True) + + if mask.all(): + pvalues = np.empty(panel_shape, dtype=values.dtype) + else: + dtype, fill_value = maybe_promote(values.dtype) + pvalues = np.empty(panel_shape, dtype=dtype) + pvalues.fill(fill_value) + + for i in range(len(placement)): + pvalues[i].flat[mask] = values[:, i] + + return make_block(pvalues, placement=placement) + + +def _factor_indexer(shape, labels): + """ + given a tuple of shape and a list of Categorical labels, return the + expanded label indexer + """ + mult = np.array(shape)[::-1].cumprod()[::-1] + return ensure_platform_int( + np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T) + + +def _get_blkno_placements(blknos, blk_count, group=True): + """ + + Parameters + ---------- + blknos : array of int64 + blk_count : int + group : bool + + Returns + ------- + iterator + yield (BlockPlacement, blkno) + + """ + + blknos = ensure_int64(blknos) + + # FIXME: blk_count is unused, but it may avoid the use of dicts in cython + for blkno, indexer in libinternals.get_blkno_indexers(blknos, group): + yield blkno, BlockPlacement(indexer) + + +def items_overlap_with_suffix(left, lsuffix, right, rsuffix): + """ + If two indices overlap, add suffixes to overlapping entries. + + If corresponding suffix is empty, the entry is simply converted to string. + + """ + to_rename = left.intersection(right) + if len(to_rename) == 0: + return left, right + else: + if not lsuffix and not rsuffix: + raise ValueError('columns overlap but no suffix specified: ' + '{rename}'.format(rename=to_rename)) + + def lrenamer(x): + if x in to_rename: + return '{x}{lsuffix}'.format(x=x, lsuffix=lsuffix) + return x + + def rrenamer(x): + if x in to_rename: + return '{x}{rsuffix}'.format(x=x, rsuffix=rsuffix) + return x + + return (_transform_index(left, lrenamer), + _transform_index(right, rrenamer)) + + +def _safe_reshape(arr, new_shape): + """ + If possible, reshape `arr` to have shape `new_shape`, + with a couple of exceptions (see gh-13012): + + 1) If `arr` is a ExtensionArray or Index, `arr` will be + returned as is. + 2) If `arr` is a Series, the `_values` attribute will + be reshaped and returned. + + Parameters + ---------- + arr : array-like, object to be reshaped + new_shape : int or tuple of ints, the new shape + """ + if isinstance(arr, ABCSeries): + arr = arr._values + if not isinstance(arr, ABCExtensionArray): + arr = arr.reshape(new_shape) + return arr + + +def _transform_index(index, func, level=None): + """ + Apply function to all values found in index. + + This includes transforming multiindex entries separately. + Only apply function to one level of the MultiIndex if level is specified. + + """ + if isinstance(index, MultiIndex): + if level is not None: + items = [tuple(func(y) if i == level else y + for i, y in enumerate(x)) for x in index] + else: + items = [tuple(func(y) for y in x) for x in index] + return MultiIndex.from_tuples(items, names=index.names) + else: + items = [func(x) for x in index] + return Index(items, name=index.name, tupleize_cols=False) + + +def _putmask_smart(v, m, n): + """ + Return a new ndarray, try to preserve dtype if possible. + + Parameters + ---------- + v : `values`, updated in-place (array like) + m : `mask`, applies to both sides (array like) + n : `new values` either scalar or an array like aligned with `values` + + Returns + ------- + values : ndarray with updated values + this *may* be a copy of the original + + See Also + -------- + ndarray.putmask + """ + + # we cannot use np.asarray() here as we cannot have conversions + # that numpy does when numeric are mixed with strings + + # n should be the length of the mask or a scalar here + if not is_list_like(n): + n = np.repeat(n, len(m)) + elif isinstance(n, np.ndarray) and n.ndim == 0: # numpy scalar + n = np.repeat(np.array(n, ndmin=1), len(m)) + + # see if we are only masking values that if putted + # will work in the current dtype + try: + nn = n[m] + + # make sure that we have a nullable type + # if we have nulls + if not _isna_compat(v, nn[0]): + raise ValueError + + # we ignore ComplexWarning here + with catch_warnings(record=True): + nn_at = nn.astype(v.dtype) + + # avoid invalid dtype comparisons + # between numbers & strings + + # only compare integers/floats + # don't compare integers to datetimelikes + if (not is_numeric_v_string_like(nn, nn_at) and + (is_float_dtype(nn.dtype) or + is_integer_dtype(nn.dtype) and + is_float_dtype(nn_at.dtype) or + is_integer_dtype(nn_at.dtype))): + + comp = (nn == nn_at) + if is_list_like(comp) and comp.all(): + nv = v.copy() + nv[m] = nn_at + return nv + except (ValueError, IndexError, TypeError): + pass + + n = np.asarray(n) + + def _putmask_preserve(nv, n): + try: + nv[m] = n[m] + except (IndexError, ValueError): + nv[m] = n + return nv + + # preserves dtype if possible + if v.dtype.kind == n.dtype.kind: + return _putmask_preserve(v, n) + + # change the dtype if needed + dtype, _ = maybe_promote(n.dtype) + + if is_extension_type(v.dtype) and is_object_dtype(dtype): + v = v.get_values(dtype) + else: + v = v.astype(dtype) + + return _putmask_preserve(v, n) + + +def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): + """ + Concatenate block managers into one. + + Parameters + ---------- + mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples + axes : list of Index + concat_axis : int + copy : bool + + """ + concat_plan = combine_concat_plans( + [get_mgr_concatenation_plan(mgr, indexers) + for mgr, indexers in mgrs_indexers], concat_axis) + + blocks = [] + + for placement, join_units in concat_plan: + + if len(join_units) == 1 and not join_units[0].indexers: + b = join_units[0].block + values = b.values + if copy: + values = values.copy() + elif not copy: + values = values.view() + b = b.make_block_same_class(values, placement=placement) + elif is_uniform_join_units(join_units): + b = join_units[0].block.concat_same_type( + [ju.block for ju in join_units], placement=placement) + else: + b = make_block( + concatenate_join_units(join_units, concat_axis, copy=copy), + placement=placement) + blocks.append(b) + + return BlockManager(blocks, axes) + + +def is_uniform_join_units(join_units): + """ + Check if the join units consist of blocks of uniform type that can + be concatenated using Block.concat_same_type instead of the generic + concatenate_join_units (which uses `_concat._concat_compat`). + + """ + return ( + # all blocks need to have the same type + all(type(ju.block) is type(join_units[0].block) for ju in join_units) and # noqa + # no blocks that would get missing values (can lead to type upcasts) + # unless we're an extension dtype. + all(not ju.is_na or ju.block.is_extension for ju in join_units) and + # no blocks with indexers (as then the dimensions do not fit) + all(not ju.indexers for ju in join_units) and + # disregard Panels + all(ju.block.ndim <= 2 for ju in join_units) and + # only use this path when there is something to concatenate + len(join_units) > 1) + + +def is_uniform_reindex(join_units): + return ( + # TODO: should this be ju.block._can_hold_na? + all(ju.block and ju.block.is_extension for ju in join_units) and + len(set(ju.block.dtype.name for ju in join_units)) == 1 + ) + + +def get_empty_dtype_and_na(join_units): + """ + Return dtype and N/A values to use when concatenating specified units. + + Returned N/A value may be None which means there was no casting involved. + + Returns + ------- + dtype + na + """ + + if len(join_units) == 1: + blk = join_units[0].block + if blk is None: + return np.float64, np.nan + + if is_uniform_reindex(join_units): + # XXX: integrate property + empty_dtype = join_units[0].block.dtype + upcasted_na = join_units[0].block.fill_value + return empty_dtype, upcasted_na + + has_none_blocks = False + dtypes = [None] * len(join_units) + for i, unit in enumerate(join_units): + if unit.block is None: + has_none_blocks = True + else: + dtypes[i] = unit.dtype + + upcast_classes = defaultdict(list) + null_upcast_classes = defaultdict(list) + for dtype, unit in zip(dtypes, join_units): + if dtype is None: + continue + + if is_categorical_dtype(dtype): + upcast_cls = 'category' + elif is_datetimetz(dtype): + upcast_cls = 'datetimetz' + elif issubclass(dtype.type, np.bool_): + upcast_cls = 'bool' + elif issubclass(dtype.type, np.object_): + upcast_cls = 'object' + elif is_datetime64_dtype(dtype): + upcast_cls = 'datetime' + elif is_timedelta64_dtype(dtype): + upcast_cls = 'timedelta' + elif is_float_dtype(dtype) or is_numeric_dtype(dtype): + upcast_cls = dtype.name + else: + upcast_cls = 'float' + + # Null blocks should not influence upcast class selection, unless there + # are only null blocks, when same upcasting rules must be applied to + # null upcast classes. + if unit.is_na: + null_upcast_classes[upcast_cls].append(dtype) + else: + upcast_classes[upcast_cls].append(dtype) + + if not upcast_classes: + upcast_classes = null_upcast_classes + + # create the result + if 'object' in upcast_classes: + return np.dtype(np.object_), np.nan + elif 'bool' in upcast_classes: + if has_none_blocks: + return np.dtype(np.object_), np.nan + else: + return np.dtype(np.bool_), None + elif 'category' in upcast_classes: + return np.dtype(np.object_), np.nan + elif 'datetimetz' in upcast_classes: + dtype = upcast_classes['datetimetz'] + return dtype[0], tslibs.iNaT + elif 'datetime' in upcast_classes: + return np.dtype('M8[ns]'), tslibs.iNaT + elif 'timedelta' in upcast_classes: + return np.dtype('m8[ns]'), tslibs.iNaT + else: # pragma + g = np.find_common_type(upcast_classes, []) + if is_float_dtype(g): + return g, g.type(np.nan) + elif is_numeric_dtype(g): + if has_none_blocks: + return np.float64, np.nan + else: + return g, None + + msg = "invalid dtype determination in get_concat_dtype" + raise AssertionError(msg) + + +def concatenate_join_units(join_units, concat_axis, copy): + """ + Concatenate values from several join units along selected axis. + """ + if concat_axis == 0 and len(join_units) > 1: + # Concatenating join units along ax0 is handled in _merge_blocks. + raise AssertionError("Concatenating join units along axis0") + + empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units) + + to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype, + upcasted_na=upcasted_na) + for ju in join_units] + + if len(to_concat) == 1: + # Only one block, nothing to concatenate. + concat_values = to_concat[0] + if copy: + if isinstance(concat_values, np.ndarray): + # non-reindexed (=not yet copied) arrays are made into a view + # in JoinUnit.get_reindexed_values + if concat_values.base is not None: + concat_values = concat_values.copy() + else: + concat_values = concat_values.copy() + else: + concat_values = _concat._concat_compat(to_concat, axis=concat_axis) + + return concat_values + + +def get_mgr_concatenation_plan(mgr, indexers): + """ + Construct concatenation plan for given block manager and indexers. + + Parameters + ---------- + mgr : BlockManager + indexers : dict of {axis: indexer} + + Returns + ------- + plan : list of (BlockPlacement, JoinUnit) tuples + + """ + # Calculate post-reindex shape , save for item axis which will be separate + # for each block anyway. + mgr_shape = list(mgr.shape) + for ax, indexer in indexers.items(): + mgr_shape[ax] = len(indexer) + mgr_shape = tuple(mgr_shape) + + if 0 in indexers: + ax0_indexer = indexers.pop(0) + blknos = algos.take_1d(mgr._blknos, ax0_indexer, fill_value=-1) + blklocs = algos.take_1d(mgr._blklocs, ax0_indexer, fill_value=-1) + else: + + if mgr._is_single_block: + blk = mgr.blocks[0] + return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] + + ax0_indexer = None + blknos = mgr._blknos + blklocs = mgr._blklocs + + plan = [] + for blkno, placements in _get_blkno_placements(blknos, len(mgr.blocks), + group=False): + + assert placements.is_slice_like + + join_unit_indexers = indexers.copy() + + shape = list(mgr_shape) + shape[0] = len(placements) + shape = tuple(shape) + + if blkno == -1: + unit = JoinUnit(None, shape) + else: + blk = mgr.blocks[blkno] + ax0_blk_indexer = blklocs[placements.indexer] + + unit_no_ax0_reindexing = (len(placements) == len(blk.mgr_locs) and + # Fastpath detection of join unit not + # needing to reindex its block: no ax0 + # reindexing took place and block + # placement was sequential before. + ((ax0_indexer is None and + blk.mgr_locs.is_slice_like and + blk.mgr_locs.as_slice.step == 1) or + # Slow-ish detection: all indexer locs + # are sequential (and length match is + # checked above). + (np.diff(ax0_blk_indexer) == 1).all())) + + # Omit indexer if no item reindexing is required. + if unit_no_ax0_reindexing: + join_unit_indexers.pop(0, None) + else: + join_unit_indexers[0] = ax0_blk_indexer + + unit = JoinUnit(blk, shape, join_unit_indexers) + + plan.append((placements, unit)) + + return plan + + +def combine_concat_plans(plans, concat_axis): + """ + Combine multiple concatenation plans into one. + + existing_plan is updated in-place. + """ + if len(plans) == 1: + for p in plans[0]: + yield p[0], [p[1]] + + elif concat_axis == 0: + offset = 0 + for plan in plans: + last_plc = None + + for plc, unit in plan: + yield plc.add(offset), [unit] + last_plc = plc + + if last_plc is not None: + offset += last_plc.as_slice.stop + + else: + num_ended = [0] + + def _next_or_none(seq): + retval = next(seq, None) + if retval is None: + num_ended[0] += 1 + return retval + + plans = list(map(iter, plans)) + next_items = list(map(_next_or_none, plans)) + + while num_ended[0] != len(next_items): + if num_ended[0] > 0: + raise ValueError("Plan shapes are not aligned") + + placements, units = zip(*next_items) + + lengths = list(map(len, placements)) + min_len, max_len = min(lengths), max(lengths) + + if min_len == max_len: + yield placements[0], units + next_items[:] = map(_next_or_none, plans) + else: + yielded_placement = None + yielded_units = [None] * len(next_items) + for i, (plc, unit) in enumerate(next_items): + yielded_units[i] = unit + if len(plc) > min_len: + # trim_join_unit updates unit in place, so only + # placement needs to be sliced to skip min_len. + next_items[i] = (plc[min_len:], + trim_join_unit(unit, min_len)) + else: + yielded_placement = plc + next_items[i] = _next_or_none(plans[i]) + + yield yielded_placement, yielded_units + + +def trim_join_unit(join_unit, length): + """ + Reduce join_unit's shape along item axis to length. + + Extra items that didn't fit are returned as a separate block. + """ + + if 0 not in join_unit.indexers: + extra_indexers = join_unit.indexers + + if join_unit.block is None: + extra_block = None + else: + extra_block = join_unit.block.getitem_block(slice(length, None)) + join_unit.block = join_unit.block.getitem_block(slice(length)) + else: + extra_block = join_unit.block + + extra_indexers = copy.copy(join_unit.indexers) + extra_indexers[0] = extra_indexers[0][length:] + join_unit.indexers[0] = join_unit.indexers[0][:length] + + extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:] + join_unit.shape = (length,) + join_unit.shape[1:] + + return JoinUnit(block=extra_block, indexers=extra_indexers, + shape=extra_shape) + + +class JoinUnit(object): + + def __init__(self, block, shape, indexers=None): + # Passing shape explicitly is required for cases when block is None. + if indexers is None: + indexers = {} + self.block = block + self.indexers = indexers + self.shape = shape + + def __repr__(self): + return '{name}({block!r}, {indexers})'.format( + name=self.__class__.__name__, block=self.block, + indexers=self.indexers) + + @cache_readonly + def needs_filling(self): + for indexer in self.indexers.values(): + # FIXME: cache results of indexer == -1 checks. + if (indexer == -1).any(): + return True + + return False + + @cache_readonly + def dtype(self): + if self.block is None: + raise AssertionError("Block is None, no dtype") + + if not self.needs_filling: + return self.block.dtype + else: + return _get_dtype(maybe_promote(self.block.dtype, + self.block.fill_value)[0]) + + @cache_readonly + def is_na(self): + if self.block is None: + return True + + if not self.block._can_hold_na: + return False + + # Usually it's enough to check but a small fraction of values to see if + # a block is NOT null, chunks should help in such cases. 1000 value + # was chosen rather arbitrarily. + values = self.block.values + if self.block.is_categorical: + values_flat = values.categories + elif self.block.is_sparse: + # fill_value is not NaN and have holes + if not values._null_fill_value and values.sp_index.ngaps > 0: + return False + values_flat = values.ravel(order='K') + elif isinstance(self.block, ExtensionBlock): + values_flat = values + else: + values_flat = values.ravel(order='K') + total_len = values_flat.shape[0] + chunk_len = max(total_len // 40, 1000) + for i in range(0, total_len, chunk_len): + if not isna(values_flat[i:i + chunk_len]).all(): + return False + + return True + + def get_reindexed_values(self, empty_dtype, upcasted_na): + if upcasted_na is None: + # No upcasting is necessary + fill_value = self.block.fill_value + values = self.block.get_values() + else: + fill_value = upcasted_na + + if self.is_na: + if getattr(self.block, 'is_object', False): + # we want to avoid filling with np.nan if we are + # using None; we already know that we are all + # nulls + values = self.block.values.ravel(order='K') + if len(values) and values[0] is None: + fill_value = None + + if getattr(self.block, 'is_datetimetz', False) or \ + is_datetimetz(empty_dtype): + pass + elif getattr(self.block, 'is_categorical', False): + pass + elif getattr(self.block, 'is_sparse', False): + pass + else: + missing_arr = np.empty(self.shape, dtype=empty_dtype) + missing_arr.fill(fill_value) + return missing_arr + + if not self.indexers: + if not self.block._can_consolidate: + # preserve these for validation in _concat_compat + return self.block.values + + if self.block.is_bool and not self.block.is_categorical: + # External code requested filling/upcasting, bool values must + # be upcasted to object to avoid being upcasted to numeric. + values = self.block.astype(np.object_).values + elif self.block.is_extension: + values = self.block.values + else: + # No dtype upcasting is done here, it will be performed during + # concatenation itself. + values = self.block.get_values() + + if not self.indexers: + # If there's no indexing to be done, we want to signal outside + # code that this array must be copied explicitly. This is done + # by returning a view and checking `retval.base`. + values = values.view() + + else: + for ax, indexer in self.indexers.items(): + values = algos.take_nd(values, indexer, axis=ax, + fill_value=fill_value) + + return values + + +def _fast_count_smallints(arr): + """Faster version of set(arr) for sequences of small numbers.""" + if len(arr) == 0: + # Handle empty arr case separately: numpy 1.6 chokes on that. + return np.empty((0, 2), dtype=arr.dtype) + else: + counts = np.bincount(arr.astype(np.int_)) + nz = counts.nonzero()[0] + return np.c_[nz, counts[nz]] + + +def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill): + if isinstance(slice_or_indexer, slice): + return ('slice', slice_or_indexer, + libinternals.slice_len(slice_or_indexer, length)) + elif (isinstance(slice_or_indexer, np.ndarray) and + slice_or_indexer.dtype == np.bool_): + return 'mask', slice_or_indexer, slice_or_indexer.sum() + else: + indexer = np.asanyarray(slice_or_indexer, dtype=np.int64) + if not allow_fill: + indexer = maybe_convert_indices(indexer, length) + return 'fancy', indexer, len(indexer) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py deleted file mode 100644 index 23f7204572b75..0000000000000 --- a/pandas/core/internals/blocks.py +++ /dev/null @@ -1,3585 +0,0 @@ -# -*- coding: utf-8 -*- -from collections import defaultdict -from datetime import date, datetime, timedelta -import inspect -import itertools -import re -import warnings - -import numpy as np - -from pandas._libs import ( - lib, tslibs, tslib, Timedelta, - internals as libinternals) -from pandas._libs.tslibs import conversion - -from pandas import compat -from pandas.compat import range, zip - -from pandas.util._validators import validate_bool_kwarg - -import pandas.core.dtypes.generic as gt -from pandas.core.dtypes.common import ( - _TD_DTYPE, _NS_DTYPE, - ensure_platform_int, - is_integer, - is_dtype_equal, - is_timedelta64_dtype, - is_datetime64_dtype, is_datetimetz, is_sparse, - is_categorical, is_categorical_dtype, - is_integer_dtype, - is_datetime64tz_dtype, - is_bool_dtype, - is_object_dtype, - is_float_dtype, - is_numeric_v_string_like, is_extension_type, - is_extension_array_dtype, - is_list_like, - is_re, - is_re_compilable, - is_scalar) -from pandas.core.dtypes.dtypes import ( - ExtensionDtype, DatetimeTZDtype, - PandasExtensionDtype, - CategoricalDtype) -from pandas.core.dtypes.cast import ( - maybe_downcast_to_dtype, - maybe_upcast, - maybe_promote, - infer_dtype_from, - infer_dtype_from_scalar, - soft_convert_objects, - maybe_convert_objects, - astype_nansafe, - find_common_type, - maybe_infer_dtype_type) -from pandas.core.dtypes.missing import ( - isna, notna, array_equivalent, - _isna_compat, - is_null_datelike_scalar) -import pandas.core.dtypes.concat as _concat - -import pandas.core.common as com -import pandas.core.algorithms as algos -from pandas.core.base import PandasObject -from pandas.core import missing - -from pandas.core.arrays import Categorical -from pandas.core.sparse.array import SparseArray, _maybe_to_sparse - -from pandas.core.index import ensure_index -from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.indexes.timedeltas import TimedeltaIndex -from pandas.core.indexing import check_setitem_lengths - -from pandas.io.formats.printing import pprint_thing - - -class Block(PandasObject): - """ - Canonical n-dimensional unit of homogeneous dtype contained in a pandas - data structure - - Index-ignorant; let the container take care of that - """ - __slots__ = ['_mgr_locs', 'values', 'ndim'] - is_numeric = False - is_float = False - is_integer = False - is_complex = False - is_datetime = False - is_datetimetz = False - is_timedelta = False - is_bool = False - is_object = False - is_categorical = False - is_sparse = False - is_extension = False - _box_to_block_values = True - _can_hold_na = False - _can_consolidate = True - _verify_integrity = True - _validate_ndim = True - _ftype = 'dense' - _concatenator = staticmethod(np.concatenate) - - def __init__(self, values, placement, ndim=None): - self.ndim = self._check_ndim(values, ndim) - self.mgr_locs = placement - self.values = values - - if (self._validate_ndim and self.ndim and - len(self.mgr_locs) != len(self.values)): - raise ValueError( - 'Wrong number of items passed {val}, placement implies ' - '{mgr}'.format(val=len(self.values), mgr=len(self.mgr_locs))) - - def _check_ndim(self, values, ndim): - """ndim inference and validation. - - Infers ndim from 'values' if not provided to __init__. - Validates that values.ndim and ndim are consistent if and only if - the class variable '_validate_ndim' is True. - - Parameters - ---------- - values : array-like - ndim : int or None - - Returns - ------- - ndim : int - - Raises - ------ - ValueError : the number of dimensions do not match - """ - if ndim is None: - ndim = values.ndim - - if self._validate_ndim and values.ndim != ndim: - msg = ("Wrong number of dimensions. values.ndim != ndim " - "[{} != {}]") - raise ValueError(msg.format(values.ndim, ndim)) - - return ndim - - @property - def _holder(self): - """The array-like that can hold the underlying values. - - None for 'Block', overridden by subclasses that don't - use an ndarray. - """ - return None - - @property - def _consolidate_key(self): - return (self._can_consolidate, self.dtype.name) - - @property - def _is_single_block(self): - return self.ndim == 1 - - @property - def is_view(self): - """ return a boolean if I am possibly a view """ - return self.values.base is not None - - @property - def is_datelike(self): - """ return True if I am a non-datelike """ - return self.is_datetime or self.is_timedelta - - def is_categorical_astype(self, dtype): - """ - validate that we have a astypeable to categorical, - returns a boolean if we are a categorical - """ - if dtype is Categorical or dtype is CategoricalDtype: - # this is a pd.Categorical, but is not - # a valid type for astypeing - raise TypeError("invalid type {0} for astype".format(dtype)) - - elif is_categorical_dtype(dtype): - return True - - return False - - def external_values(self, dtype=None): - """ return an outside world format, currently just the ndarray """ - return self.values - - def internal_values(self, dtype=None): - """ return an internal format, currently just the ndarray - this should be the pure internal API format - """ - return self.values - - def formatting_values(self): - """Return the internal values used by the DataFrame/SeriesFormatter""" - return self.internal_values() - - def get_values(self, dtype=None): - """ - return an internal format, currently just the ndarray - this is often overridden to handle to_dense like operations - """ - if is_object_dtype(dtype): - return self.values.astype(object) - return self.values - - def to_dense(self): - return self.values.view() - - @property - def _na_value(self): - return np.nan - - @property - def fill_value(self): - return np.nan - - @property - def mgr_locs(self): - return self._mgr_locs - - @mgr_locs.setter - def mgr_locs(self, new_mgr_locs): - if not isinstance(new_mgr_locs, libinternals.BlockPlacement): - new_mgr_locs = libinternals.BlockPlacement(new_mgr_locs) - - self._mgr_locs = new_mgr_locs - - @property - def array_dtype(self): - """ the dtype to return if I want to construct this block as an - array - """ - return self.dtype - - def make_block(self, values, placement=None, ndim=None): - """ - Create a new block, with type inference propagate any values that are - not specified - """ - if placement is None: - placement = self.mgr_locs - if ndim is None: - ndim = self.ndim - - return make_block(values, placement=placement, ndim=ndim) - - def make_block_scalar(self, values): - """ - Create a ScalarBlock - """ - return ScalarBlock(values) - - def make_block_same_class(self, values, placement=None, ndim=None, - dtype=None): - """ Wrap given values in a block of same type as self. """ - if dtype is not None: - # issue 19431 fastparquet is passing this - warnings.warn("dtype argument is deprecated, will be removed " - "in a future release.", DeprecationWarning) - if placement is None: - placement = self.mgr_locs - return make_block(values, placement=placement, ndim=ndim, - klass=self.__class__, dtype=dtype) - - def __unicode__(self): - - # don't want to print out all of the items here - name = pprint_thing(self.__class__.__name__) - if self._is_single_block: - - result = '{name}: {len} dtype: {dtype}'.format( - name=name, len=len(self), dtype=self.dtype) - - else: - - shape = ' x '.join(pprint_thing(s) for s in self.shape) - result = '{name}: {index}, {shape}, dtype: {dtype}'.format( - name=name, index=pprint_thing(self.mgr_locs.indexer), - shape=shape, dtype=self.dtype) - - return result - - def __len__(self): - return len(self.values) - - def __getstate__(self): - return self.mgr_locs.indexer, self.values - - def __setstate__(self, state): - self.mgr_locs = libinternals.BlockPlacement(state[0]) - self.values = state[1] - self.ndim = self.values.ndim - - def _slice(self, slicer): - """ return a slice of my values """ - return self.values[slicer] - - # TODO: remove after Panel is gone - def reshape_nd(self, labels, shape, ref_items, mgr=None): - """ - Parameters - ---------- - labels : list of new axis labels - shape : new shape - ref_items : new ref_items - - return a new block that is transformed to a nd block - """ - return _block2d_to_blocknd(values=self.get_values().T, - placement=self.mgr_locs, shape=shape, - labels=labels, ref_items=ref_items) - - def getitem_block(self, slicer, new_mgr_locs=None): - """ - Perform __getitem__-like, return result as block. - - As of now, only supports slices that preserve dimensionality. - """ - if new_mgr_locs is None: - if isinstance(slicer, tuple): - axis0_slicer = slicer[0] - else: - axis0_slicer = slicer - new_mgr_locs = self.mgr_locs[axis0_slicer] - - new_values = self._slice(slicer) - - if self._validate_ndim and new_values.ndim != self.ndim: - raise ValueError("Only same dim slicing is allowed") - - return self.make_block_same_class(new_values, new_mgr_locs) - - @property - def shape(self): - return self.values.shape - - @property - def dtype(self): - return self.values.dtype - - @property - def ftype(self): - return "{dtype}:{ftype}".format(dtype=self.dtype, ftype=self._ftype) - - def merge(self, other): - return _merge_blocks([self, other]) - - def concat_same_type(self, to_concat, placement=None): - """ - Concatenate list of single blocks of the same type. - """ - values = self._concatenator([blk.values for blk in to_concat], - axis=self.ndim - 1) - return self.make_block_same_class( - values, placement=placement or slice(0, len(values), 1)) - - def iget(self, i): - return self.values[i] - - def set(self, locs, values, check=False): - """ - Modify Block in-place with new item value - - Returns - ------- - None - """ - self.values[locs] = values - - def delete(self, loc): - """ - Delete given loc(-s) from block in-place. - """ - self.values = np.delete(self.values, loc, 0) - self.mgr_locs = self.mgr_locs.delete(loc) - - def apply(self, func, mgr=None, **kwargs): - """ apply the function to my values; return a block if we are not - one - """ - with np.errstate(all='ignore'): - result = func(self.values, **kwargs) - if not isinstance(result, Block): - result = self.make_block(values=_block_shape(result, - ndim=self.ndim)) - - return result - - def fillna(self, value, limit=None, inplace=False, downcast=None, - mgr=None): - """ fillna on the block with the value. If we fail, then convert to - ObjectBlock and try again - """ - inplace = validate_bool_kwarg(inplace, 'inplace') - - if not self._can_hold_na: - if inplace: - return self - else: - return self.copy() - - mask = isna(self.values) - if limit is not None: - if not is_integer(limit): - raise ValueError('Limit must be an integer') - if limit < 1: - raise ValueError('Limit must be greater than 0') - if self.ndim > 2: - raise NotImplementedError("number of dimensions for 'fillna' " - "is currently limited to 2") - mask[mask.cumsum(self.ndim - 1) > limit] = False - - # fillna, but if we cannot coerce, then try again as an ObjectBlock - try: - values, _, _, _ = self._try_coerce_args(self.values, value) - blocks = self.putmask(mask, value, inplace=inplace) - blocks = [b.make_block(values=self._try_coerce_result(b.values)) - for b in blocks] - return self._maybe_downcast(blocks, downcast) - except (TypeError, ValueError): - - # we can't process the value, but nothing to do - if not mask.any(): - return self if inplace else self.copy() - - # operate column-by-column - def f(m, v, i): - block = self.coerce_to_target_dtype(value) - - # slice out our block - if i is not None: - block = block.getitem_block(slice(i, i + 1)) - return block.fillna(value, - limit=limit, - inplace=inplace, - downcast=None) - - return self.split_and_operate(mask, f, inplace) - - def split_and_operate(self, mask, f, inplace): - """ - split the block per-column, and apply the callable f - per-column, return a new block for each. Handle - masking which will not change a block unless needed. - - Parameters - ---------- - mask : 2-d boolean mask - f : callable accepting (1d-mask, 1d values, indexer) - inplace : boolean - - Returns - ------- - list of blocks - """ - - if mask is None: - mask = np.ones(self.shape, dtype=bool) - new_values = self.values - - def make_a_block(nv, ref_loc): - if isinstance(nv, Block): - block = nv - elif isinstance(nv, list): - block = nv[0] - else: - # Put back the dimension that was taken from it and make - # a block out of the result. - try: - nv = _block_shape(nv, ndim=self.ndim) - except (AttributeError, NotImplementedError): - pass - block = self.make_block(values=nv, - placement=ref_loc) - return block - - # ndim == 1 - if self.ndim == 1: - if mask.any(): - nv = f(mask, new_values, None) - else: - nv = new_values if inplace else new_values.copy() - block = make_a_block(nv, self.mgr_locs) - return [block] - - # ndim > 1 - new_blocks = [] - for i, ref_loc in enumerate(self.mgr_locs): - m = mask[i] - v = new_values[i] - - # need a new block - if m.any(): - nv = f(m, v, i) - else: - nv = v if inplace else v.copy() - - block = make_a_block(nv, [ref_loc]) - new_blocks.append(block) - - return new_blocks - - def _maybe_downcast(self, blocks, downcast=None): - - # no need to downcast our float - # unless indicated - if downcast is None and self.is_float: - return blocks - elif downcast is None and (self.is_timedelta or self.is_datetime): - return blocks - - if not isinstance(blocks, list): - blocks = [blocks] - return _extend_blocks([b.downcast(downcast) for b in blocks]) - - def downcast(self, dtypes=None, mgr=None): - """ try to downcast each item to the dict of dtypes if present """ - - # turn it off completely - if dtypes is False: - return self - - values = self.values - - # single block handling - if self._is_single_block: - - # try to cast all non-floats here - if dtypes is None: - dtypes = 'infer' - - nv = maybe_downcast_to_dtype(values, dtypes) - return self.make_block(nv) - - # ndim > 1 - if dtypes is None: - return self - - if not (dtypes == 'infer' or isinstance(dtypes, dict)): - raise ValueError("downcast must have a dictionary or 'infer' as " - "its argument") - - # operate column-by-column - # this is expensive as it splits the blocks items-by-item - def f(m, v, i): - - if dtypes == 'infer': - dtype = 'infer' - else: - raise AssertionError("dtypes as dict is not supported yet") - - if dtype is not None: - v = maybe_downcast_to_dtype(v, dtype) - return v - - return self.split_and_operate(None, f, False) - - def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs): - return self._astype(dtype, copy=copy, errors=errors, values=values, - **kwargs) - - def _astype(self, dtype, copy=False, errors='raise', values=None, - klass=None, mgr=None, **kwargs): - """Coerce to the new type - - Parameters - ---------- - dtype : str, dtype convertible - copy : boolean, default False - copy if indicated - errors : str, {'raise', 'ignore'}, default 'ignore' - - ``raise`` : allow exceptions to be raised - - ``ignore`` : suppress exceptions. On error return original object - - Returns - ------- - Block - """ - errors_legal_values = ('raise', 'ignore') - - if errors not in errors_legal_values: - invalid_arg = ("Expected value of kwarg 'errors' to be one of {}. " - "Supplied value is '{}'".format( - list(errors_legal_values), errors)) - raise ValueError(invalid_arg) - - if (inspect.isclass(dtype) and - issubclass(dtype, (PandasExtensionDtype, ExtensionDtype))): - msg = ("Expected an instance of {}, but got the class instead. " - "Try instantiating 'dtype'.".format(dtype.__name__)) - raise TypeError(msg) - - # may need to convert to categorical - if self.is_categorical_astype(dtype): - - # deprecated 17636 - if ('categories' in kwargs or 'ordered' in kwargs): - if isinstance(dtype, CategoricalDtype): - raise TypeError( - "Cannot specify a CategoricalDtype and also " - "`categories` or `ordered`. Use " - "`dtype=CategoricalDtype(categories, ordered)`" - " instead.") - warnings.warn("specifying 'categories' or 'ordered' in " - ".astype() is deprecated; pass a " - "CategoricalDtype instead", - FutureWarning, stacklevel=7) - - categories = kwargs.get('categories', None) - ordered = kwargs.get('ordered', None) - if com._any_not_none(categories, ordered): - dtype = CategoricalDtype(categories, ordered) - - if is_categorical_dtype(self.values): - # GH 10696/18593: update an existing categorical efficiently - return self.make_block(self.values.astype(dtype, copy=copy)) - - return self.make_block(Categorical(self.values, dtype=dtype)) - - # astype processing - if not is_extension_array_dtype(dtype): - dtype = np.dtype(dtype) - if is_dtype_equal(self.dtype, dtype): - if copy: - return self.copy() - return self - - if klass is None: - if dtype == np.object_: - klass = ObjectBlock - try: - # force the copy here - if values is None: - - if issubclass(dtype.type, - (compat.text_type, compat.string_types)): - - # use native type formatting for datetime/tz/timedelta - if self.is_datelike: - values = self.to_native_types() - - # astype formatting - else: - values = self.get_values() - - else: - values = self.get_values(dtype=dtype) - - # _astype_nansafe works fine with 1-d only - values = astype_nansafe(values.ravel(), dtype, copy=True) - - # TODO(extension) - # should we make this attribute? - try: - values = values.reshape(self.shape) - except AttributeError: - pass - - newb = make_block(values, placement=self.mgr_locs, - klass=klass) - except: - if errors == 'raise': - raise - newb = self.copy() if copy else self - - if newb.is_numeric and self.is_numeric: - if newb.shape != self.shape: - raise TypeError( - "cannot set astype for copy = [{copy}] for dtype " - "({dtype} [{itemsize}]) with smaller itemsize than " - "current ({newb_dtype} [{newb_size}])".format( - copy=copy, dtype=self.dtype.name, - itemsize=self.itemsize, newb_dtype=newb.dtype.name, - newb_size=newb.itemsize)) - return newb - - def convert(self, copy=True, **kwargs): - """ attempt to coerce any object types to better types return a copy - of the block (if copy = True) by definition we are not an ObjectBlock - here! - """ - - return self.copy() if copy else self - - def _can_hold_element(self, element): - """ require the same dtype as ourselves """ - dtype = self.values.dtype.type - tipo = maybe_infer_dtype_type(element) - if tipo is not None: - return issubclass(tipo.type, dtype) - return isinstance(element, dtype) - - def _try_cast_result(self, result, dtype=None): - """ try to cast the result to our original type, we may have - roundtripped thru object in the mean-time - """ - if dtype is None: - dtype = self.dtype - - if self.is_integer or self.is_bool or self.is_datetime: - pass - elif self.is_float and result.dtype == self.dtype: - - # protect against a bool/object showing up here - if isinstance(dtype, compat.string_types) and dtype == 'infer': - return result - if not isinstance(dtype, type): - dtype = dtype.type - if issubclass(dtype, (np.bool_, np.object_)): - if issubclass(dtype, np.bool_): - if isna(result).all(): - return result.astype(np.bool_) - else: - result = result.astype(np.object_) - result[result == 1] = True - result[result == 0] = False - return result - else: - return result.astype(np.object_) - - return result - - # may need to change the dtype here - return maybe_downcast_to_dtype(result, dtype) - - def _try_coerce_args(self, values, other): - """ provide coercion to our input arguments """ - - if np.any(notna(other)) and not self._can_hold_element(other): - # coercion issues - # let higher levels handle - raise TypeError("cannot convert {} to an {}".format( - type(other).__name__, - type(self).__name__.lower().replace('Block', ''))) - - return values, False, other, False - - def _try_coerce_result(self, result): - """ reverse of try_coerce_args """ - return result - - def _try_coerce_and_cast_result(self, result, dtype=None): - result = self._try_coerce_result(result) - result = self._try_cast_result(result, dtype=dtype) - return result - - def to_native_types(self, slicer=None, na_rep='nan', quoting=None, - **kwargs): - """ convert to our native types format, slicing if desired """ - - values = self.get_values() - - if slicer is not None: - values = values[:, slicer] - mask = isna(values) - - if not self.is_object and not quoting: - values = values.astype(str) - else: - values = np.array(values, dtype='object') - - values[mask] = na_rep - return values - - # block actions #### - def copy(self, deep=True, mgr=None): - """ copy constructor """ - values = self.values - if deep: - values = values.copy() - return self.make_block_same_class(values) - - def replace(self, to_replace, value, inplace=False, filter=None, - regex=False, convert=True, mgr=None): - """ replace the to_replace value with value, possible to create new - blocks here this is just a call to putmask. regex is not used here. - It is used in ObjectBlocks. It is here for API - compatibility. - """ - - inplace = validate_bool_kwarg(inplace, 'inplace') - original_to_replace = to_replace - - # try to replace, if we raise an error, convert to ObjectBlock and - # retry - try: - values, _, to_replace, _ = self._try_coerce_args(self.values, - to_replace) - mask = missing.mask_missing(values, to_replace) - if filter is not None: - filtered_out = ~self.mgr_locs.isin(filter) - mask[filtered_out.nonzero()[0]] = False - - blocks = self.putmask(mask, value, inplace=inplace) - if convert: - blocks = [b.convert(by_item=True, numeric=False, - copy=not inplace) for b in blocks] - return blocks - except (TypeError, ValueError): - - # try again with a compatible block - block = self.astype(object) - return block.replace( - to_replace=original_to_replace, value=value, inplace=inplace, - filter=filter, regex=regex, convert=convert) - - def _replace_single(self, *args, **kwargs): - """ no-op on a non-ObjectBlock """ - return self if kwargs['inplace'] else self.copy() - - def setitem(self, indexer, value, mgr=None): - """Set the value inplace, returning a a maybe different typed block. - - Parameters - ---------- - indexer : tuple, list-like, array-like, slice - The subset of self.values to set - value : object - The value being set - mgr : BlockPlacement, optional - - Returns - ------- - Block - - Notes - ----- - `indexer` is a direct slice/positional indexer. `value` must - be a compatible shape. - """ - # coerce None values, if appropriate - if value is None: - if self.is_numeric: - value = np.nan - - # coerce if block dtype can store value - values = self.values - try: - values, _, value, _ = self._try_coerce_args(values, value) - # can keep its own dtype - if hasattr(value, 'dtype') and is_dtype_equal(values.dtype, - value.dtype): - dtype = self.dtype - else: - dtype = 'infer' - - except (TypeError, ValueError): - # current dtype cannot store value, coerce to common dtype - find_dtype = False - - if hasattr(value, 'dtype'): - dtype = value.dtype - find_dtype = True - - elif is_scalar(value): - if isna(value): - # NaN promotion is handled in latter path - dtype = False - else: - dtype, _ = infer_dtype_from_scalar(value, - pandas_dtype=True) - find_dtype = True - else: - dtype = 'infer' - - if find_dtype: - dtype = find_common_type([values.dtype, dtype]) - if not is_dtype_equal(self.dtype, dtype): - b = self.astype(dtype) - return b.setitem(indexer, value, mgr=mgr) - - # value must be storeable at this moment - arr_value = np.array(value) - - # cast the values to a type that can hold nan (if necessary) - if not self._can_hold_element(value): - dtype, _ = maybe_promote(arr_value.dtype) - values = values.astype(dtype) - - transf = (lambda x: x.T) if self.ndim == 2 else (lambda x: x) - values = transf(values) - - # length checking - check_setitem_lengths(indexer, value, values) - - def _is_scalar_indexer(indexer): - # return True if we are all scalar indexers - - if arr_value.ndim == 1: - if not isinstance(indexer, tuple): - indexer = tuple([indexer]) - return any(isinstance(idx, np.ndarray) and len(idx) == 0 - for idx in indexer) - return False - - def _is_empty_indexer(indexer): - # return a boolean if we have an empty indexer - - if is_list_like(indexer) and not len(indexer): - return True - if arr_value.ndim == 1: - if not isinstance(indexer, tuple): - indexer = tuple([indexer]) - return any(isinstance(idx, np.ndarray) and len(idx) == 0 - for idx in indexer) - return False - - # empty indexers - # 8669 (empty) - if _is_empty_indexer(indexer): - pass - - # setting a single element for each dim and with a rhs that could - # be say a list - # GH 6043 - elif _is_scalar_indexer(indexer): - values[indexer] = value - - # if we are an exact match (ex-broadcasting), - # then use the resultant dtype - elif (len(arr_value.shape) and - arr_value.shape[0] == values.shape[0] and - np.prod(arr_value.shape) == np.prod(values.shape)): - values[indexer] = value - try: - values = values.astype(arr_value.dtype) - except ValueError: - pass - - # set - else: - values[indexer] = value - - # coerce and try to infer the dtypes of the result - values = self._try_coerce_and_cast_result(values, dtype) - block = self.make_block(transf(values)) - return block - - def putmask(self, mask, new, align=True, inplace=False, axis=0, - transpose=False, mgr=None): - """ putmask the data to the block; it is possible that we may create a - new dtype of block - - return the resulting block(s) - - Parameters - ---------- - mask : the condition to respect - new : a ndarray/object - align : boolean, perform alignment on other/cond, default is True - inplace : perform inplace modification, default is False - axis : int - transpose : boolean - Set to True if self is stored with axes reversed - - Returns - ------- - a list of new blocks, the result of the putmask - """ - - new_values = self.values if inplace else self.values.copy() - - new = getattr(new, 'values', new) - mask = getattr(mask, 'values', mask) - - # if we are passed a scalar None, convert it here - if not is_list_like(new) and isna(new) and not self.is_object: - new = self.fill_value - - if self._can_hold_element(new): - _, _, new, _ = self._try_coerce_args(new_values, new) - - if transpose: - new_values = new_values.T - - # If the default repeat behavior in np.putmask would go in the - # wrong direction, then explicitly repeat and reshape new instead - if getattr(new, 'ndim', 0) >= 1: - if self.ndim - 1 == new.ndim and axis == 1: - new = np.repeat( - new, new_values.shape[-1]).reshape(self.shape) - new = new.astype(new_values.dtype) - - # we require exact matches between the len of the - # values we are setting (or is compat). np.putmask - # doesn't check this and will simply truncate / pad - # the output, but we want sane error messages - # - # TODO: this prob needs some better checking - # for 2D cases - if ((is_list_like(new) and - np.any(mask[mask]) and - getattr(new, 'ndim', 1) == 1)): - - if not (mask.shape[-1] == len(new) or - mask[mask].shape[-1] == len(new) or - len(new) == 1): - raise ValueError("cannot assign mismatch " - "length to masked array") - - np.putmask(new_values, mask, new) - - # maybe upcast me - elif mask.any(): - if transpose: - mask = mask.T - if isinstance(new, np.ndarray): - new = new.T - axis = new_values.ndim - axis - 1 - - # Pseudo-broadcast - if getattr(new, 'ndim', 0) >= 1: - if self.ndim - 1 == new.ndim: - new_shape = list(new.shape) - new_shape.insert(axis, 1) - new = new.reshape(tuple(new_shape)) - - # operate column-by-column - def f(m, v, i): - - if i is None: - # ndim==1 case. - n = new - else: - - if isinstance(new, np.ndarray): - n = np.squeeze(new[i % new.shape[0]]) - else: - n = np.array(new) - - # type of the new block - dtype, _ = maybe_promote(n.dtype) - - # we need to explicitly astype here to make a copy - n = n.astype(dtype) - - nv = _putmask_smart(v, m, n) - return nv - - new_blocks = self.split_and_operate(mask, f, inplace) - return new_blocks - - if inplace: - return [self] - - if transpose: - new_values = new_values.T - - return [self.make_block(new_values)] - - def coerce_to_target_dtype(self, other): - """ - coerce the current block to a dtype compat for other - we will return a block, possibly object, and not raise - - we can also safely try to coerce to the same dtype - and will receive the same block - """ - - # if we cannot then coerce to object - dtype, _ = infer_dtype_from(other, pandas_dtype=True) - - if is_dtype_equal(self.dtype, dtype): - return self - - if self.is_bool or is_object_dtype(dtype) or is_bool_dtype(dtype): - # we don't upcast to bool - return self.astype(object) - - elif ((self.is_float or self.is_complex) and - (is_integer_dtype(dtype) or is_float_dtype(dtype))): - # don't coerce float/complex to int - return self - - elif (self.is_datetime or - is_datetime64_dtype(dtype) or - is_datetime64tz_dtype(dtype)): - - # not a datetime - if not ((is_datetime64_dtype(dtype) or - is_datetime64tz_dtype(dtype)) and self.is_datetime): - return self.astype(object) - - # don't upcast timezone with different timezone or no timezone - mytz = getattr(self.dtype, 'tz', None) - othertz = getattr(dtype, 'tz', None) - - if str(mytz) != str(othertz): - return self.astype(object) - - raise AssertionError("possible recursion in " - "coerce_to_target_dtype: {} {}".format( - self, other)) - - elif (self.is_timedelta or is_timedelta64_dtype(dtype)): - - # not a timedelta - if not (is_timedelta64_dtype(dtype) and self.is_timedelta): - return self.astype(object) - - raise AssertionError("possible recursion in " - "coerce_to_target_dtype: {} {}".format( - self, other)) - - try: - return self.astype(dtype) - except (ValueError, TypeError): - pass - - return self.astype(object) - - def interpolate(self, method='pad', axis=0, index=None, values=None, - inplace=False, limit=None, limit_direction='forward', - limit_area=None, fill_value=None, coerce=False, - downcast=None, mgr=None, **kwargs): - - inplace = validate_bool_kwarg(inplace, 'inplace') - - def check_int_bool(self, inplace): - # Only FloatBlocks will contain NaNs. - # timedelta subclasses IntBlock - if (self.is_bool or self.is_integer) and not self.is_timedelta: - if inplace: - return self - else: - return self.copy() - - # a fill na type method - try: - m = missing.clean_fill_method(method) - except: - m = None - - if m is not None: - r = check_int_bool(self, inplace) - if r is not None: - return r - return self._interpolate_with_fill(method=m, axis=axis, - inplace=inplace, limit=limit, - fill_value=fill_value, - coerce=coerce, - downcast=downcast, mgr=mgr) - # try an interp method - try: - m = missing.clean_interp_method(method, **kwargs) - except: - m = None - - if m is not None: - r = check_int_bool(self, inplace) - if r is not None: - return r - return self._interpolate(method=m, index=index, values=values, - axis=axis, limit=limit, - limit_direction=limit_direction, - limit_area=limit_area, - fill_value=fill_value, inplace=inplace, - downcast=downcast, mgr=mgr, **kwargs) - - raise ValueError("invalid method '{0}' to interpolate.".format(method)) - - def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, - limit=None, fill_value=None, coerce=False, - downcast=None, mgr=None): - """ fillna but using the interpolate machinery """ - - inplace = validate_bool_kwarg(inplace, 'inplace') - - # if we are coercing, then don't force the conversion - # if the block can't hold the type - if coerce: - if not self._can_hold_na: - if inplace: - return [self] - else: - return [self.copy()] - - values = self.values if inplace else self.values.copy() - values, _, fill_value, _ = self._try_coerce_args(values, fill_value) - values = missing.interpolate_2d(values, method=method, axis=axis, - limit=limit, fill_value=fill_value, - dtype=self.dtype) - values = self._try_coerce_result(values) - - blocks = [self.make_block_same_class(values, ndim=self.ndim)] - return self._maybe_downcast(blocks, downcast) - - def _interpolate(self, method=None, index=None, values=None, - fill_value=None, axis=0, limit=None, - limit_direction='forward', limit_area=None, - inplace=False, downcast=None, mgr=None, **kwargs): - """ interpolate using scipy wrappers """ - - inplace = validate_bool_kwarg(inplace, 'inplace') - data = self.values if inplace else self.values.copy() - - # only deal with floats - if not self.is_float: - if not self.is_integer: - return self - data = data.astype(np.float64) - - if fill_value is None: - fill_value = self.fill_value - - if method in ('krogh', 'piecewise_polynomial', 'pchip'): - if not index.is_monotonic: - raise ValueError("{0} interpolation requires that the " - "index be monotonic.".format(method)) - # process 1-d slices in the axis direction - - def func(x): - - # process a 1-d slice, returning it - # should the axis argument be handled below in apply_along_axis? - # i.e. not an arg to missing.interpolate_1d - return missing.interpolate_1d(index, x, method=method, limit=limit, - limit_direction=limit_direction, - limit_area=limit_area, - fill_value=fill_value, - bounds_error=False, **kwargs) - - # interp each column independently - interp_values = np.apply_along_axis(func, axis, data) - - blocks = [self.make_block_same_class(interp_values)] - return self._maybe_downcast(blocks, downcast) - - def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): - """ - Take values according to indexer and return them as a block.bb - - """ - - # algos.take_nd dispatches for DatetimeTZBlock, CategoricalBlock - # so need to preserve types - # sparse is treated like an ndarray, but needs .get_values() shaping - - values = self.values - if self.is_sparse: - values = self.get_values() - - if fill_tuple is None: - fill_value = self.fill_value - new_values = algos.take_nd(values, indexer, axis=axis, - allow_fill=False) - else: - fill_value = fill_tuple[0] - new_values = algos.take_nd(values, indexer, axis=axis, - allow_fill=True, fill_value=fill_value) - - if new_mgr_locs is None: - if axis == 0: - slc = libinternals.indexer_as_slice(indexer) - if slc is not None: - new_mgr_locs = self.mgr_locs[slc] - else: - new_mgr_locs = self.mgr_locs[indexer] - else: - new_mgr_locs = self.mgr_locs - - if not is_dtype_equal(new_values.dtype, self.dtype): - return self.make_block(new_values, new_mgr_locs) - else: - return self.make_block_same_class(new_values, new_mgr_locs) - - def diff(self, n, axis=1, mgr=None): - """ return block for the diff of the values """ - new_values = algos.diff(self.values, n, axis=axis) - return [self.make_block(values=new_values)] - - def shift(self, periods, axis=0, mgr=None): - """ shift the block by periods, possibly upcast """ - - # convert integer to float if necessary. need to do a lot more than - # that, handle boolean etc also - new_values, fill_value = maybe_upcast(self.values) - - # make sure array sent to np.roll is c_contiguous - f_ordered = new_values.flags.f_contiguous - if f_ordered: - new_values = new_values.T - axis = new_values.ndim - axis - 1 - - if np.prod(new_values.shape): - new_values = np.roll(new_values, ensure_platform_int(periods), - axis=axis) - - axis_indexer = [slice(None)] * self.ndim - if periods > 0: - axis_indexer[axis] = slice(None, periods) - else: - axis_indexer[axis] = slice(periods, None) - new_values[tuple(axis_indexer)] = fill_value - - # restore original order - if f_ordered: - new_values = new_values.T - - return [self.make_block(new_values)] - - def eval(self, func, other, errors='raise', try_cast=False, mgr=None): - """ - evaluate the block; return result block from the result - - Parameters - ---------- - func : how to combine self, other - other : a ndarray/object - errors : str, {'raise', 'ignore'}, default 'raise' - - ``raise`` : allow exceptions to be raised - - ``ignore`` : suppress exceptions. On error return original object - - try_cast : try casting the results to the input type - - Returns - ------- - a new block, the result of the func - """ - orig_other = other - values = self.values - - other = getattr(other, 'values', other) - - # make sure that we can broadcast - is_transposed = False - if hasattr(other, 'ndim') and hasattr(values, 'ndim'): - if values.ndim != other.ndim: - is_transposed = True - else: - if values.shape == other.shape[::-1]: - is_transposed = True - elif values.shape[0] == other.shape[-1]: - is_transposed = True - else: - # this is a broadcast error heree - raise ValueError( - "cannot broadcast shape [{t_shape}] with " - "block values [{oth_shape}]".format( - t_shape=values.T.shape, oth_shape=other.shape)) - - transf = (lambda x: x.T) if is_transposed else (lambda x: x) - - # coerce/transpose the args if needed - try: - values, values_mask, other, other_mask = self._try_coerce_args( - transf(values), other) - except TypeError: - block = self.coerce_to_target_dtype(orig_other) - return block.eval(func, orig_other, - errors=errors, - try_cast=try_cast, mgr=mgr) - - # get the result, may need to transpose the other - def get_result(other): - - # avoid numpy warning of comparisons again None - if other is None: - result = not func.__name__ == 'eq' - - # avoid numpy warning of elementwise comparisons to object - elif is_numeric_v_string_like(values, other): - result = False - - # avoid numpy warning of elementwise comparisons - elif func.__name__ == 'eq': - if is_list_like(other) and not isinstance(other, np.ndarray): - other = np.asarray(other) - - # if we can broadcast, then ok - if values.shape[-1] != other.shape[-1]: - return False - result = func(values, other) - else: - result = func(values, other) - - # mask if needed - if isinstance(values_mask, np.ndarray) and values_mask.any(): - result = result.astype('float64', copy=False) - result[values_mask] = np.nan - if other_mask is True: - result = result.astype('float64', copy=False) - result[:] = np.nan - elif isinstance(other_mask, np.ndarray) and other_mask.any(): - result = result.astype('float64', copy=False) - result[other_mask.ravel()] = np.nan - - return result - - # error handler if we have an issue operating with the function - def handle_error(): - - if errors == 'raise': - # The 'detail' variable is defined in outer scope. - raise TypeError( - 'Could not operate {other!r} with block values ' - '{detail!s}'.format(other=other, detail=detail)) # noqa - else: - # return the values - result = np.empty(values.shape, dtype='O') - result.fill(np.nan) - return result - - # get the result - try: - with np.errstate(all='ignore'): - result = get_result(other) - - # if we have an invalid shape/broadcast error - # GH4576, so raise instead of allowing to pass through - except ValueError as detail: - raise - except Exception as detail: - result = handle_error() - - # technically a broadcast error in numpy can 'work' by returning a - # boolean False - if not isinstance(result, np.ndarray): - if not isinstance(result, np.ndarray): - - # differentiate between an invalid ndarray-ndarray comparison - # and an invalid type comparison - if isinstance(values, np.ndarray) and is_list_like(other): - raise ValueError( - 'Invalid broadcasting comparison [{other!r}] with ' - 'block values'.format(other=other)) - - raise TypeError('Could not compare [{other!r}] ' - 'with block values'.format(other=other)) - - # transpose if needed - result = transf(result) - - # try to cast if requested - if try_cast: - result = self._try_cast_result(result) - - result = _block_shape(result, ndim=self.ndim) - return [self.make_block(result)] - - def where(self, other, cond, align=True, errors='raise', - try_cast=False, axis=0, transpose=False, mgr=None): - """ - evaluate the block; return result block(s) from the result - - Parameters - ---------- - other : a ndarray/object - cond : the condition to respect - align : boolean, perform alignment on other/cond - errors : str, {'raise', 'ignore'}, default 'raise' - - ``raise`` : allow exceptions to be raised - - ``ignore`` : suppress exceptions. On error return original object - - axis : int - transpose : boolean - Set to True if self is stored with axes reversed - - Returns - ------- - a new block(s), the result of the func - """ - import pandas.core.computation.expressions as expressions - assert errors in ['raise', 'ignore'] - - values = self.values - orig_other = other - if transpose: - values = values.T - - other = getattr(other, '_values', getattr(other, 'values', other)) - cond = getattr(cond, 'values', cond) - - # If the default broadcasting would go in the wrong direction, then - # explicitly reshape other instead - if getattr(other, 'ndim', 0) >= 1: - if values.ndim - 1 == other.ndim and axis == 1: - other = other.reshape(tuple(other.shape + (1, ))) - elif transpose and values.ndim == self.ndim - 1: - cond = cond.T - - if not hasattr(cond, 'shape'): - raise ValueError("where must have a condition that is ndarray " - "like") - - # our where function - def func(cond, values, other): - if cond.ravel().all(): - return values - - values, values_mask, other, other_mask = self._try_coerce_args( - values, other) - - try: - return self._try_coerce_result(expressions.where( - cond, values, other)) - except Exception as detail: - if errors == 'raise': - raise TypeError( - 'Could not operate [{other!r}] with block values ' - '[{detail!s}]'.format(other=other, detail=detail)) - else: - # return the values - result = np.empty(values.shape, dtype='float64') - result.fill(np.nan) - return result - - # see if we can operate on the entire block, or need item-by-item - # or if we are a single block (ndim == 1) - try: - result = func(cond, values, other) - except TypeError: - - # we cannot coerce, return a compat dtype - # we are explicitly ignoring errors - block = self.coerce_to_target_dtype(other) - blocks = block.where(orig_other, cond, align=align, - errors=errors, - try_cast=try_cast, axis=axis, - transpose=transpose) - return self._maybe_downcast(blocks, 'infer') - - if self._can_hold_na or self.ndim == 1: - - if transpose: - result = result.T - - # try to cast if requested - if try_cast: - result = self._try_cast_result(result) - - return self.make_block(result) - - # might need to separate out blocks - axis = cond.ndim - 1 - cond = cond.swapaxes(axis, 0) - mask = np.array([cond[i].all() for i in range(cond.shape[0])], - dtype=bool) - - result_blocks = [] - for m in [mask, ~mask]: - if m.any(): - r = self._try_cast_result(result.take(m.nonzero()[0], - axis=axis)) - result_blocks.append( - self.make_block(r.T, placement=self.mgr_locs[m])) - - return result_blocks - - def equals(self, other): - if self.dtype != other.dtype or self.shape != other.shape: - return False - return array_equivalent(self.values, other.values) - - def _unstack(self, unstacker_func, new_columns): - """Return a list of unstacked blocks of self - - Parameters - ---------- - unstacker_func : callable - Partially applied unstacker. - new_columns : Index - All columns of the unstacked BlockManager. - - Returns - ------- - blocks : list of Block - New blocks of unstacked values. - mask : array_like of bool - The mask of columns of `blocks` we should keep. - """ - unstacker = unstacker_func(self.values.T) - new_items = unstacker.get_new_columns() - new_placement = new_columns.get_indexer(new_items) - new_values, mask = unstacker.get_new_values() - - mask = mask.any(0) - new_values = new_values.T[mask] - new_placement = new_placement[mask] - - blocks = [make_block(new_values, placement=new_placement)] - return blocks, mask - - def quantile(self, qs, interpolation='linear', axis=0, mgr=None): - """ - compute the quantiles of the - - Parameters - ---------- - qs: a scalar or list of the quantiles to be computed - interpolation: type of interpolation, default 'linear' - axis: axis to compute, default 0 - - Returns - ------- - tuple of (axis, block) - - """ - kw = {'interpolation': interpolation} - values = self.get_values() - values, _, _, _ = self._try_coerce_args(values, values) - - def _nanpercentile1D(values, mask, q, **kw): - values = values[~mask] - - if len(values) == 0: - if is_scalar(q): - return self._na_value - else: - return np.array([self._na_value] * len(q), - dtype=values.dtype) - - return np.percentile(values, q, **kw) - - def _nanpercentile(values, q, axis, **kw): - - mask = isna(self.values) - if not is_scalar(mask) and mask.any(): - if self.ndim == 1: - return _nanpercentile1D(values, mask, q, **kw) - else: - # for nonconsolidatable blocks mask is 1D, but values 2D - if mask.ndim < values.ndim: - mask = mask.reshape(values.shape) - if axis == 0: - values = values.T - mask = mask.T - result = [_nanpercentile1D(val, m, q, **kw) for (val, m) - in zip(list(values), list(mask))] - result = np.array(result, dtype=values.dtype, copy=False).T - return result - else: - return np.percentile(values, q, axis=axis, **kw) - - from pandas import Float64Index - is_empty = values.shape[axis] == 0 - if is_list_like(qs): - ax = Float64Index(qs) - - if is_empty: - if self.ndim == 1: - result = self._na_value - else: - # create the array of na_values - # 2d len(values) * len(qs) - result = np.repeat(np.array([self._na_value] * len(qs)), - len(values)).reshape(len(values), - len(qs)) - else: - - try: - result = _nanpercentile(values, np.array(qs) * 100, - axis=axis, **kw) - except ValueError: - - # older numpies don't handle an array for q - result = [_nanpercentile(values, q * 100, - axis=axis, **kw) for q in qs] - - result = np.array(result, copy=False) - if self.ndim > 1: - result = result.T - - else: - - if self.ndim == 1: - ax = Float64Index([qs]) - else: - ax = mgr.axes[0] - - if is_empty: - if self.ndim == 1: - result = self._na_value - else: - result = np.array([self._na_value] * len(self)) - else: - result = _nanpercentile(values, qs * 100, axis=axis, **kw) - - ndim = getattr(result, 'ndim', None) or 0 - result = self._try_coerce_result(result) - if is_scalar(result): - return ax, self.make_block_scalar(result) - return ax, make_block(result, - placement=np.arange(len(result)), - ndim=ndim) - - -class ScalarBlock(Block): - """ - a scalar compat Block - """ - __slots__ = ['_mgr_locs', 'values', 'ndim'] - - def __init__(self, values): - self.ndim = 0 - self.mgr_locs = [0] - self.values = values - - @property - def dtype(self): - return type(self.values) - - @property - def shape(self): - return tuple([0]) - - def __len__(self): - return 0 - - -class NonConsolidatableMixIn(object): - """ hold methods for the nonconsolidatable blocks """ - _can_consolidate = False - _verify_integrity = False - _validate_ndim = False - - def __init__(self, values, placement, ndim=None): - """Initialize a non-consolidatable block. - - 'ndim' may be inferred from 'placement'. - - This will call continue to call __init__ for the other base - classes mixed in with this Mixin. - """ - # Placement must be converted to BlockPlacement so that we can check - # its length - if not isinstance(placement, libinternals.BlockPlacement): - placement = libinternals.BlockPlacement(placement) - - # Maybe infer ndim from placement - if ndim is None: - if len(placement) != 1: - ndim = 1 - else: - ndim = 2 - super(NonConsolidatableMixIn, self).__init__(values, placement, - ndim=ndim) - - @property - def shape(self): - if self.ndim == 1: - return (len(self.values)), - return (len(self.mgr_locs), len(self.values)) - - def get_values(self, dtype=None): - """ need to to_dense myself (and always return a ndim sized object) """ - values = self.values.to_dense() - if values.ndim == self.ndim - 1: - values = values.reshape((1,) + values.shape) - return values - - def iget(self, col): - - if self.ndim == 2 and isinstance(col, tuple): - col, loc = col - if not com.is_null_slice(col) and col != 0: - raise IndexError("{0} only contains one item".format(self)) - return self.values[loc] - else: - if col != 0: - raise IndexError("{0} only contains one item".format(self)) - return self.values - - def should_store(self, value): - return isinstance(value, self._holder) - - def set(self, locs, values, check=False): - assert locs.tolist() == [0] - self.values = values - - def putmask(self, mask, new, align=True, inplace=False, axis=0, - transpose=False, mgr=None): - """ - putmask the data to the block; we must be a single block and not - generate other blocks - - return the resulting block - - Parameters - ---------- - mask : the condition to respect - new : a ndarray/object - align : boolean, perform alignment on other/cond, default is True - inplace : perform inplace modification, default is False - - Returns - ------- - a new block, the result of the putmask - """ - inplace = validate_bool_kwarg(inplace, 'inplace') - - # use block's copy logic. - # .values may be an Index which does shallow copy by default - new_values = self.values if inplace else self.copy().values - new_values, _, new, _ = self._try_coerce_args(new_values, new) - - if isinstance(new, np.ndarray) and len(new) == len(mask): - new = new[mask] - - mask = _safe_reshape(mask, new_values.shape) - - new_values[mask] = new - new_values = self._try_coerce_result(new_values) - return [self.make_block(values=new_values)] - - def _slice(self, slicer): - """ return a slice of my values (but densify first) """ - return self.get_values()[slicer] - - def _try_cast_result(self, result, dtype=None): - return result - - def _unstack(self, unstacker_func, new_columns): - """Return a list of unstacked blocks of self - - Parameters - ---------- - unstacker_func : callable - Partially applied unstacker. - new_columns : Index - All columns of the unstacked BlockManager. - - Returns - ------- - blocks : list of Block - New blocks of unstacked values. - mask : array_like of bool - The mask of columns of `blocks` we should keep. - """ - # NonConsolidatable blocks can have a single item only, so we return - # one block per item - unstacker = unstacker_func(self.values.T) - new_items = unstacker.get_new_columns() - new_placement = new_columns.get_indexer(new_items) - new_values, mask = unstacker.get_new_values() - - mask = mask.any(0) - new_values = new_values.T[mask] - new_placement = new_placement[mask] - - blocks = [self.make_block_same_class(vals, [place]) - for vals, place in zip(new_values, new_placement)] - return blocks, mask - - -class ExtensionBlock(NonConsolidatableMixIn, Block): - """Block for holding extension types. - - Notes - ----- - This holds all 3rd-party extension array types. It's also the immediate - parent class for our internal extension types' blocks, CategoricalBlock. - - ExtensionArrays are limited to 1-D. - """ - is_extension = True - - def __init__(self, values, placement, ndim=None): - values = self._maybe_coerce_values(values) - super(ExtensionBlock, self).__init__(values, placement, ndim) - - def _maybe_coerce_values(self, values): - """Unbox to an extension array. - - This will unbox an ExtensionArray stored in an Index or Series. - ExtensionArrays pass through. No dtype coercion is done. - - Parameters - ---------- - values : Index, Series, ExtensionArray - - Returns - ------- - ExtensionArray - """ - if isinstance(values, (gt.ABCIndexClass, gt.ABCSeries)): - values = values._values - return values - - @property - def _holder(self): - # For extension blocks, the holder is values-dependent. - return type(self.values) - - @property - def fill_value(self): - # Used in reindex_indexer - return self.values.dtype.na_value - - @property - def _can_hold_na(self): - # The default ExtensionArray._can_hold_na is True - return self._holder._can_hold_na - - @property - def is_view(self): - """Extension arrays are never treated as views.""" - return False - - def setitem(self, indexer, value, mgr=None): - """Set the value inplace, returning a same-typed block. - - This differs from Block.setitem by not allowing setitem to change - the dtype of the Block. - - Parameters - ---------- - indexer : tuple, list-like, array-like, slice - The subset of self.values to set - value : object - The value being set - mgr : BlockPlacement, optional - - Returns - ------- - Block - - Notes - ----- - `indexer` is a direct slice/positional indexer. `value` must - be a compatible shape. - """ - if isinstance(indexer, tuple): - # we are always 1-D - indexer = indexer[0] - - check_setitem_lengths(indexer, value, self.values) - self.values[indexer] = value - return self - - def get_values(self, dtype=None): - # ExtensionArrays must be iterable, so this works. - values = np.asarray(self.values) - if values.ndim == self.ndim - 1: - values = values.reshape((1,) + values.shape) - return values - - def to_dense(self): - return np.asarray(self.values) - - def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): - """ - Take values according to indexer and return them as a block. - """ - if fill_tuple is None: - fill_value = None - else: - fill_value = fill_tuple[0] - - # axis doesn't matter; we are really a single-dim object - # but are passed the axis depending on the calling routing - # if its REALLY axis 0, then this will be a reindex and not a take - new_values = self.values.take(indexer, fill_value=fill_value, - allow_fill=True) - - # if we are a 1-dim object, then always place at 0 - if self.ndim == 1: - new_mgr_locs = [0] - else: - if new_mgr_locs is None: - new_mgr_locs = self.mgr_locs - - return self.make_block_same_class(new_values, new_mgr_locs) - - def _can_hold_element(self, element): - # XXX: We may need to think about pushing this onto the array. - # We're doing the same as CategoricalBlock here. - return True - - def _slice(self, slicer): - """ return a slice of my values """ - - # slice the category - # return same dims as we currently have - - if isinstance(slicer, tuple) and len(slicer) == 2: - if not com.is_null_slice(slicer[0]): - raise AssertionError("invalid slicing for a 1-ndim " - "categorical") - slicer = slicer[1] - - return self.values[slicer] - - def formatting_values(self): - return self.values._formatting_values() - - def concat_same_type(self, to_concat, placement=None): - """ - Concatenate list of single blocks of the same type. - """ - values = self._holder._concat_same_type( - [blk.values for blk in to_concat]) - placement = placement or slice(0, len(values), 1) - return self.make_block_same_class(values, ndim=self.ndim, - placement=placement) - - def fillna(self, value, limit=None, inplace=False, downcast=None, - mgr=None): - values = self.values if inplace else self.values.copy() - values = values.fillna(value=value, limit=limit) - return [self.make_block_same_class(values=values, - placement=self.mgr_locs, - ndim=self.ndim)] - - def interpolate(self, method='pad', axis=0, inplace=False, limit=None, - fill_value=None, **kwargs): - - values = self.values if inplace else self.values.copy() - return self.make_block_same_class( - values=values.fillna(value=fill_value, method=method, - limit=limit), - placement=self.mgr_locs) - - -class NumericBlock(Block): - __slots__ = () - is_numeric = True - _can_hold_na = True - - -class FloatOrComplexBlock(NumericBlock): - __slots__ = () - - def equals(self, other): - if self.dtype != other.dtype or self.shape != other.shape: - return False - left, right = self.values, other.values - return ((left == right) | (np.isnan(left) & np.isnan(right))).all() - - -class FloatBlock(FloatOrComplexBlock): - __slots__ = () - is_float = True - - def _can_hold_element(self, element): - tipo = maybe_infer_dtype_type(element) - if tipo is not None: - return (issubclass(tipo.type, (np.floating, np.integer)) and - not issubclass(tipo.type, (np.datetime64, np.timedelta64))) - return ( - isinstance( - element, (float, int, np.floating, np.int_, compat.long)) - and not isinstance(element, (bool, np.bool_, datetime, timedelta, - np.datetime64, np.timedelta64))) - - def to_native_types(self, slicer=None, na_rep='', float_format=None, - decimal='.', quoting=None, **kwargs): - """ convert to our native types format, slicing if desired """ - - values = self.values - if slicer is not None: - values = values[:, slicer] - - # see gh-13418: no special formatting is desired at the - # output (important for appropriate 'quoting' behaviour), - # so do not pass it through the FloatArrayFormatter - if float_format is None and decimal == '.': - mask = isna(values) - - if not quoting: - values = values.astype(str) - else: - values = np.array(values, dtype='object') - - values[mask] = na_rep - return values - - from pandas.io.formats.format import FloatArrayFormatter - formatter = FloatArrayFormatter(values, na_rep=na_rep, - float_format=float_format, - decimal=decimal, quoting=quoting, - fixed_width=False) - return formatter.get_result_as_array() - - def should_store(self, value): - # when inserting a column should not coerce integers to floats - # unnecessarily - return (issubclass(value.dtype.type, np.floating) and - value.dtype == self.dtype) - - -class ComplexBlock(FloatOrComplexBlock): - __slots__ = () - is_complex = True - - def _can_hold_element(self, element): - tipo = maybe_infer_dtype_type(element) - if tipo is not None: - return issubclass(tipo.type, - (np.floating, np.integer, np.complexfloating)) - return ( - isinstance( - element, - (float, int, complex, np.float_, np.int_, compat.long)) - and not isinstance(element, (bool, np.bool_))) - - def should_store(self, value): - return issubclass(value.dtype.type, np.complexfloating) - - -class IntBlock(NumericBlock): - __slots__ = () - is_integer = True - _can_hold_na = False - - def _can_hold_element(self, element): - tipo = maybe_infer_dtype_type(element) - if tipo is not None: - return (issubclass(tipo.type, np.integer) and - not issubclass(tipo.type, (np.datetime64, - np.timedelta64)) and - self.dtype.itemsize >= tipo.itemsize) - return is_integer(element) - - def should_store(self, value): - return is_integer_dtype(value) and value.dtype == self.dtype - - -class BoolBlock(NumericBlock): - __slots__ = () - is_bool = True - _can_hold_na = False - - def _can_hold_element(self, element): - tipo = maybe_infer_dtype_type(element) - if tipo is not None: - return issubclass(tipo.type, np.bool_) - return isinstance(element, (bool, np.bool_)) - - def should_store(self, value): - return issubclass(value.dtype.type, np.bool_) - - def replace(self, to_replace, value, inplace=False, filter=None, - regex=False, convert=True, mgr=None): - inplace = validate_bool_kwarg(inplace, 'inplace') - to_replace_values = np.atleast_1d(to_replace) - if not np.can_cast(to_replace_values, bool): - return self - return super(BoolBlock, self).replace(to_replace, value, - inplace=inplace, filter=filter, - regex=regex, convert=convert, - mgr=mgr) - - -class ObjectBlock(Block): - __slots__ = () - is_object = True - _can_hold_na = True - - def __init__(self, values, placement=None, ndim=2): - if issubclass(values.dtype.type, compat.string_types): - values = np.array(values, dtype=object) - - super(ObjectBlock, self).__init__(values, ndim=ndim, - placement=placement) - - @property - def is_bool(self): - """ we can be a bool if we have only bool values but are of type - object - """ - return lib.is_bool_array(self.values.ravel()) - - # TODO: Refactor when convert_objects is removed since there will be 1 path - def convert(self, *args, **kwargs): - """ attempt to coerce any object types to better types return a copy of - the block (if copy = True) by definition we ARE an ObjectBlock!!!!! - - can return multiple blocks! - """ - - if args: - raise NotImplementedError - by_item = True if 'by_item' not in kwargs else kwargs['by_item'] - - new_inputs = ['coerce', 'datetime', 'numeric', 'timedelta'] - new_style = False - for kw in new_inputs: - new_style |= kw in kwargs - - if new_style: - fn = soft_convert_objects - fn_inputs = new_inputs - else: - fn = maybe_convert_objects - fn_inputs = ['convert_dates', 'convert_numeric', - 'convert_timedeltas'] - fn_inputs += ['copy'] - - fn_kwargs = {} - for key in fn_inputs: - if key in kwargs: - fn_kwargs[key] = kwargs[key] - - # operate column-by-column - def f(m, v, i): - shape = v.shape - values = fn(v.ravel(), **fn_kwargs) - try: - values = values.reshape(shape) - values = _block_shape(values, ndim=self.ndim) - except (AttributeError, NotImplementedError): - pass - - return values - - if by_item and not self._is_single_block: - blocks = self.split_and_operate(None, f, False) - else: - values = f(None, self.values.ravel(), None) - blocks = [make_block(values, ndim=self.ndim, - placement=self.mgr_locs)] - - return blocks - - def set(self, locs, values, check=False): - """ - Modify Block in-place with new item value - - Returns - ------- - None - """ - - # GH6026 - if check: - try: - if (self.values[locs] == values).all(): - return - except: - pass - try: - self.values[locs] = values - except (ValueError): - - # broadcasting error - # see GH6171 - new_shape = list(values.shape) - new_shape[0] = len(self.items) - self.values = np.empty(tuple(new_shape), dtype=self.dtype) - self.values.fill(np.nan) - self.values[locs] = values - - def _maybe_downcast(self, blocks, downcast=None): - - if downcast is not None: - return blocks - - # split and convert the blocks - return _extend_blocks([b.convert(datetime=True, numeric=False) - for b in blocks]) - - def _can_hold_element(self, element): - return True - - def _try_coerce_args(self, values, other): - """ provide coercion to our input arguments """ - - if isinstance(other, gt.ABCDatetimeIndex): - # to store DatetimeTZBlock as object - other = other.astype(object).values - - return values, False, other, False - - def should_store(self, value): - return not (issubclass(value.dtype.type, - (np.integer, np.floating, np.complexfloating, - np.datetime64, np.bool_)) or - # TODO(ExtensionArray): remove is_extension_type - # when all extension arrays have been ported. - is_extension_type(value) or - is_extension_array_dtype(value)) - - def replace(self, to_replace, value, inplace=False, filter=None, - regex=False, convert=True, mgr=None): - to_rep_is_list = is_list_like(to_replace) - value_is_list = is_list_like(value) - both_lists = to_rep_is_list and value_is_list - either_list = to_rep_is_list or value_is_list - - result_blocks = [] - blocks = [self] - - if not either_list and is_re(to_replace): - return self._replace_single(to_replace, value, inplace=inplace, - filter=filter, regex=True, - convert=convert, mgr=mgr) - elif not (either_list or regex): - return super(ObjectBlock, self).replace(to_replace, value, - inplace=inplace, - filter=filter, regex=regex, - convert=convert, mgr=mgr) - elif both_lists: - for to_rep, v in zip(to_replace, value): - result_blocks = [] - for b in blocks: - result = b._replace_single(to_rep, v, inplace=inplace, - filter=filter, regex=regex, - convert=convert, mgr=mgr) - result_blocks = _extend_blocks(result, result_blocks) - blocks = result_blocks - return result_blocks - - elif to_rep_is_list and regex: - for to_rep in to_replace: - result_blocks = [] - for b in blocks: - result = b._replace_single(to_rep, value, inplace=inplace, - filter=filter, regex=regex, - convert=convert, mgr=mgr) - result_blocks = _extend_blocks(result, result_blocks) - blocks = result_blocks - return result_blocks - - return self._replace_single(to_replace, value, inplace=inplace, - filter=filter, convert=convert, - regex=regex, mgr=mgr) - - def _replace_single(self, to_replace, value, inplace=False, filter=None, - regex=False, convert=True, mgr=None): - - inplace = validate_bool_kwarg(inplace, 'inplace') - - # to_replace is regex compilable - to_rep_re = regex and is_re_compilable(to_replace) - - # regex is regex compilable - regex_re = is_re_compilable(regex) - - # only one will survive - if to_rep_re and regex_re: - raise AssertionError('only one of to_replace and regex can be ' - 'regex compilable') - - # if regex was passed as something that can be a regex (rather than a - # boolean) - if regex_re: - to_replace = regex - - regex = regex_re or to_rep_re - - # try to get the pattern attribute (compiled re) or it's a string - try: - pattern = to_replace.pattern - except AttributeError: - pattern = to_replace - - # if the pattern is not empty and to_replace is either a string or a - # regex - if regex and pattern: - rx = re.compile(to_replace) - else: - # if the thing to replace is not a string or compiled regex call - # the superclass method -> to_replace is some kind of object - return super(ObjectBlock, self).replace(to_replace, value, - inplace=inplace, - filter=filter, regex=regex, - mgr=mgr) - - new_values = self.values if inplace else self.values.copy() - - # deal with replacing values with objects (strings) that match but - # whose replacement is not a string (numeric, nan, object) - if isna(value) or not isinstance(value, compat.string_types): - - def re_replacer(s): - try: - return value if rx.search(s) is not None else s - except TypeError: - return s - else: - # value is guaranteed to be a string here, s can be either a string - # or null if it's null it gets returned - def re_replacer(s): - try: - return rx.sub(value, s) - except TypeError: - return s - - f = np.vectorize(re_replacer, otypes=[self.dtype]) - - if filter is None: - filt = slice(None) - else: - filt = self.mgr_locs.isin(filter).nonzero()[0] - - new_values[filt] = f(new_values[filt]) - - # convert - block = self.make_block(new_values) - if convert: - block = block.convert(by_item=True, numeric=False) - - return block - - -class CategoricalBlock(ExtensionBlock): - __slots__ = () - is_categorical = True - _verify_integrity = True - _can_hold_na = True - _concatenator = staticmethod(_concat._concat_categorical) - - def __init__(self, values, placement, ndim=None): - from pandas.core.arrays.categorical import _maybe_to_categorical - - # coerce to categorical if we can - super(CategoricalBlock, self).__init__(_maybe_to_categorical(values), - placement=placement, - ndim=ndim) - - @property - def _holder(self): - return Categorical - - @property - def array_dtype(self): - """ the dtype to return if I want to construct this block as an - array - """ - return np.object_ - - def _try_coerce_result(self, result): - """ reverse of try_coerce_args """ - - # GH12564: CategoricalBlock is 1-dim only - # while returned results could be any dim - if ((not is_categorical_dtype(result)) and - isinstance(result, np.ndarray)): - result = _block_shape(result, ndim=self.ndim) - - return result - - def shift(self, periods, axis=0, mgr=None): - return self.make_block_same_class(values=self.values.shift(periods), - placement=self.mgr_locs) - - def to_dense(self): - # Categorical.get_values returns a DatetimeIndex for datetime - # categories, so we can't simply use `np.asarray(self.values)` like - # other types. - return self.values.get_values() - - def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): - """ convert to our native types format, slicing if desired """ - - values = self.values - if slicer is not None: - # Categorical is always one dimension - values = values[slicer] - mask = isna(values) - values = np.array(values, dtype='object') - values[mask] = na_rep - - # we are expected to return a 2-d ndarray - return values.reshape(1, len(values)) - - def concat_same_type(self, to_concat, placement=None): - """ - Concatenate list of single blocks of the same type. - - Note that this CategoricalBlock._concat_same_type *may* not - return a CategoricalBlock. When the categories in `to_concat` - differ, this will return an object ndarray. - - If / when we decide we don't like that behavior: - - 1. Change Categorical._concat_same_type to use union_categoricals - 2. Delete this method. - """ - values = self._concatenator([blk.values for blk in to_concat], - axis=self.ndim - 1) - # not using self.make_block_same_class as values can be object dtype - return make_block( - values, placement=placement or slice(0, len(values), 1), - ndim=self.ndim) - - -class DatetimeLikeBlockMixin(object): - """Mixin class for DatetimeBlock and DatetimeTZBlock.""" - - @property - def _holder(self): - return DatetimeIndex - - @property - def _na_value(self): - return tslibs.NaT - - @property - def fill_value(self): - return tslibs.iNaT - - def get_values(self, dtype=None): - """ - return object dtype as boxed values, such as Timestamps/Timedelta - """ - if is_object_dtype(dtype): - return lib.map_infer(self.values.ravel(), - self._box_func).reshape(self.values.shape) - return self.values - - -class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): - __slots__ = () - is_timedelta = True - _can_hold_na = True - is_numeric = False - - def __init__(self, values, placement, ndim=None): - if values.dtype != _TD_DTYPE: - values = conversion.ensure_timedelta64ns(values) - - super(TimeDeltaBlock, self).__init__(values, - placement=placement, ndim=ndim) - - @property - def _holder(self): - return TimedeltaIndex - - @property - def _box_func(self): - return lambda x: Timedelta(x, unit='ns') - - def _can_hold_element(self, element): - tipo = maybe_infer_dtype_type(element) - if tipo is not None: - return issubclass(tipo.type, np.timedelta64) - return is_integer(element) or isinstance( - element, (timedelta, np.timedelta64)) - - def fillna(self, value, **kwargs): - - # allow filling with integers to be - # interpreted as seconds - if is_integer(value) and not isinstance(value, np.timedelta64): - value = Timedelta(value, unit='s') - return super(TimeDeltaBlock, self).fillna(value, **kwargs) - - def _try_coerce_args(self, values, other): - """ - Coerce values and other to int64, with null values converted to - iNaT. values is always ndarray-like, other may not be - - Parameters - ---------- - values : ndarray-like - other : ndarray-like or scalar - - Returns - ------- - base-type values, values mask, base-type other, other mask - """ - - values_mask = isna(values) - values = values.view('i8') - other_mask = False - - if isinstance(other, bool): - raise TypeError - elif is_null_datelike_scalar(other): - other = tslibs.iNaT - other_mask = True - elif isinstance(other, Timedelta): - other_mask = isna(other) - other = other.value - elif isinstance(other, timedelta): - other = Timedelta(other).value - elif isinstance(other, np.timedelta64): - other_mask = isna(other) - other = Timedelta(other).value - elif hasattr(other, 'dtype') and is_timedelta64_dtype(other): - other_mask = isna(other) - other = other.astype('i8', copy=False).view('i8') - else: - # coercion issues - # let higher levels handle - raise TypeError - - return values, values_mask, other, other_mask - - def _try_coerce_result(self, result): - """ reverse of try_coerce_args / try_operate """ - if isinstance(result, np.ndarray): - mask = isna(result) - if result.dtype.kind in ['i', 'f', 'O']: - result = result.astype('m8[ns]') - result[mask] = tslibs.iNaT - elif isinstance(result, (np.integer, np.float)): - result = self._box_func(result) - return result - - def should_store(self, value): - return issubclass(value.dtype.type, np.timedelta64) - - def to_native_types(self, slicer=None, na_rep=None, quoting=None, - **kwargs): - """ convert to our native types format, slicing if desired """ - - values = self.values - if slicer is not None: - values = values[:, slicer] - mask = isna(values) - - rvalues = np.empty(values.shape, dtype=object) - if na_rep is None: - na_rep = 'NaT' - rvalues[mask] = na_rep - imask = (~mask).ravel() - - # FIXME: - # should use the formats.format.Timedelta64Formatter here - # to figure what format to pass to the Timedelta - # e.g. to not show the decimals say - rvalues.flat[imask] = np.array([Timedelta(val)._repr_base(format='all') - for val in values.ravel()[imask]], - dtype=object) - return rvalues - - -class DatetimeBlock(DatetimeLikeBlockMixin, Block): - __slots__ = () - is_datetime = True - _can_hold_na = True - - def __init__(self, values, placement, ndim=None): - values = self._maybe_coerce_values(values) - super(DatetimeBlock, self).__init__(values, - placement=placement, ndim=ndim) - - def _maybe_coerce_values(self, values): - """Input validation for values passed to __init__. Ensure that - we have datetime64ns, coercing if necessary. - - Parameters - ---------- - values : array-like - Must be convertible to datetime64 - - Returns - ------- - values : ndarray[datetime64ns] - - Overridden by DatetimeTZBlock. - """ - if values.dtype != _NS_DTYPE: - values = conversion.ensure_datetime64ns(values) - return values - - def _astype(self, dtype, mgr=None, **kwargs): - """ - these automatically copy, so copy=True has no effect - raise on an except if raise == True - """ - - # if we are passed a datetime64[ns, tz] - if is_datetime64tz_dtype(dtype): - dtype = DatetimeTZDtype(dtype) - - values = self.values - if getattr(values, 'tz', None) is None: - values = DatetimeIndex(values).tz_localize('UTC') - values = values.tz_convert(dtype.tz) - return self.make_block(values) - - # delegate - return super(DatetimeBlock, self)._astype(dtype=dtype, **kwargs) - - def _can_hold_element(self, element): - tipo = maybe_infer_dtype_type(element) - if tipo is not None: - # TODO: this still uses asarray, instead of dtype.type - element = np.array(element) - return element.dtype == _NS_DTYPE or element.dtype == np.int64 - return (is_integer(element) or isinstance(element, datetime) or - isna(element)) - - def _try_coerce_args(self, values, other): - """ - Coerce values and other to dtype 'i8'. NaN and NaT convert to - the smallest i8, and will correctly round-trip to NaT if converted - back in _try_coerce_result. values is always ndarray-like, other - may not be - - Parameters - ---------- - values : ndarray-like - other : ndarray-like or scalar - - Returns - ------- - base-type values, values mask, base-type other, other mask - """ - - values_mask = isna(values) - values = values.view('i8') - other_mask = False - - if isinstance(other, bool): - raise TypeError - elif is_null_datelike_scalar(other): - other = tslibs.iNaT - other_mask = True - elif isinstance(other, (datetime, np.datetime64, date)): - other = self._box_func(other) - if getattr(other, 'tz') is not None: - raise TypeError("cannot coerce a Timestamp with a tz on a " - "naive Block") - other_mask = isna(other) - other = other.asm8.view('i8') - elif hasattr(other, 'dtype') and is_datetime64_dtype(other): - other_mask = isna(other) - other = other.astype('i8', copy=False).view('i8') - else: - # coercion issues - # let higher levels handle - raise TypeError - - return values, values_mask, other, other_mask - - def _try_coerce_result(self, result): - """ reverse of try_coerce_args """ - if isinstance(result, np.ndarray): - if result.dtype.kind in ['i', 'f', 'O']: - try: - result = result.astype('M8[ns]') - except ValueError: - pass - elif isinstance(result, (np.integer, np.float, np.datetime64)): - result = self._box_func(result) - return result - - @property - def _box_func(self): - return tslibs.Timestamp - - def to_native_types(self, slicer=None, na_rep=None, date_format=None, - quoting=None, **kwargs): - """ convert to our native types format, slicing if desired """ - - values = self.values - if slicer is not None: - values = values[..., slicer] - - from pandas.io.formats.format import _get_format_datetime64_from_values - format = _get_format_datetime64_from_values(values, date_format) - - result = tslib.format_array_from_datetime( - values.view('i8').ravel(), tz=getattr(self.values, 'tz', None), - format=format, na_rep=na_rep).reshape(values.shape) - return np.atleast_2d(result) - - def should_store(self, value): - return (issubclass(value.dtype.type, np.datetime64) and - not is_datetimetz(value)) - - def set(self, locs, values, check=False): - """ - Modify Block in-place with new item value - - Returns - ------- - None - """ - if values.dtype != _NS_DTYPE: - # Workaround for numpy 1.6 bug - values = conversion.ensure_datetime64ns(values) - - self.values[locs] = values - - -class DatetimeTZBlock(NonConsolidatableMixIn, DatetimeBlock): - """ implement a datetime64 block with a tz attribute """ - __slots__ = () - _concatenator = staticmethod(_concat._concat_datetime) - is_datetimetz = True - - def __init__(self, values, placement, ndim=2, dtype=None): - # XXX: This will end up calling _maybe_coerce_values twice - # when dtype is not None. It's relatively cheap (just an isinstance) - # but it'd nice to avoid. - # - # If we can remove dtype from __init__, and push that conversion - # push onto the callers, then we can remove this entire __init__ - # and just use DatetimeBlock's. - if dtype is not None: - values = self._maybe_coerce_values(values, dtype=dtype) - super(DatetimeTZBlock, self).__init__(values, placement=placement, - ndim=ndim) - - def _maybe_coerce_values(self, values, dtype=None): - """Input validation for values passed to __init__. Ensure that - we have datetime64TZ, coercing if necessary. - - Parametetrs - ----------- - values : array-like - Must be convertible to datetime64 - dtype : string or DatetimeTZDtype, optional - Does a shallow copy to this tz - - Returns - ------- - values : ndarray[datetime64ns] - """ - if not isinstance(values, self._holder): - values = self._holder(values) - - if dtype is not None: - if isinstance(dtype, compat.string_types): - dtype = DatetimeTZDtype.construct_from_string(dtype) - values = values._shallow_copy(tz=dtype.tz) - - if values.tz is None: - raise ValueError("cannot create a DatetimeTZBlock without a tz") - - return values - - @property - def is_view(self): - """ return a boolean if I am possibly a view """ - # check the ndarray values of the DatetimeIndex values - return self.values.values.base is not None - - def copy(self, deep=True, mgr=None): - """ copy constructor """ - values = self.values - if deep: - values = values.copy(deep=True) - return self.make_block_same_class(values) - - def external_values(self): - """ we internally represent the data as a DatetimeIndex, but for - external compat with ndarray, export as a ndarray of Timestamps - """ - return self.values.astype('datetime64[ns]').values - - def get_values(self, dtype=None): - # return object dtype as Timestamps with the zones - if is_object_dtype(dtype): - return lib.map_infer( - self.values.ravel(), self._box_func).reshape(self.values.shape) - return self.values - - def _slice(self, slicer): - """ return a slice of my values """ - if isinstance(slicer, tuple): - col, loc = slicer - if not com.is_null_slice(col) and col != 0: - raise IndexError("{0} only contains one item".format(self)) - return self.values[loc] - return self.values[slicer] - - def _try_coerce_args(self, values, other): - """ - localize and return i8 for the values - - Parameters - ---------- - values : ndarray-like - other : ndarray-like or scalar - - Returns - ------- - base-type values, values mask, base-type other, other mask - """ - values_mask = _block_shape(isna(values), ndim=self.ndim) - # asi8 is a view, needs copy - values = _block_shape(values.asi8, ndim=self.ndim) - other_mask = False - - if isinstance(other, gt.ABCSeries): - other = self._holder(other) - other_mask = isna(other) - - if isinstance(other, bool): - raise TypeError - elif (is_null_datelike_scalar(other) or - (is_scalar(other) and isna(other))): - other = tslibs.iNaT - other_mask = True - elif isinstance(other, self._holder): - if other.tz != self.values.tz: - raise ValueError("incompatible or non tz-aware value") - other_mask = _block_shape(isna(other), ndim=self.ndim) - other = _block_shape(other.asi8, ndim=self.ndim) - elif isinstance(other, (np.datetime64, datetime, date)): - other = tslibs.Timestamp(other) - tz = getattr(other, 'tz', None) - - # test we can have an equal time zone - if tz is None or str(tz) != str(self.values.tz): - raise ValueError("incompatible or non tz-aware value") - other_mask = isna(other) - other = other.value - else: - raise TypeError - - return values, values_mask, other, other_mask - - def _try_coerce_result(self, result): - """ reverse of try_coerce_args """ - if isinstance(result, np.ndarray): - if result.dtype.kind in ['i', 'f', 'O']: - result = result.astype('M8[ns]') - elif isinstance(result, (np.integer, np.float, np.datetime64)): - result = tslibs.Timestamp(result, tz=self.values.tz) - if isinstance(result, np.ndarray): - # allow passing of > 1dim if its trivial - if result.ndim > 1: - result = result.reshape(np.prod(result.shape)) - result = self.values._shallow_copy(result) - - return result - - @property - def _box_func(self): - return lambda x: tslibs.Timestamp(x, tz=self.dtype.tz) - - def shift(self, periods, axis=0, mgr=None): - """ shift the block by periods """ - - # think about moving this to the DatetimeIndex. This is a non-freq - # (number of periods) shift ### - - N = len(self) - indexer = np.zeros(N, dtype=int) - if periods > 0: - indexer[periods:] = np.arange(N - periods) - else: - indexer[:periods] = np.arange(-periods, N) - - new_values = self.values.asi8.take(indexer) - - if periods > 0: - new_values[:periods] = tslibs.iNaT - else: - new_values[periods:] = tslibs.iNaT - - new_values = self.values._shallow_copy(new_values) - return [self.make_block_same_class(new_values, - placement=self.mgr_locs)] - - def diff(self, n, axis=0, mgr=None): - """1st discrete difference - - Parameters - ---------- - n : int, number of periods to diff - axis : int, axis to diff upon. default 0 - mgr : default None - - Return - ------ - A list with a new TimeDeltaBlock. - - Note - ---- - The arguments here are mimicking shift so they are called correctly - by apply. - """ - if axis == 0: - # Cannot currently calculate diff across multiple blocks since this - # function is invoked via apply - raise NotImplementedError - new_values = (self.values - self.shift(n, axis=axis)[0].values).asi8 - - # Reshape the new_values like how algos.diff does for timedelta data - new_values = new_values.reshape(1, len(new_values)) - new_values = new_values.astype('timedelta64[ns]') - return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] - - def concat_same_type(self, to_concat, placement=None): - """ - Concatenate list of single blocks of the same type. - """ - values = self._concatenator([blk.values for blk in to_concat], - axis=self.ndim - 1) - # not using self.make_block_same_class as values can be non-tz dtype - return make_block( - values, placement=placement or slice(0, len(values), 1)) - - -class SparseBlock(NonConsolidatableMixIn, Block): - """ implement as a list of sparse arrays of the same dtype """ - __slots__ = () - is_sparse = True - is_numeric = True - _box_to_block_values = False - _can_hold_na = True - _ftype = 'sparse' - _concatenator = staticmethod(_concat._concat_sparse) - - def __init__(self, values, placement, ndim=None): - # Ensure that we have the underlying SparseArray here... - if isinstance(values, gt.ABCSeries): - values = values.values - assert isinstance(values, SparseArray) - super(SparseBlock, self).__init__(values, placement, ndim=ndim) - - @property - def _holder(self): - return SparseArray - - @property - def shape(self): - return (len(self.mgr_locs), self.sp_index.length) - - @property - def fill_value(self): - # return np.nan - return self.values.fill_value - - @fill_value.setter - def fill_value(self, v): - self.values.fill_value = v - - def to_dense(self): - return self.values.to_dense().view() - - @property - def sp_values(self): - return self.values.sp_values - - @sp_values.setter - def sp_values(self, v): - # reset the sparse values - self.values = SparseArray(v, sparse_index=self.sp_index, - kind=self.kind, dtype=v.dtype, - fill_value=self.values.fill_value, - copy=False) - - @property - def sp_index(self): - return self.values.sp_index - - @property - def kind(self): - return self.values.kind - - def _astype(self, dtype, copy=False, errors='raise', values=None, - klass=None, mgr=None, **kwargs): - if values is None: - values = self.values - values = values.astype(dtype, copy=copy) - return self.make_block_same_class(values=values, - placement=self.mgr_locs) - - def __len__(self): - try: - return self.sp_index.length - except: - return 0 - - def copy(self, deep=True, mgr=None): - return self.make_block_same_class(values=self.values, - sparse_index=self.sp_index, - kind=self.kind, copy=deep, - placement=self.mgr_locs) - - def make_block_same_class(self, values, placement, sparse_index=None, - kind=None, dtype=None, fill_value=None, - copy=False, ndim=None): - """ return a new block """ - if dtype is None: - dtype = values.dtype - if fill_value is None and not isinstance(values, SparseArray): - fill_value = self.values.fill_value - - # if not isinstance(values, SparseArray) and values.ndim != self.ndim: - # raise ValueError("ndim mismatch") - - if values.ndim == 2: - nitems = values.shape[0] - - if nitems == 0: - # kludgy, but SparseBlocks cannot handle slices, where the - # output is 0-item, so let's convert it to a dense block: it - # won't take space since there's 0 items, plus it will preserve - # the dtype. - return self.make_block(np.empty(values.shape, dtype=dtype), - placement) - elif nitems > 1: - raise ValueError("Only 1-item 2d sparse blocks are supported") - else: - values = values.reshape(values.shape[1]) - - new_values = SparseArray(values, sparse_index=sparse_index, - kind=kind or self.kind, dtype=dtype, - fill_value=fill_value, copy=copy) - return self.make_block(new_values, - placement=placement) - - def interpolate(self, method='pad', axis=0, inplace=False, limit=None, - fill_value=None, **kwargs): - - values = missing.interpolate_2d(self.values.to_dense(), method, axis, - limit, fill_value) - return self.make_block_same_class(values=values, - placement=self.mgr_locs) - - def fillna(self, value, limit=None, inplace=False, downcast=None, - mgr=None): - # we may need to upcast our fill to match our dtype - if limit is not None: - raise NotImplementedError("specifying a limit for 'fillna' has " - "not been implemented yet") - values = self.values if inplace else self.values.copy() - values = values.fillna(value, downcast=downcast) - return [self.make_block_same_class(values=values, - placement=self.mgr_locs)] - - def shift(self, periods, axis=0, mgr=None): - """ shift the block by periods """ - N = len(self.values.T) - indexer = np.zeros(N, dtype=int) - if periods > 0: - indexer[periods:] = np.arange(N - periods) - else: - indexer[:periods] = np.arange(-periods, N) - new_values = self.values.to_dense().take(indexer) - # convert integer to float if necessary. need to do a lot more than - # that, handle boolean etc also - new_values, fill_value = maybe_upcast(new_values) - if periods > 0: - new_values[:periods] = fill_value - else: - new_values[periods:] = fill_value - return [self.make_block_same_class(new_values, - placement=self.mgr_locs)] - - def sparse_reindex(self, new_index): - """ sparse reindex and return a new block - current reindex only works for float64 dtype! """ - values = self.values - values = values.sp_index.to_int_index().reindex( - values.sp_values.astype('float64'), values.fill_value, new_index) - return self.make_block_same_class(values, sparse_index=new_index, - placement=self.mgr_locs) - - -# ------------------------------------------------------------------- -# Constructor Helpers - -def get_block_type(values, dtype=None): - """ - Find the appropriate Block subclass to use for the given values and dtype. - - Parameters - ---------- - values : ndarray-like - dtype : numpy or pandas dtype - - Returns - ------- - cls : class, subclass of Block - """ - dtype = dtype or values.dtype - vtype = dtype.type - - if is_sparse(values): - cls = SparseBlock - elif issubclass(vtype, np.floating): - cls = FloatBlock - elif issubclass(vtype, np.timedelta64): - assert issubclass(vtype, np.integer) - cls = TimeDeltaBlock - elif issubclass(vtype, np.complexfloating): - cls = ComplexBlock - elif is_categorical(values): - cls = CategoricalBlock - elif is_extension_array_dtype(values): - cls = ExtensionBlock - elif issubclass(vtype, np.datetime64): - assert not is_datetimetz(values) - cls = DatetimeBlock - elif is_datetimetz(values): - cls = DatetimeTZBlock - elif issubclass(vtype, np.integer): - cls = IntBlock - elif dtype == np.bool_: - cls = BoolBlock - else: - cls = ObjectBlock - return cls - - -def make_block(values, placement, klass=None, ndim=None, dtype=None, - fastpath=None): - if fastpath is not None: - # GH#19265 pyarrow is passing this - warnings.warn("fastpath argument is deprecated, will be removed " - "in a future release.", DeprecationWarning) - if klass is None: - dtype = dtype or values.dtype - klass = get_block_type(values, dtype) - - elif klass is DatetimeTZBlock and not is_datetimetz(values): - return klass(values, ndim=ndim, - placement=placement, dtype=dtype) - - return klass(values, ndim=ndim, placement=placement) - - -def form_blocks(arrays, names, axes): - # put "leftover" items in float bucket, where else? - # generalize? - items_dict = defaultdict(list) - extra_locs = [] - - names_idx = ensure_index(names) - if names_idx.equals(axes[0]): - names_indexer = np.arange(len(names_idx)) - else: - assert names_idx.intersection(axes[0]).is_unique - names_indexer = names_idx.get_indexer_for(axes[0]) - - for i, name_idx in enumerate(names_indexer): - if name_idx == -1: - extra_locs.append(i) - continue - - k = names[name_idx] - v = arrays[name_idx] - - block_type = get_block_type(v) - items_dict[block_type.__name__].append((i, k, v)) - - blocks = [] - if len(items_dict['FloatBlock']): - float_blocks = _multi_blockify(items_dict['FloatBlock']) - blocks.extend(float_blocks) - - if len(items_dict['ComplexBlock']): - complex_blocks = _multi_blockify(items_dict['ComplexBlock']) - blocks.extend(complex_blocks) - - if len(items_dict['TimeDeltaBlock']): - timedelta_blocks = _multi_blockify(items_dict['TimeDeltaBlock']) - blocks.extend(timedelta_blocks) - - if len(items_dict['IntBlock']): - int_blocks = _multi_blockify(items_dict['IntBlock']) - blocks.extend(int_blocks) - - if len(items_dict['DatetimeBlock']): - datetime_blocks = _simple_blockify(items_dict['DatetimeBlock'], - _NS_DTYPE) - blocks.extend(datetime_blocks) - - if len(items_dict['DatetimeTZBlock']): - dttz_blocks = [make_block(array, - klass=DatetimeTZBlock, - placement=[i]) - for i, _, array in items_dict['DatetimeTZBlock']] - blocks.extend(dttz_blocks) - - if len(items_dict['BoolBlock']): - bool_blocks = _simple_blockify(items_dict['BoolBlock'], np.bool_) - blocks.extend(bool_blocks) - - if len(items_dict['ObjectBlock']) > 0: - object_blocks = _simple_blockify(items_dict['ObjectBlock'], np.object_) - blocks.extend(object_blocks) - - if len(items_dict['SparseBlock']) > 0: - sparse_blocks = _sparse_blockify(items_dict['SparseBlock']) - blocks.extend(sparse_blocks) - - if len(items_dict['CategoricalBlock']) > 0: - cat_blocks = [make_block(array, klass=CategoricalBlock, placement=[i]) - for i, _, array in items_dict['CategoricalBlock']] - blocks.extend(cat_blocks) - - if len(items_dict['ExtensionBlock']): - - external_blocks = [ - make_block(array, klass=ExtensionBlock, placement=[i]) - for i, _, array in items_dict['ExtensionBlock'] - ] - - blocks.extend(external_blocks) - - if len(extra_locs): - shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:]) - - # empty items -> dtype object - block_values = np.empty(shape, dtype=object) - block_values.fill(np.nan) - - na_block = make_block(block_values, placement=extra_locs) - blocks.append(na_block) - - return blocks - - -def _simple_blockify(tuples, dtype): - """ return a single array of a block that has a single dtype; if dtype is - not None, coerce to this dtype - """ - values, placement = _stack_arrays(tuples, dtype) - - # CHECK DTYPE? - if dtype is not None and values.dtype != dtype: # pragma: no cover - values = values.astype(dtype) - - block = make_block(values, placement=placement) - return [block] - - -def _multi_blockify(tuples, dtype=None): - """ return an array of blocks that potentially have different dtypes """ - - # group by dtype - grouper = itertools.groupby(tuples, lambda x: x[2].dtype) - - new_blocks = [] - for dtype, tup_block in grouper: - - values, placement = _stack_arrays(list(tup_block), dtype) - - block = make_block(values, placement=placement) - new_blocks.append(block) - - return new_blocks - - -def _sparse_blockify(tuples, dtype=None): - """ return an array of blocks that potentially have different dtypes (and - are sparse) - """ - - new_blocks = [] - for i, names, array in tuples: - array = _maybe_to_sparse(array) - block = make_block(array, klass=SparseBlock, placement=[i]) - new_blocks.append(block) - - return new_blocks - - -def _stack_arrays(tuples, dtype): - - # fml - def _asarray_compat(x): - if isinstance(x, gt.ABCSeries): - return x._values - else: - return np.asarray(x) - - def _shape_compat(x): - if isinstance(x, gt.ABCSeries): - return len(x), - else: - return x.shape - - placement, names, arrays = zip(*tuples) - - first = arrays[0] - shape = (len(arrays),) + _shape_compat(first) - - stacked = np.empty(shape, dtype=dtype) - for i, arr in enumerate(arrays): - stacked[i] = _asarray_compat(arr) - - return stacked, placement - - -# ------------------------------------------------------------------- - -def _block_shape(values, ndim=1, shape=None): - """ guarantee the shape of the values to be at least 1 d """ - if values.ndim < ndim: - if shape is None: - shape = values.shape - values = values.reshape(tuple((1, ) + shape)) - return values - - -def _putmask_smart(v, m, n): - """ - Return a new ndarray, try to preserve dtype if possible. - - Parameters - ---------- - v : `values`, updated in-place (array like) - m : `mask`, applies to both sides (array like) - n : `new values` either scalar or an array like aligned with `values` - - Returns - ------- - values : ndarray with updated values - this *may* be a copy of the original - - See Also - -------- - ndarray.putmask - """ - - # we cannot use np.asarray() here as we cannot have conversions - # that numpy does when numeric are mixed with strings - - # n should be the length of the mask or a scalar here - if not is_list_like(n): - n = np.repeat(n, len(m)) - elif isinstance(n, np.ndarray) and n.ndim == 0: # numpy scalar - n = np.repeat(np.array(n, ndmin=1), len(m)) - - # see if we are only masking values that if putted - # will work in the current dtype - try: - nn = n[m] - - # make sure that we have a nullable type - # if we have nulls - if not _isna_compat(v, nn[0]): - raise ValueError - - with warnings.catch_warnings(record=True): - # we ignore ComplexWarning here - nn_at = nn.astype(v.dtype) - - # avoid invalid dtype comparisons - # between numbers & strings - - # only compare integers/floats - # don't compare integers to datetimelikes - if (not is_numeric_v_string_like(nn, nn_at) and - (is_float_dtype(nn.dtype) or - is_integer_dtype(nn.dtype) and - is_float_dtype(nn_at.dtype) or - is_integer_dtype(nn_at.dtype))): - - comp = (nn == nn_at) - if is_list_like(comp) and comp.all(): - nv = v.copy() - nv[m] = nn_at - return nv - except (ValueError, IndexError, TypeError): - pass - - n = np.asarray(n) - - def _putmask_preserve(nv, n): - try: - nv[m] = n[m] - except (IndexError, ValueError): - nv[m] = n - return nv - - # preserves dtype if possible - if v.dtype.kind == n.dtype.kind: - return _putmask_preserve(v, n) - - # change the dtype if needed - dtype, _ = maybe_promote(n.dtype) - - if is_extension_type(v.dtype) and is_object_dtype(dtype): - v = v.get_values(dtype) - else: - v = v.astype(dtype) - - return _putmask_preserve(v, n) - - -# Also needed by managers -def _extend_blocks(result, blocks=None): - """ return a new extended blocks, givin the result """ - from .managers import BlockManager - if blocks is None: - blocks = [] - if isinstance(result, list): - for r in result: - if isinstance(r, list): - blocks.extend(r) - else: - blocks.append(r) - elif isinstance(result, BlockManager): - blocks.extend(result.blocks) - else: - blocks.append(result) - return blocks - - -# Also needed by managers -def _safe_reshape(arr, new_shape): - """ - If possible, reshape `arr` to have shape `new_shape`, - with a couple of exceptions (see gh-13012): - - 1) If `arr` is a ExtensionArray or Index, `arr` will be - returned as is. - 2) If `arr` is a Series, the `_values` attribute will - be reshaped and returned. - - Parameters - ---------- - arr : array-like, object to be reshaped - new_shape : int or tuple of ints, the new shape - """ - if isinstance(arr, gt.ABCSeries): - arr = arr._values - if not isinstance(arr, gt.ABCExtensionArray): - arr = arr.reshape(new_shape) - return arr - - -def _merge_blocks(blocks, dtype=None, _can_consolidate=True): - - if len(blocks) == 1: - return blocks[0] - - if _can_consolidate: - - if dtype is None: - if len({b.dtype for b in blocks}) != 1: - raise AssertionError("_merge_blocks are invalid!") - dtype = blocks[0].dtype - - # FIXME: optimization potential in case all mgrs contain slices and - # combination of those slices is a slice, too. - new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks]) - new_values = _vstack([b.values for b in blocks], dtype) - - argsort = np.argsort(new_mgr_locs) - new_values = new_values[argsort] - new_mgr_locs = new_mgr_locs[argsort] - - return make_block(new_values, placement=new_mgr_locs) - - # no merge - return blocks - - -def _vstack(to_stack, dtype): - - # work around NumPy 1.6 bug --> TODO: Still necessary? - if dtype == _NS_DTYPE or dtype == _TD_DTYPE: - new_values = np.vstack([x.view('i8') for x in to_stack]) - return new_values.view(dtype) - - else: - return np.vstack(to_stack) - - -def _factor_indexer(shape, labels): - """ - given a tuple of shape and a list of Categorical labels, return the - expanded label indexer - """ - mult = np.array(shape)[::-1].cumprod()[::-1] - return ensure_platform_int( - np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T) - - -# TODO: remove after Panel is gone -def _block2d_to_blocknd(values, placement, shape, labels, ref_items): - """ pivot to the labels shape """ - panel_shape = (len(placement),) + shape - - # TODO: lexsort depth needs to be 2!! - - # Create observation selection vector using major and minor - # labels, for converting to panel format. - selector = _factor_indexer(shape[1:], labels) - mask = np.zeros(np.prod(shape), dtype=bool) - mask.put(selector, True) - - if mask.all(): - pvalues = np.empty(panel_shape, dtype=values.dtype) - else: - dtype, fill_value = maybe_promote(values.dtype) - pvalues = np.empty(panel_shape, dtype=dtype) - pvalues.fill(fill_value) - - for i in range(len(placement)): - pvalues[i].flat[mask] = values[:, i] - - return make_block(pvalues, placement=placement) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py deleted file mode 100644 index d574428b30525..0000000000000 --- a/pandas/core/internals/concat.py +++ /dev/null @@ -1,519 +0,0 @@ -# -*- coding: utf-8 -*- - -# TODO: Needs a new name; too many things are already called "concat" -import copy -from collections import defaultdict - -import numpy as np - -from pandas._libs import tslibs - -from pandas.util._decorators import cache_readonly - -from pandas.core.dtypes.common import ( - is_timedelta64_dtype, - is_datetime64_dtype, is_datetimetz, - is_categorical_dtype, - is_numeric_dtype, - is_float_dtype, - _get_dtype) -from pandas.core.dtypes.missing import isna -from pandas.core.dtypes.cast import maybe_promote -import pandas.core.dtypes.concat as _concat - -import pandas.core.algorithms as algos - -from .blocks import make_block -from .managers import BlockManager, _get_blkno_placements - - -class JoinUnit(object): - - def __init__(self, block, shape, indexers=None): - # Passing shape explicitly is required for cases when block is None. - if indexers is None: - indexers = {} - self.block = block - self.indexers = indexers - self.shape = shape - - def __repr__(self): - return '{name}({block!r}, {indexers})'.format( - name=self.__class__.__name__, block=self.block, - indexers=self.indexers) - - @cache_readonly - def needs_filling(self): - for indexer in self.indexers.values(): - # FIXME: cache results of indexer == -1 checks. - if (indexer == -1).any(): - return True - - return False - - @cache_readonly - def dtype(self): - if self.block is None: - raise AssertionError("Block is None, no dtype") - - if not self.needs_filling: - return self.block.dtype - else: - return _get_dtype(maybe_promote(self.block.dtype, - self.block.fill_value)[0]) - - @cache_readonly - def is_na(self): - if self.block is None: - return True - - if not self.block._can_hold_na: - return False - - # Usually it's enough to check but a small fraction of values to see if - # a block is NOT null, chunks should help in such cases. 1000 value - # was chosen rather arbitrarily. - values = self.block.values - if self.block.is_categorical: - values_flat = values.categories - elif self.block.is_sparse: - # fill_value is not NaN and have holes - if not values._null_fill_value and values.sp_index.ngaps > 0: - return False - values_flat = values.ravel(order='K') - elif self.block.is_extension: - values_flat = values - else: - values_flat = values.ravel(order='K') - total_len = values_flat.shape[0] - chunk_len = max(total_len // 40, 1000) - for i in range(0, total_len, chunk_len): - if not isna(values_flat[i:i + chunk_len]).all(): - return False - - return True - - def get_reindexed_values(self, empty_dtype, upcasted_na): - if upcasted_na is None: - # No upcasting is necessary - fill_value = self.block.fill_value - values = self.block.get_values() - else: - fill_value = upcasted_na - - if self.is_na: - if getattr(self.block, 'is_object', False): - # we want to avoid filling with np.nan if we are - # using None; we already know that we are all - # nulls - values = self.block.values.ravel(order='K') - if len(values) and values[0] is None: - fill_value = None - - if getattr(self.block, 'is_datetimetz', False) or \ - is_datetimetz(empty_dtype): - pass - elif getattr(self.block, 'is_categorical', False): - pass - elif getattr(self.block, 'is_sparse', False): - pass - else: - missing_arr = np.empty(self.shape, dtype=empty_dtype) - missing_arr.fill(fill_value) - return missing_arr - - if not self.indexers: - if not self.block._can_consolidate: - # preserve these for validation in _concat_compat - return self.block.values - - if self.block.is_bool and not self.block.is_categorical: - # External code requested filling/upcasting, bool values must - # be upcasted to object to avoid being upcasted to numeric. - values = self.block.astype(np.object_).values - elif self.block.is_extension: - values = self.block.values - else: - # No dtype upcasting is done here, it will be performed during - # concatenation itself. - values = self.block.get_values() - - if not self.indexers: - # If there's no indexing to be done, we want to signal outside - # code that this array must be copied explicitly. This is done - # by returning a view and checking `retval.base`. - values = values.view() - - else: - for ax, indexer in self.indexers.items(): - values = algos.take_nd(values, indexer, axis=ax, - fill_value=fill_value) - - return values - - -def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): - """ - Concatenate block managers into one. - - Parameters - ---------- - mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples - axes : list of Index - concat_axis : int - copy : bool - - """ - concat_plan = combine_concat_plans( - [get_mgr_concatenation_plan(mgr, indexers) - for mgr, indexers in mgrs_indexers], concat_axis) - - blocks = [] - - for placement, join_units in concat_plan: - - if len(join_units) == 1 and not join_units[0].indexers: - b = join_units[0].block - values = b.values - if copy: - values = values.copy() - elif not copy: - values = values.view() - b = b.make_block_same_class(values, placement=placement) - elif is_uniform_join_units(join_units): - b = join_units[0].block.concat_same_type( - [ju.block for ju in join_units], placement=placement) - else: - b = make_block( - concatenate_join_units(join_units, concat_axis, copy=copy), - placement=placement) - blocks.append(b) - - return BlockManager(blocks, axes) - - -def is_uniform_join_units(join_units): - """ - Check if the join units consist of blocks of uniform type that can - be concatenated using Block.concat_same_type instead of the generic - concatenate_join_units (which uses `_concat._concat_compat`). - - """ - return ( - # all blocks need to have the same type - all(type(ju.block) is type(join_units[0].block) for ju in join_units) and # noqa - # no blocks that would get missing values (can lead to type upcasts) - # unless we're an extension dtype. - all(not ju.is_na or ju.block.is_extension for ju in join_units) and - # no blocks with indexers (as then the dimensions do not fit) - all(not ju.indexers for ju in join_units) and - # disregard Panels - all(ju.block.ndim <= 2 for ju in join_units) and - # only use this path when there is something to concatenate - len(join_units) > 1) - - -def is_uniform_reindex(join_units): - return ( - # TODO: should this be ju.block._can_hold_na? - all(ju.block and ju.block.is_extension for ju in join_units) and - len(set(ju.block.dtype.name for ju in join_units)) == 1 - ) - - -def get_empty_dtype_and_na(join_units): - """ - Return dtype and N/A values to use when concatenating specified units. - - Returned N/A value may be None which means there was no casting involved. - - Returns - ------- - dtype - na - """ - - if len(join_units) == 1: - blk = join_units[0].block - if blk is None: - return np.float64, np.nan - - if is_uniform_reindex(join_units): - # XXX: integrate property - empty_dtype = join_units[0].block.dtype - upcasted_na = join_units[0].block.fill_value - return empty_dtype, upcasted_na - - has_none_blocks = False - dtypes = [None] * len(join_units) - for i, unit in enumerate(join_units): - if unit.block is None: - has_none_blocks = True - else: - dtypes[i] = unit.dtype - - upcast_classes = defaultdict(list) - null_upcast_classes = defaultdict(list) - for dtype, unit in zip(dtypes, join_units): - if dtype is None: - continue - - if is_categorical_dtype(dtype): - upcast_cls = 'category' - elif is_datetimetz(dtype): - upcast_cls = 'datetimetz' - elif issubclass(dtype.type, np.bool_): - upcast_cls = 'bool' - elif issubclass(dtype.type, np.object_): - upcast_cls = 'object' - elif is_datetime64_dtype(dtype): - upcast_cls = 'datetime' - elif is_timedelta64_dtype(dtype): - upcast_cls = 'timedelta' - elif is_float_dtype(dtype) or is_numeric_dtype(dtype): - upcast_cls = dtype.name - else: - upcast_cls = 'float' - - # Null blocks should not influence upcast class selection, unless there - # are only null blocks, when same upcasting rules must be applied to - # null upcast classes. - if unit.is_na: - null_upcast_classes[upcast_cls].append(dtype) - else: - upcast_classes[upcast_cls].append(dtype) - - if not upcast_classes: - upcast_classes = null_upcast_classes - - # create the result - if 'object' in upcast_classes: - return np.dtype(np.object_), np.nan - elif 'bool' in upcast_classes: - if has_none_blocks: - return np.dtype(np.object_), np.nan - else: - return np.dtype(np.bool_), None - elif 'category' in upcast_classes: - return np.dtype(np.object_), np.nan - elif 'datetimetz' in upcast_classes: - dtype = upcast_classes['datetimetz'] - return dtype[0], tslibs.iNaT - elif 'datetime' in upcast_classes: - return np.dtype('M8[ns]'), tslibs.iNaT - elif 'timedelta' in upcast_classes: - return np.dtype('m8[ns]'), tslibs.iNaT - else: # pragma - g = np.find_common_type(upcast_classes, []) - if is_float_dtype(g): - return g, g.type(np.nan) - elif is_numeric_dtype(g): - if has_none_blocks: - return np.float64, np.nan - else: - return g, None - - msg = "invalid dtype determination in get_concat_dtype" - raise AssertionError(msg) - - -def concatenate_join_units(join_units, concat_axis, copy): - """ - Concatenate values from several join units along selected axis. - """ - if concat_axis == 0 and len(join_units) > 1: - # Concatenating join units along ax0 is handled in _merge_blocks. - raise AssertionError("Concatenating join units along axis0") - - empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units) - - to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype, - upcasted_na=upcasted_na) - for ju in join_units] - - if len(to_concat) == 1: - # Only one block, nothing to concatenate. - concat_values = to_concat[0] - if copy: - if isinstance(concat_values, np.ndarray): - # non-reindexed (=not yet copied) arrays are made into a view - # in JoinUnit.get_reindexed_values - if concat_values.base is not None: - concat_values = concat_values.copy() - else: - concat_values = concat_values.copy() - else: - concat_values = _concat._concat_compat(to_concat, axis=concat_axis) - - return concat_values - - -def get_mgr_concatenation_plan(mgr, indexers): - """ - Construct concatenation plan for given block manager and indexers. - - Parameters - ---------- - mgr : BlockManager - indexers : dict of {axis: indexer} - - Returns - ------- - plan : list of (BlockPlacement, JoinUnit) tuples - - """ - # Calculate post-reindex shape , save for item axis which will be separate - # for each block anyway. - mgr_shape = list(mgr.shape) - for ax, indexer in indexers.items(): - mgr_shape[ax] = len(indexer) - mgr_shape = tuple(mgr_shape) - - if 0 in indexers: - ax0_indexer = indexers.pop(0) - blknos = algos.take_1d(mgr._blknos, ax0_indexer, fill_value=-1) - blklocs = algos.take_1d(mgr._blklocs, ax0_indexer, fill_value=-1) - else: - - if mgr._is_single_block: - blk = mgr.blocks[0] - return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] - - ax0_indexer = None - blknos = mgr._blknos - blklocs = mgr._blklocs - - plan = [] - for blkno, placements in _get_blkno_placements(blknos, len(mgr.blocks), - group=False): - - assert placements.is_slice_like - - join_unit_indexers = indexers.copy() - - shape = list(mgr_shape) - shape[0] = len(placements) - shape = tuple(shape) - - if blkno == -1: - unit = JoinUnit(None, shape) - else: - blk = mgr.blocks[blkno] - ax0_blk_indexer = blklocs[placements.indexer] - - unit_no_ax0_reindexing = (len(placements) == len(blk.mgr_locs) and - # Fastpath detection of join unit not - # needing to reindex its block: no ax0 - # reindexing took place and block - # placement was sequential before. - ((ax0_indexer is None and - blk.mgr_locs.is_slice_like and - blk.mgr_locs.as_slice.step == 1) or - # Slow-ish detection: all indexer locs - # are sequential (and length match is - # checked above). - (np.diff(ax0_blk_indexer) == 1).all())) - - # Omit indexer if no item reindexing is required. - if unit_no_ax0_reindexing: - join_unit_indexers.pop(0, None) - else: - join_unit_indexers[0] = ax0_blk_indexer - - unit = JoinUnit(blk, shape, join_unit_indexers) - - plan.append((placements, unit)) - - return plan - - -def combine_concat_plans(plans, concat_axis): - """ - Combine multiple concatenation plans into one. - - existing_plan is updated in-place. - """ - if len(plans) == 1: - for p in plans[0]: - yield p[0], [p[1]] - - elif concat_axis == 0: - offset = 0 - for plan in plans: - last_plc = None - - for plc, unit in plan: - yield plc.add(offset), [unit] - last_plc = plc - - if last_plc is not None: - offset += last_plc.as_slice.stop - - else: - num_ended = [0] - - def _next_or_none(seq): - retval = next(seq, None) - if retval is None: - num_ended[0] += 1 - return retval - - plans = list(map(iter, plans)) - next_items = list(map(_next_or_none, plans)) - - while num_ended[0] != len(next_items): - if num_ended[0] > 0: - raise ValueError("Plan shapes are not aligned") - - placements, units = zip(*next_items) - - lengths = list(map(len, placements)) - min_len, max_len = min(lengths), max(lengths) - - if min_len == max_len: - yield placements[0], units - next_items[:] = map(_next_or_none, plans) - else: - yielded_placement = None - yielded_units = [None] * len(next_items) - for i, (plc, unit) in enumerate(next_items): - yielded_units[i] = unit - if len(plc) > min_len: - # trim_join_unit updates unit in place, so only - # placement needs to be sliced to skip min_len. - next_items[i] = (plc[min_len:], - trim_join_unit(unit, min_len)) - else: - yielded_placement = plc - next_items[i] = _next_or_none(plans[i]) - - yield yielded_placement, yielded_units - - -def trim_join_unit(join_unit, length): - """ - Reduce join_unit's shape along item axis to length. - - Extra items that didn't fit are returned as a separate block. - """ - - if 0 not in join_unit.indexers: - extra_indexers = join_unit.indexers - - if join_unit.block is None: - extra_block = None - else: - extra_block = join_unit.block.getitem_block(slice(length, None)) - join_unit.block = join_unit.block.getitem_block(slice(length)) - else: - extra_block = join_unit.block - - extra_indexers = copy.copy(join_unit.indexers) - extra_indexers[0] = extra_indexers[0][length:] - join_unit.indexers[0] = join_unit.indexers[0][:length] - - extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:] - join_unit.shape = (length,) + join_unit.shape[1:] - - return JoinUnit(block=extra_block, indexers=extra_indexers, - shape=extra_shape) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py deleted file mode 100644 index 09cbc1ac3905d..0000000000000 --- a/pandas/core/internals/managers.py +++ /dev/null @@ -1,1892 +0,0 @@ -# -*- coding: utf-8 -*- -from functools import partial -import itertools -import operator - -import numpy as np - -from pandas._libs import lib, internals as libinternals - -from pandas.util._validators import validate_bool_kwarg - -from pandas.compat import range, map, zip - -import pandas.core.dtypes.generic as gt -from pandas.core.dtypes.common import ( - ensure_int64, - is_datetimelike_v_numeric, - is_numeric_v_string_like, is_extension_type, - is_extension_array_dtype, - is_scalar) -from pandas.core.dtypes.missing import isna -import pandas.core.dtypes.concat as _concat -from pandas.core.dtypes.dtypes import ( - ExtensionDtype, - PandasExtensionDtype) -from pandas.core.dtypes.cast import ( - maybe_promote, - infer_dtype_from_scalar, - find_common_type) - -import pandas.core.algorithms as algos -from pandas.core.base import PandasObject - -from pandas.core.index import Index, MultiIndex, ensure_index -from pandas.core.indexing import maybe_convert_indices - -from pandas.io.formats.printing import pprint_thing - -from .blocks import ( - form_blocks, - Block, - make_block, - _extend_blocks, _safe_reshape, _merge_blocks) - - -# TODO: flexible with index=None and/or items=None - - -class BlockManager(PandasObject): - """ - Core internal data structure to implement DataFrame, Series, Panel, etc. - - Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a - lightweight blocked set of labeled data to be manipulated by the DataFrame - public API class - - Attributes - ---------- - shape - ndim - axes - values - items - - Methods - ------- - set_axis(axis, new_labels) - copy(deep=True) - - get_dtype_counts - get_ftype_counts - get_dtypes - get_ftypes - - apply(func, axes, block_filter_fn) - - get_bool_data - get_numeric_data - - get_slice(slice_like, axis) - get(label) - iget(loc) - get_scalar(label_tup) - - take(indexer, axis) - reindex_axis(new_labels, axis) - reindex_indexer(new_labels, indexer, axis) - - delete(label) - insert(loc, label, value) - set(label, value) - - Parameters - ---------- - - - Notes - ----- - This is *not* a public API class - """ - __slots__ = ['axes', 'blocks', '_ndim', '_shape', '_known_consolidated', - '_is_consolidated', '_blknos', '_blklocs'] - - def __init__(self, blocks, axes, do_integrity_check=True): - self.axes = [ensure_index(ax) for ax in axes] - self.blocks = tuple(blocks) - - for block in blocks: - if block.is_sparse: - if len(block.mgr_locs) != 1: - raise AssertionError("Sparse block refers to multiple " - "items") - else: - if self.ndim != block.ndim: - raise AssertionError( - 'Number of Block dimensions ({block}) must equal ' - 'number of axes ({self})'.format(block=block.ndim, - self=self.ndim)) - - if do_integrity_check: - self._verify_integrity() - - self._consolidate_check() - - self._rebuild_blknos_and_blklocs() - - def make_empty(self, axes=None): - """ return an empty BlockManager with the items axis of len 0 """ - if axes is None: - axes = [ensure_index([])] + [ensure_index(a) - for a in self.axes[1:]] - - # preserve dtype if possible - if self.ndim == 1: - blocks = np.array([], dtype=self.array_dtype) - else: - blocks = [] - return self.__class__(blocks, axes) - - def __nonzero__(self): - return True - - # Python3 compat - __bool__ = __nonzero__ - - @property - def shape(self): - return tuple(len(ax) for ax in self.axes) - - @property - def ndim(self): - return len(self.axes) - - def set_axis(self, axis, new_labels): - new_labels = ensure_index(new_labels) - old_len = len(self.axes[axis]) - new_len = len(new_labels) - - if new_len != old_len: - raise ValueError( - 'Length mismatch: Expected axis has {old} elements, new ' - 'values have {new} elements'.format(old=old_len, new=new_len)) - - self.axes[axis] = new_labels - - def rename_axis(self, mapper, axis, copy=True, level=None): - """ - Rename one of axes. - - Parameters - ---------- - mapper : unary callable - axis : int - copy : boolean, default True - level : int, default None - - """ - obj = self.copy(deep=copy) - obj.set_axis(axis, _transform_index(self.axes[axis], mapper, level)) - return obj - - def add_prefix(self, prefix): - f = partial('{prefix}{}'.format, prefix=prefix) - return self.rename_axis(f, axis=0) - - def add_suffix(self, suffix): - f = partial('{}{suffix}'.format, suffix=suffix) - return self.rename_axis(f, axis=0) - - @property - def _is_single_block(self): - if self.ndim == 1: - return True - - if len(self.blocks) != 1: - return False - - blk = self.blocks[0] - return (blk.mgr_locs.is_slice_like and - blk.mgr_locs.as_slice == slice(0, len(self), 1)) - - def _rebuild_blknos_and_blklocs(self): - """ - Update mgr._blknos / mgr._blklocs. - """ - new_blknos = np.empty(self.shape[0], dtype=np.int64) - new_blklocs = np.empty(self.shape[0], dtype=np.int64) - new_blknos.fill(-1) - new_blklocs.fill(-1) - - for blkno, blk in enumerate(self.blocks): - rl = blk.mgr_locs - new_blknos[rl.indexer] = blkno - new_blklocs[rl.indexer] = np.arange(len(rl)) - - if (new_blknos == -1).any(): - raise AssertionError("Gaps in blk ref_locs") - - self._blknos = new_blknos - self._blklocs = new_blklocs - - # make items read only for now - def _get_items(self): - return self.axes[0] - - items = property(fget=_get_items) - - def _get_counts(self, f): - """ return a dict of the counts of the function in BlockManager """ - self._consolidate_inplace() - counts = dict() - for b in self.blocks: - v = f(b) - counts[v] = counts.get(v, 0) + b.shape[0] - return counts - - def get_dtype_counts(self): - return self._get_counts(lambda b: b.dtype.name) - - def get_ftype_counts(self): - return self._get_counts(lambda b: b.ftype) - - def get_dtypes(self): - dtypes = np.array([blk.dtype for blk in self.blocks]) - return algos.take_1d(dtypes, self._blknos, allow_fill=False) - - def get_ftypes(self): - ftypes = np.array([blk.ftype for blk in self.blocks]) - return algos.take_1d(ftypes, self._blknos, allow_fill=False) - - def __getstate__(self): - block_values = [b.values for b in self.blocks] - block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] - axes_array = [ax for ax in self.axes] - - extra_state = { - '0.14.1': { - 'axes': axes_array, - 'blocks': [dict(values=b.values, mgr_locs=b.mgr_locs.indexer) - for b in self.blocks] - } - } - - # First three elements of the state are to maintain forward - # compatibility with 0.13.1. - return axes_array, block_values, block_items, extra_state - - def __setstate__(self, state): - def unpickle_block(values, mgr_locs): - # numpy < 1.7 pickle compat - if values.dtype == 'M8[us]': - values = values.astype('M8[ns]') - return make_block(values, placement=mgr_locs) - - if (isinstance(state, tuple) and len(state) >= 4 and - '0.14.1' in state[3]): - state = state[3]['0.14.1'] - self.axes = [ensure_index(ax) for ax in state['axes']] - self.blocks = tuple(unpickle_block(b['values'], b['mgr_locs']) - for b in state['blocks']) - else: - # discard anything after 3rd, support beta pickling format for a - # little while longer - ax_arrays, bvalues, bitems = state[:3] - - self.axes = [ensure_index(ax) for ax in ax_arrays] - - if len(bitems) == 1 and self.axes[0].equals(bitems[0]): - # This is a workaround for pre-0.14.1 pickles that didn't - # support unpickling multi-block frames/panels with non-unique - # columns/items, because given a manager with items ["a", "b", - # "a"] there's no way of knowing which block's "a" is where. - # - # Single-block case can be supported under the assumption that - # block items corresponded to manager items 1-to-1. - all_mgr_locs = [slice(0, len(bitems[0]))] - else: - all_mgr_locs = [self.axes[0].get_indexer(blk_items) - for blk_items in bitems] - - self.blocks = tuple( - unpickle_block(values, mgr_locs) - for values, mgr_locs in zip(bvalues, all_mgr_locs)) - - self._post_setstate() - - def _post_setstate(self): - self._is_consolidated = False - self._known_consolidated = False - self._rebuild_blknos_and_blklocs() - - def __len__(self): - return len(self.items) - - def __unicode__(self): - output = pprint_thing(self.__class__.__name__) - for i, ax in enumerate(self.axes): - if i == 0: - output += u'\nItems: {ax}'.format(ax=ax) - else: - output += u'\nAxis {i}: {ax}'.format(i=i, ax=ax) - - for block in self.blocks: - output += u'\n{block}'.format(block=pprint_thing(block)) - return output - - def _verify_integrity(self): - mgr_shape = self.shape - tot_items = sum(len(x.mgr_locs) for x in self.blocks) - for block in self.blocks: - if block._verify_integrity and block.shape[1:] != mgr_shape[1:]: - construction_error(tot_items, block.shape[1:], self.axes) - if len(self.items) != tot_items: - raise AssertionError('Number of manager items must equal union of ' - 'block items\n# manager items: {0}, # ' - 'tot_items: {1}'.format( - len(self.items), tot_items)) - - def apply(self, f, axes=None, filter=None, do_integrity_check=False, - consolidate=True, **kwargs): - """ - iterate over the blocks, collect and create a new block manager - - Parameters - ---------- - f : the callable or function name to operate on at the block level - axes : optional (if not supplied, use self.axes) - filter : list, if supplied, only call the block if the filter is in - the block - do_integrity_check : boolean, default False. Do the block manager - integrity check - consolidate: boolean, default True. Join together blocks having same - dtype - - Returns - ------- - Block Manager (new object) - - """ - - result_blocks = [] - - # filter kwarg is used in replace-* family of methods - if filter is not None: - filter_locs = set(self.items.get_indexer_for(filter)) - if len(filter_locs) == len(self.items): - # All items are included, as if there were no filtering - filter = None - else: - kwargs['filter'] = filter_locs - - if consolidate: - self._consolidate_inplace() - - if f == 'where': - align_copy = True - if kwargs.get('align', True): - align_keys = ['other', 'cond'] - else: - align_keys = ['cond'] - elif f == 'putmask': - align_copy = False - if kwargs.get('align', True): - align_keys = ['new', 'mask'] - else: - align_keys = ['mask'] - elif f == 'eval': - align_copy = False - align_keys = ['other'] - elif f == 'fillna': - # fillna internally does putmask, maybe it's better to do this - # at mgr, not block level? - align_copy = False - align_keys = ['value'] - else: - align_keys = [] - - # TODO(EA): may interfere with ExtensionBlock.setitem for blocks - # with a .values attribute. - aligned_args = dict((k, kwargs[k]) - for k in align_keys - if hasattr(kwargs[k], 'values') and - not isinstance(kwargs[k], gt.ABCExtensionArray)) - - for b in self.blocks: - if filter is not None: - if not b.mgr_locs.isin(filter_locs).any(): - result_blocks.append(b) - continue - - if aligned_args: - b_items = self.items[b.mgr_locs.indexer] - - for k, obj in aligned_args.items(): - axis = getattr(obj, '_info_axis_number', 0) - kwargs[k] = obj.reindex(b_items, axis=axis, - copy=align_copy) - - kwargs['mgr'] = self - applied = getattr(b, f)(**kwargs) - result_blocks = _extend_blocks(applied, result_blocks) - - if len(result_blocks) == 0: - return self.make_empty(axes or self.axes) - bm = self.__class__(result_blocks, axes or self.axes, - do_integrity_check=do_integrity_check) - bm._consolidate_inplace() - return bm - - def reduction(self, f, axis=0, consolidate=True, transposed=False, - **kwargs): - """ - iterate over the blocks, collect and create a new block manager. - This routine is intended for reduction type operations and - will do inference on the generated blocks. - - Parameters - ---------- - f: the callable or function name to operate on at the block level - axis: reduction axis, default 0 - consolidate: boolean, default True. Join together blocks having same - dtype - transposed: boolean, default False - we are holding transposed data - - Returns - ------- - Block Manager (new object) - - """ - - if consolidate: - self._consolidate_inplace() - - axes, blocks = [], [] - for b in self.blocks: - kwargs['mgr'] = self - axe, block = getattr(b, f)(axis=axis, **kwargs) - - axes.append(axe) - blocks.append(block) - - # note that some DatetimeTZ, Categorical are always ndim==1 - ndim = {b.ndim for b in blocks} - - if 2 in ndim: - - new_axes = list(self.axes) - - # multiple blocks that are reduced - if len(blocks) > 1: - new_axes[1] = axes[0] - - # reset the placement to the original - for b, sb in zip(blocks, self.blocks): - b.mgr_locs = sb.mgr_locs - - else: - new_axes[axis] = Index(np.concatenate( - [ax.values for ax in axes])) - - if transposed: - new_axes = new_axes[::-1] - blocks = [b.make_block(b.values.T, - placement=np.arange(b.shape[1]) - ) for b in blocks] - - return self.__class__(blocks, new_axes) - - # 0 ndim - if 0 in ndim and 1 not in ndim: - values = np.array([b.values for b in blocks]) - if len(values) == 1: - return values.item() - blocks = [make_block(values, ndim=1)] - axes = Index([ax[0] for ax in axes]) - - # single block - values = _concat._concat_compat([b.values for b in blocks]) - - # compute the orderings of our original data - if len(self.blocks) > 1: - - indexer = np.empty(len(self.axes[0]), dtype=np.intp) - i = 0 - for b in self.blocks: - for j in b.mgr_locs: - indexer[j] = i - i = i + 1 - - values = values.take(indexer) - - return SingleBlockManager( - [make_block(values, - ndim=1, - placement=np.arange(len(values)))], - axes[0]) - - def isna(self, func, **kwargs): - return self.apply('apply', func=func, **kwargs) - - def where(self, **kwargs): - return self.apply('where', **kwargs) - - def eval(self, **kwargs): - return self.apply('eval', **kwargs) - - def quantile(self, **kwargs): - return self.reduction('quantile', **kwargs) - - def setitem(self, **kwargs): - return self.apply('setitem', **kwargs) - - def putmask(self, **kwargs): - return self.apply('putmask', **kwargs) - - def diff(self, **kwargs): - return self.apply('diff', **kwargs) - - def interpolate(self, **kwargs): - return self.apply('interpolate', **kwargs) - - def shift(self, **kwargs): - return self.apply('shift', **kwargs) - - def fillna(self, **kwargs): - return self.apply('fillna', **kwargs) - - def downcast(self, **kwargs): - return self.apply('downcast', **kwargs) - - def astype(self, dtype, **kwargs): - return self.apply('astype', dtype=dtype, **kwargs) - - def convert(self, **kwargs): - return self.apply('convert', **kwargs) - - def replace(self, **kwargs): - return self.apply('replace', **kwargs) - - def replace_list(self, src_list, dest_list, inplace=False, regex=False, - mgr=None): - """ do a list replace """ - - inplace = validate_bool_kwarg(inplace, 'inplace') - - if mgr is None: - mgr = self - - # figure out our mask a-priori to avoid repeated replacements - values = self.as_array() - - def comp(s): - if isna(s): - return isna(values) - return _maybe_compare(values, getattr(s, 'asm8', s), operator.eq) - - masks = [comp(s) for i, s in enumerate(src_list)] - - result_blocks = [] - src_len = len(src_list) - 1 - for blk in self.blocks: - - # its possible to get multiple result blocks here - # replace ALWAYS will return a list - rb = [blk if inplace else blk.copy()] - for i, (s, d) in enumerate(zip(src_list, dest_list)): - new_rb = [] - for b in rb: - if b.dtype == np.object_: - convert = i == src_len - result = b.replace(s, d, inplace=inplace, regex=regex, - mgr=mgr, convert=convert) - new_rb = _extend_blocks(result, new_rb) - else: - # get our mask for this element, sized to this - # particular block - m = masks[i][b.mgr_locs.indexer] - if m.any(): - b = b.coerce_to_target_dtype(d) - new_rb.extend(b.putmask(m, d, inplace=True)) - else: - new_rb.append(b) - rb = new_rb - result_blocks.extend(rb) - - bm = self.__class__(result_blocks, self.axes) - bm._consolidate_inplace() - return bm - - # TODO: remove once Panel is gone - def reshape_nd(self, axes, **kwargs): - """ a 2d-nd reshape operation on a BlockManager """ - return self.apply('reshape_nd', axes=axes, **kwargs) - - def is_consolidated(self): - """ - Return True if more than one block with the same dtype - """ - if not self._known_consolidated: - self._consolidate_check() - return self._is_consolidated - - def _consolidate_check(self): - ftypes = [blk.ftype for blk in self.blocks] - self._is_consolidated = len(ftypes) == len(set(ftypes)) - self._known_consolidated = True - - @property - def is_mixed_type(self): - # Warning, consolidation needs to get checked upstairs - self._consolidate_inplace() - return len(self.blocks) > 1 - - @property - def is_numeric_mixed_type(self): - # Warning, consolidation needs to get checked upstairs - self._consolidate_inplace() - return all(block.is_numeric for block in self.blocks) - - @property - def is_datelike_mixed_type(self): - # Warning, consolidation needs to get checked upstairs - self._consolidate_inplace() - return any(block.is_datelike for block in self.blocks) - - @property - def any_extension_types(self): - """Whether any of the blocks in this manager are extension blocks""" - return any(block.is_extension for block in self.blocks) - - @property - def is_view(self): - """ return a boolean if we are a single block and are a view """ - if len(self.blocks) == 1: - return self.blocks[0].is_view - - # It is technically possible to figure out which blocks are views - # e.g. [ b.values.base is not None for b in self.blocks ] - # but then we have the case of possibly some blocks being a view - # and some blocks not. setting in theory is possible on the non-view - # blocks w/o causing a SettingWithCopy raise/warn. But this is a bit - # complicated - - return False - - def get_bool_data(self, copy=False): - """ - Parameters - ---------- - copy : boolean, default False - Whether to copy the blocks - """ - self._consolidate_inplace() - return self.combine([b for b in self.blocks if b.is_bool], copy) - - def get_numeric_data(self, copy=False): - """ - Parameters - ---------- - copy : boolean, default False - Whether to copy the blocks - """ - self._consolidate_inplace() - return self.combine([b for b in self.blocks if b.is_numeric], copy) - - def combine(self, blocks, copy=True): - """ return a new manager with the blocks """ - if len(blocks) == 0: - return self.make_empty() - - # FIXME: optimization potential - indexer = np.sort(np.concatenate([b.mgr_locs.as_array - for b in blocks])) - inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) - - new_blocks = [] - for b in blocks: - b = b.copy(deep=copy) - b.mgr_locs = algos.take_1d(inv_indexer, b.mgr_locs.as_array, - axis=0, allow_fill=False) - new_blocks.append(b) - - axes = list(self.axes) - axes[0] = self.items.take(indexer) - - return self.__class__(new_blocks, axes, do_integrity_check=False) - - def get_slice(self, slobj, axis=0): - if axis >= self.ndim: - raise IndexError("Requested axis not found in manager") - - if axis == 0: - new_blocks = self._slice_take_blocks_ax0(slobj) - else: - slicer = [slice(None)] * (axis + 1) - slicer[axis] = slobj - slicer = tuple(slicer) - new_blocks = [blk.getitem_block(slicer) for blk in self.blocks] - - new_axes = list(self.axes) - new_axes[axis] = new_axes[axis][slobj] - - bm = self.__class__(new_blocks, new_axes, do_integrity_check=False) - bm._consolidate_inplace() - return bm - - def __contains__(self, item): - return item in self.items - - @property - def nblocks(self): - return len(self.blocks) - - def copy(self, deep=True, mgr=None): - """ - Make deep or shallow copy of BlockManager - - Parameters - ---------- - deep : boolean o rstring, default True - If False, return shallow copy (do not copy data) - If 'all', copy data and a deep copy of the index - - Returns - ------- - copy : BlockManager - """ - - # this preserves the notion of view copying of axes - if deep: - if deep == 'all': - copy = lambda ax: ax.copy(deep=True) - else: - copy = lambda ax: ax.view() - new_axes = [copy(ax) for ax in self.axes] - else: - new_axes = list(self.axes) - return self.apply('copy', axes=new_axes, deep=deep, - do_integrity_check=False) - - def as_array(self, transpose=False, items=None): - """Convert the blockmanager data into an numpy array. - - Parameters - ---------- - transpose : boolean, default False - If True, transpose the return array - items : list of strings or None - Names of block items that will be included in the returned - array. ``None`` means that all block items will be used - - Returns - ------- - arr : ndarray - """ - if len(self.blocks) == 0: - arr = np.empty(self.shape, dtype=float) - return arr.transpose() if transpose else arr - - if items is not None: - mgr = self.reindex_axis(items, axis=0) - else: - mgr = self - - if self._is_single_block or not self.is_mixed_type: - arr = mgr.blocks[0].get_values() - else: - arr = mgr._interleave() - - return arr.transpose() if transpose else arr - - def _interleave(self): - """ - Return ndarray from blocks with specified item order - Items must be contained in the blocks - """ - dtype = _interleaved_dtype(self.blocks) - - result = np.empty(self.shape, dtype=dtype) - - if result.shape[0] == 0: - # Workaround for numpy 1.7 bug: - # - # >>> a = np.empty((0,10)) - # >>> a[slice(0,0)] - # array([], shape=(0, 10), dtype=float64) - # >>> a[[]] - # Traceback (most recent call last): - # File "", line 1, in - # IndexError: index 0 is out of bounds for axis 0 with size 0 - return result - - itemmask = np.zeros(self.shape[0]) - - for blk in self.blocks: - rl = blk.mgr_locs - result[rl.indexer] = blk.get_values(dtype) - itemmask[rl.indexer] = 1 - - if not itemmask.all(): - raise AssertionError('Some items were not contained in blocks') - - return result - - def to_dict(self, copy=True): - """ - Return a dict of str(dtype) -> BlockManager - - Parameters - ---------- - copy : boolean, default True - - Returns - ------- - values : a dict of dtype -> BlockManager - - Notes - ----- - This consolidates based on str(dtype) - """ - self._consolidate_inplace() - - bd = {} - for b in self.blocks: - bd.setdefault(str(b.dtype), []).append(b) - - return {dtype: self.combine(blocks, copy=copy) - for dtype, blocks in bd.items()} - - def xs(self, key, axis=1, copy=True, takeable=False): - if axis < 1: - raise AssertionError( - 'Can only take xs across axis >= 1, got {ax}'.format(ax=axis)) - - # take by position - if takeable: - loc = key - else: - loc = self.axes[axis].get_loc(key) - - slicer = [slice(None, None) for _ in range(self.ndim)] - slicer[axis] = loc - slicer = tuple(slicer) - - new_axes = list(self.axes) - - # could be an array indexer! - if isinstance(loc, (slice, np.ndarray)): - new_axes[axis] = new_axes[axis][loc] - else: - new_axes.pop(axis) - - new_blocks = [] - if len(self.blocks) > 1: - # we must copy here as we are mixed type - for blk in self.blocks: - newb = make_block(values=blk.values[slicer], - klass=blk.__class__, - placement=blk.mgr_locs) - new_blocks.append(newb) - elif len(self.blocks) == 1: - block = self.blocks[0] - vals = block.values[slicer] - if copy: - vals = vals.copy() - new_blocks = [make_block(values=vals, - placement=block.mgr_locs, - klass=block.__class__)] - - return self.__class__(new_blocks, new_axes) - - def fast_xs(self, loc): - """ - get a cross sectional for a given location in the - items ; handle dups - - return the result, is *could* be a view in the case of a - single block - """ - if len(self.blocks) == 1: - return self.blocks[0].iget((slice(None), loc)) - - items = self.items - - # non-unique (GH4726) - if not items.is_unique: - result = self._interleave() - if self.ndim == 2: - result = result.T - return result[loc] - - # unique - dtype = _interleaved_dtype(self.blocks) - n = len(items) - result = np.empty(n, dtype=dtype) - for blk in self.blocks: - # Such assignment may incorrectly coerce NaT to None - # result[blk.mgr_locs] = blk._slice((slice(None), loc)) - for i, rl in enumerate(blk.mgr_locs): - result[rl] = blk._try_coerce_result(blk.iget((i, loc))) - - return result - - def consolidate(self): - """ - Join together blocks having same dtype - - Returns - ------- - y : BlockManager - """ - if self.is_consolidated(): - return self - - bm = self.__class__(self.blocks, self.axes) - bm._is_consolidated = False - bm._consolidate_inplace() - return bm - - def _consolidate_inplace(self): - if not self.is_consolidated(): - self.blocks = tuple(_consolidate(self.blocks)) - self._is_consolidated = True - self._known_consolidated = True - self._rebuild_blknos_and_blklocs() - - def get(self, item, fastpath=True): - """ - Return values for selected item (ndarray or BlockManager). - """ - if self.items.is_unique: - - if not isna(item): - loc = self.items.get_loc(item) - else: - indexer = np.arange(len(self.items))[isna(self.items)] - - # allow a single nan location indexer - if not is_scalar(indexer): - if len(indexer) == 1: - loc = indexer.item() - else: - raise ValueError("cannot label index with a null key") - - return self.iget(loc, fastpath=fastpath) - else: - - if isna(item): - raise TypeError("cannot label index with a null key") - - indexer = self.items.get_indexer_for([item]) - return self.reindex_indexer(new_axis=self.items[indexer], - indexer=indexer, axis=0, - allow_dups=True) - - def iget(self, i, fastpath=True): - """ - Return the data as a SingleBlockManager if fastpath=True and possible - - Otherwise return as a ndarray - """ - block = self.blocks[self._blknos[i]] - values = block.iget(self._blklocs[i]) - if not fastpath or not block._box_to_block_values or values.ndim != 1: - return values - - # fastpath shortcut for select a single-dim from a 2-dim BM - return SingleBlockManager( - [block.make_block_same_class(values, - placement=slice(0, len(values)), - ndim=1)], - self.axes[1]) - - def get_scalar(self, tup): - """ - Retrieve single item - """ - full_loc = [ax.get_loc(x) for ax, x in zip(self.axes, tup)] - blk = self.blocks[self._blknos[full_loc[0]]] - values = blk.values - - # FIXME: this may return non-upcasted types? - if values.ndim == 1: - return values[full_loc[1]] - - full_loc[0] = self._blklocs[full_loc[0]] - return values[tuple(full_loc)] - - def delete(self, item): - """ - Delete selected item (items if non-unique) in-place. - """ - indexer = self.items.get_loc(item) - - is_deleted = np.zeros(self.shape[0], dtype=np.bool_) - is_deleted[indexer] = True - ref_loc_offset = -is_deleted.cumsum() - - is_blk_deleted = [False] * len(self.blocks) - - if isinstance(indexer, int): - affected_start = indexer - else: - affected_start = is_deleted.nonzero()[0][0] - - for blkno, _ in _fast_count_smallints(self._blknos[affected_start:]): - blk = self.blocks[blkno] - bml = blk.mgr_locs - blk_del = is_deleted[bml.indexer].nonzero()[0] - - if len(blk_del) == len(bml): - is_blk_deleted[blkno] = True - continue - elif len(blk_del) != 0: - blk.delete(blk_del) - bml = blk.mgr_locs - - blk.mgr_locs = bml.add(ref_loc_offset[bml.indexer]) - - # FIXME: use Index.delete as soon as it uses fastpath=True - self.axes[0] = self.items[~is_deleted] - self.blocks = tuple(b for blkno, b in enumerate(self.blocks) - if not is_blk_deleted[blkno]) - self._shape = None - self._rebuild_blknos_and_blklocs() - - def set(self, item, value, check=False): - """ - Set new item in-place. Does not consolidate. Adds new Block if not - contained in the current set of items - if check, then validate that we are not setting the same data in-place - """ - # FIXME: refactor, clearly separate broadcasting & zip-like assignment - # can prob also fix the various if tests for sparse/categorical - - # TODO(EA): Remove an is_extension_ when all extension types satisfy - # the interface - value_is_extension_type = (is_extension_type(value) or - is_extension_array_dtype(value)) - - # categorical/spares/datetimetz - if value_is_extension_type: - - def value_getitem(placement): - return value - else: - if value.ndim == self.ndim - 1: - value = _safe_reshape(value, (1,) + value.shape) - - def value_getitem(placement): - return value - else: - - def value_getitem(placement): - return value[placement.indexer] - - if value.shape[1:] != self.shape[1:]: - raise AssertionError('Shape of new values must be compatible ' - 'with manager shape') - - try: - loc = self.items.get_loc(item) - except KeyError: - # This item wasn't present, just insert at end - self.insert(len(self.items), item, value) - return - - if isinstance(loc, int): - loc = [loc] - - blknos = self._blknos[loc] - blklocs = self._blklocs[loc].copy() - - unfit_mgr_locs = [] - unfit_val_locs = [] - removed_blknos = [] - for blkno, val_locs in _get_blkno_placements(blknos, len(self.blocks), - group=True): - blk = self.blocks[blkno] - blk_locs = blklocs[val_locs.indexer] - if blk.should_store(value): - blk.set(blk_locs, value_getitem(val_locs), check=check) - else: - unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) - unfit_val_locs.append(val_locs) - - # If all block items are unfit, schedule the block for removal. - if len(val_locs) == len(blk.mgr_locs): - removed_blknos.append(blkno) - else: - self._blklocs[blk.mgr_locs.indexer] = -1 - blk.delete(blk_locs) - self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk)) - - if len(removed_blknos): - # Remove blocks & update blknos accordingly - is_deleted = np.zeros(self.nblocks, dtype=np.bool_) - is_deleted[removed_blknos] = True - - new_blknos = np.empty(self.nblocks, dtype=np.int64) - new_blknos.fill(-1) - new_blknos[~is_deleted] = np.arange(self.nblocks - - len(removed_blknos)) - self._blknos = algos.take_1d(new_blknos, self._blknos, axis=0, - allow_fill=False) - self.blocks = tuple(blk for i, blk in enumerate(self.blocks) - if i not in set(removed_blknos)) - - if unfit_val_locs: - unfit_mgr_locs = np.concatenate(unfit_mgr_locs) - unfit_count = len(unfit_mgr_locs) - - new_blocks = [] - if value_is_extension_type: - # This code (ab-)uses the fact that sparse blocks contain only - # one item. - new_blocks.extend( - make_block(values=value.copy(), ndim=self.ndim, - placement=slice(mgr_loc, mgr_loc + 1)) - for mgr_loc in unfit_mgr_locs) - - self._blknos[unfit_mgr_locs] = (np.arange(unfit_count) + - len(self.blocks)) - self._blklocs[unfit_mgr_locs] = 0 - - else: - # unfit_val_locs contains BlockPlacement objects - unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:]) - - new_blocks.append( - make_block(values=value_getitem(unfit_val_items), - ndim=self.ndim, placement=unfit_mgr_locs)) - - self._blknos[unfit_mgr_locs] = len(self.blocks) - self._blklocs[unfit_mgr_locs] = np.arange(unfit_count) - - self.blocks += tuple(new_blocks) - - # Newly created block's dtype may already be present. - self._known_consolidated = False - - def insert(self, loc, item, value, allow_duplicates=False): - """ - Insert item at selected position. - - Parameters - ---------- - loc : int - item : hashable - value : array_like - allow_duplicates: bool - If False, trying to insert non-unique item will raise - - """ - if not allow_duplicates and item in self.items: - # Should this be a different kind of error?? - raise ValueError('cannot insert {}, already exists'.format(item)) - - if not isinstance(loc, int): - raise TypeError("loc must be int") - - # insert to the axis; this could possibly raise a TypeError - new_axis = self.items.insert(loc, item) - - block = make_block(values=value, ndim=self.ndim, - placement=slice(loc, loc + 1)) - - for blkno, count in _fast_count_smallints(self._blknos[loc:]): - blk = self.blocks[blkno] - if count == len(blk.mgr_locs): - blk.mgr_locs = blk.mgr_locs.add(1) - else: - new_mgr_locs = blk.mgr_locs.as_array.copy() - new_mgr_locs[new_mgr_locs >= loc] += 1 - blk.mgr_locs = new_mgr_locs - - if loc == self._blklocs.shape[0]: - # np.append is a lot faster (at least in numpy 1.7.1), let's use it - # if we can. - self._blklocs = np.append(self._blklocs, 0) - self._blknos = np.append(self._blknos, len(self.blocks)) - else: - self._blklocs = np.insert(self._blklocs, loc, 0) - self._blknos = np.insert(self._blknos, loc, len(self.blocks)) - - self.axes[0] = new_axis - self.blocks += (block,) - self._shape = None - - self._known_consolidated = False - - if len(self.blocks) > 100: - self._consolidate_inplace() - - def reindex_axis(self, new_index, axis, method=None, limit=None, - fill_value=None, copy=True): - """ - Conform block manager to new index. - """ - new_index = ensure_index(new_index) - new_index, indexer = self.axes[axis].reindex(new_index, method=method, - limit=limit) - - return self.reindex_indexer(new_index, indexer, axis=axis, - fill_value=fill_value, copy=copy) - - def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, - allow_dups=False, copy=True): - """ - Parameters - ---------- - new_axis : Index - indexer : ndarray of int64 or None - axis : int - fill_value : object - allow_dups : bool - - pandas-indexer with -1's only. - """ - if indexer is None: - if new_axis is self.axes[axis] and not copy: - return self - - result = self.copy(deep=copy) - result.axes = list(self.axes) - result.axes[axis] = new_axis - return result - - self._consolidate_inplace() - - # some axes don't allow reindexing with dups - if not allow_dups: - self.axes[axis]._can_reindex(indexer) - - if axis >= self.ndim: - raise IndexError("Requested axis not found in manager") - - if axis == 0: - new_blocks = self._slice_take_blocks_ax0(indexer, - fill_tuple=(fill_value,)) - else: - new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=( - fill_value if fill_value is not None else blk.fill_value,)) - for blk in self.blocks] - - new_axes = list(self.axes) - new_axes[axis] = new_axis - return self.__class__(new_blocks, new_axes) - - def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): - """ - Slice/take blocks along axis=0. - - Overloaded for SingleBlock - - Returns - ------- - new_blocks : list of Block - - """ - - allow_fill = fill_tuple is not None - - sl_type, slobj, sllen = _preprocess_slice_or_indexer( - slice_or_indexer, self.shape[0], allow_fill=allow_fill) - - if self._is_single_block: - blk = self.blocks[0] - - if sl_type in ('slice', 'mask'): - return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] - elif not allow_fill or self.ndim == 1: - if allow_fill and fill_tuple[0] is None: - _, fill_value = maybe_promote(blk.dtype) - fill_tuple = (fill_value, ) - - return [blk.take_nd(slobj, axis=0, - new_mgr_locs=slice(0, sllen), - fill_tuple=fill_tuple)] - - if sl_type in ('slice', 'mask'): - blknos = self._blknos[slobj] - blklocs = self._blklocs[slobj] - else: - blknos = algos.take_1d(self._blknos, slobj, fill_value=-1, - allow_fill=allow_fill) - blklocs = algos.take_1d(self._blklocs, slobj, fill_value=-1, - allow_fill=allow_fill) - - # When filling blknos, make sure blknos is updated before appending to - # blocks list, that way new blkno is exactly len(blocks). - # - # FIXME: mgr_groupby_blknos must return mgr_locs in ascending order, - # pytables serialization will break otherwise. - blocks = [] - for blkno, mgr_locs in _get_blkno_placements(blknos, len(self.blocks), - group=True): - if blkno == -1: - # If we've got here, fill_tuple was not None. - fill_value = fill_tuple[0] - - blocks.append(self._make_na_block(placement=mgr_locs, - fill_value=fill_value)) - else: - blk = self.blocks[blkno] - - # Otherwise, slicing along items axis is necessary. - if not blk._can_consolidate: - # A non-consolidatable block, it's easy, because there's - # only one item and each mgr loc is a copy of that single - # item. - for mgr_loc in mgr_locs: - newblk = blk.copy(deep=True) - newblk.mgr_locs = slice(mgr_loc, mgr_loc + 1) - blocks.append(newblk) - - else: - blocks.append(blk.take_nd(blklocs[mgr_locs.indexer], - axis=0, new_mgr_locs=mgr_locs, - fill_tuple=None)) - - return blocks - - def _make_na_block(self, placement, fill_value=None): - # TODO: infer dtypes other than float64 from fill_value - - if fill_value is None: - fill_value = np.nan - block_shape = list(self.shape) - block_shape[0] = len(placement) - - dtype, fill_value = infer_dtype_from_scalar(fill_value) - block_values = np.empty(block_shape, dtype=dtype) - block_values.fill(fill_value) - return make_block(block_values, placement=placement) - - def take(self, indexer, axis=1, verify=True, convert=True): - """ - Take items along any axis. - """ - self._consolidate_inplace() - indexer = (np.arange(indexer.start, indexer.stop, indexer.step, - dtype='int64') - if isinstance(indexer, slice) - else np.asanyarray(indexer, dtype='int64')) - - n = self.shape[axis] - if convert: - indexer = maybe_convert_indices(indexer, n) - - if verify: - if ((indexer == -1) | (indexer >= n)).any(): - raise Exception('Indices must be nonzero and less than ' - 'the axis length') - - new_labels = self.axes[axis].take(indexer) - return self.reindex_indexer(new_axis=new_labels, indexer=indexer, - axis=axis, allow_dups=True) - - def merge(self, other, lsuffix='', rsuffix=''): - if not self._is_indexed_like(other): - raise AssertionError('Must have same axes to merge managers') - - l, r = items_overlap_with_suffix(left=self.items, lsuffix=lsuffix, - right=other.items, rsuffix=rsuffix) - new_items = _concat_indexes([l, r]) - - new_blocks = [blk.copy(deep=False) for blk in self.blocks] - - offset = self.shape[0] - for blk in other.blocks: - blk = blk.copy(deep=False) - blk.mgr_locs = blk.mgr_locs.add(offset) - new_blocks.append(blk) - - new_axes = list(self.axes) - new_axes[0] = new_items - - return self.__class__(_consolidate(new_blocks), new_axes) - - def _is_indexed_like(self, other): - """ - Check all axes except items - """ - if self.ndim != other.ndim: - raise AssertionError( - 'Number of dimensions must agree got {ndim} and ' - '{oth_ndim}'.format(ndim=self.ndim, oth_ndim=other.ndim)) - for ax, oax in zip(self.axes[1:], other.axes[1:]): - if not ax.equals(oax): - return False - return True - - def equals(self, other): - self_axes, other_axes = self.axes, other.axes - if len(self_axes) != len(other_axes): - return False - if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)): - return False - self._consolidate_inplace() - other._consolidate_inplace() - if len(self.blocks) != len(other.blocks): - return False - - # canonicalize block order, using a tuple combining the type - # name and then mgr_locs because there might be unconsolidated - # blocks (say, Categorical) which can only be distinguished by - # the iteration order - def canonicalize(block): - return (block.dtype.name, block.mgr_locs.as_array.tolist()) - - self_blocks = sorted(self.blocks, key=canonicalize) - other_blocks = sorted(other.blocks, key=canonicalize) - return all(block.equals(oblock) - for block, oblock in zip(self_blocks, other_blocks)) - - def unstack(self, unstacker_func): - """Return a blockmanager with all blocks unstacked. - - Parameters - ---------- - unstacker_func : callable - A (partially-applied) ``pd.core.reshape._Unstacker`` class. - - Returns - ------- - unstacked : BlockManager - """ - dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items) - new_columns = dummy.get_new_columns() - new_index = dummy.get_new_index() - new_blocks = [] - columns_mask = [] - - for blk in self.blocks: - blocks, mask = blk._unstack( - partial(unstacker_func, - value_columns=self.items[blk.mgr_locs.indexer]), - new_columns) - - new_blocks.extend(blocks) - columns_mask.extend(mask) - - new_columns = new_columns[columns_mask] - - bm = BlockManager(new_blocks, [new_columns, new_index]) - return bm - - -class SingleBlockManager(BlockManager): - """ manage a single block with """ - - ndim = 1 - _is_consolidated = True - _known_consolidated = True - __slots__ = () - - def __init__(self, block, axis, do_integrity_check=False, fastpath=False): - - if isinstance(axis, list): - if len(axis) != 1: - raise ValueError("cannot create SingleBlockManager with more " - "than 1 axis") - axis = axis[0] - - # passed from constructor, single block, single axis - if fastpath: - self.axes = [axis] - if isinstance(block, list): - - # empty block - if len(block) == 0: - block = [np.array([])] - elif len(block) != 1: - raise ValueError('Cannot create SingleBlockManager with ' - 'more than 1 block') - block = block[0] - else: - self.axes = [ensure_index(axis)] - - # create the block here - if isinstance(block, list): - - # provide consolidation to the interleaved_dtype - if len(block) > 1: - dtype = _interleaved_dtype(block) - block = [b.astype(dtype) for b in block] - block = _consolidate(block) - - if len(block) != 1: - raise ValueError('Cannot create SingleBlockManager with ' - 'more than 1 block') - block = block[0] - - if not isinstance(block, Block): - block = make_block(block, placement=slice(0, len(axis)), ndim=1) - - self.blocks = [block] - - def _post_setstate(self): - pass - - @property - def _block(self): - return self.blocks[0] - - @property - def _values(self): - return self._block.values - - @property - def _blknos(self): - """ compat with BlockManager """ - return None - - @property - def _blklocs(self): - """ compat with BlockManager """ - return None - - def get_slice(self, slobj, axis=0): - if axis >= self.ndim: - raise IndexError("Requested axis not found in manager") - - return self.__class__(self._block._slice(slobj), - self.index[slobj], fastpath=True) - - @property - def index(self): - return self.axes[0] - - def convert(self, **kwargs): - """ convert the whole block as one """ - kwargs['by_item'] = False - return self.apply('convert', **kwargs) - - @property - def dtype(self): - return self._block.dtype - - @property - def array_dtype(self): - return self._block.array_dtype - - @property - def ftype(self): - return self._block.ftype - - def get_dtype_counts(self): - return {self.dtype.name: 1} - - def get_ftype_counts(self): - return {self.ftype: 1} - - def get_dtypes(self): - return np.array([self._block.dtype]) - - def get_ftypes(self): - return np.array([self._block.ftype]) - - def external_values(self): - return self._block.external_values() - - def internal_values(self): - return self._block.internal_values() - - def formatting_values(self): - """Return the internal values used by the DataFrame/SeriesFormatter""" - return self._block.formatting_values() - - def get_values(self): - """ return a dense type view """ - return np.array(self._block.to_dense(), copy=False) - - @property - def asobject(self): - """ - return a object dtype array. datetime/timedelta like values are boxed - to Timestamp/Timedelta instances. - """ - return self._block.get_values(dtype=object) - - @property - def _can_hold_na(self): - return self._block._can_hold_na - - def is_consolidated(self): - return True - - def _consolidate_check(self): - pass - - def _consolidate_inplace(self): - pass - - def delete(self, item): - """ - Delete single item from SingleBlockManager. - - Ensures that self.blocks doesn't become empty. - """ - loc = self.items.get_loc(item) - self._block.delete(loc) - self.axes[0] = self.axes[0].delete(loc) - - def fast_xs(self, loc): - """ - fast path for getting a cross-section - return a view of the data - """ - return self._block.values[loc] - - def concat(self, to_concat, new_axis): - """ - Concatenate a list of SingleBlockManagers into a single - SingleBlockManager. - - Used for pd.concat of Series objects with axis=0. - - Parameters - ---------- - to_concat : list of SingleBlockManagers - new_axis : Index of the result - - Returns - ------- - SingleBlockManager - - """ - non_empties = [x for x in to_concat if len(x) > 0] - - # check if all series are of the same block type: - if len(non_empties) > 0: - blocks = [obj.blocks[0] for obj in non_empties] - - if all(type(b) is type(blocks[0]) for b in blocks[1:]): # noqa - new_block = blocks[0].concat_same_type(blocks) - else: - values = [x.values for x in blocks] - values = _concat._concat_compat(values) - new_block = make_block( - values, placement=slice(0, len(values), 1)) - else: - values = [x._block.values for x in to_concat] - values = _concat._concat_compat(values) - new_block = make_block( - values, placement=slice(0, len(values), 1)) - - mgr = SingleBlockManager(new_block, new_axis) - return mgr - - -# ------------------------------------------------------------------ -# Manager Helpers - Used only by BlockManager Methods (or each other) - -def _consolidate(blocks): - """ - Merge blocks having same dtype, exclude non-consolidating blocks - """ - - # sort by _can_consolidate, dtype - gkey = lambda x: x._consolidate_key - grouper = itertools.groupby(sorted(blocks, key=gkey), gkey) - - new_blocks = [] - for (_can_consolidate, dtype), group_blocks in grouper: - merged_blocks = _merge_blocks(list(group_blocks), dtype=dtype, - _can_consolidate=_can_consolidate) - new_blocks = _extend_blocks(merged_blocks, new_blocks) - return new_blocks - - -def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill): - if isinstance(slice_or_indexer, slice): - return ('slice', slice_or_indexer, - libinternals.slice_len(slice_or_indexer, length)) - elif (isinstance(slice_or_indexer, np.ndarray) and - slice_or_indexer.dtype == np.bool_): - return 'mask', slice_or_indexer, slice_or_indexer.sum() - else: - indexer = np.asanyarray(slice_or_indexer, dtype=np.int64) - if not allow_fill: - indexer = maybe_convert_indices(indexer, length) - return 'fancy', indexer, len(indexer) - - -def _maybe_compare(a, b, op): - - is_a_array = isinstance(a, np.ndarray) - is_b_array = isinstance(b, np.ndarray) - - # numpy deprecation warning to have i8 vs integer comparisons - if is_datetimelike_v_numeric(a, b): - result = False - - # numpy deprecation warning if comparing numeric vs string-like - elif is_numeric_v_string_like(a, b): - result = False - - else: - result = op(a, b) - - if is_scalar(result) and (is_a_array or is_b_array): - type_names = [type(a).__name__, type(b).__name__] - - if is_a_array: - type_names[0] = 'ndarray(dtype={dtype})'.format(dtype=a.dtype) - - if is_b_array: - type_names[1] = 'ndarray(dtype={dtype})'.format(dtype=b.dtype) - - raise TypeError( - "Cannot compare types {a!r} and {b!r}".format(a=type_names[0], - b=type_names[1])) - return result - - -def _interleaved_dtype(blocks): - if not len(blocks): - return None - - dtype = find_common_type([b.dtype for b in blocks]) - - # only numpy compat - if isinstance(dtype, (PandasExtensionDtype, ExtensionDtype)): - dtype = np.object - - return dtype - - -def _fast_count_smallints(arr): - """Faster version of set(arr) for sequences of small numbers.""" - if len(arr) == 0: - # Handle empty arr case separately: numpy 1.6 chokes on that. - return np.empty((0, 2), dtype=arr.dtype) - else: - counts = np.bincount(arr.astype(np.int_)) - nz = counts.nonzero()[0] - return np.c_[nz, counts[nz]] - - -# ------------------------------------------------------------------ -# Externally Used - -# also used in concat -def _get_blkno_placements(blknos, blk_count, group=True): - """ - - Parameters - ---------- - blknos : array of int64 - blk_count : int - group : bool - - Returns - ------- - iterator - yield (BlockPlacement, blkno) - """ - - blknos = ensure_int64(blknos) - - # FIXME: blk_count is unused, but it may avoid the use of dicts in cython - for blkno, indexer in libinternals.get_blkno_indexers(blknos, group): - yield blkno, libinternals.BlockPlacement(indexer) - - -# also used in reshape.merge -def items_overlap_with_suffix(left, lsuffix, right, rsuffix): - """ - If two indices overlap, add suffixes to overlapping entries. - - If corresponding suffix is empty, the entry is simply converted to string. - - """ - to_rename = left.intersection(right) - if len(to_rename) == 0: - return left, right - else: - if not lsuffix and not rsuffix: - raise ValueError('columns overlap but no suffix specified: ' - '{rename}'.format(rename=to_rename)) - - def lrenamer(x): - if x in to_rename: - return '{x}{lsuffix}'.format(x=x, lsuffix=lsuffix) - return x - - def rrenamer(x): - if x in to_rename: - return '{x}{rsuffix}'.format(x=x, rsuffix=rsuffix) - return x - - return (_transform_index(left, lrenamer), - _transform_index(right, rrenamer)) - - -# ------------------------------------------------------------------ -# Index-Related - -def _transform_index(index, func, level=None): - """ - Apply function to all values found in index. - - This includes transforming multiindex entries separately. - Only apply function to one level of the MultiIndex if level is specified. - - """ - if isinstance(index, MultiIndex): - if level is not None: - items = [tuple(func(y) if i == level else y - for i, y in enumerate(x)) for x in index] - else: - items = [tuple(func(y) for y in x) for x in index] - return MultiIndex.from_tuples(items, names=index.names) - else: - items = [func(x) for x in index] - return Index(items, name=index.name, tupleize_cols=False) - - -def _concat_indexes(indexes): - return indexes[0].append(indexes[1:]) - - -# ------------------------------------------------------------------ -# Constructor Helpers - -def create_block_manager_from_blocks(blocks, axes): - try: - if len(blocks) == 1 and not isinstance(blocks[0], Block): - # if blocks[0] is of length 0, return empty blocks - if not len(blocks[0]): - blocks = [] - else: - # It's OK if a single block is passed as values, its placement - # is basically "all items", but if there're many, don't bother - # converting, it's an error anyway. - blocks = [make_block(values=blocks[0], - placement=slice(0, len(axes[0])))] - - mgr = BlockManager(blocks, axes) - mgr._consolidate_inplace() - return mgr - - except (ValueError) as e: - blocks = [getattr(b, 'values', b) for b in blocks] - tot_items = sum(b.shape[0] for b in blocks) - construction_error(tot_items, blocks[0].shape[1:], axes, e) - - -def create_block_manager_from_arrays(arrays, names, axes): - - try: - blocks = form_blocks(arrays, names, axes) - mgr = BlockManager(blocks, axes) - mgr._consolidate_inplace() - return mgr - except ValueError as e: - construction_error(len(arrays), arrays[0].shape, axes, e) - - -def construction_error(tot_items, block_shape, axes, e=None): - """ raise a helpful message about our construction """ - passed = tuple(map(int, [tot_items] + list(block_shape))) - implied = tuple(map(int, [len(ax) for ax in axes])) - if passed == implied and e is not None: - raise e - if block_shape[0] == 0: - raise ValueError("Empty data passed with indices specified.") - raise ValueError("Shape of passed values is {0}, indices imply {1}".format( - passed, implied))