From 8d3cb3f36e2f7b415531e3b910f490c01657ecca Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 29 Jul 2014 10:35:14 -0400 Subject: [PATCH] API/COMPAT: Index no longer an ndarray sub-class (GH5080) CLN: add searchsorted to core/base (GH6712, GH7447, GH6469) fixup tests in test_timeseries for reverse ndarray/datetimeindex comparisons fix algos / multi-index repeat (essentially this is a bug-fix) ENH: add NumericIndex and operators, related (GH7439) DOC: indexing/v0.15.0 docs TST: fixed up plotting issues COMPAT/API: use __array_priority__ to facility proper comparisons of DatetimeIndex with ndarrays fixup to do actual views in copy (except in reduce where its needed) COMPAT: numpy compat with 1.6 for np.may_share_memory FIX: access values attr in JSON code to support index that's not an ndarry subclass COMPAT: numpy compat with array_priority fix CLN: remove constructor pickle compat code as not necessary COMPAT: fix pickle in sparse CLN: clean up shallow_copy/simple_new COMPAT: pickle compat remove __array_prepare__ COMPAT: tests & compat for numeric operation support only on supported indexes DOC: fixup for comments COMPAT: allow older MultiIndex pickles again CLN: combine properties from index/series for ndarray compat --- doc/source/api.rst | 2 + doc/source/indexing.rst | 8 +- doc/source/v0.15.0.txt | 23 +- pandas/compat/pickle_compat.py | 85 ++-- pandas/core/base.py | 135 +++++- pandas/core/categorical.py | 2 +- pandas/core/common.py | 7 +- pandas/core/format.py | 20 +- pandas/core/frame.py | 24 +- pandas/core/generic.py | 8 +- pandas/core/groupby.py | 22 +- pandas/core/index.py | 600 ++++++++++++++++-------- pandas/core/indexing.py | 7 +- pandas/core/internals.py | 11 +- pandas/core/ops.py | 16 +- pandas/core/series.py | 154 +++--- pandas/io/pickle.py | 3 +- pandas/io/pytables.py | 2 +- pandas/io/tests/test_pickle.py | 53 ++- pandas/io/tests/test_pytables.py | 17 +- pandas/lib.pyx | 4 +- pandas/sparse/tests/test_array.py | 4 +- pandas/sparse/tests/test_sparse.py | 26 +- pandas/src/generate_code.py | 13 +- pandas/src/generated.pyx | 25 +- pandas/src/reduce.pyx | 133 ++++-- pandas/src/ujson/python/objToJSON.c | 8 +- pandas/tests/test_algos.py | 7 +- pandas/tests/test_base.py | 36 +- pandas/tests/test_frame.py | 8 +- pandas/tests/test_graphics.py | 6 +- pandas/tests/test_groupby.py | 29 ++ pandas/tests/test_index.py | 209 +++++++-- pandas/tests/test_internals.py | 19 +- pandas/tests/test_multilevel.py | 18 +- pandas/tests/test_panel.py | 5 +- pandas/tests/test_series.py | 15 +- pandas/tests/test_tseries.py | 18 +- pandas/tools/pivot.py | 4 +- pandas/tools/plotting.py | 14 +- pandas/tools/tests/test_pivot.py | 6 +- pandas/tseries/converter.py | 14 +- pandas/tseries/index.py | 149 +++--- pandas/tseries/period.py | 113 +++-- pandas/tseries/plotting.py | 4 +- pandas/tseries/tests/test_converter.py | 4 +- pandas/tseries/tests/test_daterange.py | 9 +- pandas/tseries/tests/test_period.py | 4 +- pandas/tseries/tests/test_plotting.py | 10 +- pandas/tseries/tests/test_timeseries.py | 29 +- pandas/util/testing.py | 7 + 51 files changed, 1391 insertions(+), 758 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 88aab0ced8420..9d443254ae25a 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1104,6 +1104,8 @@ Modifying and Computations Index.order Index.reindex Index.repeat + Index.take + Index.putmask Index.set_names Index.unique Index.nunique diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 39635cb0e612f..8ec61496c538a 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -52,6 +52,12 @@ indexing. should be avoided. See :ref:`Returning a View versus Copy ` +.. warning:: + + In 0.15.0 ``Index`` has internally been refactored to no longer sub-class ``ndarray`` + but instead subclass ``PandasObject``, similarly to the rest of the pandas objects. This should be + a transparent change with only very limited API implications (See the :ref:`Internal Refactoring `) + See the :ref:`cookbook` for some advanced strategies Different Choices for Indexing (``loc``, ``iloc``, and ``ix``) @@ -2175,7 +2181,7 @@ you can specify ``inplace=True`` to have the data change in place. .. versionadded:: 0.15.0 -``set_names``, ``set_levels``, and ``set_labels`` also take an optional +``set_names``, ``set_levels``, and ``set_labels`` also take an optional `level`` argument .. ipython:: python diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 7623bf287bcd3..bb039b4484c7d 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -10,6 +10,7 @@ users upgrade to this version. - Highlights include: - The ``Categorical`` type was integrated as a first-class pandas type, see :ref:`here ` + - Internal refactoring of the ``Index`` class to no longer sub-class ``ndarray``, see :ref:`Internal Refactoring ` - :ref:`Other Enhancements ` @@ -25,6 +26,12 @@ users upgrade to this version. - :ref:`Bug Fixes ` +.. warning:: + + In 0.15.0 ``Index`` has internally been refactored to no longer sub-class ``ndarray`` + but instead subclass ``PandasObject``, similarly to the rest of the pandas objects. This change allows very easy sub-classing and creation of new index types. This should be + a transparent change with only very limited API implications (See the :ref:`Internal Refactoring `) + .. _whatsnew_0150.api: API changes @@ -155,6 +162,18 @@ previously results in ``Exception`` or ``TypeError`` (:issue:`7812`) didx didx.tz_localize(None) +.. _whatsnew_0150.refactoring: + +Internal Refactoring +~~~~~~~~~~~~~~~~~~~~ + +In 0.15.0 ``Index`` has internally been refactored to no longer sub-class ``ndarray`` +but instead subclass ``PandasObject``, similarly to the rest of the pandas objects. This change allows very easy sub-classing and creation of new index types. This should be +a transparent change with only very limited API implications (:issue:`5080`,:issue:`7439`,:issue:`7796`) + +- you may need to unpickle pandas version < 0.15.0 pickles using ``pd.read_pickle`` rather than ``pickle.load``. See :ref:`pickle docs ` +- when plotting with a ``PeriodIndex``. The ``matplotlib`` internal axes will now be arrays of ``Period`` rather than a ``PeriodIndex``. (this is similar to how a ``DatetimeIndex`` passess arrays of ``datetimes`` now) + .. _whatsnew_0150.cat: Categoricals in Series/DataFrame @@ -278,7 +297,7 @@ Performance ~~~~~~~~~~~ - Performance improvements in ``DatetimeIndex.__iter__`` to allow faster iteration (:issue:`7683`) - +- Performance improvements in ``Period`` creation (and ``PeriodIndex`` setitem) (:issue:`5155`) @@ -386,7 +405,7 @@ Bug Fixes - Bug in ``GroupBy.filter()`` where fast path vs. slow path made the filter return a non scalar value that appeared valid but wasn't (:issue:`7870`). - Bug in ``date_range()``/``DatetimeIndex()`` when the timezone was inferred from input dates yet incorrect - times were returned when crossing DST boundaries (:issue:`7835`, :issue:`7901`). + times were returned when crossing DST boundaries (:issue:`7835`, :issue:`7901`). diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 03b45336833d3..e794725574119 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -5,29 +5,32 @@ import pandas import copy import pickle as pkl -from pandas import compat +from pandas import compat, Index from pandas.compat import u, string_types -from pandas.core.series import Series, TimeSeries -from pandas.sparse.series import SparseSeries, SparseTimeSeries - def load_reduce(self): stack = self.stack args = stack.pop() func = stack[-1] + if type(args[0]) is type: n = args[0].__name__ - if n == u('DeprecatedSeries') or n == u('DeprecatedTimeSeries'): - stack[-1] = object.__new__(Series) - return - elif (n == u('DeprecatedSparseSeries') or - n == u('DeprecatedSparseTimeSeries')): - stack[-1] = object.__new__(SparseSeries) - return try: - value = func(*args) - except: + stack[-1] = func(*args) + return + except Exception as e: + + # if we have a deprecated function + # try to replace and try again + + if '_reconstruct: First argument must be a sub-type of ndarray' in str(e): + try: + cls = args[0] + stack[-1] = object.__new__(cls) + return + except: + pass # try to reencode the arguments if getattr(self,'encoding',None) is not None: @@ -57,6 +60,35 @@ class Unpickler(pkl.Unpickler): Unpickler.dispatch = copy.copy(Unpickler.dispatch) Unpickler.dispatch[pkl.REDUCE[0]] = load_reduce +def load_newobj(self): + args = self.stack.pop() + cls = self.stack[-1] + + # compat + if issubclass(cls, Index): + obj = object.__new__(cls) + else: + obj = cls.__new__(cls, *args) + + self.stack[-1] = obj +Unpickler.dispatch[pkl.NEWOBJ[0]] = load_newobj + +# py3 compat +def load_newobj_ex(self): + kwargs = self.stack.pop() + args = self.stack.pop() + cls = self.stack.pop() + + # compat + if issubclass(cls, Index): + obj = object.__new__(cls) + else: + obj = cls.__new__(cls, *args, **kwargs) + self.append(obj) +try: + Unpickler.dispatch[pkl.NEWOBJ_EX[0]] = load_newobj_ex +except: + pass def load(fh, encoding=None, compat=False, is_verbose=False): """load a pickle, with a provided encoding @@ -74,11 +106,6 @@ def load(fh, encoding=None, compat=False, is_verbose=False): """ try: - if compat: - pandas.core.series.Series = DeprecatedSeries - pandas.core.series.TimeSeries = DeprecatedTimeSeries - pandas.sparse.series.SparseSeries = DeprecatedSparseSeries - pandas.sparse.series.SparseTimeSeries = DeprecatedSparseTimeSeries fh.seek(0) if encoding is not None: up = Unpickler(fh, encoding=encoding) @@ -89,25 +116,3 @@ def load(fh, encoding=None, compat=False, is_verbose=False): return up.load() except: raise - finally: - if compat: - pandas.core.series.Series = Series - pandas.core.series.Series = TimeSeries - pandas.sparse.series.SparseSeries = SparseSeries - pandas.sparse.series.SparseTimeSeries = SparseTimeSeries - - -class DeprecatedSeries(np.ndarray, Series): - pass - - -class DeprecatedTimeSeries(DeprecatedSeries): - pass - - -class DeprecatedSparseSeries(DeprecatedSeries): - pass - - -class DeprecatedSparseTimeSeries(DeprecatedSparseSeries): - pass diff --git a/pandas/core/base.py b/pandas/core/base.py index beffbfb2923db..f685edd477b8c 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -8,7 +8,7 @@ from pandas.core import common as com import pandas.core.nanops as nanops import pandas.tslib as tslib -from pandas.util.decorators import cache_readonly +from pandas.util.decorators import Appender, cache_readonly class StringMixin(object): @@ -205,6 +205,19 @@ def __unicode__(self): quote_strings=True) return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype) +def _unbox(func): + @Appender(func.__doc__) + def f(self, *args, **kwargs): + result = func(self.values, *args, **kwargs) + from pandas.core.index import Index + if isinstance(result, (np.ndarray, com.ABCSeries, Index)) and result.ndim == 0: + # return NumPy type + return result.dtype.type(result.item()) + else: # pragma: no cover + return result + f.__name__ = func.__name__ + return f + class IndexOpsMixin(object): """ common ops mixin to support a unified inteface / docs for Series / Index """ @@ -238,6 +251,64 @@ def _wrap_access_object(self, obj): return obj + # ndarray compatibility + __array_priority__ = 1000 + + def transpose(self): + """ return the transpose, which is by definition self """ + return self + + T = property(transpose, doc="return the transpose, which is by definition self") + + @property + def shape(self): + """ return a tuple of the shape of the underlying data """ + return self._data.shape + + @property + def ndim(self): + """ return the number of dimensions of the underlying data, by definition 1 """ + return 1 + + def item(self): + """ return the first element of the underlying data as a python scalar """ + return self.values.item() + + @property + def data(self): + """ return the data pointer of the underlying data """ + return self.values.data + + @property + def itemsize(self): + """ return the size of the dtype of the item of the underlying data """ + return self.values.itemsize + + @property + def nbytes(self): + """ return the number of bytes in the underlying data """ + return self.values.nbytes + + @property + def strides(self): + """ return the strides of the underlying data """ + return self.values.strides + + @property + def size(self): + """ return the number of elements in the underlying data """ + return self.values.size + + @property + def flags(self): + """ return the ndarray.flags for the underlying data """ + return self.values.flags + + @property + def base(self): + """ return the base object if the memory of the underlying data is shared """ + return self.values.base + def max(self): """ The maximum value of the object """ return nanops.nanmax(self.values) @@ -340,6 +411,20 @@ def factorize(self, sort=False, na_sentinel=-1): from pandas.core.algorithms import factorize return factorize(self, sort=sort, na_sentinel=na_sentinel) + def searchsorted(self, key, side='left'): + """ np.ndarray searchsorted compat """ + + ### FIXME in GH7447 + #### needs coercion on the key (DatetimeIndex does alreay) + #### needs tests/doc-string + return self.values.searchsorted(key, side=side) + + #---------------------------------------------------------------------- + # unbox reductions + + all = _unbox(np.ndarray.all) + any = _unbox(np.ndarray.any) + # facilitate the properties on the wrapped ops def _field_accessor(name, docstring=None): op_accessor = '_{0}'.format(name) @@ -431,13 +516,17 @@ def asobject(self): def tolist(self): """ - See ndarray.tolist + return a list of the underlying data """ return list(self.asobject) def min(self, axis=None): """ - Overridden ndarray.min to return an object + return the minimum value of the Index + + See also + -------- + numpy.ndarray.min """ try: i8 = self.asi8 @@ -456,9 +545,30 @@ def min(self, axis=None): except ValueError: return self._na_value + def argmin(self, axis=None): + """ + return a ndarray of the minimum argument indexer + + See also + -------- + numpy.ndarray.argmin + """ + + ##### FIXME: need some tests (what do do if all NaT?) + i8 = self.asi8 + if self.hasnans: + mask = i8 == tslib.iNaT + i8 = i8.copy() + i8[mask] = np.iinfo('int64').max + return i8.argmin() + def max(self, axis=None): """ - Overridden ndarray.max to return an object + return the maximum value of the Index + + See also + -------- + numpy.ndarray.max """ try: i8 = self.asi8 @@ -477,6 +587,23 @@ def max(self, axis=None): except ValueError: return self._na_value + def argmax(self, axis=None): + """ + return a ndarray of the maximum argument indexer + + See also + -------- + numpy.ndarray.argmax + """ + + #### FIXME: need some tests (what do do if all NaT?) + i8 = self.asi8 + if self.hasnans: + mask = i8 == tslib.iNaT + i8 = i8.copy() + i8[mask] = 0 + return i8.argmax() + @property def _formatter_func(self): """ diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index f9ed6c2fecc3c..c9674aea4a715 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -939,6 +939,6 @@ def _get_codes_for_values(values, levels): levels = com._ensure_object(levels) (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) t = hash_klass(len(levels)) - t.map_locations(levels) + t.map_locations(com._values_from_object(levels)) return com._ensure_platform_int(t.lookup(values)) diff --git a/pandas/core/common.py b/pandas/core/common.py index 04c5140d6a59b..d8314977742a4 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -205,7 +205,7 @@ def _isnull_new(obj): # hack (for now) because MI registers as ndarray elif isinstance(obj, pd.MultiIndex): raise NotImplementedError("isnull is not defined for MultiIndex") - elif isinstance(obj, (ABCSeries, np.ndarray)): + elif isinstance(obj, (ABCSeries, np.ndarray, pd.Index)): return _isnull_ndarraylike(obj) elif isinstance(obj, ABCGeneric): return obj._constructor(obj._data.isnull(func=isnull)) @@ -231,7 +231,7 @@ def _isnull_old(obj): # hack (for now) because MI registers as ndarray elif isinstance(obj, pd.MultiIndex): raise NotImplementedError("isnull is not defined for MultiIndex") - elif isinstance(obj, (ABCSeries, np.ndarray)): + elif isinstance(obj, (ABCSeries, np.ndarray, pd.Index)): return _isnull_ndarraylike_old(obj) elif isinstance(obj, ABCGeneric): return obj._constructor(obj._data.isnull(func=_isnull_old)) @@ -2024,8 +2024,7 @@ def _is_bool_indexer(key): def _default_index(n): from pandas.core.index import Int64Index values = np.arange(n, dtype=np.int64) - result = values.view(Int64Index) - result.name = None + result = Int64Index(values,name=None) result.is_unique = True return result diff --git a/pandas/core/format.py b/pandas/core/format.py index be4074bdb0ae7..8f749d07296a7 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -1186,7 +1186,7 @@ def _helper_csv(self, writer, na_rep=None, cols=None, if cols is None: cols = self.columns - has_aliases = isinstance(header, (tuple, list, np.ndarray)) + has_aliases = isinstance(header, (tuple, list, np.ndarray, Index)) if has_aliases or header: if index: # should write something for index label @@ -1205,7 +1205,7 @@ def _helper_csv(self, writer, na_rep=None, cols=None, else: index_label = [index_label] elif not isinstance(index_label, - (list, tuple, np.ndarray)): + (list, tuple, np.ndarray, Index)): # given a string for a DF with Index index_label = [index_label] @@ -1327,7 +1327,7 @@ def _save_header(self): header = self.header encoded_labels = [] - has_aliases = isinstance(header, (tuple, list, np.ndarray)) + has_aliases = isinstance(header, (tuple, list, np.ndarray, Index)) if not (has_aliases or self.header): return if has_aliases: @@ -1355,7 +1355,7 @@ def _save_header(self): index_label = [''] else: index_label = [index_label] - elif not isinstance(index_label, (list, tuple, np.ndarray)): + elif not isinstance(index_label, (list, tuple, np.ndarray, Index)): # given a string for a DF with Index index_label = [index_label] @@ -1520,7 +1520,7 @@ def _format_value(self, val): return val def _format_header_mi(self): - has_aliases = isinstance(self.header, (tuple, list, np.ndarray)) + has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index)) if not(has_aliases or self.header): return @@ -1566,7 +1566,7 @@ def _format_header_mi(self): self.rowcounter = lnum def _format_header_regular(self): - has_aliases = isinstance(self.header, (tuple, list, np.ndarray)) + has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index)) if has_aliases or self.header: coloffset = 0 @@ -1611,7 +1611,7 @@ def _format_body(self): return self._format_regular_rows() def _format_regular_rows(self): - has_aliases = isinstance(self.header, (tuple, list, np.ndarray)) + has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index)) if has_aliases or self.header: self.rowcounter += 1 @@ -1621,7 +1621,7 @@ def _format_regular_rows(self): # chek aliases # if list only take first as this is not a MultiIndex if self.index_label and isinstance(self.index_label, - (list, tuple, np.ndarray)): + (list, tuple, np.ndarray, Index)): index_label = self.index_label[0] # if string good to go elif self.index_label and isinstance(self.index_label, str): @@ -1661,7 +1661,7 @@ def _format_regular_rows(self): yield ExcelCell(self.rowcounter + i, colidx + coloffset, val) def _format_hierarchical_rows(self): - has_aliases = isinstance(self.header, (tuple, list, np.ndarray)) + has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index)) if has_aliases or self.header: self.rowcounter += 1 @@ -1671,7 +1671,7 @@ def _format_hierarchical_rows(self): index_labels = self.df.index.names # check for aliases if self.index_label and isinstance(self.index_label, - (list, tuple, np.ndarray)): + (list, tuple, np.ndarray, Index)): index_labels = self.index_label # if index labels are not empty go ahead and dump diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 516d87bb25f5d..3979ae76f14c3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -220,7 +220,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, mgr = self._init_ndarray(data, index, columns, dtype=dtype, copy=copy) - elif isinstance(data, (np.ndarray, Series)): + elif isinstance(data, (np.ndarray, Series, Index)): if data.dtype.names: data_columns = list(data.dtype.names) data = dict((k, data[k]) for k in data_columns) @@ -593,7 +593,7 @@ def dot(self, other): columns=other.columns) elif isinstance(other, Series): return Series(np.dot(lvals, rvals), index=left.index) - elif isinstance(rvals, np.ndarray): + elif isinstance(rvals, (np.ndarray, Index)): result = np.dot(lvals, rvals) if result.ndim == 2: return self._constructor(result, index=left.index) @@ -1668,7 +1668,7 @@ def __getitem__(self, key): if indexer is not None: return self._getitem_slice(indexer) - if isinstance(key, (Series, np.ndarray, list)): + if isinstance(key, (Series, np.ndarray, Index, list)): # either boolean or fancy integer index return self._getitem_array(key) elif isinstance(key, DataFrame): @@ -1719,7 +1719,7 @@ def _getitem_array(self, key): def _getitem_multilevel(self, key): loc = self.columns.get_loc(key) - if isinstance(loc, (slice, Series, np.ndarray)): + if isinstance(loc, (slice, Series, np.ndarray, Index)): new_columns = self.columns[loc] result_columns = _maybe_droplevels(new_columns, key) if self._is_mixed_type: @@ -1999,7 +1999,7 @@ def __setitem__(self, key, value): if indexer is not None: return self._setitem_slice(indexer, value) - if isinstance(key, (Series, np.ndarray, list)): + if isinstance(key, (Series, np.ndarray, list, Index)): self._setitem_array(key, value) elif isinstance(key, DataFrame): self._setitem_frame(key, value) @@ -2371,7 +2371,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, elif isinstance(col, Index): level = col names.append(col.name) - elif isinstance(col, (list, np.ndarray)): + elif isinstance(col, (list, np.ndarray, Index)): level = col names.append(None) else: @@ -2436,7 +2436,7 @@ def reset_index(self, level=None, drop=False, inplace=False, col_level=0, def _maybe_casted_values(index, labels=None): if isinstance(index, PeriodIndex): - values = index.asobject + values = index.asobject.values elif (isinstance(index, DatetimeIndex) and index.tz is not None): values = index.asobject @@ -3020,7 +3020,7 @@ def _compare_frame(self, other, func, str_rep): def _flex_compare_frame(self, other, func, str_rep, level): if not self._indexed_same(other): - self, other = self.align(other, 'outer', level=level) + self, other = self.align(other, 'outer', level=level, copy=False) return self._compare_frame_evaluate(other, func, str_rep) def combine(self, other, func, fill_value=None, overwrite=True): @@ -4622,7 +4622,7 @@ def extract_index(data): def _prep_ndarray(values, copy=True): - if not isinstance(values, (np.ndarray, Series)): + if not isinstance(values, (np.ndarray, Series, Index)): if len(values) == 0: return np.empty((0, 0), dtype=object) @@ -4685,7 +4685,7 @@ def _to_arrays(data, columns, coerce_float=False, dtype=None): return _list_of_series_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) - elif (isinstance(data, (np.ndarray, Series)) + elif (isinstance(data, (np.ndarray, Series, Index)) and data.dtype.names is not None): columns = list(data.dtype.names) @@ -4865,9 +4865,9 @@ def _homogenize(data, index, dtype=None): oindex = index.astype('O') if type(v) == dict: # fast cython method - v = lib.fast_multiget(v, oindex, default=NA) + v = lib.fast_multiget(v, oindex.values, default=NA) else: - v = lib.map_infer(oindex, v.get) + v = lib.map_infer(oindex.values, v.get) v = _sanitize_array(v, index, dtype=dtype, copy=False, raise_cast_failure=False) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index cef18c5ad3c2b..2815f05ce313b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1505,7 +1505,7 @@ def drop(self, labels, axis=0, level=None, inplace=False, **kwargs): if level is not None: if not isinstance(axis, MultiIndex): raise AssertionError('axis must be a MultiIndex') - indexer = ~lib.ismember(axis.get_level_values(level), + indexer = ~lib.ismember(axis.get_level_values(level).values, set(labels)) else: indexer = ~axis.isin(labels) @@ -2135,16 +2135,14 @@ def copy(self, deep=True): Parameters ---------- - deep : boolean, default True + deep : boolean or string, default True Make a deep copy, i.e. also copy data Returns ------- copy : type of caller """ - data = self._data - if deep: - data = data.copy() + data = self._data.copy(deep=deep) return self._constructor(data).__finalize__(self) def convert_objects(self, convert_dates=True, convert_numeric=False, diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 8cfa0e25b789f..212e5086ee543 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -24,7 +24,7 @@ from pandas.core.common import(_possibly_downcast_to_dtype, isnull, notnull, _DATELIKE_DTYPES, is_numeric_dtype, is_timedelta64_dtype, is_datetime64_dtype, - is_categorical_dtype) + is_categorical_dtype, _values_from_object) from pandas.core.config import option_context from pandas import _np_version_under1p7 import pandas.lib as lib @@ -453,7 +453,7 @@ def name(self): @property def _selection_list(self): - if not isinstance(self._selection, (list, tuple, Series, np.ndarray)): + if not isinstance(self._selection, (list, tuple, Series, Index, np.ndarray)): return [self._selection] return self._selection @@ -1254,7 +1254,7 @@ def indices(self): return self.groupings[0].indices else: label_list = [ping.labels for ping in self.groupings] - keys = [ping.group_index for ping in self.groupings] + keys = [_values_from_object(ping.group_index) for ping in self.groupings] return _get_indices_dict(label_list, keys) @property @@ -1552,7 +1552,7 @@ def _aggregate_series_pure_python(self, obj, func): for label, group in splitter: res = func(group) if result is None: - if (isinstance(res, (Series, np.ndarray)) or + if (isinstance(res, (Series, Index, np.ndarray)) or isinstance(res, list)): raise ValueError('Function does not reduce') result = np.empty(ngroups, dtype='O') @@ -1894,7 +1894,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.name = grouper.name # no level passed - if not isinstance(self.grouper, (Series, np.ndarray)): + if not isinstance(self.grouper, (Series, Index, np.ndarray)): self.grouper = self.index.map(self.grouper) if not (hasattr(self.grouper, "__len__") and len(self.grouper) == len(self.index)): @@ -2014,7 +2014,7 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True): # what are we after, exactly? match_axis_length = len(keys) == len(group_axis) any_callable = any(callable(g) or isinstance(g, dict) for g in keys) - any_arraylike = any(isinstance(g, (list, tuple, Series, np.ndarray)) + any_arraylike = any(isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys) try: @@ -2080,7 +2080,7 @@ def _convert_grouper(axis, grouper): return grouper.values else: return grouper.reindex(axis).values - elif isinstance(grouper, (list, Series, np.ndarray)): + elif isinstance(grouper, (list, Series, Index, np.ndarray)): if len(grouper) != len(axis): raise AssertionError('Grouper and axis must be same length') return grouper @@ -2246,7 +2246,7 @@ def _aggregate_named(self, func, *args, **kwargs): for name, group in self: group.name = name output = func(group, *args, **kwargs) - if isinstance(output, (Series, np.ndarray)): + if isinstance(output, (Series, Index, np.ndarray)): raise Exception('Must produce aggregated value') result[name] = self._try_cast(output, group) @@ -2678,7 +2678,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): v = values[0] - if isinstance(v, (np.ndarray, Series)): + if isinstance(v, (np.ndarray, Index, Series)): if isinstance(v, Series): applied_index = self._selected_obj._get_axis(self.axis) all_indexed_same = _all_indexes_same([ @@ -2984,7 +2984,7 @@ def __getitem__(self, key): if self._selection is not None: raise Exception('Column(s) %s already selected' % self._selection) - if isinstance(key, (list, tuple, Series, np.ndarray)): + if isinstance(key, (list, tuple, Series, Index, np.ndarray)): if len(self.obj.columns.intersection(key)) != len(key): bad_keys = list(set(key).difference(self.obj.columns)) raise KeyError("Columns not found: %s" @@ -3579,7 +3579,7 @@ def _intercept_cython(func): def _groupby_indices(values): - return _algos.groupby_indices(com._ensure_object(values)) + return _algos.groupby_indices(_values_from_object(com._ensure_object(values))) def numpy_groupby(data, labels, axis=0): diff --git a/pandas/core/index.py b/pandas/core/index.py index 94bc48d0f4342..c7b1c60a9ddc4 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1,6 +1,7 @@ # pylint: disable=E1101,E1103,W0232 import datetime import warnings +import operator from functools import partial from pandas.compat import range, zip, lrange, lzip, u, reduce from pandas import compat @@ -11,12 +12,12 @@ import pandas.algos as _algos import pandas.index as _index from pandas.lib import Timestamp, is_datetime_array -from pandas.core.base import FrozenList, FrozenNDArray, IndexOpsMixin -from pandas.util.decorators import cache_readonly, deprecate, Appender +from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin +from pandas.util.decorators import Appender, cache_readonly, deprecate from pandas.core.common import isnull, array_equivalent import pandas.core.common as com from pandas.core.common import (_values_from_object, is_float, is_integer, - ABCSeries) + ABCSeries, _ensure_object) from pandas.core.config import get_option # simplify @@ -44,10 +45,15 @@ def _indexOp(opname): """ def wrapper(self, other): - func = getattr(self.view(np.ndarray), opname) - result = func(other) + func = getattr(self._data.view(np.ndarray), opname) + result = func(np.asarray(other)) + + # technically we could support bool dtyped Index + # for now just return the indexing array directly + if com.is_bool_dtype(result): + return result try: - return result.view(np.ndarray) + return Index(result) except: # pragma: no cover return result return wrapper @@ -56,19 +62,15 @@ def wrapper(self, other): class InvalidIndexError(Exception): pass - _o_dtype = np.dtype(object) - - -def _shouldbe_timestamp(obj): - return (tslib.is_datetime_array(obj) - or tslib.is_datetime64_array(obj) - or tslib.is_timestamp_array(obj)) - _Identity = object +def _new_Index(cls, d): + """ This is called upon unpickling, rather than the default which doesn't have arguments + and breaks __new__ """ + return cls.__new__(cls, **d) -class Index(IndexOpsMixin, FrozenNDArray): +class Index(IndexOpsMixin, PandasObject): """ Immutable ndarray implementing an ordered, sliceable set. The basic object @@ -102,16 +104,21 @@ class Index(IndexOpsMixin, FrozenNDArray): _box_scalars = False + _typ = 'index' + _data = None + _id = None name = None asi8 = None _comparables = ['name'] + _attributes = ['name'] _allow_index_ops = True _allow_datetime_index_ops = False _allow_period_index_ops = False + _is_numeric_dtype = False _engine_type = _index.ObjectEngine - def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False, + def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, tupleize_cols=True, **kwargs): # no class inference! @@ -119,7 +126,7 @@ def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False, return cls._simple_new(data, name) from pandas.tseries.period import PeriodIndex - if isinstance(data, (np.ndarray, ABCSeries)): + if isinstance(data, (np.ndarray, Index, ABCSeries)): if issubclass(data.dtype.type, np.datetime64): from pandas.tseries.index import DatetimeIndex result = DatetimeIndex(data, copy=copy, name=name, **kwargs) @@ -143,7 +150,10 @@ def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False, if issubclass(data.dtype.type, np.floating): return Float64Index(data, copy=copy, dtype=dtype, name=name) - subarr = com._asarray_tuplesafe(data, dtype=object) + if com.is_bool_dtype(data): + subarr = data + else: + subarr = com._asarray_tuplesafe(data, dtype=object) # _asarray_tuplesafe does not always copy underlying data, # so need to make sure that this happens @@ -153,7 +163,7 @@ def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False, elif hasattr(data, '__array__'): return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) - elif np.isscalar(data): + elif data is None or np.isscalar(data): cls._scalar_data_error(data) else: if tupleize_cols and isinstance(data, list) and data: @@ -177,6 +187,9 @@ def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False, return Int64Index(subarr.astype('i8'), copy=copy, name=name) elif inferred in ['floating', 'mixed-integer-float']: return Float64Index(subarr, copy=copy, name=name) + elif inferred == 'boolean': + # don't support boolean explicity ATM + pass elif inferred != 'string': if (inferred.startswith('datetime') or tslib.is_timestamp_array(subarr)): @@ -185,15 +198,16 @@ def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False, elif inferred == 'period': return PeriodIndex(subarr, name=name, **kwargs) - subarr = subarr.view(cls) - # could also have a _set_name, but I don't think it's really necessary - subarr._set_names([name]) - return subarr + return cls._simple_new(subarr, name) @classmethod - def _simple_new(cls, values, name, **kwargs): - result = values.view(cls) + def _simple_new(cls, values, name=None, **kwargs): + result = object.__new__(cls) + result._data = values result.name = name + for k, v in compat.iteritems(kwargs): + setattr(result,k,v) + result._reset_identity() return result def is_(self, other): @@ -219,11 +233,66 @@ def _reset_identity(self): """Initializes or resets ``_id`` attribute with new object""" self._id = _Identity() - def view(self, *args, **kwargs): - result = super(Index, self).view(*args, **kwargs) - if isinstance(result, Index): - result._id = self._id - return result + # ndarray compat + def __len__(self): + """ + return the length of the Index + """ + return len(self._data) + + def __array__(self, result=None): + """ the array interface, return my values """ + return self._data.view(np.ndarray) + + def __array_wrap__(self, result, context=None): + """ + Gets called after a ufunc + """ + return self._shallow_copy(result) + + @cache_readonly + def dtype(self): + """ return the dtype object of the underlying data """ + return self._data.dtype + + @property + def values(self): + """ return the underlying data as an ndarray """ + return self._data.view(np.ndarray) + + def get_values(self): + """ return the underlying data as an ndarray """ + return self.values + + def _array_values(self): + return self._data + + # ops compat + def tolist(self): + """ + return a list of the Index values + """ + return list(self.values) + + def repeat(self, n): + """ + return a new Index of the values repeated n times + + See also + -------- + numpy.ndarray.repeat + """ + return self._shallow_copy(self.values.repeat(n)) + + def ravel(self, order='C'): + """ + return an ndarray of the flattened values of the underlying data + + See also + -------- + numpy.ndarray.ravel + """ + return self.values.ravel(order=order) # construction helpers @classmethod @@ -243,8 +312,8 @@ def _coerce_to_ndarray(cls, data): """coerces data to ndarray, raises on scalar data. Converts other iterables to list first and then to array. Does not touch ndarrays.""" - if not isinstance(data, np.ndarray): - if np.isscalar(data): + if not isinstance(data, (np.ndarray, Index)): + if data is None or np.isscalar(data): cls._scalar_data_error(data) # other iterable of some kind @@ -253,16 +322,27 @@ def _coerce_to_ndarray(cls, data): data = np.asarray(data) return data - def __array_finalize__(self, obj): - self._reset_identity() - if not isinstance(obj, type(self)): - # Only relevant if array being created from an Index instance - return + def _get_attributes_dict(self): + """ return an attributes dict for my class """ + return dict([ (k,getattr(self,k,None)) for k in self._attributes]) - self.name = getattr(obj, 'name', None) + def view(self, cls=None): + if cls is not None and not issubclass(cls, Index): + result = self._data.view(cls) + else: + result = self._shallow_copy() + if isinstance(result, Index): + result._id = self._id + return result - def _shallow_copy(self): - return self.view() + def _shallow_copy(self, values=None, **kwargs): + """ create a new Index, don't copy the data, use the same object attributes + with passed in attributes taking precedence """ + if values is None: + values = self.values + attributes = self._get_attributes_dict() + attributes.update(kwargs) + return self.__class__._simple_new(values,**attributes) def copy(self, names=None, name=None, dtype=None, deep=False): """ @@ -287,10 +367,11 @@ def copy(self, names=None, name=None, dtype=None, deep=False): raise TypeError("Can only provide one of `names` and `name`") if deep: from copy import deepcopy - new_index = np.ndarray.__deepcopy__(self, {}).view(self.__class__) + new_index = self._shallow_copy(self._data.copy()) name = name or deepcopy(self.name) else: - new_index = super(Index, self).copy() + new_index = self._shallow_copy() + name = self.name if name is not None: names = [name] if names: @@ -299,6 +380,19 @@ def copy(self, names=None, name=None, dtype=None, deep=False): new_index = new_index.astype(dtype) return new_index + __copy__ = copy + + def __unicode__(self): + """ + Return a string representation for this object. + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both + py2/py3. + """ + prepr = com.pprint_thing(self, escape_chars=('\t', '\r', '\n'), + quote_strings=True) + return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype) + def to_series(self, keep_tz=False): """ Create a Series with both index and values equal to the index keys @@ -343,22 +437,10 @@ def to_datetime(self, dayfirst=False): def _assert_can_do_setop(self, other): return True - def tolist(self): - """ - Overridden version of ndarray.tolist - """ - return list(self.values) - - @cache_readonly - def dtype(self): - return self.values.dtype - @property def nlevels(self): return 1 - # for compat with multindex code - def _get_names(self): return FrozenList((self.name,)) @@ -395,7 +477,7 @@ def set_names(self, names, level=None, inplace=False): >>> Index([1, 2, 3, 4]).set_names(['foo']) Int64Index([1, 2, 3, 4], dtype='int64') >>> idx = MultiIndex.from_tuples([(1, u'one'), (1, u'two'), - (2, u'one'), (2, u'two')], + (2, u'one'), (2, u'two')], names=['foo', 'bar']) >>> idx.set_names(['baz', 'quz']) MultiIndex(levels=[[1, 2], [u'one', u'two']], @@ -473,13 +555,6 @@ def _mpl_repr(self): # how to represent ourselves to matplotlib return self.values - @property - def values(self): - return np.asarray(self) - - def get_values(self): - return self.values - _na_value = np.nan """The expected NA value to use with this index.""" @@ -720,26 +795,42 @@ def is_type_compatible(self, typ): @cache_readonly def is_all_dates(self): - return is_datetime_array(self.values) + if self._data is None: + return False + return is_datetime_array(_ensure_object(self.values)) def __iter__(self): return iter(self.values) def __reduce__(self): - """Necessary for making this object picklable""" - object_state = list(np.ndarray.__reduce__(self)) - subclass_state = self.name, - object_state[2] = (object_state[2], subclass_state) - return tuple(object_state) + d = dict(data=self._data) + d.update(self._get_attributes_dict()) + return _new_Index, (self.__class__, d), None def __setstate__(self, state): """Necessary for making this object picklable""" - if len(state) == 2: - nd_state, own_state = state - np.ndarray.__setstate__(self, nd_state) - self.name = own_state[0] - else: # pragma: no cover - np.ndarray.__setstate__(self, state) + + if isinstance(state, dict): + self._data = state.pop('data') + for k, v in compat.iteritems(state): + setattr(self, k, v) + + elif isinstance(state, tuple): + + if len(state) == 2: + nd_state, own_state = state + data = np.empty(nd_state[1], dtype=nd_state[2]) + np.ndarray.__setstate__(data, nd_state) + self.name = own_state[0] + + else: # pragma: no cover + data = np.empty(state) + np.ndarray.__setstate__(data, state) + + self._data = data + else: + raise Exception("invalid pickle state") + _unpickle_compat = __setstate__ def __deepcopy__(self, memo={}): return self.copy(deep=True) @@ -755,6 +846,9 @@ def __contains__(self, key): def __hash__(self): raise TypeError("unhashable type: %r" % type(self).__name__) + def __setitem__(self, key, value): + raise TypeError("Indexes does not support mutable operations") + def __getitem__(self, key): """ Override numpy.ndarray's __getitem__ method to work as desired. @@ -768,21 +862,24 @@ def __getitem__(self, key): """ # There's no custom logic to be implemented in __getslice__, so it's # not overloaded intentionally. - __getitem__ = super(Index, self).__getitem__ + getitem = self._data.__getitem__ + promote = self._shallow_copy + if np.isscalar(key): - return __getitem__(key) + return getitem(key) if isinstance(key, slice): # This case is separated from the conditional above to avoid # pessimization of basic indexing. - return __getitem__(key) + return promote(getitem(key)) if com._is_bool_indexer(key): - return __getitem__(np.asarray(key)) + key = np.asarray(key) - result = __getitem__(key) - if result.ndim > 1: - return result.view(np.ndarray) + key = _values_from_object(key) + result = getitem(key) + if not np.isscalar(result): + return promote(result) else: return result @@ -831,12 +928,30 @@ def _ensure_compat_concat(indexes): def take(self, indexer, axis=0): """ - Analogous to ndarray.take + return a new Index of the values selected by the indexer + + See also + -------- + numpy.ndarray.take """ + indexer = com._ensure_platform_int(indexer) - taken = self.view(np.ndarray).take(indexer) - return self._simple_new(taken, name=self.name, freq=None, - tz=getattr(self, 'tz', None)) + taken = np.array(self).take(indexer) + + # by definition cannot propogate freq + return self._shallow_copy(taken, freq=None) + + def putmask(self, mask, value): + """ + return a new Index of the values set with the mask + + See also + -------- + numpy.ndarray.putmask + """ + values = self.values.copy() + np.putmask(values, mask, value) + return self._shallow_copy(values) def format(self, name=False, formatter=None, **kwargs): """ @@ -985,18 +1100,22 @@ def shift(self, periods=1, freq=None): def argsort(self, *args, **kwargs): """ - See docstring for ndarray.argsort + return an ndarray indexer of the underlying data + + See also + -------- + numpy.ndarray.argsort """ result = self.asi8 if result is None: - result = self.view(np.ndarray) + result = np.array(self) return result.argsort(*args, **kwargs) def __add__(self, other): if isinstance(other, Index): return self.union(other) else: - return Index(self.view(np.ndarray) + other) + return Index(np.array(self) + other) __iadd__ = __add__ __eq__ = _indexOp('__eq__') @@ -1048,7 +1167,7 @@ def union(self, other): if self.is_monotonic and other.is_monotonic: try: - result = self._outer_indexer(self, other.values)[0] + result = self._outer_indexer(self.values, other.values)[0] except TypeError: # incomparable objects result = list(self.values) @@ -1122,7 +1241,7 @@ def intersection(self, other): if self.is_monotonic and other.is_monotonic: try: - result = self._inner_indexer(self, other.values)[0] + result = self._inner_indexer(self.values, other.values)[0] return self._wrap_union_result(other, result) except TypeError: pass @@ -1381,7 +1500,7 @@ def _possibly_promote(self, other): return self, other def groupby(self, to_groupby): - return self._groupby(self.values, to_groupby) + return self._groupby(self.values, _values_from_object(to_groupby)) def map(self, mapper): return self._arrmap(self.values, mapper) @@ -1416,9 +1535,6 @@ def isin(self, values, level=None): self._validate_index_level(level) return lib.ismember(self._array_values(), value_set) - def _array_values(self): - return self - def _get_method(self, method): if method: method = method.lower() @@ -1778,7 +1894,7 @@ def slice_indexer(self, start=None, end=None, step=None): return slice(start_slice, end_slice, step) # loc indexers - return Index(start_slice) & Index(end_slice) + return (Index(start_slice) & Index(end_slice)).values def slice_locs(self, start=None, end=None): """ @@ -1814,7 +1930,7 @@ def _get_slice(starting_value, offset, search_side, slice_property, # get_loc will return a boolean array for non_uniques # if we are not monotonic - if isinstance(slc, np.ndarray): + if isinstance(slc, (np.ndarray, Index)): raise KeyError("cannot peform a slice operation " "on a non-unique non-monotonic index") @@ -1853,7 +1969,7 @@ def delete(self, loc): ------- new_index : Index """ - return np.delete(self, loc) + return Index(np.delete(self._data, loc), name=self.name) def insert(self, loc, item): """ @@ -1894,8 +2010,75 @@ def drop(self, labels): raise ValueError('labels %s not contained in axis' % labels[mask]) return self.delete(indexer) + @classmethod + def _add_numeric_methods_disabled(cls): + """ add in numeric methods to disable """ + + def _make_invalid_op(opstr): + + def _invalid_op(self, other): + raise TypeError("cannot perform {opstr} with this index type: {typ}".format(opstr=opstr, + typ=type(self))) + return _invalid_op + + cls.__mul__ = cls.__rmul__ = _make_invalid_op('multiplication') + cls.__floordiv__ = cls.__rfloordiv__ = _make_invalid_op('floor division') + cls.__truediv__ = cls.__rtruediv__ = _make_invalid_op('true division') + if not compat.PY3: + cls.__div__ = cls.__rdiv__ = _make_invalid_op('division') + + @classmethod + def _add_numeric_methods(cls): + """ add in numeric methods """ + + def _make_evaluate_binop(op, opstr): + + def _evaluate_numeric_binop(self, other): + + # if we are an inheritor of numeric, but not actually numeric (e.g. DatetimeIndex/PeriodInde) + if not self._is_numeric_dtype: + raise TypeError("cannot evaluate a numeric op {opstr} for type: {typ}".format(opstr=opstr, + typ=type(self))) + + if isinstance(other, Index): + if not other._is_numeric_dtype: + raise TypeError("cannot evaluate a numeric op {opstr} with type: {typ}".format(opstr=type(self), + typ=type(other))) + elif isinstance(other, np.ndarray) and not other.ndim: + other = other.item() + + if isinstance(other, (Index, ABCSeries, np.ndarray)): + if len(self) != len(other): + raise ValueError("cannot evaluate a numeric op with unequal lengths") + other = _values_from_object(other) + if other.dtype.kind not in ['f','i']: + raise TypeError("cannot evaluate a numeric op with a non-numeric dtype") + else: + if not (com.is_float(other) or com.is_integer(other)): + raise TypeError("can only perform ops with scalar values") + return self._shallow_copy(op(self.values, other)) + + return _evaluate_numeric_binop + + + cls.__mul__ = cls.__rmul__ = _make_evaluate_binop(operator.mul,'multiplication') + cls.__floordiv__ = cls.__rfloordiv__ = _make_evaluate_binop(operator.floordiv,'floor division') + cls.__truediv__ = cls.__rtruediv__ = _make_evaluate_binop(operator.truediv,'true division') + if not compat.PY3: + cls.__div__ = cls.__rdiv__ = _make_evaluate_binop(operator.div,'division') +Index._add_numeric_methods_disabled() + +class NumericIndex(Index): + """ + Provide numeric type operations -class Int64Index(Index): + This is an abstract class + + """ + _is_numeric_dtype = True + + +class Int64Index(NumericIndex): """ Immutable ndarray implementing an ordered, sliceable set. The basic object @@ -1918,6 +2101,7 @@ class Int64Index(Index): An Index instance can **only** contain hashable objects """ + _typ = 'int64index' _groupby = _algos.groupby_int64 _arrmap = _algos.arrmap_int64 _left_indexer_unique = _algos.left_join_indexer_unique_int64 @@ -1927,12 +2111,10 @@ class Int64Index(Index): _engine_type = _index.Int64Engine - def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False): + def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs): if fastpath: - subarr = data.view(cls) - subarr.name = name - return subarr + return cls._simple_new(data, name=name) # isscalar, generators handled in coerce_to_ndarray data = cls._coerce_to_ndarray(data) @@ -1955,9 +2137,7 @@ def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False): raise TypeError('Unsafe NumPy casting to integer, you must' ' explicitly cast') - subarr = subarr.view(cls) - subarr.name = name - return subarr + return cls._simple_new(subarr, name=name) @property def inferred_type(self): @@ -1994,9 +2174,9 @@ def equals(self, other): def _wrap_joined_index(self, joined, other): name = self.name if self.name == other.name else None return Int64Index(joined, name=name) +Int64Index._add_numeric_methods() - -class Float64Index(Index): +class Float64Index(NumericIndex): """ Immutable ndarray implementing an ordered, sliceable set. The basic object @@ -2017,7 +2197,7 @@ class Float64Index(Index): An Float64Index instance can **only** contain hashable objects """ - # when this is not longer object dtype this can be changed + _typ = 'float64index' _engine_type = _index.Float64Engine _groupby = _algos.groupby_float64 _arrmap = _algos.arrmap_float64 @@ -2026,12 +2206,10 @@ class Float64Index(Index): _inner_indexer = _algos.inner_join_indexer_float64 _outer_indexer = _algos.outer_join_indexer_float64 - def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False): + def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs): if fastpath: - subarr = data.view(cls) - subarr.name = name - return subarr + return cls._simple_new(data, name) data = cls._coerce_to_ndarray(data) @@ -2051,9 +2229,7 @@ def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False): if subarr.dtype != np.float64: subarr = subarr.astype(np.float64) - subarr = subarr.view(cls) - subarr.name = name - return subarr + return cls._simple_new(subarr, name) @property def inferred_type(self): @@ -2186,6 +2362,7 @@ def isin(self, values, level=None): self._validate_index_level(level) return lib.ismember_nans(self._array_values(), value_set, isnull(list(value_set)).any()) +Float64Index._add_numeric_methods() class MultiIndex(Index): @@ -2205,8 +2382,14 @@ class MultiIndex(Index): level) names : optional sequence of objects Names for each of the index levels. + copy : boolean, default False + Copy the meta-data + verify_integrity : boolean, default True + Check that the levels/labels are consistent and valid """ + # initialize to zero-length tuples to make everything work + _typ = 'multiindex' _names = FrozenList() _levels = FrozenList() _labels = FrozenList() @@ -2214,7 +2397,8 @@ class MultiIndex(Index): rename = Index.set_names def __new__(cls, levels=None, labels=None, sortorder=None, names=None, - copy=False, verify_integrity=True): + copy=False, verify_integrity=True, _set_identity=True, **kwargs): + if levels is None or labels is None: raise TypeError("Must pass both levels and labels") if len(levels) != len(labels): @@ -2226,28 +2410,29 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None, name = names[0] else: name = None - return Index(levels[0], name=name, copy=True).take(labels[0]) - # v3, 0.8.0 - subarr = np.empty(0, dtype=object).view(cls) + result = object.__new__(MultiIndex) + # we've already validated levels and labels, so shortcut here - subarr._set_levels(levels, copy=copy, validate=False) - subarr._set_labels(labels, copy=copy, validate=False) + result._set_levels(levels, copy=copy, validate=False) + result._set_labels(labels, copy=copy, validate=False) if names is not None: # handles name validation - subarr._set_names(names) + result._set_names(names) if sortorder is not None: - subarr.sortorder = int(sortorder) + result.sortorder = int(sortorder) else: - subarr.sortorder = sortorder + result.sortorder = sortorder if verify_integrity: - subarr._verify_integrity() + result._verify_integrity() + if _set_identity: + result._reset_identity() - return subarr + return result def _verify_integrity(self): """Raises ValueError if length of levels and labels don't match or any @@ -2329,7 +2514,7 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): Examples -------- >>> idx = MultiIndex.from_tuples([(1, u'one'), (1, u'two'), - (2, u'one'), (2, u'two')], + (2, u'one'), (2, u'two')], names=['foo', 'bar']) >>> idx.set_levels([['a','b'], [1,2]]) MultiIndex(levels=[[u'a', u'b'], [1, 2]], @@ -2381,7 +2566,7 @@ def _get_labels(self): def _set_labels(self, labels, level=None, copy=False, validate=True, verify_integrity=False): - + if validate and level is None and len(labels) != self.nlevels: raise ValueError("Length of labels must match number of levels") if validate and level is not None and len(labels) != len(level): @@ -2427,7 +2612,7 @@ def set_labels(self, labels, level=None, inplace=False, verify_integrity=True): Examples -------- >>> idx = MultiIndex.from_tuples([(1, u'one'), (1, u'two'), - (2, u'one'), (2, u'two')], + (2, u'one'), (2, u'two')], names=['foo', 'bar']) >>> idx.set_labels([[1,0,1,0], [0,0,1,1]]) MultiIndex(levels=[[1, 2], [u'one', u'two']], @@ -2474,7 +2659,7 @@ def set_labels(self, labels, level=None, inplace=False, verify_integrity=True): labels = property(fget=_get_labels, fset=__set_labels) def copy(self, names=None, dtype=None, levels=None, labels=None, - deep=False): + deep=False, _set_identity=False): """ Make a copy of this object. Names, dtype, levels and labels can be passed and will be set on new copy. @@ -2496,39 +2681,33 @@ def copy(self, names=None, dtype=None, levels=None, labels=None, ``deep``, but if ``deep`` is passed it will attempt to deepcopy. This could be potentially expensive on large MultiIndex objects. """ - new_index = np.ndarray.copy(self) if deep: from copy import deepcopy levels = levels if levels is not None else deepcopy(self.levels) labels = labels if labels is not None else deepcopy(self.labels) names = names if names is not None else deepcopy(self.names) - if levels is not None: - new_index = new_index.set_levels(levels) - if labels is not None: - new_index = new_index.set_labels(labels) - if names is not None: - new_index = new_index.set_names(names) - if dtype: - new_index = new_index.astype(dtype) - return new_index + else: + levels = self.levels + labels = self.labels + names = self.names + return MultiIndex(levels=levels, + labels=labels, + names=names, + sortorder=self.sortorder, + verify_integrity=False, + _set_identity=_set_identity) + + def __array__(self, result=None): + """ the array interface, return my values """ + return self.values - def __array_finalize__(self, obj): - """ - Update custom MultiIndex attributes when a new array is created by - numpy, e.g. when calling ndarray.view() - """ - # overriden if a view - self._reset_identity() - if not isinstance(obj, type(self)): - # Only relevant if this array is being created from an Index - # instance. - return + def view(self, cls=None): + """ this is defined as a copy with the same identity """ + result = self.copy() + result._id = self._id + return result - # skip the validation on first, rest will catch the errors - self._set_levels(getattr(obj, 'levels', []), validate=False) - self._set_labels(getattr(obj, 'labels', [])) - self._set_names(getattr(obj, 'names', [])) - self.sortorder = getattr(obj, 'sortorder', None) + _shallow_copy = view def _array_values(self): # hack for various methods @@ -2628,12 +2807,7 @@ def inferred_type(self): @staticmethod def _from_elements(values, labels=None, levels=None, names=None, sortorder=None): - index = values.view(MultiIndex) - index._set_levels(levels) - index._set_labels(labels) - index._set_names(names) - index.sortorder = sortorder - return index + return MultiIndex(levels, labels, names, sortorder=sortorder) def _get_level_number(self, level): try: @@ -2663,33 +2837,28 @@ def _get_level_number(self, level): @property def values(self): - if self._is_v2: - return self.view(np.ndarray) - else: - if self._tuples is not None: - return self._tuples + if self._tuples is not None: + return self._tuples - values = [] - for lev, lab in zip(self.levels, self.labels): - taken = com.take_1d(lev.values, lab) - # Need to box timestamps, etc. - if hasattr(lev, '_box_values'): - taken = lev._box_values(taken) - values.append(taken) + values = [] + for lev, lab in zip(self.levels, self.labels): + taken = com.take_1d(lev.values, lab) + # Need to box timestamps, etc. + if hasattr(lev, '_box_values'): + taken = lev._box_values(taken) + values.append(taken) - self._tuples = lib.fast_zip(values) - return self._tuples + self._tuples = lib.fast_zip(values) + return self._tuples # fml @property def _is_v1(self): - contents = self.view(np.ndarray) - return len(contents) > 0 and not isinstance(contents[0], tuple) + return False @property def _is_v2(self): - contents = self.view(np.ndarray) - return len(contents) > 0 and isinstance(contents[0], tuple) + return False @property def _has_complex_internals(self): @@ -3000,7 +3169,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): # I think this is right? Not quite sure... raise TypeError('Cannot infer number of levels from empty list') - if isinstance(tuples, np.ndarray): + if isinstance(tuples, (np.ndarray, Index)): if isinstance(tuples, Index): tuples = tuples.values @@ -3075,18 +3244,25 @@ def __contains__(self, key): def __reduce__(self): """Necessary for making this object picklable""" - object_state = list(np.ndarray.__reduce__(self)) - subclass_state = ([lev.view(np.ndarray) for lev in self.levels], - [label.view(np.ndarray) for label in self.labels], - self.sortorder, list(self.names)) - object_state[2] = (object_state[2], subclass_state) - return tuple(object_state) + d = dict(levels = [lev.view(np.ndarray) for lev in self.levels], + labels = [label.view(np.ndarray) for label in self.labels], + sortorder = self.sortorder, + names = list(self.names)) + return _new_Index, (self.__class__, d), None def __setstate__(self, state): """Necessary for making this object picklable""" - nd_state, own_state = state - np.ndarray.__setstate__(self, nd_state) - levels, labels, sortorder, names = own_state + + if isinstance(state, dict): + levels = state.get('levels') + labels = state.get('labels') + sortorder = state.get('sortorder') + names = state.get('names') + + elif isinstance(state, tuple): + + nd_state, own_state = state + levels, labels, sortorder, names = own_state self._set_levels([Index(x) for x in levels], validate=False) self._set_labels(labels) @@ -3112,21 +3288,15 @@ def __getitem__(self, key): # cannot be sure whether the result will be sorted sortorder = None - result = np.empty(0, dtype=object).view(type(self)) new_labels = [lab[key] for lab in self.labels] - # an optimization - result._set_levels(self.levels, validate=False) - result._set_labels(new_labels) - result.sortorder = sortorder - result._set_names(self.names) - - return result + return MultiIndex(levels=self.levels, + labels=new_labels, + names=self.names, + sortorder=sortorder, + verify_integrity=False) def take(self, indexer, axis=None): - """ - Analogous to ndarray.take - """ indexer = com._ensure_platform_int(indexer) new_labels = [lab.take(indexer) for lab in self.labels] return MultiIndex(levels=self.levels, labels=new_labels, @@ -3167,6 +3337,13 @@ def append(self, other): def argsort(self, *args, **kwargs): return self.values.argsort() + def repeat(self, n): + return MultiIndex(levels=self.levels, + labels=[label.view(np.ndarray).repeat(n) for label in self.labels], + names=self.names, + sortorder=self.sortorder, + verify_integrity=False) + def drop(self, labels, level=None): """ Make new MultiIndex with passed list of labels deleted @@ -3185,7 +3362,7 @@ def drop(self, labels, level=None): return self._drop_from_level(labels, level) try: - if not isinstance(labels, np.ndarray): + if not isinstance(labels, (np.ndarray, Index)): labels = com._index_labels_to_array(labels) indexer = self.get_indexer(labels) mask = indexer == -1 @@ -3254,7 +3431,7 @@ def droplevel(self, level=0): mask = new_labels[0] == -1 result = new_levels[0].take(new_labels[0]) if mask.any(): - np.putmask(result, mask, np.nan) + result = result.putmask(mask, np.nan) result.name = new_names[0] return result @@ -3414,16 +3591,16 @@ def get_indexer(self, target, method=None, limit=None): if not self.is_unique or not self.is_monotonic: raise AssertionError(('Must be unique and monotonic to ' 'use forward fill getting the indexer')) - indexer = self_index._engine.get_pad_indexer(target_index, + indexer = self_index._engine.get_pad_indexer(target_index.values, limit=limit) elif method == 'backfill': if not self.is_unique or not self.is_monotonic: raise AssertionError(('Must be unique and monotonic to ' 'use backward fill getting the indexer')) - indexer = self_index._engine.get_backfill_indexer(target_index, + indexer = self_index._engine.get_backfill_indexer(target_index.values, limit=limit) else: - indexer = self_index._engine.get_indexer(target_index) + indexer = self_index._engine.get_indexer(target_index.values) return com._ensure_platform_int(indexer) @@ -4087,6 +4264,7 @@ def isin(self, values, level=None): return np.zeros(len(labs), dtype=np.bool_) else: return np.lib.arraysetops.in1d(labs, sought_labels) +MultiIndex._add_numeric_methods_disabled() # For utility purposes @@ -4192,6 +4370,12 @@ def _union_indexes(indexes): return result indexes, kind = _sanitize_and_check(indexes) + def _unique_indices(inds): + def conv(i): + if isinstance(i, Index): + i = i.tolist() + return i + return Index(lib.fast_unique_multiple_list([ conv(i) for i in inds ])) if kind == 'special': result = indexes[0] @@ -4206,11 +4390,11 @@ def _union_indexes(indexes): index = indexes[0] for other in indexes[1:]: if not index.equals(other): - return Index(lib.fast_unique_multiple(indexes)) + return _unique_indices(indexes) return index else: - return Index(lib.fast_unique_multiple_list(indexes)) + return _unique_indices(indexes) def _trim_front(strings): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index b02fe523df998..91008f9b22aed 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -7,7 +7,8 @@ import pandas.core.common as com from pandas.core.common import (_is_bool_indexer, is_integer_dtype, _asarray_tuplesafe, is_list_like, isnull, - ABCSeries, ABCDataFrame, ABCPanel, is_float) + ABCSeries, ABCDataFrame, ABCPanel, is_float, + _values_from_object) import pandas.lib as lib import numpy as np @@ -1086,7 +1087,7 @@ def _convert_to_indexer(self, obj, axis=0, is_setter=False): return {'key': obj} raise KeyError('%s not in index' % objarr[mask]) - return indexer + return _values_from_object(indexer) else: try: @@ -1512,7 +1513,7 @@ def _length_of_indexer(indexer, target=None): elif step < 0: step = abs(step) return (stop - start) / step - elif isinstance(indexer, (ABCSeries, np.ndarray, list)): + elif isinstance(indexer, (ABCSeries, Index, np.ndarray, list)): return len(indexer) elif not is_list_like(indexer): return 1 diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f5cb48fd94022..da36d95a3ad9e 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2617,15 +2617,22 @@ def copy(self, deep=True): Parameters ---------- - deep : boolean, default True + deep : boolean o rstring, default True If False, return shallow copy (do not copy data) + If 'all', copy data and a deep copy of the index Returns ------- copy : BlockManager """ + + # this preserves the notion of view copying of axes if deep: - new_axes = [ax.view() for ax in self.axes] + if deep == 'all': + copy = lambda ax: ax.copy(deep=True) + else: + copy = lambda ax: ax.view() + new_axes = [ copy(ax) for ax in self.axes] else: new_axes = list(self.axes) return self.apply('copy', axes=new_axes, deep=deep, diff --git a/pandas/core/ops.py b/pandas/core/ops.py index abe1974705243..9f29570af6f4f 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -247,7 +247,7 @@ def __init__(self, left, right, name): # need to make sure that we are aligning the data if isinstance(left, pd.Series) and isinstance(right, pd.Series): - left, right = left.align(right) + left, right = left.align(right,copy=False) self.left = left self.right = right @@ -331,12 +331,12 @@ def _convert_to_array(self, values, name=None, other=None): values = np.empty(values.shape, dtype=other.dtype) values[:] = tslib.iNaT - # a datetlike + # a datelike + elif isinstance(values, pd.DatetimeIndex): + values = values.to_series() elif not (isinstance(values, (pa.Array, pd.Series)) and com.is_datetime64_dtype(values)): values = tslib.array_to_datetime(values) - elif isinstance(values, pd.DatetimeIndex): - values = values.to_series() elif inferred_type in ('timedelta', 'timedelta64'): # have a timedelta, convert to to ns here values = _possibly_cast_to_timedelta(values, coerce=coerce, dtype='timedelta64[ns]') @@ -451,11 +451,11 @@ def na_op(x, y): result = expressions.evaluate(op, str_rep, x, y, raise_on_error=True, **eval_kwargs) except TypeError: - if isinstance(y, (pa.Array, pd.Series)): + if isinstance(y, (pa.Array, pd.Series, pd.Index)): dtype = np.find_common_type([x.dtype, y.dtype], []) result = np.empty(x.size, dtype=dtype) mask = notnull(x) & notnull(y) - result[mask] = op(x[mask], y[mask]) + result[mask] = op(x[mask], _values_from_object(y[mask])) elif isinstance(x, pa.Array): result = pa.empty(len(x), dtype=x.dtype) mask = notnull(x) @@ -555,7 +555,7 @@ def wrapper(self, other): index=self.index, name=name) elif isinstance(other, pd.DataFrame): # pragma: no cover return NotImplemented - elif isinstance(other, (pa.Array, pd.Series)): + elif isinstance(other, (pa.Array, pd.Series, pd.Index)): if len(self) != len(other): raise ValueError('Lengths must match to compare') return self._constructor(na_op(self.values, np.asarray(other)), @@ -565,7 +565,7 @@ def wrapper(self, other): mask = isnull(self) values = self.get_values() - other = _index.convert_scalar(values, other) + other = _index.convert_scalar(values,_values_from_object(other)) if issubclass(values.dtype.type, np.datetime64): values = values.view('i8') diff --git a/pandas/core/series.py b/pandas/core/series.py index d1f861b7f7fd7..22284df337d97 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -70,18 +70,6 @@ def wrapper(self): return wrapper -def _unbox(func): - @Appender(func.__doc__) - def f(self, *args, **kwargs): - result = func(self.values, *args, **kwargs) - if isinstance(result, (pa.Array, Series)) and result.ndim == 0: - # return NumPy type - return result.dtype.type(result.item()) - else: # pragma: no cover - return result - f.__name__ = func.__name__ - return f - #---------------------------------------------------------------------- # Series class @@ -290,76 +278,87 @@ def _set_subtyp(self, is_all_dates): object.__setattr__(self, '_subtyp', 'series') # ndarray compatibility - def item(self): - return self._data.values.item() - - @property - def data(self): - return self._data.values.data - - @property - def strides(self): - return self._data.values.strides - - @property - def size(self): - return self._data.values.size - - @property - def flags(self): - return self._data.values.flags - @property def dtype(self): + """ return the dtype object of the underlying data """ return self._data.dtype @property def dtypes(self): - """ for compat """ + """ return the dtype object of the underlying data """ return self._data.dtype @property def ftype(self): + """ return if the data is sparse|dense """ return self._data.ftype @property def ftypes(self): - """ for compat """ + """ return if the data is sparse|dense """ return self._data.ftype @property - def shape(self): - return self._data.shape + def values(self): + """ + Return Series as ndarray - @property - def ndim(self): - return 1 + Returns + ------- + arr : numpy.ndarray + """ + return self._data.values - @property - def base(self): - return self.values.base + def get_values(self): + """ same as values (but handles sparseness conversions); is a view """ + return self._data.get_values() + + # ops def ravel(self, order='C'): + """ + Return the flattened underlying data as an ndarray + + See also + -------- + numpy.ndarray.ravel + """ return self.values.ravel(order=order) def compress(self, condition, axis=0, out=None, **kwargs): - # 1-d compat with numpy - return self[condition] - - def transpose(self): - """ support for compatiblity """ - return self + """ + Return selected slices of an array along given axis as a Series - T = property(transpose) + See also + -------- + numpy.ndarray.compress + """ + return self[condition] def nonzero(self): - """ numpy like, returns same as nonzero """ + """ + return the a boolean array of the underlying data is nonzero + + See also + -------- + numpy.ndarray.nonzero + """ return self.values.nonzero() def put(self, *args, **kwargs): + """ + return a ndarray with the values put + + See also + -------- + numpy.ndarray.put + """ self.values.put(*args, **kwargs) def __len__(self): + """ + return the length of the Series + """ return len(self._data) def view(self, dtype=None): @@ -442,7 +441,7 @@ def _unpickle_series_compat(self, state): # recreate self._data = SingleBlockManager(data, index, fastpath=True) - self.index = index + self._index = index self.name = name else: @@ -549,7 +548,7 @@ def _get_with(self, key): raise # pragma: no cover - if not isinstance(key, (list, pa.Array, Series)): + if not isinstance(key, (list, pa.Array, Series, Index)): key = list(key) if isinstance(key, Index): @@ -716,7 +715,11 @@ def _set_values(self, key, value): def repeat(self, reps): """ - See ndarray.repeat + return a new Series with the values repeated reps times + + See also + -------- + numpy.ndarray.repeat """ new_index = self.index.repeat(reps) new_values = self.values.repeat(reps) @@ -725,7 +728,13 @@ def repeat(self, reps): def reshape(self, *args, **kwargs): """ - See numpy.ndarray.reshape + return an ndarray with the values shape + if the specified shape matches exactly the current shape, then + return self (for compat) + + See also + -------- + numpy.ndarray.take """ if len(args) == 1 and hasattr(args[0], '__iter__'): shape = args[0] @@ -989,12 +998,6 @@ def iteritems(self): if compat.PY3: # pragma: no cover items = iteritems - #---------------------------------------------------------------------- - # unbox reductions - - all = _unbox(pa.Array.all) - any = _unbox(pa.Array.any) - #---------------------------------------------------------------------- # Misc public methods @@ -1002,21 +1005,6 @@ def keys(self): "Alias for index" return self.index - @property - def values(self): - """ - Return Series as ndarray - - Returns - ------- - arr : numpy.ndarray - """ - return self._data.values - - def get_values(self): - """ same as values (but handles sparseness conversions); is a view """ - return self._data.get_values() - def tolist(self): """ Convert Series to a nested list """ return list(self) @@ -1191,6 +1179,7 @@ def idxmin(self, axis=None, out=None, skipna=True): See Also -------- DataFrame.idxmin + numpy.ndarray.argmin """ i = nanops.nanargmin(_values_from_object(self), skipna=skipna) if i == -1: @@ -1217,6 +1206,7 @@ def idxmax(self, axis=None, out=None, skipna=True): See Also -------- DataFrame.idxmax + numpy.ndarray.argmax """ i = nanops.nanargmax(_values_from_object(self), skipna=skipna) if i == -1: @@ -1334,7 +1324,7 @@ def cov(self, other, min_periods=None): Normalized by N-1 (unbiased estimator). """ - this, other = self.align(other, join='inner') + this, other = self.align(other, join='inner', copy=False) if len(this) == 0: return pa.NA return nanops.nancov(this.values, other.values, @@ -1460,7 +1450,7 @@ def _binop(self, other, func, level=None, fill_value=None): this = self if not self.index.equals(other.index): - this, other = self.align(other, level=level, join='outer') + this, other = self.align(other, level=level, join='outer', copy=False) new_index = this.index this_vals = this.values @@ -1599,6 +1589,9 @@ def argsort(self, axis=0, kind='quicksort', order=None): ------- argsorted : Series, with -1 indicated where nan values are present + See also + -------- + numpy.ndarray.argsort """ values = self.values mask = isnull(values) @@ -2072,8 +2065,7 @@ def reindex_axis(self, labels, axis=0, **kwargs): def take(self, indices, axis=0, convert=True, is_copy=False): """ - Analogous to ndarray.take, return Series corresponding to requested - indices + return Series corresponding to requested indices Parameters ---------- @@ -2083,6 +2075,10 @@ def take(self, indices, axis=0, convert=True, is_copy=False): Returns ------- taken : Series + + See also + -------- + numpy.ndarray.take """ # check/convert indicies here if convert: @@ -2483,7 +2479,7 @@ def _try_cast(arr, take_fast_path): return subarr # GH #846 - if isinstance(data, (pa.Array, Series)): + if isinstance(data, (pa.Array, Index, Series)): subarr = np.array(data, copy=False) if dtype is not None: diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index e80bfec9c8dba..52a9ef0370e9e 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,6 +1,5 @@ from pandas.compat import cPickle as pkl, pickle_compat as pc, PY3 - def to_pickle(obj, path): """ Pickle (serialize) object to input file path @@ -45,7 +44,7 @@ def try_read(path, encoding=None): try: with open(path, 'rb') as fh: return pkl.load(fh) - except: + except (Exception) as e: # reg/patched pickle try: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b95c1ed0b77e9..78e7c43de678f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3564,7 +3564,7 @@ def read(self, where=None, columns=None, **kwargs): # need a better algorithm tuple_index = long_index._tuple_index - unique_tuples = lib.fast_unique(tuple_index) + unique_tuples = lib.fast_unique(tuple_index.values) unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = match(unique_tuples, tuple_index) diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index 07d576ac1c8ae..aea7fb42b7d36 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -18,11 +18,6 @@ from pandas.util.misc import is_little_endian import pandas -def _read_pickle(vf, encoding=None, compat=False): - from pandas.compat import pickle_compat as pc - with open(vf,'rb') as fh: - pc.load(fh, encoding=encoding, compat=compat) - class TestPickle(tm.TestCase): _multiprocess_can_split_ = True @@ -97,16 +92,54 @@ def test_read_pickles_0_14_0(self): self.read_pickles('0.14.0') def test_round_trip_current(self): - for typ, dv in self.data.items(): + try: + import cPickle as c_pickle + def c_pickler(obj,path): + with open(path,'wb') as fh: + c_pickle.dump(obj,fh,protocol=-1) + + def c_unpickler(path): + with open(path,'rb') as fh: + fh.seek(0) + return c_pickle.load(fh) + except: + c_pickler = None + c_unpickler = None + + import pickle as python_pickle + + def python_pickler(obj,path): + with open(path,'wb') as fh: + python_pickle.dump(obj,fh,protocol=-1) + + def python_unpickler(path): + with open(path,'rb') as fh: + fh.seek(0) + return python_pickle.load(fh) + + for typ, dv in self.data.items(): for dt, expected in dv.items(): - with tm.ensure_clean(self.path) as path: + for writer in [pd.to_pickle, c_pickler, python_pickler ]: + if writer is None: + continue + + with tm.ensure_clean(self.path) as path: + + # test writing with each pickler + writer(expected,path) + + # test reading with each unpickler + result = pd.read_pickle(path) + self.compare_element(typ, result, expected) - pd.to_pickle(expected,path) + if c_unpickler is not None: + result = c_unpickler(path) + self.compare_element(typ, result, expected) - result = pd.read_pickle(path) - self.compare_element(typ, result, expected) + result = python_unpickler(path) + self.compare_element(typ, result, expected) def _validate_timeseries(self, pickled, current): # GH 7748 diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 024415409cdca..4f76f72b8eb66 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -857,11 +857,16 @@ def check(format,index): assert_frame_equal(df,store['df']) for index in [ tm.makeFloatIndex, tm.makeStringIndex, tm.makeIntIndex, - tm.makeDateIndex, tm.makePeriodIndex ]: + tm.makeDateIndex ]: check('table',index) check('fixed',index) + # period index currently broken for table + # seee GH7796 FIXME + check('fixed',tm.makePeriodIndex) + #check('table',tm.makePeriodIndex) + # unicode index = tm.makeUnicodeIndex if compat.PY3: @@ -2285,7 +2290,7 @@ def test_remove_where(self): # deleted number (entire table) n = store.remove('wp', []) - assert(n == 120) + self.assertTrue(n == 120) # non - empty where _maybe_remove(store, 'wp') @@ -2379,7 +2384,8 @@ def test_remove_crit(self): crit4 = Term('major_axis=date4') store.put('wp3', wp, format='t') n = store.remove('wp3', where=[crit4]) - assert(n == 36) + self.assertTrue(n == 36) + result = store.select('wp3') expected = wp.reindex(major_axis=wp.major_axis - date4) assert_panel_equal(result, expected) @@ -2392,11 +2398,10 @@ def test_remove_crit(self): crit1 = Term('major_axis>date') crit2 = Term("minor_axis=['A', 'D']") n = store.remove('wp', where=[crit1]) - - assert(n == 56) + self.assertTrue(n == 56) n = store.remove('wp', where=[crit2]) - assert(n == 32) + self.assertTrue(n == 32) result = store['wp'] expected = wp.truncate(after=date).reindex(minor=['B', 'C']) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 373320393bff2..7ffc59f6ab50d 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -813,7 +813,7 @@ def clean_index_list(list obj): for i in range(n): v = obj[i] - if not (PyList_Check(v) or np.PyArray_Check(v)): + if not (PyList_Check(v) or np.PyArray_Check(v) or hasattr(v,'_data')): all_arrays = 0 break @@ -823,7 +823,7 @@ def clean_index_list(list obj): converted = np.empty(n, dtype=object) for i in range(n): v = obj[i] - if PyList_Check(v) or np.PyArray_Check(v): + if PyList_Check(v) or np.PyArray_Check(v) or hasattr(v,'_data'): converted[i] = tuple(v) else: converted[i] = v diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index a12d1dfe70513..5227bb23ad616 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -4,7 +4,6 @@ import numpy as np import operator -import pickle from pandas.core.series import Series from pandas.core.common import notnull @@ -169,8 +168,7 @@ def _check_inplace_op(op): def test_pickle(self): def _check_roundtrip(obj): - pickled = pickle.dumps(obj) - unpickled = pickle.loads(pickled) + unpickled = self.round_trip_pickle(obj) assert_sp_array_equal(unpickled, obj) _check_roundtrip(self.arr) diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index 475b8f93c10ef..105f661f08b10 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -21,7 +21,7 @@ import pandas.core.datetools as datetools from pandas.core.common import isnull import pandas.util.testing as tm -from pandas.compat import range, lrange, cPickle as pickle, StringIO, lrange +from pandas.compat import range, lrange, StringIO, lrange from pandas import compat import pandas.sparse.frame as spf @@ -315,8 +315,7 @@ def test_kind(self): def test_pickle(self): def _test_roundtrip(series): - pickled = pickle.dumps(series, protocol=pickle.HIGHEST_PROTOCOL) - unpickled = pickle.loads(pickled) + unpickled = self.round_trip_pickle(series) assert_sp_series_equal(series, unpickled) assert_series_equal(series.to_dense(), unpickled.to_dense()) @@ -793,7 +792,10 @@ def test_copy(self): cp = self.frame.copy() tm.assert_isinstance(cp, SparseDataFrame) assert_sp_frame_equal(cp, self.frame) - self.assertTrue(cp.index.is_(self.frame.index)) + + # as of v0.15.0 + # this is now identical (but not is_a ) + self.assertTrue(cp.index.identical(self.frame.index)) def test_constructor(self): for col, series in compat.iteritems(self.frame): @@ -918,9 +920,8 @@ def test_array_interface(self): def test_pickle(self): def _test_roundtrip(frame): - pickled = pickle.dumps(frame, protocol=pickle.HIGHEST_PROTOCOL) - unpickled = pickle.loads(pickled) - assert_sp_frame_equal(frame, unpickled) + result = self.round_trip_pickle(frame) + assert_sp_frame_equal(frame, result) _test_roundtrip(SparseDataFrame()) self._check_all(_test_roundtrip) @@ -1608,12 +1609,11 @@ def test_from_dict(self): def test_pickle(self): def _test_roundtrip(panel): - pickled = pickle.dumps(panel, protocol=pickle.HIGHEST_PROTOCOL) - unpickled = pickle.loads(pickled) - tm.assert_isinstance(unpickled.items, Index) - tm.assert_isinstance(unpickled.major_axis, Index) - tm.assert_isinstance(unpickled.minor_axis, Index) - assert_sp_panel_equal(panel, unpickled) + result = self.round_trip_pickle(panel) + tm.assert_isinstance(result.items, Index) + tm.assert_isinstance(result.major_axis, Index) + tm.assert_isinstance(result.minor_axis, Index) + assert_sp_panel_equal(panel, result) _test_roundtrip(self.panel) diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py index 842be5a1645bf..f7aede92d635d 100644 --- a/pandas/src/generate_code.py +++ b/pandas/src/generate_code.py @@ -55,6 +55,17 @@ else: return np.array(arr, dtype=np.int_) +cpdef ensure_object(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_OBJECT: + return arr + else: + return arr.astype(np.object_) + elif hasattr(arr,'asobject'): + return arr.asobject + else: + return np.array(arr, dtype=np.object_) + """ @@ -2189,7 +2200,7 @@ def outer_join_indexer_%(name)s(ndarray[%(c_type)s] left, ('int32', 'INT32', 'int32'), ('int64', 'INT64', 'int64'), # ('platform_int', 'INT', 'int_'), - ('object', 'OBJECT', 'object_'), + #('object', 'OBJECT', 'object_'), ] def generate_ensure_dtypes(): diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx index 97a34582d2ef2..50eefa5e783cf 100644 --- a/pandas/src/generated.pyx +++ b/pandas/src/generated.pyx @@ -49,6 +49,17 @@ cpdef ensure_platform_int(object arr): else: return np.array(arr, dtype=np.int_) +cpdef ensure_object(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_OBJECT: + return arr + else: + return arr.astype(np.object_) + elif hasattr(arr,'asobject'): + return arr.asobject + else: + return np.array(arr, dtype=np.object_) + cpdef ensure_float64(object arr): @@ -111,16 +122,6 @@ cpdef ensure_int64(object arr): return np.array(arr, dtype=np.int64) -cpdef ensure_object(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_OBJECT: - return arr - else: - return arr.astype(np.object_) - else: - return np.array(arr, dtype=np.object_) - - @cython.wraparound(False) @cython.boundscheck(False) cpdef map_indices_float64(ndarray[float64_t] index): @@ -5932,7 +5933,7 @@ def group_mean_bin_float64(ndarray[float64_t, ndim=2] out, for i in range(ngroups): for j in range(K): count = nobs[i, j] - if nobs[i, j] == 0: + if count == 0: out[i, j] = nan else: out[i, j] = sumx[i, j] / count @@ -5985,7 +5986,7 @@ def group_mean_bin_float32(ndarray[float32_t, ndim=2] out, for i in range(ngroups): for j in range(K): count = nobs[i, j] - if nobs[i, j] == 0: + if count == 0: out[i, j] = nan else: out[i, j] = sumx[i, j] / count diff --git a/pandas/src/reduce.pyx b/pandas/src/reduce.pyx index a22e7e636d7e4..add9a03642bed 100644 --- a/pandas/src/reduce.pyx +++ b/pandas/src/reduce.pyx @@ -13,7 +13,7 @@ cdef class Reducer: ''' cdef: Py_ssize_t increment, chunksize, nresults - object arr, dummy, f, labels, typ, index + object arr, dummy, f, labels, typ, ityp, index def __init__(self, object arr, object f, axis=1, dummy=None, labels=None): @@ -37,38 +37,34 @@ cdef class Reducer: self.f = f self.arr = arr - self.typ = None self.labels = labels - self.dummy, index = self._check_dummy(dummy=dummy) - - self.labels = labels - self.index = index + self.dummy, self.typ, self.index, self.ityp = self._check_dummy(dummy=dummy) def _check_dummy(self, dummy=None): - cdef object index + cdef object index=None, typ=None, ityp=None if dummy is None: dummy = np.empty(self.chunksize, dtype=self.arr.dtype) - index = None # our ref is stolen later since we are creating this array # in cython, so increment first Py_INCREF(dummy) + else: + # we passed a series-like if hasattr(dummy,'values'): - self.typ = type(dummy) + typ = type(dummy) index = getattr(dummy,'index',None) dummy = dummy.values if dummy.dtype != self.arr.dtype: raise ValueError('Dummy array must be same dtype') if len(dummy) != self.chunksize: - raise ValueError('Dummy array must be length %d' % - self.chunksize) + raise ValueError('Dummy array must be length %d' % self.chunksize) - return dummy, index + return dummy, typ, index, ityp def get_result(self): cdef: @@ -76,21 +72,23 @@ cdef class Reducer: ndarray arr, result, chunk Py_ssize_t i, incr flatiter it + bint has_labels object res, name, labels, index - object cached_typ = None + object cached_typ=None arr = self.arr chunk = self.dummy dummy_buf = chunk.data chunk.data = arr.data labels = self.labels - index = self.index + has_labels = labels is not None + has_index = self.index is not None incr = self.increment try: for i in range(self.nresults): - if labels is not None: + if has_labels: name = util.get_value_at(labels, i) else: name = None @@ -102,9 +100,9 @@ cdef class Reducer: if self.typ is not None: # recreate with the index if supplied - if index is not None: + if has_index: - cached_typ = self.typ(chunk, index=index, name=name) + cached_typ = self.typ(chunk, index=self.index, name=name) else: @@ -113,6 +111,10 @@ cdef class Reducer: # use the cached_typ if possible if cached_typ is not None: + + if has_index: + object.__setattr__(cached_typ, 'index', self.index) + object.__setattr__(cached_typ._data._block, 'values', chunk) object.__setattr__(cached_typ, 'name', name) res = self.f(cached_typ) @@ -121,7 +123,6 @@ cdef class Reducer: if hasattr(res,'values'): res = res.values - if i == 0: result = self._get_result_array(res) it = PyArray_IterNew(result) @@ -163,7 +164,7 @@ cdef class SeriesBinGrouper: bint passed_dummy cdef public: - object arr, index, dummy_arr, dummy_index, values, f, bins, typ, name + object arr, index, dummy_arr, dummy_index, values, f, bins, typ, ityp, name def __init__(self, object series, object f, object bins, object dummy): n = len(series) @@ -175,8 +176,9 @@ cdef class SeriesBinGrouper: if not values.flags.c_contiguous: values = values.copy('C') self.arr = values - self.index = series.index self.typ = type(series) + self.ityp = type(series.index) + self.index = series.index.values self.name = getattr(series,'name',None) self.dummy_arr, self.dummy_index = self._check_dummy(dummy) @@ -189,6 +191,8 @@ cdef class SeriesBinGrouper: self.ngroups = len(bins) + 1 def _check_dummy(self, dummy=None): + # both values and index must be an ndarray! + if dummy is None: values = np.empty(0, dtype=self.arr.dtype) index = None @@ -198,7 +202,9 @@ cdef class SeriesBinGrouper: raise ValueError('Dummy array must be same dtype') if not values.flags.contiguous: values = values.copy() - index = dummy.index + index = dummy.index.values + if not index.flags.contiguous: + index = index.copy() return values, index @@ -210,8 +216,7 @@ cdef class SeriesBinGrouper: object res bint initialized = 0 Slider vslider, islider - object gin, typ, name - object cached_typ = None + object name, cached_typ=None, cached_ityp=None counts = np.zeros(self.ngroups, dtype=np.int64) @@ -230,8 +235,6 @@ cdef class SeriesBinGrouper: vslider = Slider(self.arr, self.dummy_arr) islider = Slider(self.index, self.dummy_index) - gin = self.dummy_index._engine - try: for i in range(self.ngroups): group_size = counts[i] @@ -240,13 +243,17 @@ cdef class SeriesBinGrouper: vslider.set_length(group_size) if cached_typ is None: - cached_typ = self.typ(vslider.buf, index=islider.buf, + cached_ityp = self.ityp(islider.buf) + cached_typ = self.typ(vslider.buf, index=cached_ityp, name=name) else: + object.__setattr__(cached_ityp, '_data', islider.buf) + cached_ityp._engine.clear_mapping() object.__setattr__(cached_typ._data._block, 'values', vslider.buf) - object.__setattr__(cached_typ, '_index', islider.buf) + object.__setattr__(cached_typ, '_index', cached_ityp) object.__setattr__(cached_typ, 'name', name) + cached_ityp._engine.clear_mapping() res = self.f(cached_typ) res = _extract_result(res) if not initialized: @@ -258,7 +265,6 @@ cdef class SeriesBinGrouper: islider.advance(group_size) vslider.advance(group_size) - gin.clear_mapping() except: raise finally: @@ -292,7 +298,7 @@ cdef class SeriesGrouper: bint passed_dummy cdef public: - object arr, index, dummy_arr, dummy_index, f, labels, values, typ, name + object arr, index, dummy_arr, dummy_index, f, labels, values, typ, ityp, name def __init__(self, object series, object f, object labels, Py_ssize_t ngroups, object dummy): @@ -305,8 +311,9 @@ cdef class SeriesGrouper: if not values.flags.c_contiguous: values = values.copy('C') self.arr = values - self.index = series.index self.typ = type(series) + self.ityp = type(series.index) + self.index = series.index.values self.name = getattr(series,'name',None) self.dummy_arr, self.dummy_index = self._check_dummy(dummy) @@ -314,6 +321,8 @@ cdef class SeriesGrouper: self.ngroups = ngroups def _check_dummy(self, dummy=None): + # both values and index must be an ndarray! + if dummy is None: values = np.empty(0, dtype=self.arr.dtype) index = None @@ -323,7 +332,9 @@ cdef class SeriesGrouper: raise ValueError('Dummy array must be same dtype') if not values.flags.contiguous: values = values.copy() - index = dummy.index + index = dummy.index.values + if not index.flags.contiguous: + index = index.copy() return values, index @@ -335,8 +346,7 @@ cdef class SeriesGrouper: object res bint initialized = 0 Slider vslider, islider - object gin, typ, name - object cached_typ = None + object name, cached_typ=None, cached_ityp=None labels = self.labels counts = np.zeros(self.ngroups, dtype=np.int64) @@ -347,8 +357,6 @@ cdef class SeriesGrouper: vslider = Slider(self.arr, self.dummy_arr) islider = Slider(self.index, self.dummy_index) - gin = self.dummy_index._engine - try: for i in range(n): group_size += 1 @@ -366,13 +374,17 @@ cdef class SeriesGrouper: vslider.set_length(group_size) if cached_typ is None: - cached_typ = self.typ(vslider.buf, index=islider.buf, + cached_ityp = self.ityp(islider.buf) + cached_typ = self.typ(vslider.buf, index=cached_ityp, name=name) else: + object.__setattr__(cached_ityp, '_data', islider.buf) + cached_ityp._engine.clear_mapping() object.__setattr__(cached_typ._data._block, 'values', vslider.buf) - object.__setattr__(cached_typ, '_index', islider.buf) + object.__setattr__(cached_typ, '_index', cached_ityp) object.__setattr__(cached_typ, 'name', name) + cached_ityp._engine.clear_mapping() res = self.f(cached_typ) res = _extract_result(res) if not initialized: @@ -386,8 +398,6 @@ cdef class SeriesGrouper: group_size = 0 - gin.clear_mapping() - except: raise finally: @@ -434,6 +444,7 @@ cdef class Slider: def __init__(self, object values, object buf): assert(values.ndim == 1) + if not values.flags.contiguous: values = values.copy() @@ -463,11 +474,11 @@ cdef class Slider: self.buf.shape[0] = length cpdef reset(self): + self.buf.shape[0] = self.orig_len self.buf.data = self.orig_data self.buf.strides[0] = self.orig_stride - class InvalidApply(Exception): pass @@ -488,7 +499,7 @@ def apply_frame_axis0(object frame, object f, object names, # Need to infer if our low-level mucking is going to cause a segfault if n > 0: - chunk = frame[starts[0]:ends[0]] + chunk = frame.iloc[starts[0]:ends[0]] shape_before = chunk.shape try: result = f(chunk) @@ -497,17 +508,16 @@ def apply_frame_axis0(object frame, object f, object names, except: raise InvalidApply('Let this error raise above us') + slider = BlockSlider(frame) mutated = False item_cache = slider.dummy._item_cache - gin = slider.dummy.index._engine # f7u12 try: for i in range(n): slider.move(starts[i], ends[i]) item_cache.clear() # ugh - gin.clear_mapping() object.__setattr__(slider.dummy, 'name', names[i]) piece = f(slider.dummy) @@ -515,11 +525,12 @@ def apply_frame_axis0(object frame, object f, object names, # I'm paying the price for index-sharing, ugh try: if piece.index is slider.dummy.index: - piece = piece.copy() + piece = piece.copy(deep='all') else: mutated = True except AttributeError: pass + results.append(piece) finally: slider.reset() @@ -532,7 +543,7 @@ cdef class BlockSlider: ''' cdef public: - object frame, dummy + object frame, dummy, index int nblocks Slider idx_slider list blocks @@ -543,6 +554,7 @@ cdef class BlockSlider: def __init__(self, frame): self.frame = frame self.dummy = frame[:0] + self.index = self.dummy.index self.blocks = [b.values for b in self.dummy._data.blocks] @@ -550,7 +562,7 @@ cdef class BlockSlider: util.set_array_not_contiguous(x) self.nblocks = len(self.blocks) - self.idx_slider = Slider(self.frame.index, self.dummy.index) + self.idx_slider = Slider(self.frame.index.values, self.dummy.index.values) self.base_ptrs = malloc(sizeof(char*) * len(self.blocks)) for i, block in enumerate(self.blocks): @@ -562,6 +574,7 @@ cdef class BlockSlider: cpdef move(self, int start, int end): cdef: ndarray arr + object index # move blocks for i in range(self.nblocks): @@ -571,13 +584,16 @@ cdef class BlockSlider: arr.data = self.base_ptrs[i] + arr.strides[1] * start arr.shape[1] = end - start + # move and set the index self.idx_slider.move(start, end) + object.__setattr__(self.index,'_data',self.idx_slider.buf) + self.index._engine.clear_mapping() cdef reset(self): cdef: ndarray arr - # move blocks + # reset blocks for i in range(self.nblocks): arr = self.blocks[i] @@ -585,12 +601,25 @@ cdef class BlockSlider: arr.data = self.base_ptrs[i] arr.shape[1] = 0 - self.idx_slider.reset() - - def reduce(arr, f, axis=0, dummy=None, labels=None): - if labels._has_complex_internals: - raise Exception('Cannot use shortcut') + """ + + Paramaters + ----------- + arr : NDFrame object + f : function + axis : integer axis + dummy : type of reduced output (series) + labels : Index or None + """ + + if labels is not None: + if labels._has_complex_internals: + raise Exception('Cannot use shortcut') + + # pass as an ndarray + if hasattr(labels,'values'): + labels = labels.values reducer = Reducer(arr, f, axis=axis, dummy=dummy, labels=labels) return reducer.get_result() diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c index f6cb5b9803e25..c1e9f8edcf423 100644 --- a/pandas/src/ujson/python/objToJSON.c +++ b/pandas/src/ujson/python/objToJSON.c @@ -1537,7 +1537,7 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) PRINTMARK(); tc->type = JT_OBJECT; pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "index"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(PyObject_GetAttrString(obj, "index"), "values"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); if (!pc->columnLabels) { goto INVALID; @@ -1614,7 +1614,7 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) PRINTMARK(); tc->type = JT_ARRAY; pc->columnLabelsLen = PyArray_DIM(pc->newObj, 1); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "columns"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(PyObject_GetAttrString(obj, "columns"), "values"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); if (!pc->columnLabels) { goto INVALID; @@ -1632,7 +1632,7 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) goto INVALID; } pc->columnLabelsLen = PyArray_DIM(pc->newObj, 1); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "columns"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(PyObject_GetAttrString(obj, "columns"), "values"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); if (!pc->columnLabels) { NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); @@ -1645,7 +1645,7 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) PRINTMARK(); tc->type = JT_OBJECT; pc->rowLabelsLen = PyArray_DIM(pc->newObj, 1); - pc->rowLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "columns"), (JSONObjectEncoder*) enc, pc->rowLabelsLen); + pc->rowLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(PyObject_GetAttrString(obj, "columns"), "values"), (JSONObjectEncoder*) enc, pc->rowLabelsLen); if (!pc->rowLabels) { goto INVALID; diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6353ad53a88ef..fe070cff2e0ea 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -187,16 +187,17 @@ def test_object_refcount_bug(self): len(algos.unique(lst)) def test_on_index_object(self): + mindex = pd.MultiIndex.from_arrays([np.arange(5).repeat(5), np.tile(np.arange(5), 5)]) + expected = mindex.values + expected.sort() + mindex = mindex.repeat(2) result = pd.unique(mindex) result.sort() - expected = mindex.values - expected.sort() - tm.assert_almost_equal(result, expected) class TestValueCounts(tm.TestCase): diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 9acb1804a7ef0..90a36228e816a 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -197,6 +197,32 @@ def setUp(self): self.is_valid_objs = [ o for o in self.objs if o._allow_index_ops ] self.not_valid_objs = [ o for o in self.objs if not o._allow_index_ops ] + def test_ndarray_compat_properties(self): + + for o in self.objs: + + # check that we work + for p in ['shape','dtype','base','flags','T', + 'strides','itemsize','nbytes']: + self.assertIsNotNone(getattr(o,p,None)) + + # if we have a datetimelike dtype then needs a view to work + # but the user is responsible for that + try: + self.assertIsNotNone(o.data) + except (ValueError): + pass + + # len > 1 + self.assertRaises(ValueError, lambda : o.item()) + + self.assertTrue(o.ndim == 1) + + self.assertTrue(o.size == len(o)) + + self.assertTrue(Index([1]).item() == 1) + self.assertTrue(Series([1]).item() == 1) + def test_ops(self): tm._skip_if_not_numpy17_friendly() for op in ['max','min']: @@ -243,11 +269,13 @@ def test_value_counts_unique_nunique(self): # create repeated values, 'n'th element is repeated by n+1 times if isinstance(o, PeriodIndex): # freq must be specified because repeat makes freq ambiguous + expected_index = o[::-1] o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq) else: + expected_index = values[::-1] o = klass(np.repeat(values, range(1, len(o) + 1))) - expected_s = Series(range(10, 0, -1), index=values[::-1], dtype='int64') + expected_s = Series(range(10, 0, -1), index=expected_index, dtype='int64') tm.assert_series_equal(o.value_counts(), expected_s) result = o.unique() @@ -278,12 +306,14 @@ def test_value_counts_unique_nunique(self): # create repeated values, 'n'th element is repeated by n+1 times if isinstance(o, PeriodIndex): + expected_index = o o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq) else: + expected_index = values o = klass(np.repeat(values, range(1, len(o) + 1))) - expected_s_na = Series(list(range(10, 2, -1)) +[3], index=values[9:0:-1], dtype='int64') - expected_s = Series(list(range(10, 2, -1)), index=values[9:1:-1], dtype='int64') + expected_s_na = Series(list(range(10, 2, -1)) +[3], index=expected_index[9:0:-1], dtype='int64') + expected_s = Series(list(range(10, 2, -1)), index=expected_index[9:1:-1], dtype='int64') tm.assert_series_equal(o.value_counts(dropna=False), expected_s_na) tm.assert_series_equal(o.value_counts(), expected_s) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 88a86da27daf9..6a31f573951cd 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -16,7 +16,7 @@ from pandas.compat import( map, zip, range, long, lrange, lmap, lzip, - OrderedDict, cPickle as pickle, u, StringIO + OrderedDict, u, StringIO ) from pandas import compat @@ -3620,6 +3620,7 @@ def test_constructor_with_datetimes(self): df = DataFrame() df['a'] = i assert_frame_equal(df, expected) + df = DataFrame( {'a' : i } ) assert_frame_equal(df, expected) @@ -3925,14 +3926,14 @@ def test_array_interface(self): assert_frame_equal(result, self.frame.apply(np.sqrt)) def test_pickle(self): - unpickled = pickle.loads(pickle.dumps(self.mixed_frame)) + unpickled = self.round_trip_pickle(self.mixed_frame) assert_frame_equal(self.mixed_frame, unpickled) # buglet self.mixed_frame._data.ndim # empty - unpickled = pickle.loads(pickle.dumps(self.empty)) + unpickled = self.round_trip_pickle(self.empty) repr(unpickled) def test_to_dict(self): @@ -12578,6 +12579,7 @@ def test_empty_nonzero(self): self.assertTrue(df.T.empty) def test_any_all(self): + self._check_bool_op('any', np.any, has_skipna=True, has_bool_only=True) self._check_bool_op('all', np.all, has_skipna=True, has_bool_only=True) diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 5d9b43e48e3c1..8dbcb8c542fb3 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -1000,7 +1000,8 @@ def test_xcompat(self): pd.plot_params['x_compat'] = False ax = df.plot() lines = ax.get_lines() - tm.assert_isinstance(lines[0].get_xdata(), PeriodIndex) + self.assertNotIsInstance(lines[0].get_xdata(), PeriodIndex) + self.assertIsInstance(PeriodIndex(lines[0].get_xdata()), PeriodIndex) tm.close() # useful if you're plotting a bunch together @@ -1012,7 +1013,8 @@ def test_xcompat(self): tm.close() ax = df.plot() lines = ax.get_lines() - tm.assert_isinstance(lines[0].get_xdata(), PeriodIndex) + self.assertNotIsInstance(lines[0].get_xdata(), PeriodIndex) + self.assertIsInstance(PeriodIndex(lines[0].get_xdata()), PeriodIndex) def test_unsorted_index(self): df = DataFrame({'y': np.arange(100)}, diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index f958d5481ad33..8e9503b4fe1a3 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2152,6 +2152,10 @@ def test_non_cython_api(self): result = g.idxmax() assert_frame_equal(result,expected) + def test_cython_api2(self): + + # this takes the fast apply path + # cumsum (GH5614) df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=['A', 'B', 'C']) expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=['B', 'C']) @@ -2425,6 +2429,31 @@ def convert_force_pure(x): self.assertEqual(result.dtype, np.object_) tm.assert_isinstance(result[0], Decimal) + def test_fast_apply(self): + # make sure that fast apply is correctly called + # rather than raising any kind of error + # otherwise the python path will be callsed + # which slows things down + N = 1000 + labels = np.random.randint(0, 2000, size=N) + labels2 = np.random.randint(0, 3, size=N) + df = DataFrame({'key': labels, + 'key2': labels2, + 'value1': np.random.randn(N), + 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)}) + def f(g): + return 1 + + g = df.groupby(['key', 'key2']) + + grouper = g.grouper + + splitter = grouper._get_splitter(g._selected_obj, axis=g.axis) + group_keys = grouper._get_group_keys() + + values, mutated = splitter.fast_apply(f, group_keys) + self.assertFalse(mutated) + def test_apply_with_mixed_dtype(self): # GH3480, apply with mixed dtype on axis=1 breaks in 0.11 df = DataFrame({'foo1' : ['one', 'two', 'two', 'three', 'one', 'two'], diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index c32c7ddc55ced..5affdbe1c99aa 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -3,7 +3,6 @@ from datetime import datetime, timedelta from pandas.compat import range, lrange, lzip, u, zip import operator -import pickle import re import nose import warnings @@ -12,9 +11,12 @@ import numpy as np from numpy.testing import assert_array_equal +from pandas import period_range, date_range + from pandas.core.index import (Index, Float64Index, Int64Index, MultiIndex, - InvalidIndexError) + InvalidIndexError, NumericIndex) from pandas.tseries.index import DatetimeIndex +from pandas.tseries.period import PeriodIndex from pandas.core.series import Series from pandas.util.testing import (assert_almost_equal, assertRaisesRegexp, assert_copy) @@ -32,7 +34,48 @@ from pandas import _np_version_under1p7 -class TestIndex(tm.TestCase): +class Base(object): + """ base class for index sub-class tests """ + _holder = None + + def verify_pickle(self,index): + unpickled = self.round_trip_pickle(index) + self.assertTrue(index.equals(unpickled)) + + def test_pickle_compat_construction(self): + # this is testing for pickle compat + if self._holder is None: + return + + # need an object to create with + self.assertRaises(TypeError, self._holder) + + def test_numeric_compat(self): + + idx = self.create_index() + tm.assertRaisesRegexp(TypeError, + "cannot perform multiplication", + lambda : idx * 1) + tm.assertRaisesRegexp(TypeError, + "cannot perform multiplication", + lambda : 1 * idx) + + div_err = "cannot perform true division" if compat.PY3 else "cannot perform division" + tm.assertRaisesRegexp(TypeError, + div_err, + lambda : idx / 1) + tm.assertRaisesRegexp(TypeError, + div_err, + lambda : 1 / idx) + tm.assertRaisesRegexp(TypeError, + "cannot perform floor division", + lambda : idx // 1) + tm.assertRaisesRegexp(TypeError, + "cannot perform floor division", + lambda : 1 // idx) + +class TestIndex(Base, tm.TestCase): + _holder = Index _multiprocess_can_split_ = True def setUp(self): @@ -49,6 +92,9 @@ def setUp(self): for name, ind in self.indices.items(): setattr(self, name, ind) + def create_index(self): + return Index(list('abcde')) + def test_wrong_number_names(self): def testit(ind): ind.names = ["apple", "banana", "carrot"] @@ -123,7 +169,7 @@ def test_constructor(self): # casting arr = np.array(self.strIndex) - index = arr.view(Index) + index = Index(arr) tm.assert_contains_all(arr, index) self.assert_numpy_array_equal(self.strIndex, index) @@ -181,13 +227,12 @@ def __array__(self, dtype=None): for array in [np.arange(5), np.array(['a', 'b', 'c']), - pd.date_range('2000-01-01', periods=3).values]: + date_range('2000-01-01', periods=3).values]: expected = pd.Index(array) result = pd.Index(ArrayLike(array)) self.assertTrue(result.equals(expected)) def test_index_ctor_infer_periodindex(self): - from pandas import period_range, PeriodIndex xp = period_range('2012-1-1', freq='M', periods=3) rs = Index(xp) assert_array_equal(rs, xp) @@ -312,8 +357,9 @@ def test_is_(self): self.assertFalse(ind.is_(ind[:])) self.assertFalse(ind.is_(ind.view(np.ndarray).view(Index))) self.assertFalse(ind.is_(np.array(range(10)))) + # quasi-implementation dependent - self.assertTrue(ind.is_(ind.view().base)) + self.assertTrue(ind.is_(ind.view())) ind2 = ind.view() ind2.name = 'bob' self.assertTrue(ind.is_(ind2)) @@ -366,8 +412,7 @@ def _check(op): arr_result = op(arr, element) index_result = op(index, element) - tm.assert_isinstance(index_result, np.ndarray) - self.assertNotIsInstance(index_result, Index) + self.assertIsInstance(index_result, np.ndarray) self.assert_numpy_array_equal(arr_result, index_result) _check(operator.eq) @@ -617,6 +662,7 @@ def test_symmetric_diff(self): idx2 = Index([0, 1, np.nan]) result = idx1.sym_diff(idx2) # expected = Index([0.0, np.nan, 2.0, 3.0, np.nan]) + nans = pd.isnull(result) self.assertEqual(nans.sum(), 2) self.assertEqual((~nans).sum(), 3) @@ -639,21 +685,11 @@ def test_symmetric_diff(self): idx1 - 1 def test_pickle(self): - def testit(index): - pickled = pickle.dumps(index) - unpickled = pickle.loads(pickled) - - tm.assert_isinstance(unpickled, Index) - self.assert_numpy_array_equal(unpickled, index) - self.assertEqual(unpickled.name, index.name) - - # tm.assert_dict_equal(unpickled.indexMap, index.indexMap) - testit(self.strIndex) + self.verify_pickle(self.strIndex) self.strIndex.name = 'foo' - testit(self.strIndex) - - testit(self.dateIndex) + self.verify_pickle(self.strIndex) + self.verify_pickle(self.dateIndex) def test_is_numeric(self): self.assertFalse(self.dateIndex.is_numeric()) @@ -902,9 +938,7 @@ def test_boolean_cmp(self): idx = Index(values) res = (idx == values) - self.assertTrue(res.all()) - self.assertEqual(res.dtype, 'bool') - self.assertNotIsInstance(res, Index) + self.assert_numpy_array_equal(res,np.array([True,True,True,True],dtype=bool)) def test_get_level_values(self): result = self.strIndex.get_level_values(0) @@ -951,13 +985,64 @@ def test_nan_first_take_datetime(self): tm.assert_index_equal(res, exp) -class TestFloat64Index(tm.TestCase): +class Numeric(Base): + + def test_numeric_compat(self): + + idx = self._holder(np.arange(5,dtype='int64')) + didx = self._holder(np.arange(5,dtype='int64')**2 + ) + result = idx * 1 + tm.assert_index_equal(result, idx) + + result = 1 * idx + tm.assert_index_equal(result, idx) + + result = idx * idx + tm.assert_index_equal(result, didx) + + result = idx / 1 + tm.assert_index_equal(result, idx) + + result = idx // 1 + tm.assert_index_equal(result, idx) + + result = idx * np.array(5,dtype='int64') + tm.assert_index_equal(result, self._holder(np.arange(5,dtype='int64')*5)) + + result = idx * np.arange(5,dtype='int64') + tm.assert_index_equal(result, didx) + + result = idx * Series(np.arange(5,dtype='int64')) + tm.assert_index_equal(result, didx) + + result = idx * Series(np.arange(5,dtype='float64')+0.1) + tm.assert_index_equal(result, + Float64Index(np.arange(5,dtype='float64')*(np.arange(5,dtype='float64')+0.1))) + + + # invalid + self.assertRaises(TypeError, lambda : idx * date_range('20130101',periods=5)) + self.assertRaises(ValueError, lambda : idx * self._holder(np.arange(3))) + self.assertRaises(ValueError, lambda : idx * np.array([1,2])) + + def test_ufunc_compat(self): + idx = self._holder(np.arange(5,dtype='int64')) + result = np.sin(idx) + expected = Float64Index(np.sin(np.arange(5,dtype='int64'))) + tm.assert_index_equal(result, expected) + +class TestFloat64Index(Numeric, tm.TestCase): + _holder = Float64Index _multiprocess_can_split_ = True def setUp(self): self.mixed = Float64Index([1.5, 2, 3, 4, 5]) self.float = Float64Index(np.arange(5) * 2.5) + def create_index(self): + return Float64Index(np.arange(5,dtype='float64')) + def test_hash_error(self): with tm.assertRaisesRegexp(TypeError, "unhashable type: %r" % @@ -1095,12 +1180,16 @@ def test_astype_from_object(self): tm.assert_index_equal(result, expected) -class TestInt64Index(tm.TestCase): +class TestInt64Index(Numeric, tm.TestCase): + _holder = Int64Index _multiprocess_can_split_ = True def setUp(self): self.index = Int64Index(np.arange(0, 20, 2)) + def create_index(self): + return Int64Index(np.arange(5,dtype='int64')) + def test_too_many_names(self): def testit(): self.index.names = ["roger", "harold"] @@ -1519,8 +1608,38 @@ def test_slice_keep_name(self): idx = Int64Index([1, 2], name='asdf') self.assertEqual(idx.name, idx[1:].name) +class TestDatetimeIndex(Base, tm.TestCase): + _holder = DatetimeIndex + _multiprocess_can_split_ = True + + def create_index(self): + return date_range('20130101',periods=5) + + def test_pickle_compat_construction(self): + pass + + def test_numeric_compat(self): + super(TestDatetimeIndex, self).test_numeric_compat() + + if not (_np_version_under1p7 or compat.PY3_2): + for f in [lambda : np.timedelta64(1, 'D').astype('m8[ns]') * pd.date_range('2000-01-01', periods=3), + lambda : pd.date_range('2000-01-01', periods=3) * np.timedelta64(1, 'D').astype('m8[ns]') ]: + tm.assertRaisesRegexp(TypeError, + "cannot perform multiplication with this index type", + f) -class TestMultiIndex(tm.TestCase): +class TestPeriodIndex(Base, tm.TestCase): + _holder = PeriodIndex + _multiprocess_can_split_ = True + + def create_index(self): + return period_range('20130101',periods=5,freq='D') + + def test_pickle_compat_construction(self): + pass + +class TestMultiIndex(Base, tm.TestCase): + _holder = MultiIndex _multiprocess_can_split_ = True def setUp(self): @@ -1534,6 +1653,9 @@ def setUp(self): labels=[major_labels, minor_labels], names=self.index_names, verify_integrity=False) + def create_index(self): + return self.index + def test_hash_error(self): with tm.assertRaisesRegexp(TypeError, "unhashable type: %r" % @@ -1574,6 +1696,7 @@ def test_set_names_and_rename(self): def test_set_levels(self): + # side note - you probably wouldn't want to use levels and labels # directly like this - but it is possible. levels, labels = self.index.levels, self.index.labels @@ -1966,6 +2089,7 @@ def check_level_names(self, index, names): self.assertEqual([level.name for level in index.levels], list(names)) def test_changing_names(self): + # names should be applied to levels level_names = [level.name for level in self.index.levels] self.check_level_names(self.index, self.index.names) @@ -2015,6 +2139,7 @@ def test_from_arrays(self): self.assertTrue(result.levels[1].equals(Index(['a','b']))) def test_from_product(self): + first = ['foo', 'bar', 'buz'] second = ['a', 'b', 'c'] names = ['first', 'second'] @@ -2029,7 +2154,7 @@ def test_from_product(self): self.assertEqual(result.names, names) def test_from_product_datetimeindex(self): - dt_index = pd.date_range('2000-01-01', periods=2) + dt_index = date_range('2000-01-01', periods=2) mi = pd.MultiIndex.from_product([[1, 2], dt_index]) etalon = pd.lib.list_to_object_array([(1, pd.Timestamp('2000-01-01')), (1, pd.Timestamp('2000-01-02')), @@ -2108,23 +2233,12 @@ def test_iter(self): ('baz', 'two'), ('qux', 'one'), ('qux', 'two')] self.assertEqual(result, expected) - def test_pickle(self): - pickled = pickle.dumps(self.index) - unpickled = pickle.loads(pickled) - self.assertTrue(self.index.equals(unpickled)) - def test_legacy_pickle(self): if compat.PY3: - raise nose.SkipTest("doesn't work on Python 3") - - def curpath(): - pth, _ = os.path.split(os.path.abspath(__file__)) - return pth + raise nose.SkipTest("testing for legacy pickles not support on py3") - ppath = os.path.join(curpath(), 'data/multiindex_v1.pickle') - obj = pickle.load(open(ppath, 'r')) - - self.assertTrue(obj._is_v1) + path = tm.get_data_path('multiindex_v1.pickle') + obj = pd.read_pickle(path) obj2 = MultiIndex.from_tuples(obj.values) self.assertTrue(obj.equals(obj2)) @@ -2140,11 +2254,10 @@ def curpath(): assert_almost_equal(exp, exp2) def test_legacy_v2_unpickle(self): - # 0.7.3 -> 0.8.0 format manage - pth, _ = os.path.split(os.path.abspath(__file__)) - filepath = os.path.join(pth, 'data', 'mindex_073.pickle') - obj = pd.read_pickle(filepath) + # 0.7.3 -> 0.8.0 format manage + path = tm.get_data_path('mindex_073.pickle') + obj = pd.read_pickle(path) obj2 = MultiIndex.from_tuples(obj.values) self.assertTrue(obj.equals(obj2)) @@ -2562,6 +2675,7 @@ def test_identical(self): self.assertTrue(mi.equals(mi4)) def test_is_(self): + mi = MultiIndex.from_tuples(lzip(range(10), range(10))) self.assertTrue(mi.is_(mi)) self.assertTrue(mi.is_(mi.view())) @@ -2571,6 +2685,7 @@ def test_is_(self): mi2.names = ["A", "B"] self.assertTrue(mi2.is_(mi)) self.assertTrue(mi.is_(mi2)) + self.assertTrue(mi.is_(mi.set_names(["C", "D"]))) mi2 = mi.view() mi2.set_names(["E", "F"], inplace=True) diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 36dbced6eda8c..a523df4cc2461 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -9,7 +9,7 @@ from pandas.core.internals import * import pandas.core.internals as internals import pandas.util.testing as tm - +import pandas as pd from pandas.util.testing import ( assert_almost_equal, assert_frame_equal, randn) from pandas.compat import zip, u @@ -182,12 +182,9 @@ def test_constructor(self): self.assertEqual(int32block.dtype, np.int32) def test_pickle(self): - import pickle def _check(blk): - pickled = pickle.dumps(blk) - unpickled = pickle.loads(pickled) - assert_block_equal(blk, unpickled) + assert_block_equal(self.round_trip_pickle(blk), blk) _check(self.fblock) _check(self.cblock) @@ -341,12 +338,8 @@ def test_contains(self): self.assertNotIn('baz', self.mgr) def test_pickle(self): - import pickle - - pickled = pickle.dumps(self.mgr) - mgr2 = pickle.loads(pickled) - # same result + mgr2 = self.round_trip_pickle(self.mgr) assert_frame_equal(DataFrame(self.mgr), DataFrame(mgr2)) # share ref_items @@ -361,13 +354,13 @@ def test_pickle(self): self.assertFalse(mgr2._known_consolidated) def test_non_unique_pickle(self): - import pickle + mgr = create_mgr('a,a,a:f8') - mgr2 = pickle.loads(pickle.dumps(mgr)) + mgr2 = self.round_trip_pickle(mgr) assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) mgr = create_mgr('a: f8; a: i8') - mgr2 = pickle.loads(pickle.dumps(mgr)) + mgr2 = self.round_trip_pickle(mgr) assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) def test_get_scalar(self): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 6d0d7aaf37b02..ed078ae5749de 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -14,7 +14,7 @@ assertRaisesRegexp) import pandas.core.common as com import pandas.util.testing as tm -from pandas.compat import (range, lrange, StringIO, lzip, u, cPickle, +from pandas.compat import (range, lrange, StringIO, lzip, u, product as cart_product, zip) import pandas as pd @@ -181,8 +181,7 @@ def _check_op(opname): def test_pickle(self): def _test_roundtrip(frame): - pickled = cPickle.dumps(frame) - unpickled = cPickle.loads(pickled) + unpickled = self.round_trip_pickle(frame) assert_frame_equal(frame, unpickled) _test_roundtrip(self.frame) @@ -445,6 +444,7 @@ def test_xs(self): ] df = DataFrame(acc, columns=['a1','a2','cnt']).set_index(['a1','a2']) expected = DataFrame({ 'cnt' : [24,26,25,26] }, index=Index(['xbcde',np.nan,'zbcde','ybcde'],name='a2')) + result = df.xs('z',level='a1') assert_frame_equal(result, expected) @@ -2106,13 +2106,13 @@ def test_reset_index_datetime(self): idx1 = pd.date_range('1/1/2011', periods=5, freq='D', tz=tz, name='idx1') idx2 = pd.Index(range(5), name='idx2',dtype='int64') idx = pd.MultiIndex.from_arrays([idx1, idx2]) - df = pd.DataFrame({'a': np.arange(5,dtype='int64'), 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx) + df = pd.DataFrame({'a': np.arange(5,dtype='int64'), 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx) expected = pd.DataFrame({'idx1': [datetime.datetime(2011, 1, 1), datetime.datetime(2011, 1, 2), datetime.datetime(2011, 1, 3), datetime.datetime(2011, 1, 4), - datetime.datetime(2011, 1, 5)], + datetime.datetime(2011, 1, 5)], 'idx2': np.arange(5,dtype='int64'), 'a': np.arange(5,dtype='int64'), 'b': ['A', 'B', 'C', 'D', 'E']}, columns=['idx1', 'idx2', 'a', 'b']) @@ -2122,19 +2122,19 @@ def test_reset_index_datetime(self): idx3 = pd.date_range('1/1/2012', periods=5, freq='MS', tz='Europe/Paris', name='idx3') idx = pd.MultiIndex.from_arrays([idx1, idx2, idx3]) - df = pd.DataFrame({'a': np.arange(5,dtype='int64'), 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx) + df = pd.DataFrame({'a': np.arange(5,dtype='int64'), 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx) expected = pd.DataFrame({'idx1': [datetime.datetime(2011, 1, 1), datetime.datetime(2011, 1, 2), datetime.datetime(2011, 1, 3), datetime.datetime(2011, 1, 4), - datetime.datetime(2011, 1, 5)], + datetime.datetime(2011, 1, 5)], 'idx2': np.arange(5,dtype='int64'), 'idx3': [datetime.datetime(2012, 1, 1), datetime.datetime(2012, 2, 1), datetime.datetime(2012, 3, 1), datetime.datetime(2012, 4, 1), - datetime.datetime(2012, 5, 1)], + datetime.datetime(2012, 5, 1)], 'a': np.arange(5,dtype='int64'), 'b': ['A', 'B', 'C', 'D', 'E']}, columns=['idx1', 'idx2', 'idx3', 'a', 'b']) expected['idx1'] = expected['idx1'].apply(lambda d: pd.Timestamp(d, tz=tz)) @@ -2148,7 +2148,7 @@ def test_reset_index_datetime(self): expected = pd.DataFrame({'level_0': 'a a a b b b'.split(), 'level_1': [datetime.datetime(2013, 1, 1), datetime.datetime(2013, 1, 2), - datetime.datetime(2013, 1, 3)] * 2, + datetime.datetime(2013, 1, 3)] * 2, 'a': np.arange(6, dtype='int64')}, columns=['level_0', 'level_1', 'a']) expected['level_1'] = expected['level_1'].apply(lambda d: pd.Timestamp(d, offset='D', tz=tz)) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index f8798e794d22c..fb1f1c1693fdd 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -12,7 +12,7 @@ from pandas.core.series import remove_na import pandas.core.common as com from pandas import compat -from pandas.compat import range, lrange, StringIO, cPickle, OrderedDict +from pandas.compat import range, lrange, StringIO, OrderedDict from pandas.util.testing import (assert_panel_equal, assert_frame_equal, @@ -31,8 +31,7 @@ class PanelTests(object): panel = None def test_pickle(self): - pickled = cPickle.dumps(self.panel) - unpickled = cPickle.loads(pickled) + unpickled = self.round_trip_pickle(self.panel) assert_frame_equal(unpickled['ItemA'], self.panel['ItemA']) def test_cumsum(self): diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index fcd4b89377176..01e9e15585fc0 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -27,7 +27,7 @@ import pandas.core.datetools as datetools import pandas.core.nanops as nanops -from pandas.compat import StringIO, lrange, range, zip, u, OrderedDict, long +from pandas.compat import StringIO, lrange, range, zip, u, OrderedDict, long, PY3_2 from pandas import compat from pandas.util.testing import (assert_series_equal, assert_almost_equal, @@ -61,6 +61,7 @@ def test_copy_index_name_checking(self): self.ts.index.name = None self.assertIsNone(self.ts.index.name) self.assertIs(self.ts, self.ts) + cp = self.ts.copy() cp.index.name = 'foo' com.pprint_thing(self.ts.index.name) @@ -1867,7 +1868,7 @@ def test_timeseries_periodindex(self): from pandas import period_range prng = period_range('1/1/2011', '1/1/2012', freq='M') ts = Series(np.random.randn(len(prng)), prng) - new_ts = pickle.loads(pickle.dumps(ts)) + new_ts = self.round_trip_pickle(ts) self.assertEqual(new_ts.index.freq, 'M') def test_iter(self): @@ -5232,9 +5233,15 @@ def test_align_sameindex(self): # self.assertIsNot(b.index, self.ts.index) def test_reindex(self): + identity = self.series.reindex(self.series.index) - self.assertTrue(np.may_share_memory(self.series.index, identity.index)) + + # the older numpies / 3.2 call __array_inteface__ which we don't define + if not _np_version_under1p7 and not PY3_2: + self.assertTrue(np.may_share_memory(self.series.index, identity.index)) + self.assertTrue(identity.index.is_(self.series.index)) + self.assertTrue(identity.index.identical(self.series.index)) subIndex = self.series.index[10:20] subSeries = self.series.reindex(subIndex) @@ -6083,7 +6090,7 @@ def test_unique_data_ownership(self): # it works! #1807 Series(Series(["a", "c", "b"]).unique()).sort() - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) + diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py index d5f7a536f9fe8..5c26fce2b111e 100644 --- a/pandas/tests/test_tseries.py +++ b/pandas/tests/test_tseries.py @@ -32,7 +32,7 @@ def test_backfill(self): old = Index([1, 5, 10]) new = Index(lrange(12)) - filler = algos.backfill_int64(old, new) + filler = algos.backfill_int64(old.values, new.values) expect_filler = [0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1] self.assert_numpy_array_equal(filler, expect_filler) @@ -40,7 +40,7 @@ def test_backfill(self): # corner case old = Index([1, 4]) new = Index(lrange(5, 10)) - filler = algos.backfill_int64(old, new) + filler = algos.backfill_int64(old.values, new.values) expect_filler = [-1, -1, -1, -1, -1] self.assert_numpy_array_equal(filler, expect_filler) @@ -49,7 +49,7 @@ def test_pad(self): old = Index([1, 5, 10]) new = Index(lrange(12)) - filler = algos.pad_int64(old, new) + filler = algos.pad_int64(old.values, new.values) expect_filler = [-1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2] self.assert_numpy_array_equal(filler, expect_filler) @@ -57,7 +57,7 @@ def test_pad(self): # corner case old = Index([5, 10]) new = Index(lrange(5)) - filler = algos.pad_int64(old, new) + filler = algos.pad_int64(old.values, new.values) expect_filler = [-1, -1, -1, -1, -1] self.assert_numpy_array_equal(filler, expect_filler) @@ -165,7 +165,7 @@ def test_left_join_indexer2(): idx = Index([1, 1, 2, 5]) idx2 = Index([1, 2, 5, 7, 9]) - res, lidx, ridx = algos.left_join_indexer_int64(idx2, idx) + res, lidx, ridx = algos.left_join_indexer_int64(idx2.values, idx.values) exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) assert_almost_equal(res, exp_res) @@ -181,7 +181,7 @@ def test_outer_join_indexer2(): idx = Index([1, 1, 2, 5]) idx2 = Index([1, 2, 5, 7, 9]) - res, lidx, ridx = algos.outer_join_indexer_int64(idx2, idx) + res, lidx, ridx = algos.outer_join_indexer_int64(idx2.values, idx.values) exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) assert_almost_equal(res, exp_res) @@ -197,7 +197,7 @@ def test_inner_join_indexer2(): idx = Index([1, 1, 2, 5]) idx2 = Index([1, 2, 5, 7, 9]) - res, lidx, ridx = algos.inner_join_indexer_int64(idx2, idx) + res, lidx, ridx = algos.inner_join_indexer_int64(idx2.values, idx.values) exp_res = np.array([1, 1, 2, 5], dtype=np.int64) assert_almost_equal(res, exp_res) @@ -690,6 +690,10 @@ def test_int_index(self): expected = arr.sum(1) assert_almost_equal(result, expected) + result = lib.reduce(arr, np.sum, axis=1, + dummy=dummy, labels=Index(np.arange(100))) + assert_almost_equal(result, expected) + class TestTsUtil(tm.TestCase): def test_min_valid(self): diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index ada13d6f4bccb..83df908d8033f 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -3,7 +3,7 @@ import warnings from pandas import Series, DataFrame -from pandas.core.index import MultiIndex +from pandas.core.index import MultiIndex, Index from pandas.core.groupby import Grouper from pandas.tools.merge import concat from pandas.tools.util import cartesian_product @@ -307,7 +307,7 @@ def _all_key(): def _convert_by(by): if by is None: by = [] - elif (np.isscalar(by) or isinstance(by, (np.ndarray, Series, Grouper)) + elif (np.isscalar(by) or isinstance(by, (np.ndarray, Index, Series, Grouper)) or hasattr(by, '__call__')): by = [by] else: diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 8f79f14cd551a..5d85b68234f96 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -12,7 +12,7 @@ from pandas.util.decorators import cache_readonly, deprecate_kwarg import pandas.core.common as com from pandas.core.generic import _shared_docs, _shared_doc_kwargs -from pandas.core.index import MultiIndex +from pandas.core.index import Index, MultiIndex from pandas.core.series import Series, remove_na from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex, Period @@ -821,7 +821,7 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=True, for kw, err in zip(['xerr', 'yerr'], [xerr, yerr]): self.errors[kw] = self._parse_errorbars(kw, err) - if not isinstance(secondary_y, (bool, tuple, list, np.ndarray)): + if not isinstance(secondary_y, (bool, tuple, list, np.ndarray, Index)): secondary_y = [secondary_y] self.secondary_y = secondary_y @@ -872,7 +872,7 @@ def _iter_data(self, data=None, keep_index=False): data = self.data from pandas.core.frame import DataFrame - if isinstance(data, (Series, np.ndarray)): + if isinstance(data, (Series, np.ndarray, Index)): if keep_index is True: yield self.label, data else: @@ -1223,7 +1223,7 @@ def on_right(self, i): return self.secondary_y if (isinstance(self.data, DataFrame) and - isinstance(self.secondary_y, (tuple, list, np.ndarray))): + isinstance(self.secondary_y, (tuple, list, np.ndarray, Index))): return self.data.columns[i] in self.secondary_y def _get_style(self, i, col_name): @@ -2485,7 +2485,7 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, return axes if column is not None: - if not isinstance(column, (list, np.ndarray)): + if not isinstance(column, (list, np.ndarray, Index)): column = [column] data = data[column] data = data._get_numeric_data() @@ -2962,7 +2962,7 @@ def _subplots(nrows=1, ncols=1, naxes=None, sharex=False, sharey=False, squeeze= axarr[i] = ax if nplots > 1: - + if sharex and nrows > 1: for ax in axarr[:naxes][:-ncols]: # only bottom row for label in ax.get_xticklabels(): @@ -3015,7 +3015,7 @@ def _subplots(nrows=1, ncols=1, naxes=None, sharex=False, sharey=False, squeeze= def _flatten(axes): if not com.is_list_like(axes): axes = [axes] - elif isinstance(axes, np.ndarray): + elif isinstance(axes, (np.ndarray, Index)): axes = axes.ravel() return axes diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index a16df00351d76..7e52c8c333dbf 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -517,10 +517,10 @@ def test_pivot_datetime_tz(self): exp_col3 = pd.DatetimeIndex(['2013-01-01 15:00:00', '2013-02-01 15:00:00'] * 4, tz='Asia/Tokyo', name='dt2') exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3]) - expected = DataFrame(np.array([[0, 3, 1, 2, 0, 3, 1, 2], + expected = DataFrame(np.array([[0, 3, 1, 2, 0, 3, 1, 2], [1, 4, 2, 1, 1, 4, 2, 1], - [2, 5, 1, 2, 2, 5, 1, 2]], dtype='int64'), - index=exp_idx, + [2, 5, 1, 2, 2, 5, 1, 2]], dtype='int64'), + index=exp_idx, columns=exp_col) result = pivot_table(df, index=['dt1'], columns=['dt2'], values=['value1', 'value2'], diff --git a/pandas/tseries/converter.py b/pandas/tseries/converter.py index 80ac97ee60617..b014e718d5411 100644 --- a/pandas/tseries/converter.py +++ b/pandas/tseries/converter.py @@ -59,7 +59,7 @@ def convert(value, unit, axis): return time2num(value) if isinstance(value, Index): return value.map(time2num) - if isinstance(value, (list, tuple, np.ndarray)): + if isinstance(value, (list, tuple, np.ndarray, Index)): return [time2num(x) for x in value] return value @@ -116,8 +116,8 @@ def convert(values, units, axis): return values.asfreq(axis.freq).values if isinstance(values, Index): return values.map(lambda x: get_datevalue(x, axis.freq)) - if isinstance(values, (list, tuple, np.ndarray)): - return [get_datevalue(x, axis.freq) for x in values] + if isinstance(values, (list, tuple, np.ndarray, Index)): + return PeriodIndex(values, freq=axis.freq).values return values @@ -127,7 +127,7 @@ def get_datevalue(date, freq): elif isinstance(date, (str, datetime, pydt.date, pydt.time)): return Period(date, freq).ordinal elif (com.is_integer(date) or com.is_float(date) or - (isinstance(date, np.ndarray) and (date.size == 1))): + (isinstance(date, (np.ndarray, Index)) and (date.size == 1))): return date elif date is None: return None @@ -145,7 +145,7 @@ def _dt_to_float_ordinal(dt): preserving hours, minutes, seconds and microseconds. Return value is a :func:`float`. """ - if isinstance(dt, (np.ndarray, Series)) and com.is_datetime64_ns_dtype(dt): + if isinstance(dt, (np.ndarray, Index, Series)) and com.is_datetime64_ns_dtype(dt): base = dates.epoch2num(dt.asi8 / 1.0E9) else: base = dates.date2num(dt) @@ -171,7 +171,9 @@ def try_parse(values): return values elif isinstance(values, compat.string_types): return try_parse(values) - elif isinstance(values, (list, tuple, np.ndarray)): + elif isinstance(values, (list, tuple, np.ndarray, Index)): + if isinstance(values, Index): + values = values.values if not isinstance(values, np.ndarray): values = com._asarray_tuplesafe(values) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index fb87e1b570985..3ada26a7e5779 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -59,10 +59,10 @@ def f(self): def _join_i8_wrapper(joinf, with_indexers=True): @staticmethod def wrapper(left, right): - if isinstance(left, (np.ndarray, ABCSeries)): - left = left.view('i8', type=np.ndarray) - if isinstance(right, (np.ndarray, ABCSeries)): - right = right.view('i8', type=np.ndarray) + if isinstance(left, (np.ndarray, Index, ABCSeries)): + left = left.view('i8') + if isinstance(right, (np.ndarray, Index, ABCSeries)): + right = right.view('i8') results = joinf(left, right) if with_indexers: join_index, left_indexer, right_indexer = results @@ -86,9 +86,10 @@ def wrapper(self, other): else: if isinstance(other, list): other = DatetimeIndex(other) - elif not isinstance(other, (np.ndarray, ABCSeries)): + elif not isinstance(other, (np.ndarray, Index, ABCSeries)): other = _ensure_datetime64(other) result = func(other) + result = _values_from_object(result) if isinstance(other, Index): o_mask = other.values.view('i8') == tslib.iNaT @@ -101,7 +102,11 @@ def wrapper(self, other): mask = self.asi8 == tslib.iNaT if mask.any(): result[mask] = nat_result - return result.view(np.ndarray) + + # support of bool dtype indexers + if com.is_bool_dtype(result): + return result + return Index(result) return wrapper @@ -143,8 +148,9 @@ class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index): name : object Name to be stored in the index """ - _join_precedence = 10 + _typ = 'datetimeindex' + _join_precedence = 10 _inner_indexer = _join_i8_wrapper(_algos.inner_join_indexer_int64) _outer_indexer = _join_i8_wrapper(_algos.outer_join_indexer_int64) _left_indexer = _join_i8_wrapper(_algos.left_join_indexer_int64) @@ -167,17 +173,19 @@ class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index): tz = None offset = None _comparables = ['name','freqstr','tz'] + _attributes = ['name','freq','tz'] _allow_datetime_index_ops = True + _is_numeric_dtype = False def __new__(cls, data=None, freq=None, start=None, end=None, periods=None, copy=False, name=None, tz=None, verify_integrity=True, normalize=False, - closed=None, **kwds): + closed=None, **kwargs): - dayfirst = kwds.pop('dayfirst', None) - yearfirst = kwds.pop('yearfirst', None) - infer_dst = kwds.pop('infer_dst', False) + dayfirst = kwargs.pop('dayfirst', None) + yearfirst = kwargs.pop('yearfirst', None) + infer_dst = kwargs.pop('infer_dst', False) freq_infer = False if not isinstance(freq, DateOffset): @@ -205,7 +213,7 @@ def __new__(cls, data=None, tz=tz, normalize=normalize, closed=closed, infer_dst=infer_dst) - if not isinstance(data, (np.ndarray, ABCSeries)): + if not isinstance(data, (np.ndarray, Index, ABCSeries)): if np.isscalar(data): raise ValueError('DatetimeIndex() must be called with a ' 'collection of some kind, %s was passed' @@ -262,7 +270,7 @@ def __new__(cls, data=None, else: subarr = data.view(_NS_DTYPE) else: - if isinstance(data, ABCSeries): + if isinstance(data, (ABCSeries, Index)): values = data.values else: values = data @@ -302,10 +310,7 @@ def __new__(cls, data=None, subarr = subarr.view(_NS_DTYPE) - subarr = subarr.view(cls) - subarr.name = name - subarr.offset = freq - subarr.tz = tz + subarr = cls._simple_new(subarr, name=name, freq=freq, tz=tz) if verify_integrity and len(subarr) > 0: if freq is not None and not freq_infer: @@ -442,10 +447,7 @@ def _generate(cls, start, end, periods, name, offset, infer_dst=infer_dst) index = index.view(_NS_DTYPE) - index = index.view(cls) - index.name = name - index.offset = offset - index.tz = tz + index = cls._simple_new(index, name=name, freq=offset, tz=tz) if not left_closed: index = index[1:] @@ -474,15 +476,18 @@ def _local_timestamps(self): return result.take(reverse) @classmethod - def _simple_new(cls, values, name, freq=None, tz=None): + def _simple_new(cls, values, name=None, freq=None, tz=None): + if not getattr(values,'dtype',None): + values = np.array(values,copy=False) if values.dtype != _NS_DTYPE: values = com._ensure_int64(values).view(_NS_DTYPE) - result = values.view(cls) + result = object.__new__(cls) + result._data = values result.name = name result.offset = freq result.tz = tslib.maybe_get_tz(tz) - + result._reset_identity() return result @property @@ -517,7 +522,7 @@ def _cached_range(cls, start=None, end=None, periods=None, offset=None, arr = tools.to_datetime(list(xdr), box=False) - cachedRange = arr.view(DatetimeIndex) + cachedRange = DatetimeIndex._simple_new(arr) cachedRange.offset = offset cachedRange.tz = None cachedRange.name = None @@ -575,29 +580,37 @@ def _formatter_func(self): formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) return lambda x: formatter(x, tz=self.tz) - def __reduce__(self): - """Necessary for making this object picklable""" - object_state = list(np.ndarray.__reduce__(self)) - subclass_state = self.name, self.offset, self.tz - object_state[2] = (object_state[2], subclass_state) - return tuple(object_state) - def __setstate__(self, state): """Necessary for making this object picklable""" - if len(state) == 2: - nd_state, own_state = state - self.name = own_state[0] - self.offset = own_state[1] - self.tz = own_state[2] - np.ndarray.__setstate__(self, nd_state) - - # provide numpy < 1.7 compat - if nd_state[2] == 'M8[us]': - new_state = np.ndarray.__reduce__(self.values.astype('M8[ns]')) - np.ndarray.__setstate__(self, new_state[2]) + if isinstance(state, dict): + super(DatetimeIndex, self).__setstate__(state) - else: # pragma: no cover - np.ndarray.__setstate__(self, state) + elif isinstance(state, tuple): + + # < 0.15 compat + if len(state) == 2: + nd_state, own_state = state + data = np.empty(nd_state[1], dtype=nd_state[2]) + np.ndarray.__setstate__(data, nd_state) + + self.name = own_state[0] + self.offset = own_state[1] + self.tz = own_state[2] + + # provide numpy < 1.7 compat + if nd_state[2] == 'M8[us]': + new_state = np.ndarray.__reduce__(data.astype('M8[ns]')) + np.ndarray.__setstate__(data, new_state[2]) + + else: # pragma: no cover + data = np.empty(state) + np.ndarray.__setstate__(data, state) + + self._data = data + + else: + raise Exception("invalid pickle state") + _unpickle_compat = __setstate__ def _add_delta(self, delta): if isinstance(delta, (Tick, timedelta)): @@ -662,7 +675,7 @@ def to_datetime(self, dayfirst=False): return self.copy() def groupby(self, f): - objs = self.asobject + objs = self.asobject.values return _algos.groupby_object(objs, f) def summary(self, name=None): @@ -982,7 +995,7 @@ def _wrap_joined_index(self, joined, other): if (isinstance(other, DatetimeIndex) and self.offset == other.offset and self._can_fast_union(other)): - joined = self._view_like(joined) + joined = self._shallow_copy(joined) joined.name = name return joined else: @@ -1044,7 +1057,7 @@ def _fast_union(self, other): loc = right.searchsorted(left_end, side='right') right_chunk = right.values[loc:] dates = com._concat_compat((left.values, right_chunk)) - return self._view_like(dates) + return self._shallow_copy(dates) else: return left else: @@ -1140,7 +1153,7 @@ def intersection(self, other): else: lslice = slice(*left.slice_locs(start, end)) left_chunk = left.values[lslice] - return self._view_like(left_chunk) + return self._shallow_copy(left_chunk) def _partial_date_slice(self, reso, parsed, use_lhs=True, use_rhs=True): @@ -1357,10 +1370,9 @@ def slice_locs(self, start=None, end=None): return Index.slice_locs(self, start, end) def __getitem__(self, key): - """Override numpy.ndarray's __getitem__ method to work as desired""" - arr_idx = self.view(np.ndarray) + getitem = self._data.__getitem__ if np.isscalar(key): - val = arr_idx[key] + val = getitem(key) return Timestamp(val, offset=self.offset, tz=self.tz) else: if com._is_bool_indexer(key): @@ -1377,7 +1389,7 @@ def __getitem__(self, key): else: new_offset = self.offset - result = arr_idx[key] + result = getitem(key) if result.ndim > 1: return result @@ -1388,18 +1400,20 @@ def __getitem__(self, key): def map(self, f): try: result = f(self) - if not isinstance(result, np.ndarray): + if not isinstance(result, (np.ndarray, Index)): raise TypeError return result except Exception: - return _algos.arrmap_object(self.asobject, f) + return _algos.arrmap_object(self.asobject.values, f) # alias to offset - @property - def freq(self): - """ return the frequency object if its set, otherwise None """ + def _get_freq(self): return self.offset + def _set_freq(self, value): + self.offset = value + freq = property(fget=_get_freq, fset=_set_freq, doc="get/set the frequncy of the Index") + @cache_readonly def inferred_freq(self): try: @@ -1443,14 +1457,14 @@ def _time(self): """ # can't call self.map() which tries to treat func as ufunc # and causes recursion warnings on python 2.6 - return _algos.arrmap_object(self.asobject, lambda x: x.time()) + return _algos.arrmap_object(self.asobject.values, lambda x: x.time()) @property def _date(self): """ Returns numpy array of datetime.date. The date part of the Timestamps. """ - return _algos.arrmap_object(self.asobject, lambda x: x.date()) + return _algos.arrmap_object(self.asobject.values, lambda x: x.date()) def normalize(self): @@ -1466,7 +1480,7 @@ def normalize(self): tz=self.tz) def searchsorted(self, key, side='left'): - if isinstance(key, np.ndarray): + if isinstance(key, (np.ndarray, Index)): key = np.array(key, dtype=_NS_DTYPE, copy=False) else: key = _to_m8(key, tz=self.tz) @@ -1609,13 +1623,6 @@ def delete(self, loc): new_dates = tslib.tz_convert(new_dates, 'UTC', self.tz) return DatetimeIndex(new_dates, name=self.name, freq=freq, tz=self.tz) - def _view_like(self, ndarray): - result = ndarray.view(type(self)) - result.offset = self.offset - result.tz = self.tz - result.name = self.name - return result - def tz_convert(self, tz): """ Convert tz-aware DatetimeIndex from one time zone to another (using pytz/dateutil) @@ -1639,7 +1646,7 @@ def tz_convert(self, tz): 'tz_localize to localize') # No conversion since timestamps are all UTC to begin with - return self._simple_new(self.values, self.name, self.offset, tz) + return self._shallow_copy(tz=tz) def tz_localize(self, tz, infer_dst=False): """ @@ -1669,7 +1676,7 @@ def tz_localize(self, tz, infer_dst=False): # Convert to UTC new_dates = tslib.tz_localize_to_utc(self.asi8, tz, infer_dst=infer_dst) new_dates = new_dates.view(_NS_DTYPE) - return self._simple_new(new_dates, self.name, self.offset, tz) + return self._shallow_copy(new_dates, tz=tz) def indexer_at_time(self, time, asof=False): """ @@ -1782,7 +1789,7 @@ def to_julian_date(self): self.microsecond/3600.0/1e+6 + self.nanosecond/3600.0/1e+9 )/24.0) - +DatetimeIndex._add_numeric_methods_disabled() def _generate_regular_range(start, end, periods, offset): if isinstance(offset, Tick): diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 7f865fd9aefa8..ddd1ee34f0798 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -60,12 +60,22 @@ class Period(PandasObject): minute : int, default 0 second : int, default 0 """ + _typ = 'periodindex' __slots__ = ['freq', 'ordinal'] _comparables = ['name','freqstr'] + @classmethod + def _from_ordinal(cls, ordinal, freq): + """ fast creation from an ordinal and freq that are already validated! """ + self = object.__new__(cls) + self.ordinal = ordinal + self.freq = freq + return self + def __init__(self, value=None, freq=None, ordinal=None, year=None, month=1, quarter=None, day=1, hour=0, minute=0, second=0): + # freq points to a tuple (base, mult); base is one of the defined # periods such as A, Q, etc. Every five minutes would be, e.g., # ('T', 5) but may be passed in as a string like '5T' @@ -563,6 +573,8 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index): """ _box_scalars = True _allow_period_index_ops = True + _attributes = ['name','freq'] + _is_numeric_dtype = False __eq__ = _period_index_cmp('__eq__') __ne__ = _period_index_cmp('__ne__', nat_result=True) @@ -572,9 +584,7 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index): __ge__ = _period_index_cmp('__ge__') def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, - periods=None, copy=False, name=None, year=None, month=None, - quarter=None, day=None, hour=None, minute=None, second=None, - tz=None): + periods=None, copy=False, name=None, tz=None, **kwargs): freq = frequencies.get_standard_freq(freq) @@ -589,32 +599,24 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, if ordinal is not None: data = np.asarray(ordinal, dtype=np.int64) else: - fields = [year, month, quarter, day, hour, minute, second] data, freq = cls._generate_range(start, end, periods, - freq, fields) + freq, kwargs) else: ordinal, freq = cls._from_arraylike(data, freq, tz) data = np.array(ordinal, dtype=np.int64, copy=False) - subarr = data.view(cls) - subarr.name = name - subarr.freq = freq - - return subarr + return cls._simple_new(data, name=name, freq=freq) @classmethod def _generate_range(cls, start, end, periods, freq, fields): - field_count = com._count_not_none(*fields) + field_count = len(fields) if com._count_not_none(start, end) > 0: if field_count > 0: raise ValueError('Can either instantiate from fields ' 'or endpoints, but not both') subarr, freq = _get_ordinal_range(start, end, periods, freq) elif field_count > 0: - y, mth, q, d, h, minute, s = fields - subarr, freq = _range_from_fields(year=y, month=mth, quarter=q, - day=d, hour=h, minute=minute, - second=s, freq=freq) + subarr, freq = _range_from_fields(freq=freq, **fields) else: raise ValueError('Not enough parameters to construct ' 'Period range') @@ -623,7 +625,8 @@ def _generate_range(cls, start, end, periods, freq, fields): @classmethod def _from_arraylike(cls, data, freq, tz): - if not isinstance(data, np.ndarray): + + if not isinstance(data, (np.ndarray, PeriodIndex, DatetimeIndex, Int64Index)): if np.isscalar(data) or isinstance(data, Period): raise ValueError('PeriodIndex() must be called with a ' 'collection of some kind, %s was passed' @@ -681,10 +684,12 @@ def _from_arraylike(cls, data, freq, tz): return data, freq @classmethod - def _simple_new(cls, values, name, freq=None, **kwargs): - result = values.view(cls) + def _simple_new(cls, values, name=None, freq=None, **kwargs): + result = object.__new__(cls) + result._data = values result.name = name result.freq = freq + result._reset_identity() return result @property @@ -704,7 +709,7 @@ def __contains__(self, key): @property def _box_func(self): - return lambda x: Period(ordinal=x, freq=self.freq) + return lambda x: Period._from_ordinal(ordinal=x, freq=self.freq) def asof_locs(self, where, mask): """ @@ -800,17 +805,15 @@ def to_datetime(self, dayfirst=False): def map(self, f): try: result = f(self) - if not isinstance(result, np.ndarray): + if not isinstance(result, (np.ndarray, Index)): raise TypeError return result except Exception: - return _algos.arrmap_object(self.asobject, f) + return _algos.arrmap_object(self.asobject.values, f) def _get_object_array(self): freq = self.freq - boxfunc = lambda x: Period(ordinal=x, freq=freq) - boxer = np.frompyfunc(boxfunc, 1, 1) - return boxer(self.values) + return np.array([ Period._from_ordinal(ordinal=x, freq=freq) for x in self.values], copy=False) def _mpl_repr(self): # how to represent ourselves to matplotlib @@ -823,6 +826,13 @@ def equals(self, other): if self.is_(other): return True + if (not hasattr(other, 'inferred_type') or + other.inferred_type != 'int64'): + try: + other = PeriodIndex(other) + except: + return False + return np.array_equal(self.asi8, other.asi8) def to_timestamp(self, freq=None, how='start'): @@ -1042,21 +1052,19 @@ def _wrap_union_result(self, other, result): def _apply_meta(self, rawarr): if not isinstance(rawarr, PeriodIndex): - rawarr = rawarr.view(PeriodIndex) - rawarr.freq = self.freq + rawarr = PeriodIndex(rawarr, freq=self.freq) return rawarr def __getitem__(self, key): - """Override numpy.ndarray's __getitem__ method to work as desired""" - arr_idx = self.view(np.ndarray) + getitem = self._data.__getitem__ if np.isscalar(key): - val = arr_idx[key] + val = getitem(key) return Period(ordinal=val, freq=self.freq) else: if com._is_bool_indexer(key): key = np.asarray(key) - result = arr_idx[key] + result = getitem(key) if result.ndim > 1: # MPL kludge # values = np.asarray(list(values), dtype=object) @@ -1129,7 +1137,7 @@ def append(self, other): if isinstance(to_concat[0], PeriodIndex): if len(set([x.freq for x in to_concat])) > 1: # box - to_concat = [x.asobject for x in to_concat] + to_concat = [x.asobject.values for x in to_concat] else: cat_values = np.concatenate([x.values for x in to_concat]) return PeriodIndex(cat_values, freq=self.freq, name=name) @@ -1138,26 +1146,35 @@ def append(self, other): for x in to_concat] return Index(com._concat_compat(to_concat), name=name) - def __reduce__(self): - """Necessary for making this object picklable""" - object_state = list(np.ndarray.__reduce__(self)) - subclass_state = (self.name, self.freq) - object_state[2] = (object_state[2], subclass_state) - return tuple(object_state) - def __setstate__(self, state): """Necessary for making this object picklable""" - if len(state) == 2: - nd_state, own_state = state - np.ndarray.__setstate__(self, nd_state) - self.name = own_state[0] - try: # backcompat - self.freq = own_state[1] - except: - pass - else: # pragma: no cover - np.ndarray.__setstate__(self, state) + if isinstance(state, dict): + super(PeriodIndex, self).__setstate__(state) + + elif isinstance(state, tuple): + + # < 0.15 compat + if len(state) == 2: + nd_state, own_state = state + data = np.empty(nd_state[1], dtype=nd_state[2]) + np.ndarray.__setstate__(data, nd_state) + + try: # backcompat + self.freq = own_state[1] + except: + pass + + else: # pragma: no cover + data = np.empty(state) + np.ndarray.__setstate__(self, state) + + self._data = data + + else: + raise Exception("invalid pickle state") + _unpickle_compat = __setstate__ +PeriodIndex._add_numeric_methods_disabled() def _get_ordinal_range(start, end, periods, freq): if com._count_not_none(start, end, periods) < 2: diff --git a/pandas/tseries/plotting.py b/pandas/tseries/plotting.py index b95553f87ec6b..899d2bfdc9c76 100644 --- a/pandas/tseries/plotting.py +++ b/pandas/tseries/plotting.py @@ -61,7 +61,7 @@ def tsplot(series, plotf, **kwargs): if not hasattr(ax, '_plot_data'): ax._plot_data = [] ax._plot_data.append((series, plotf, kwargs)) - lines = plotf(ax, series.index, series.values, **kwargs) + lines = plotf(ax, series.index._mpl_repr(), series.values, **kwargs) # set date formatter, locators and rescale limits format_dateaxis(ax, ax.freq) @@ -152,7 +152,7 @@ def _replot_ax(ax, freq, kwargs): idx = series.index.asfreq(freq, how='S') series.index = idx ax._plot_data.append(series) - lines.append(plotf(ax, series.index, series.values, **kwds)[0]) + lines.append(plotf(ax, series.index._mpl_repr(), series.values, **kwds)[0]) labels.append(com.pprint_thing(series.name)) return lines, labels diff --git a/pandas/tseries/tests/test_converter.py b/pandas/tseries/tests/test_converter.py index 902b9cb549e32..a1b873e1c0bea 100644 --- a/pandas/tseries/tests/test_converter.py +++ b/pandas/tseries/tests/test_converter.py @@ -84,8 +84,8 @@ def _assert_less(ts1, ts2): if not val1 < val2: raise AssertionError('{0} is not less than {1}.'.format(val1, val2)) - # Matplotlib's time representation using floats cannot distinguish intervals smaller - # than ~10 microsecond in the common range of years. + # Matplotlib's time representation using floats cannot distinguish intervals smaller + # than ~10 microsecond in the common range of years. ts = Timestamp('2012-1-1') _assert_less(ts, ts + Second()) _assert_less(ts, ts + Milli()) diff --git a/pandas/tseries/tests/test_daterange.py b/pandas/tseries/tests/test_daterange.py index 7b0bfa98690e2..b109f6585092a 100644 --- a/pandas/tseries/tests/test_daterange.py +++ b/pandas/tseries/tests/test_daterange.py @@ -1,6 +1,5 @@ from datetime import datetime from pandas.compat import range -import pickle import nose import sys import numpy as np @@ -168,9 +167,7 @@ def test_shift(self): self.assertEqual(shifted[0], rng[0] + datetools.bday) def test_pickle_unpickle(self): - pickled = pickle.dumps(self.rng) - unpickled = pickle.loads(pickled) - + unpickled = self.round_trip_pickle(self.rng) self.assertIsNotNone(unpickled.offset) def test_union(self): @@ -561,9 +558,7 @@ def test_shift(self): self.assertEqual(shifted[0], rng[0] + datetools.cday) def test_pickle_unpickle(self): - pickled = pickle.dumps(self.rng) - unpickled = pickle.loads(pickled) - + unpickled = self.round_trip_pickle(self.rng) self.assertIsNotNone(unpickled.offset) def test_union(self): diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index b9d4dd80438ef..b7abedbafa7b0 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -2052,7 +2052,7 @@ def test_range_slice_outofbounds(self): for idx in [didx, pidx]: df = DataFrame(dict(units=[100 + i for i in range(10)]), index=idx) - empty = DataFrame(index=DatetimeIndex([], freq='D'), columns=['units']) + empty = DataFrame(index=idx.__class__([], freq='D'), columns=['units']) tm.assert_frame_equal(df['2013/09/01':'2013/09/30'], empty) tm.assert_frame_equal(df['2013/09/30':'2013/10/02'], df.iloc[:2]) @@ -2408,7 +2408,7 @@ def test_pickle_freq(self): # GH2891 import pickle prng = period_range('1/1/2011', '1/1/2012', freq='M') - new_prng = pickle.loads(pickle.dumps(prng)) + new_prng = self.round_trip_pickle(prng) self.assertEqual(new_prng.freq,'M') def test_slice_keep_name(self): diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py index b52dca76f2c77..6b34ae0eb9384 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tseries/tests/test_plotting.py @@ -298,7 +298,7 @@ def test_dataframe(self): bts = DataFrame({'a': tm.makeTimeSeries()}) ax = bts.plot() idx = ax.get_lines()[0].get_xdata() - assert_array_equal(bts.index.to_period(), idx) + assert_array_equal(bts.index.to_period(), PeriodIndex(idx)) @slow def test_axis_limits(self): @@ -605,8 +605,8 @@ def test_mixed_freq_regular_first(self): ax = s1.plot() ax2 = s2.plot(style='g') lines = ax2.get_lines() - idx1 = lines[0].get_xdata() - idx2 = lines[1].get_xdata() + idx1 = PeriodIndex(lines[0].get_xdata()) + idx2 = PeriodIndex(lines[1].get_xdata()) self.assertTrue(idx1.equals(s1.index.to_period('B'))) self.assertTrue(idx2.equals(s2.index.to_period('B'))) left, right = ax2.get_xlim() @@ -881,9 +881,9 @@ def test_secondary_upsample(self): low.plot() ax = high.plot(secondary_y=True) for l in ax.get_lines(): - self.assertEqual(l.get_xdata().freq, 'D') + self.assertEqual(PeriodIndex(l.get_xdata()).freq, 'D') for l in ax.right_ax.get_lines(): - self.assertEqual(l.get_xdata().freq, 'D') + self.assertEqual(PeriodIndex(l.get_xdata()).freq, 'D') @slow def test_secondary_legend(self): diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 6dbf095189d36..9487949adf23a 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -27,7 +27,7 @@ import pandas.index as _index -from pandas.compat import range, long, StringIO, lrange, lmap, zip, product +from pandas.compat import range, long, StringIO, lrange, lmap, zip, product, PY3_2 from numpy.random import rand from numpy.testing import assert_array_equal from pandas.util.testing import assert_frame_equal @@ -871,11 +871,11 @@ def test_string_na_nat_conversion(self): result2 = to_datetime(strings) tm.assert_isinstance(result2, DatetimeIndex) - assert_almost_equal(result, result2) + self.assert_numpy_array_equal(result, result2) malformed = np.array(['1/100/2000', np.nan], dtype=object) result = to_datetime(malformed) - assert_almost_equal(result, malformed) + self.assert_numpy_array_equal(result, malformed) self.assertRaises(ValueError, to_datetime, malformed, errors='raise') @@ -2058,18 +2058,15 @@ def test_period_resample_with_local_timezone_dateutil(self): def test_pickle(self): #GH4606 - from pandas.compat import cPickle - import pickle - for pick in [pickle, cPickle]: - p = pick.loads(pick.dumps(NaT)) - self.assertTrue(p is NaT) + p = self.round_trip_pickle(NaT) + self.assertTrue(p is NaT) - idx = pd.to_datetime(['2013-01-01', NaT, '2014-01-06']) - idx_p = pick.loads(pick.dumps(idx)) - self.assertTrue(idx_p[0] == idx[0]) - self.assertTrue(idx_p[1] is NaT) - self.assertTrue(idx_p[2] == idx[2]) + idx = pd.to_datetime(['2013-01-01', NaT, '2014-01-06']) + idx_p = self.round_trip_pickle(idx) + self.assertTrue(idx_p[0] == idx[0]) + self.assertTrue(idx_p[1] is NaT) + self.assertTrue(idx_p[2] == idx[2]) def _simple_ts(start, end, freq='D'): @@ -2212,6 +2209,9 @@ def test_comparisons_coverage(self): self.assert_numpy_array_equal(result, exp) def test_comparisons_nat(self): + if PY3_2: + raise nose.SkipTest('nat comparisons on 3.2 broken') + fidx1 = pd.Index([1.0, np.nan, 3.0, np.nan, 5.0, 7.0]) fidx2 = pd.Index([2.0, 3.0, np.nan, np.nan, 6.0, 7.0]) @@ -2233,9 +2233,11 @@ def test_comparisons_nat(self): # Check pd.NaT is handles as the same as np.nan for idx1, idx2 in cases: + result = idx1 < idx2 expected = np.array([True, False, False, False, True, False]) self.assert_numpy_array_equal(result, expected) + result = idx2 > idx1 expected = np.array([True, False, False, False, True, False]) self.assert_numpy_array_equal(result, expected) @@ -2243,6 +2245,7 @@ def test_comparisons_nat(self): result = idx1 <= idx2 expected = np.array([True, False, False, False, True, True]) self.assert_numpy_array_equal(result, expected) + result = idx2 >= idx1 expected = np.array([True, False, False, False, True, True]) self.assert_numpy_array_equal(result, expected) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 13f432d5cea2a..42048ec9877fa 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -98,6 +98,13 @@ def assert_numpy_array_equal(self, np_array, assert_equal): return raise AssertionError('{0} is not equal to {1}.'.format(np_array, assert_equal)) + def round_trip_pickle(self, obj, path=None): + if path is None: + path = u('__%s__.pickle' % rands(10)) + with ensure_clean(path) as path: + pd.to_pickle(obj, path) + return pd.read_pickle(path) + def assert_numpy_array_equivalent(self, np_array, assert_equal): """Checks that 'np_array' is equivalent to 'assert_equal'