From cb96f776dd4b1d1d243046cd74b64c9a84232aa3 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 24 Feb 2013 22:46:00 -0500 Subject: [PATCH 1/7] ENH: add .loc attribute to provide location-based indexing TST: added multi-index tests DOC: changed loc -> iloc added more docs ENH: added integer lists as indexers to iloc ENH: raise correctly on out-of-bounds slicing support negative indexing in iloc and icol CLN: move all indexings (ix/iloc) to PandasObject in generic.py (except _SeriesIndexer in series.py) add name parameter to Indexer creation, makes indexers independent of their external names --- RELEASE.rst | 3 ++ doc/source/indexing.rst | 80 ++++++++++++++++++++++++++++ doc/source/v0.11.0.txt | 59 ++++++++++++++++++++- pandas/core/frame.py | 20 +++---- pandas/core/generic.py | 28 ++++++++-- pandas/core/indexing.py | 80 ++++++++++++++++++++++++++-- pandas/core/internals.py | 7 ++- pandas/core/panel.py | 14 +---- pandas/core/series.py | 20 +++++-- pandas/sparse/frame.py | 7 ++- pandas/tests/test_frame.py | 103 +++++++++++++++++++++++++++++++++++- pandas/tests/test_series.py | 34 ++++++++++++ 12 files changed, 414 insertions(+), 41 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index cf3fd598a8186..32cbddf6bfd74 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -35,6 +35,7 @@ pandas 0.11.0 Yahoo! finance (GH2795_) - Add ``squeeze`` function to reduce dimensionality of 1-len objects - Support slicing with time objects (GH2681_) + - Add ``.iloc`` attribute, to support location-based indexing, analagous to ``.ix`` **Improvements to existing features** @@ -127,6 +128,7 @@ pandas 0.11.0 - Bug on in-place putmasking on an ``integer`` series that needs to be converted to ``float`` (GH2746_) - Bug in argsort of ``datetime64[ns]`` Series with ``NaT`` (GH2967_) - Bug in idxmin/idxmax of ``datetime64[ns]`` Series with ``NaT`` (GH2982__) + - ``icol`` with negative indicies was return ``nan`` (see GH2922_) .. _GH622: https://github.com/pydata/pandas/issues/622 .. _GH797: https://github.com/pydata/pandas/issues/797 @@ -145,6 +147,7 @@ pandas 0.11.0 .. _GH2849: https://github.com/pydata/pandas/issues/2849 .. _GH2898: https://github.com/pydata/pandas/issues/2898 .. _GH2909: https://github.com/pydata/pandas/issues/2909 +.. _GH2922: https://github.com/pydata/pandas/issues/2922 .. _GH2931: https://github.com/pydata/pandas/issues/2931 .. _GH2973: https://github.com/pydata/pandas/issues/2973 .. _GH2967: https://github.com/pydata/pandas/issues/2967 diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 8c18d9f69bee3..1ddb4a8282fb0 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -177,6 +177,86 @@ largely as a convenience since it is such a common operation. df[:3] df[::-1] +Location Based Indexing +~~~~~~~~~~~~~~~~~~~~~~~ + +Pandas provides a suite of methods in order to get **purely integer based indexing**. +The semantics follow closely python and numpy slicing. These are ``0-based`` indexing. +When slicing, the start bounds is *included*, while the upper bound is *excluded*. +Invalid selections will raise with an ``IndexError``. Trying to use a non-integer, +even a **valid** label will raise a ``ValueError``. Integers, lists of integers, and slices +are allowed indexers. + +The ``.iloc`` attribute is the primary access method . + +.. ipython:: python + + s1 = Series(np.random.randn(5),index=range(0,10,2)) + s1 + s1.iloc[:3] + s1.iloc[3] + +Note that setting works as well: + +.. ipython:: python + + s1.iloc[:3] = 0 + s1 + +With a DataFrame + +.. ipython:: python + + df1 = DataFrame(np.random.randn(6,4),index=range(0,12,2),columns=range(0,8,2)) + df1 + + # integer access + df1.iloc[5,2] + + # slices + df1.iloc[:3] + df1.iloc[1:5,2:4] + + # integer lists + df1.iloc[[1,3,5],[1,3]] + +For slicing rows explicitly. + +.. ipython:: python + + # this is equivalent to ``df1.iloc[1:3,:]`` + df1.irow(range(1,3)) + +For slicing columns explicitly. + +.. ipython:: python + + # this is equivalent to ``df1.iloc[:,1:3]`` + df1.icol(range(1,3)) + +For getting a value explicity. + +.. ipython:: python + + # this is equivalent to ``df1.iloc[1,1]`` + df1.iget_value(1,1) + +There is one signficant departure from standard python/numpy slicing semantics. +python/numpy allow slicing past the end of an array without an associated error. + +.. ipython:: python + + # these are allowed in python/numpy. + x = list('abcdef') + x[4:10] + x[8:10] + +:: + + >>> df.iloc[:,3:6] + IndexError: out-of-bounds on slice (end) + + Boolean indexing ~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt index cc3b39dd22e34..76f7ffae746b4 100644 --- a/doc/source/v0.11.0.txt +++ b/doc/source/v0.11.0.txt @@ -11,11 +11,68 @@ to. API changes ~~~~~~~~~~~ -Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, or a passed ``Series``, then it will be preserved in DataFrame operations. Furthermore, different numeric dtypes will **NOT** be combined. The following example will give you a taste. +Location Based Indexing +~~~~~~~~~~~~~~~~~~~~~~~ + +Pandas provides a suite of methods in order to get **purely integer based indexing**. +The semantics follow closely python and numpy slicing. These are ``0-based`` indexing. +When slicing, the start bounds is *included*, while the upper bound is *excluded*. +Invalid selections will raise with an ``IndexError``. Trying to use a non-integer, +even a **valid** label will raise a ``ValueError``. Integers, lists of integers, and slices +are allowed indexers. + +The ``.iloc`` attribute is the primary access method . + +.. ipython:: python + + s1 = Series(np.random.randn(5),index=range(0,10,2)) + s1 + s1.iloc[:3] + s1.iloc[3] + +Note that setting works as well: + +.. ipython:: python + + s1.iloc[:3] = 0 + s1 + +With a DataFrame + +.. ipython:: python + + df1 = DataFrame(np.random.randn(8,4),index=range(0,16,2),columns=range(0,8,2)) + + # integer access + df1.iloc[5,2] + + # slices + df1.iloc[:3] + df1.iloc[1:5,2:4] + + # integer lists + df1.iloc[[1,3,5],[1,3]] + +There is one signficant departure from standard python/numpy slicing semantics. +python/numpy allow slicing past the end of an array without an associated error. + +.. ipython:: python + + # these are allowed in python/numpy. + x = list('abcdef') + x[4:10] + x[8:10] + +:: + + >>> df.iloc[:,3:6] + IndexError: out-of-bounds on slice (end) Dtype Specification ~~~~~~~~~~~~~~~~~~~ +Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, or a passed ``Series``, then it will be preserved in DataFrame operations. Furthermore, different numeric dtypes will **NOT** be combined. The following example will give you a taste. + .. ipython:: python df1 = DataFrame(randn(8, 1), columns = ['A'], dtype = 'float32') diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c0449faf40368..b3770cb790f5b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -568,16 +568,6 @@ def axes(self): def _constructor(self): return DataFrame - # Fancy indexing - _ix = None - - @property - def ix(self): - if self._ix is None: - self._ix = _NDFrameIndexer(self) - - return self._ix - @property def shape(self): return (len(self.index), len(self.columns)) @@ -1947,6 +1937,12 @@ def icol(self, i): else: label = self.columns[i] if isinstance(label, Index): + + # if we have negative indicies, translate to postive here + # (take doesen't deal properly with these) + l = len(self.columns) + i = [ v if v >= 0 else l+v for v in i ] + return self.take(i, axis=1) values = self._data.iget(i) @@ -2054,13 +2050,13 @@ def _getitem_frame(self, key): raise ValueError('Must pass DataFrame with boolean values only') return self.where(key) - def _slice(self, slobj, axis=0): + def _slice(self, slobj, axis=0, raise_on_error=False): if axis == 0: mgr_axis = 1 else: mgr_axis = 0 - new_data = self._data.get_slice(slobj, axis=mgr_axis) + new_data = self._data.get_slice(slobj, axis=mgr_axis, raise_on_error=raise_on_error) return self._constructor(new_data) def _box_item_values(self, key, values): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index afe7f8775b1e9..511fb8976c411 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3,6 +3,7 @@ import numpy as np from pandas.core.index import MultiIndex +from pandas.core.indexing import _NDFrameIndexer, _NDFrameLocIndexer from pandas.tseries.index import DatetimeIndex import pandas.core.common as com import pandas.lib as lib @@ -70,6 +71,29 @@ def abs(self): """ return np.abs(self) + + #---------------------------------------------------------------------- + # integer indexing + _iloc = None + + @property + def iloc(self): + if self._iloc is None: + self._iloc = _NDFrameLocIndexer(self, 'iloc') + + return self._iloc + + #---------------------------------------------------------------------- + # Fancy indexing + _ix = None + + @property + def ix(self): + if self._ix is None: + self._ix = _NDFrameIndexer(self, 'ix') + + return self._ix + def get(self, key, default=None): """ Get item from object for given key (DataFrame column, Panel slice, @@ -396,10 +420,6 @@ def sort_index(self, axis=0, ascending=True): new_axis = labels.take(sort_index) return self.reindex(**{axis_name: new_axis}) - @property - def ix(self): - raise NotImplementedError - def reindex(self, *args, **kwds): raise NotImplementedError diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 8f812252134a1..0d80f54f6fe9c 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -17,9 +17,10 @@ class IndexingError(Exception): class _NDFrameIndexer(object): - def __init__(self, obj): + def __init__(self, obj, name): self.obj = obj self.ndim = obj.ndim + self.name = name def __iter__(self): raise NotImplementedError('ix is not iterable') @@ -50,8 +51,8 @@ def _get_label(self, label, axis=0): def _get_loc(self, key, axis=0): return self.obj._ixs(key, axis=axis) - def _slice(self, obj, axis=0): - return self.obj._slice(obj, axis=axis) + def _slice(self, obj, axis=0, raise_on_error=False): + return self.obj._slice(obj, axis=axis, raise_on_error=raise_on_error) def __setitem__(self, key, value): # kludgetastic @@ -221,7 +222,7 @@ def _getitem_tuple(self, tup): if _is_null_slice(key): continue - retval = retval.ix._getitem_axis(key, axis=i) + retval = getattr(retval,self.name)._getitem_axis(key, axis=i) return retval @@ -325,7 +326,7 @@ def _getitem_lowerdim(self, tup): if len(new_key) == 1: new_key, = new_key - return section.ix[new_key] + return getattr(section,self.name)[new_key] raise IndexingError('not applicable') @@ -593,6 +594,64 @@ def _get_slice_axis(self, slice_obj, axis=0): else: return self.obj.take(indexer, axis=axis) +class _NDFrameLocIndexer(_NDFrameIndexer): + """ purely location based indexing """ + + def __getitem__(self, key): + if type(key) is tuple: + + for i, k in enumerate(key): + if i >= self.obj.ndim: + raise ValueError('Too many indexers') + if not (isinstance(k, slice) or com.is_integer(k) or _is_list_like(k)): + raise ValueError("Location based indexing can only have slice or integer indexers") + + return self._getitem_tuple(key) + else: + return self._getitem_axis(key, axis=0) + + def _getitem_tuple(self, tup): + + retval = self.obj + for i, key in enumerate(tup): + if _is_null_slice(key): + continue + + retval = getattr(retval,self.name)._getitem_axis(key, axis=i) + + return retval + + def _get_slice_axis(self, slice_obj, axis=0): + obj = self.obj + + if not _need_slice(slice_obj): + return obj + + if isinstance(slice_obj, slice): + return self._slice(slice_obj, axis=axis, raise_on_error=True) + else: + return self.obj.take(slice_obj, axis=axis) + + def _getitem_axis(self, key, axis=0): + + if isinstance(key, slice): + return self._get_slice_axis(key, axis=axis) + + # a single integer + else: + + if not (com.is_integer(key) or _is_list_like(key)): + raise ValueError("Cannot index by location index with a non-integer key") + + return self._get_loc(key,axis=axis) + + def _convert_to_indexer(self, obj, axis=0): + """ much simpler as we only have to deal with slice/integer """ + if isinstance(obj, slice) or com.is_integer(obj): + return obj + + raise ValueError("Can only index by location with a slice or integer key") + # 32-bit floating point machine epsilon _eps = np.finfo('f4').eps @@ -737,6 +796,17 @@ def _need_slice(obj): (obj.step is not None and obj.step != 1)) +def _check_slice_bounds(slobj, values): + l = len(values) + start = slobj.start + if start is not None: + if start < -l or start > l-1: + raise IndexError("out-of-bounds on slice (start)") + stop = slobj.stop + if stop is not None: + if stop < -l-1 or stop > l: + raise IndexError("out-of-bounds on slice (end)") + def _maybe_droplevels(index, key): # drop levels if isinstance(key, tuple): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 159393be38b07..5bf918aff6367 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -5,6 +5,7 @@ import numpy as np from pandas.core.index import Index, _ensure_index, _handle_legacy_indexes +from pandas.core.indexing import _check_slice_bounds import pandas.core.common as com import pandas.lib as lib import pandas.tslib as tslib @@ -1034,8 +1035,12 @@ def get_bool_data(self, copy=False, as_blocks=False): return self.get_numeric_data(copy=copy, type_list=(BoolBlock,), as_blocks=as_blocks) - def get_slice(self, slobj, axis=0): + def get_slice(self, slobj, axis=0, raise_on_error=False): new_axes = list(self.axes) + + if raise_on_error: + _check_slice_bounds(slobj, new_axes[axis]) + new_axes[axis] = new_axes[axis][slobj] if axis == 0: diff --git a/pandas/core/panel.py b/pandas/core/panel.py index b418995ce3085..16d5f09aadc9c 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -540,16 +540,6 @@ def _get_plane_axes(self, axis): return index, columns - # Fancy indexing - _ix = None - - @property - def ix(self): - if self._ix is None: - self._ix = _NDFrameIndexer(self) - - return self._ix - def _wrap_array(self, arr, axes, copy=False): d = self._construct_axes_dict_from(self, axes, copy=copy) return self._constructor(arr, **d) @@ -679,8 +669,8 @@ def __getattr__(self, name): raise AttributeError("'%s' object has no attribute '%s'" % (type(self).__name__, name)) - def _slice(self, slobj, axis=0): - new_data = self._data.get_slice(slobj, axis=axis) + def _slice(self, slobj, axis=0, raise_on_error=False): + new_data = self._data.get_slice(slobj, axis=axis, raise_on_error=raise_on_error) return self._constructor(new_data) def __setitem__(self, key, value): diff --git a/pandas/core/series.py b/pandas/core/series.py index b349dd65ff82d..788752020dc01 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -20,7 +20,7 @@ _infer_dtype_from_scalar, is_list_like) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, _ensure_index, _handle_legacy_indexes) -from pandas.core.indexing import _SeriesIndexer, _check_bool_indexer +from pandas.core.indexing import _SeriesIndexer, _check_bool_indexer, _check_slice_bounds from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex, Period from pandas.util import py3compat @@ -547,15 +547,27 @@ def __setstate__(self, state): self.index = _handle_legacy_indexes([index])[0] self.name = name - _ix = None - + # indexers @property def ix(self): if self._ix is None: - self._ix = _SeriesIndexer(self) + self._ix = _SeriesIndexer(self, 'ix') return self._ix + def _ixs(self, i, axis=0): + return self.values[i] + + @property + def _is_mixed_type(self): + return False + + def _slice(self, slobj, axis=0, raise_on_error=False): + if raise_on_error: + _check_slice_bounds(slobj, self.values) + + return self._constructor(self.values[slobj], index=self.index[slobj]) + def __getitem__(self, key): try: return self.index.get_value(self, key) diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index bf978c322dbd2..f142b36534e22 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -10,6 +10,7 @@ from pandas.core.common import _pickle_array, _unpickle_array, _try_sort from pandas.core.index import Index, MultiIndex, _ensure_index +from pandas.core.indexing import _check_slice_bounds from pandas.core.series import Series from pandas.core.frame import (DataFrame, extract_index, _prep_ndarray, _default_index) @@ -416,11 +417,15 @@ def set_value(self, index, col, value): return dense.to_sparse(kind=self.default_kind, fill_value=self.default_fill_value) - def _slice(self, slobj, axis=0): + def _slice(self, slobj, axis=0, raise_on_error=False): if axis == 0: + if raise_on_error: + _check_slice_bounds(slobj, self.index) new_index = self.index[slobj] new_columns = self.columns else: + if raise_on_error: + _check_slice_bounds(slobj, self.columns) new_index = self.index new_columns = self.columns[slobj] diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 304072acc664e..3968371f8b1c8 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1451,6 +1451,108 @@ def test_single_element_ix_dont_upcast(self): result = self.frame.ix[self.frame.index[5], 'E'] self.assert_(com.is_integer(result)) + def test_iloc_getitem(self): + df = DataFrame(np.random.randn(10, 4), index=range(0, 20, 2), columns=range(0,8,2)) + + result = df.iloc[2] + exp = df.ix[4] + assert_series_equal(result, exp) + + result = df.iloc[2,2] + exp = df.ix[4,4] + self.assert_(result == exp) + + # slice + result = df.iloc[4:8] + expected = df.ix[8:14] + assert_frame_equal(result, expected) + + result = df.iloc[:,2:3] + expected = df.ix[:,4:5] + assert_frame_equal(result, expected) + + # list of integers + result = df.iloc[[0,1,3]] + expected = df.ix[[0,2,6]] + assert_frame_equal(result, expected) + + result = df.iloc[[0,1,3],[0,1]] + expected = df.ix[[0,2,6],[0,2]] + assert_frame_equal(result, expected) + + # neg indicies + result = df.iloc[[-1,1,3],[-1,1]] + expected = df.ix[[18,2,6],[6,2]] + assert_frame_equal(result, expected) + + # dups indicies + result = df.iloc[[-1,-1,1,3],[-1,1]] + expected = df.ix[[18,18,2,6],[6,2]] + assert_frame_equal(result, expected) + + # with index-like + s = Series(index=range(1,5)) + result = df.iloc[s.index] + expected = df.ix[[2,4,6,8]] + assert_frame_equal(result, expected) + + # out-of-bounds slice + self.assertRaises(IndexError, df.iloc.__getitem__, tuple([slice(None),slice(1,5,None)])) + self.assertRaises(IndexError, df.iloc.__getitem__, tuple([slice(None),slice(-5,3,None)])) + self.assertRaises(IndexError, df.iloc.__getitem__, tuple([slice(1,11,None)])) + self.assertRaises(IndexError, df.iloc.__getitem__, tuple([slice(-11,3,None)])) + + # try with labelled frame + df = DataFrame(np.random.randn(10, 4), index=list('abcdefghij'), columns=list('ABCD')) + + result = df.iloc[1,1] + exp = df.ix['b','B'] + self.assert_(result == exp) + + result = df.iloc[:,2:3] + expected = df.ix[:,['C']] + assert_frame_equal(result, expected) + + # negative indexing + result = df.iloc[-1,-1] + exp = df.ix['j','D'] + self.assert_(result == exp) + + # out-of-bounds exception + self.assertRaises(IndexError, df.iloc.__getitem__, tuple([10,5])) + + # trying to use a label + self.assertRaises(ValueError, df.iloc.__getitem__, tuple(['j','D'])) + + def test_iloc_setitem(self): + df = DataFrame(np.random.randn(10, 4), index=list('abcdefghij'), columns=list('ABCD')) + + df.iloc[1,1] = 1 + result = df.iloc[1,1] + self.assert_(result == 1) + + df.iloc[:,2:3] = 0 + expected = df.iloc[:,2:3] + result = df.iloc[:,2:3] + assert_frame_equal(result, expected) + + def test_iloc_multiindex(self): + df = DataFrame(np.random.randn(3, 3), + columns=[[2,2,4],[6,8,10]], + index=[[4,4,8],[8,10,12]]) + + rs = df.iloc[2] + xp = df.irow(2) + assert_series_equal(rs, xp) + + rs = df.iloc[:,2] + xp = df.icol(2) + assert_series_equal(rs, xp) + + rs = df.iloc[2,2] + xp = df.values[2,2] + self.assert_(rs == xp) + def test_irow(self): df = DataFrame(np.random.randn(10, 4), index=range(0, 20, 2)) @@ -6305,7 +6407,6 @@ def _check_set(df, cond, check_dtypes = True): econd = cond.reindex_like(df).fillna(True) expected = dfi.mask(~econd) - #import pdb; pdb.set_trace() dfi.where(cond, np.nan, inplace=True) assert_frame_equal(dfi, expected) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index ee288fda120d3..eab956fb5feb2 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -1000,6 +1000,40 @@ def test_basic_setitem_with_labels(self): self.assertRaises(Exception, s.__setitem__, inds_notfound, 0) self.assertRaises(Exception, s.__setitem__, arr_inds_notfound, 0) + def test_iloc_getitem(self): + s = Series(np.random.randn(10), index=list('abcdefghij')) + + result = s.iloc[1] + exp = s.ix['b'] + self.assert_(result == exp) + + result = s.iloc[2:4] + expected = s.ix['c':'d'] + assert_series_equal(result, expected) + + # negative indexing + result = s.iloc[-1] + exp = s.ix['j'] + self.assert_(result == exp) + + # out-of-bounds exception + self.assertRaises(IndexError, s.iloc.__getitem__, tuple([12])) + + # trying to use a label + self.assertRaises(ValueError, s.iloc.__getitem__, tuple(['j'])) + + def test_iloc_setitem(self): + s = Series(np.random.randn(10), index=range(0,20,2)) + + s.iloc[1] = 1 + result = s.iloc[1] + self.assert_(result == 1) + + s.iloc[:4] = 0 + expected = s.iloc[:4] + result = s.iloc[:4] + assert_series_equal(result, expected) + def test_ix_getitem(self): inds = self.series.index[[3, 4, 7]] assert_series_equal(self.series.ix[inds], self.series.reindex(inds)) From 02ed7915a7ee0013dff4185c4be7c6faa353f8df Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 27 Feb 2013 17:05:18 -0500 Subject: [PATCH 2/7] TST: new test suite for indexing --- pandas/core/generic.py | 43 ++-- pandas/core/indexing.py | 101 +++++++++- pandas/core/series.py | 6 +- pandas/tests/test_indexing.py | 370 ++++++++++++++++++++++++++++++++++ 4 files changed, 485 insertions(+), 35 deletions(-) create mode 100644 pandas/tests/test_indexing.py diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 511fb8976c411..c25e686afacbf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3,7 +3,7 @@ import numpy as np from pandas.core.index import MultiIndex -from pandas.core.indexing import _NDFrameIndexer, _NDFrameLocIndexer +import pandas.core.indexing as indexing from pandas.tseries.index import DatetimeIndex import pandas.core.common as com import pandas.lib as lib @@ -60,6 +60,21 @@ def _get_axis(self, axis): name = self._get_axis_name(axis) return getattr(self, name) + #---------------------------------------------------------------------- + # Indexers + @classmethod + def _create_indexer(cls, name, indexer): + """ create an indexer like _name in the class """ + iname = '_%s' % name + setattr(cls,iname,None) + + def _indexer(self): + if getattr(self,iname,None) is None: + setattr(self,iname,indexer(self, name)) + return getattr(self,iname) + + setattr(cls,name,property(_indexer)) + def abs(self): """ Return an object with absolute value taken. Only applicable to objects @@ -71,29 +86,6 @@ def abs(self): """ return np.abs(self) - - #---------------------------------------------------------------------- - # integer indexing - _iloc = None - - @property - def iloc(self): - if self._iloc is None: - self._iloc = _NDFrameLocIndexer(self, 'iloc') - - return self._iloc - - #---------------------------------------------------------------------- - # Fancy indexing - _ix = None - - @property - def ix(self): - if self._ix is None: - self._ix = _NDFrameIndexer(self, 'ix') - - return self._ix - def get(self, key, default=None): """ Get item from object for given key (DataFrame column, Panel slice, @@ -486,6 +478,9 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, np.putmask(rs.values, mask, np.nan) return rs +# install the indexerse +for _name, _indexer in indexing.get_indexers_list(): + PandasObject._create_indexer(_name,_indexer) class NDFrame(PandasObject): """ diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 0d80f54f6fe9c..f2d4f299d11ff 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -7,6 +7,17 @@ import numpy as np +# the supported indexers +def get_indexers_list(): + + return [ + ('ix' ,_NDFrameIndexer), + ('iloc',_iLocIndexer ), + ('loc' ,_LocIndexer ), + ('at' ,_AtIndexer ), + ('iat' ,_iAtIndexer ), + ] + # "null slice" _NS = slice(None, None) @@ -594,8 +605,11 @@ def _get_slice_axis(self, slice_obj, axis=0): else: return self.obj.take(indexer, axis=axis) -class _NDFrameLocIndexer(_NDFrameIndexer): - """ purely location based indexing """ +class _LocationIndexer(_NDFrameIndexer): + _valid_types = None + + def _has_valid_type(self, k): + raise NotImplemented def __getitem__(self, key): if type(key) is tuple: @@ -603,8 +617,8 @@ def __getitem__(self, key): for i, k in enumerate(key): if i >= self.obj.ndim: raise ValueError('Too many indexers') - if not (isinstance(k, slice) or com.is_integer(k) or _is_list_like(k)): - raise ValueError("Location based indexing can only have slice or integer indexers") + if not self._has_valid_type(k): + raise ValueError("Location based indexing can only have [%s] types" % self._valid_types) return self._getitem_tuple(key) else: @@ -621,6 +635,33 @@ def _getitem_tuple(self, tup): return retval + def _get_slice_axis(self, slice_obj, axis=0): + raise NotImplemented + + def _getitem_axis(self, key, axis=0): + raise NotImplemented + + def _convert_to_indexer(self, obj, axis=0): + """ much simpler as we only have to deal with our valid types """ + if self._has_valid_type(obj): + return obj + + raise ValueError("Can only index by location with a [%s]" % self._valid_types) + +class _LocIndexer(_LocationIndexer): + """ purely label based location based indexing """ + _valid_types = None + + def _has_valid_type(self, k): + return True + +class _iLocIndexer(_LocationIndexer): + """ purely integer based location based indexing """ + _valid_types = "integer, integer slice, listlike of integers, boolean array" + + def _has_valid_type(self, k): + return isinstance(k, slice) or com.is_integer(k) or _is_list_like(k) or com._is_bool_indexer(k) + def _get_slice_axis(self, slice_obj, axis=0): obj = self.obj @@ -637,7 +678,14 @@ def _getitem_axis(self, key, axis=0): if isinstance(key, slice): return self._get_slice_axis(key, axis=axis) - # a single integer + elif com._is_bool_indexer(key): + + labels = self.obj._get_axis(axis) + key = _check_bool_indexer(labels, key) + inds, = key.nonzero() + return self.obj.take(inds, axis=axis) + + # a single integer or a list of integers else: if not (com.is_integer(key) or _is_list_like(key)): @@ -645,12 +693,45 @@ def _getitem_axis(self, key, axis=0): return self._get_loc(key,axis=axis) - def _convert_to_indexer(self, obj, axis=0): - """ much simpler as we only have to deal with slice/integer """ - if isinstance(obj, slice) or com.is_integer(obj): - return obj - raise ValueError("Can only index by location with a slice or integer key") +class _ScalarAccessIndexer(_NDFrameIndexer): + """ access scalars quickly """ + + def _convert_key(self, key): + return list(key) + + def __getitem__(self, key): + if not isinstance(key, tuple): + raise ValueError('Invalid call for scalar access (getting)!') + if len(key) != self.obj.ndim: + raise ValueError('Not enough indexers for scalar access (getting)!') + key = self._convert_key(key) + return self.obj.get_value(*key) + + def __setitem__(self, key, value): + if not isinstance(key, tuple): + raise ValueError('Invalid call for scalar access (setting)!') + if len(key) != self.obj.ndim: + raise ValueError('Not enough indexers for scalar access (setting)!') + key = self._convert_key(key) + key.append(value) + self.obj.set_value(*key) + +class _AtIndexer(_ScalarAccessIndexer): + """ label based scalar accessor """ + pass + +class _iAtIndexer(_ScalarAccessIndexer): + """ integer based scalar accessor """ + + def _convert_key(self, key): + """ require integer args (and convert to label arguments) """ + ckey = [] + for a, i in zip(self.obj.axes,key): + if not com.is_integer(i): + raise ValueError("iAt based indexing can only have integer indexers") + ckey.append(a[i]) + return ckey # 32-bit floating point machine epsilon _eps = np.finfo('f4').eps diff --git a/pandas/core/series.py b/pandas/core/series.py index 788752020dc01..a3a0cfea97f9d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -548,6 +548,10 @@ def __setstate__(self, state): self.name = name # indexers + @property + def axes(self): + return [ self.index ] + @property def ix(self): if self._ix is None: @@ -556,7 +560,7 @@ def ix(self): return self._ix def _ixs(self, i, axis=0): - return self.values[i] + return self[self.index[i]] @property def _is_mixed_type(self): diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py new file mode 100644 index 0000000000000..acb720cfca580 --- /dev/null +++ b/pandas/tests/test_indexing.py @@ -0,0 +1,370 @@ +# pylint: disable-msg=W0612,E1101 +import unittest +import nose +import itertools + +from numpy import random, nan +from numpy.random import randn +import numpy as np +from numpy.testing import assert_array_equal + +import pandas as pan +import pandas.core.common as com +from pandas.core.api import (DataFrame, Index, Series, Panel, notnull, isnull, + MultiIndex, DatetimeIndex, Timestamp) +from pandas.util.testing import (assert_almost_equal, assert_series_equal, + assert_frame_equal, assert_panel_equal) +from pandas.util import py3compat + +import pandas.util.testing as tm +import pandas.lib as lib + +from numpy.testing.decorators import slow + +_verbose = True + +#------------------------------------------------------------------------------- +# Indexing test cases + + +def _generate_indices(f, values=False): + """ generate the indicies + if values is True , use the axis values + is False, use the range + """ + + axes = f.axes + if values: + axes = [ range(len(a)) for a in axes ] + + return itertools.product(*axes) + +def _get_value(f, i, values=False): + """ return the value for the location i """ + + # check agains values + if values: + return f.values[i] + + # this is equiv of f[col][row]..... + #v = f + #for a in reversed(i): + # v = v.__getitem__(a) + #return v + return f.ix[i] + +def _get_result(obj, method, key, axis): + """ return the result for this obj with this key and this axis """ + + if isinstance(key, dict): + key = key[axis] + + # in case we actually want 0 index slicing + try: + xp = getattr(obj, method).__getitem__(_axify(obj,key,axis)) + except: + xp = getattr(obj, method).__getitem__(key) + + return xp + +def _axify(obj, key, axis): + # create a tuple accessor + if axis is not None: + axes = [ slice(None) ] * obj.ndim + axes[axis] = key + return tuple(axes) + return k + + +class TestIndexing(unittest.TestCase): + + _multiprocess_can_split_ = True + + _objs = set(['series','frame','panel']) + _typs = set(['ints','labels','mixed','ts','floats','empty']) + + def setUp(self): + import warnings + warnings.filterwarnings(action='ignore', category=FutureWarning) + + from pandas import date_range + + self.series_ints = Series(np.random.rand(4), index=range(0,8,2)) + self.frame_ints = DataFrame(np.random.randn(4, 4), index=range(0, 8, 2), columns=range(0,12,3)) + self.panel_ints = Panel(np.random.rand(4,4,4), items=range(0,8,2),major_axis=range(0,12,3),minor_axis=range(0,16,4)) + + self.series_labels = Series(np.random.randn(4), index=list('abcd')) + self.frame_labels = DataFrame(np.random.randn(4, 4), index=list('abcd'), columns=list('ABCD')) + self.panel_labels = Panel(np.random.randn(4,4,4), items=list('ZYXW'), major_axis=list('abcd'), minor_axis=list('ABCD')) + + self.series_mixed = Series(np.random.randn(4), index=[2, 4, 'null', 8]) + self.frame_mixed = DataFrame(np.random.randn(4, 4), columns=[2, 4, 'null', 8]) + self.panel_mixed = Panel(np.random.randn(4,4,4), items=[2,4,'null',8]) + + self.series_ts = Series(np.random.randn(4), index=date_range('20130101', periods=4)) + self.frame_ts = DataFrame(np.random.randn(4, 4), columns=date_range('20130101', periods=4)) + self.panel_ts = Panel(np.random.randn(4, 4, 4), items=date_range('20130101', periods=4)) + + #self.series_floats = Series(np.random.randn(4), index=[1.00, 2.00, 3.00, 4.00]) + #self.frame_floats = DataFrame(np.random.randn(4, 4), columns=[1.00, 2.00, 3.00, 4.00]) + #self.panel_floats = Panel(np.random.rand(4,4,4), items = [1.00,2.00,3.00,4.00]) + + self.frame_empty = DataFrame({}) + self.series_empty = Series({}) + self.panel_empty = Panel({}) + + # form agglomerates + for o in self._objs: + + d = dict() + for t in self._typs: + d[t] = getattr(self,'%s_%s' % (o,t),None) + + setattr(self,o,d) + + def check_values(self, f, func, values = False): + + if f is None: return + axes = f.axes + indicies = itertools.product(*axes) + + for i in indicies: + result = getattr(f,func)[i] + + # check agains values + if values: + expected = f.values[i] + else: + expected = f + for a in reversed(i): + expected = expected.__getitem__(a) + + assert_almost_equal(result, expected) + + + def check_result(self, name, method1, key1, method2, key2, typs = None, objs = None, axis = None, empty = 'fail', fails = None): + + + def _eq(t, o, a, obj, k1, k2): + """ compare equal for these 2 keys """ + + if a is not None and a > obj.ndim-1: + return + + def _print(result, show = True,error = None): + if error is not None: + error = str(error) + v = "%-12.12s [%-10.10s]: [typ->%-8.8s,obj->%-8.8s,key1->(%-4.4s) %-20.20s,key2->(%-4.4s) %-20.20s,axis->%s] %s" % (name,result,t,o,method1,key1,method2,key2,a,error or '') + if show: + print(v) + + try: + + rs = getattr(obj, method1).__getitem__(_axify(obj,k1,a)) + + try: + xp = _get_result(obj,method2,k2,a) + except: + result = 'no comp' + _print(result) + return + + try: + if np.isscalar(rs) and np.isscalar(xp): + self.assert_(rs == xp) + elif xp.ndim == 1: + assert_series_equal(rs,xp) + elif xp.ndim == 2: + assert_frame_equal(rs,xp) + elif xp.ndim == 3: + assert_panel_equal(rs,xp) + result = 'ok' + except (AssertionError): + result = 'fail' + + # reverse the checks + if fails is True: + if result == 'fail': + result = 'ok (fail)' + + if not result.startswith('ok'): + raise AssertionError(_print(result)) + + if _verbose: + _print(result) + + except (AssertionError): + raise + except (TypeError): + raise AssertionError(_print('type error')) + except (Exception), detail: + + # if we are in fails, the ok, otherwise raise it + if isinstance(fails,(tuple,list)): + if tuple(t,o) in fails or tuple(o,t) in fails: + return + + # empty fails are ok + if empty == 'fail' and t == 'empty': + return + + result = 'error' + raise AssertionError(_print(result, error = detail)) + + if typs is None: + typs = self._typs + + if objs is None: + objs = self._objs + + axes = [] + if axis is not None: + if not isinstance(axis,(tuple,list)): + axes = [ axis ] + else: + axes = list(axis) + else: + axes = [ 0, 1, 2] + + # check + for o in objs: + if o not in self._objs: + continue + + d = getattr(self,o) + for a in axes: + for t in typs: + if t not in self._typs: + continue + + obj = d[t] + if obj is not None: + obj = obj.copy() + + k2 = key2 + + if name == 'list int' and o == 'panel': + import pdb; pdb.set_trace() + + _eq(t, o, a, obj, key1, k2) + + def test_at_and_iat_get(self): + + def _check(f, func, values = False): + + if f is not None: + indicies = _generate_indices(f, values) + for i in indicies: + result = getattr(f,func)[i] + expected = _get_value(f,i,values) + assert_almost_equal(result, expected) + + for o in self._objs: + + d = getattr(self,o) + + # iat + _check(d['ints'],'iat', values=True) + for f in [d['labels'],d['ts'],d['floats']]: + if f is not None: + self.assertRaises(ValueError, self.check_values, f, 'iat') + + # at + _check(d['ints'], 'at') + _check(d['labels'],'at') + _check(d['ts'], 'at') + _check(d['floats'],'at') + + def test_at_and_iat_set(self): + + def _check(f, func, values = False): + + if f is not None: + indicies = _generate_indices(f, values) + for i in indicies: + getattr(f,func)[i] = 1 + expected = _get_value(f,i,values) + assert_almost_equal(expected, 1) + + for t in self._objs: + + d = getattr(self,t) + + _check(d['ints'],'iat',values=True) + for f in [d['labels'],d['ts'],d['floats']]: + if f is not None: + self.assertRaises(ValueError, _check, f, 'iat') + + # at + _check(d['ints'], 'at') + _check(d['labels'],'at') + _check(d['ts'], 'at') + _check(d['floats'],'at') + + def test_iloc_getitem(self): + + # integer + self.check_result('integer', 'iloc', 2, 'ix', { 0 : 4, 1: 6, 2: 8 }, typs = ['ints']) + self.check_result('integer', 'iloc', 2, 'ix', { 0 : 4, 1: 6, 2: 8 }, typs = ['labels','mixed','ts','floats','empty'], fails = True) + + # neg integer + self.check_result('neg int', 'iloc', -1, 'ix', { 0 : 6, 1: 9, 2: 12 }, typs = ['ints']) + self.check_result('neg int', 'iloc', -1, 'ix', { 0 : 6, 1: 9, 2: 12 }, typs = ['labels','mixed','ts','floats','empty'], fails = True) + + # list of ints + self.check_result('list int', 'iloc', [0,1,3], 'ix', { 0 : [0,2,6], 1 : [0,3,9], 2: [0,4,12] }, typs = ['ints']) + self.check_result('list int', 'iloc', [0,1,3], 'ix', { 0 : [0,2,6], 1 : [0,3,9], 2: [0,4,12] }, typs = ['labels','mixed','ts','floats','empty'], fails = True) + self.check_result('list int (dups)', 'iloc', [0,1,1,3], 'ix', { 0 : [0,2,2,6], 1 : [0,3,3,9], 2: [0,4,4,12] }, typs = ['ints']) + + # series like + s = Series(index=range(1,4)) + self.check_result('array like', 'iloc', s.index, 'ix', [2,4,6], typs = ['ints']) + + # boolean indexers + b = [True,False,True,False,] + self.check_result('bool', 'iloc', b, 'ix', b, typs = ['ints']) + self.check_result('bool', 'iloc', b, 'ix', b, typs = ['labels','mixed','ts','floats','empty'], fails = True) + + # slices + self.check_result('slice', 'iloc', slice(1,3), 'ix', slice(2,4,2), typs = ['ints']) + self.check_result('slice', 'iloc', slice(1,3), 'ix', slice(2,4,2), typs = ['labels','mixed','ts','floats','empty'], fails = True) + + # out-of-bounds slice + self.assertRaises(IndexError, self.frame_ints.iloc.__getitem__, tuple([slice(None),slice(1,5,None)])) + self.assertRaises(IndexError, self.frame_ints.iloc.__getitem__, tuple([slice(None),slice(-5,3,None)])) + self.assertRaises(IndexError, self.frame_ints.iloc.__getitem__, tuple([slice(1,5,None)])) + self.assertRaises(IndexError, self.frame_ints.iloc.__getitem__, tuple([slice(-5,3,None)])) + + def test_iloc_setitem(self): + df = self.frame_ints + + df.iloc[1,1] = 1 + result = df.iloc[1,1] + self.assert_(result == 1) + + df.iloc[:,2:3] = 0 + expected = df.iloc[:,2:3] + result = df.iloc[:,2:3] + assert_frame_equal(result, expected) + + def test_iloc_multiindex(self): + df = DataFrame(np.random.randn(3, 3), + columns=[[2,2,4],[6,8,10]], + index=[[4,4,8],[8,10,12]]) + + rs = df.iloc[2] + xp = df.irow(2) + assert_series_equal(rs, xp) + + rs = df.iloc[:,2] + xp = df.icol(2) + assert_series_equal(rs, xp) + + rs = df.iloc[2,2] + xp = df.values[2,2] + self.assert_(rs == xp) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) From 7cc64d65ce5c4d15c82d14e95bf644b92dd9bf2c Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 28 Feb 2013 00:15:39 -0500 Subject: [PATCH 3/7] ENH: added loc/at/iat indexers ....almost done --- doc/source/conf.py | 1 + doc/source/indexing.rst | 85 +++++++++++++--- pandas/core/indexing.py | 123 +++++++++++++++++------ pandas/core/panel.py | 7 +- pandas/core/series.py | 57 ++++++----- pandas/tests/test_indexing.py | 182 ++++++++++++++++++++++++++++------ 6 files changed, 347 insertions(+), 108 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 76093d83b32e7..43b7df6af805b 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -17,6 +17,7 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # sys.path.append(os.path.abspath('.')) +sys.path.insert(0,'/home/jreback/pandas') sys.path.insert(0, os.path.abspath('../sphinxext')) sys.path.extend([ diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 1ddb4a8282fb0..11708f68d9719 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -53,7 +53,7 @@ indexing functionality: .. ipython:: python - dates = np.asarray(date_range('1/1/2000', periods=8)) + dates = date_range('1/1/2000', periods=8) df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) df panel = Panel({'one' : df, 'two' : df - df.mean()}) @@ -81,21 +81,29 @@ Fast scalar value getting and setting Since indexing with ``[]`` must handle a lot of cases (single-label access, slicing, boolean indexing, etc.), it has a bit of overhead in order to figure out what you're asking for. If you only want to access a scalar value, the -fastest way is to use the ``get_value`` method, which is implemented on all of -the data structures: +fastest way is to use the ``at`` and ``iat`` methods, which are implemented on all of +the data structures. + +Similary to ``loc`` and ``at`` provides **label** based lookups, while, ``iat`` provides +**integer** based lookups analagously to ``iloc`` .. ipython:: python - s.get_value(dates[5]) - df.get_value(dates[5], 'A') + type(dates[5]) + dates[5] + s.at[dates[5]] + s.iat[5] + df.at[dates[5], 'A'] + df.iat[3, 0] -There is an analogous ``set_value`` method which has the additional capability +You can also set using these same indexers. These have the additional capability of enlarging an object. This method *always* returns a reference to the object it modified, which in the case of enlargement, will be a **new object**: .. ipython:: python - df.set_value(dates[5], 'E', 7) + df.at[dates[5], 'E'] = 7 + df.iat[3, 0] = 7 Additional Column Access ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -177,8 +185,57 @@ largely as a convenience since it is such a common operation. df[:3] df[::-1] -Location Based Indexing -~~~~~~~~~~~~~~~~~~~~~~~ +Label Based Indexing +~~~~~~~~~~~~~~~~~~~~ + +Pandas provides a suite of methods in order to get **purely label based indexing**. +This is a strict inclusion based protocol. **ALL** of the labels for which you ask, +must be in the index or a KeyError will be raised! When slicing, the start bounds is +*included*, **AND** the upper bound is *included*. Invalid selections will raise with +an ``KeyError``. Integers are valid indicies (and labels), but standard semantics +**DO NOT APPLY**, labels must be **INCLUSIVE**. Valid selection include: *label, listlike +of labels, boolean indexers, and label slices* + +The ``.loc`` attribute is the primary access method. + +.. ipython:: python + + s1 = Series(np.random.randn(6),index=list('abcdef')) + s1 + s1.loc['c':] + s1.loc['b'] + +Note that setting works as well: + +.. ipython:: python + + s1.loc['c':] = 0 + s1 + +With a DataFrame + +.. ipython:: python + + df1 = DataFrame(np.random.randn(6,4),index=list('abcdef'),columns=range(4)) + df1 + df1.loc[['a','b','d'],:] + + # slices (this is an ok integer slice because it encompasses all of the labels) + df1.loc['d':,1:3] + + # boolean + df1.loc[:,[True,True,False,False]] + +For getting a value explicity. + +.. ipython:: python + + # this is equivalent to ``df1.at['a',1]`` + df1.loc['a',1] + + +Integer Based Indexing +~~~~~~~~~~~~~~~~~~~~~~ Pandas provides a suite of methods in order to get **purely integer based indexing**. The semantics follow closely python and numpy slicing. These are ``0-based`` indexing. @@ -224,22 +281,20 @@ For slicing rows explicitly. .. ipython:: python - # this is equivalent to ``df1.iloc[1:3,:]`` - df1.irow(range(1,3)) + df1.iloc[1:3,:] For slicing columns explicitly. .. ipython:: python - # this is equivalent to ``df1.iloc[:,1:3]`` - df1.icol(range(1,3)) + df1.iloc[:,1:3] For getting a value explicity. .. ipython:: python - # this is equivalent to ``df1.iloc[1,1]`` - df1.iget_value(1,1) + # this is equivalent to ``df1.iat[1,1]`` + df1.iloc[1,1] There is one signficant departure from standard python/numpy slicing semantics. python/numpy allow slicing past the end of an array without an associated error. diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index f2d4f299d11ff..e21405b775166 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1,7 +1,7 @@ # pylint: disable=W0223 from pandas.core.common import _asarray_tuplesafe -from pandas.core.index import Index, MultiIndex +from pandas.core.index import Index, MultiIndex, _ensure_index import pandas.core.common as com import pandas.lib as lib @@ -607,9 +607,10 @@ def _get_slice_axis(self, slice_obj, axis=0): class _LocationIndexer(_NDFrameIndexer): _valid_types = None + _exception = Exception - def _has_valid_type(self, k): - raise NotImplemented + def _has_valid_type(self, k, axis): + raise NotImplementedError() def __getitem__(self, key): if type(key) is tuple: @@ -617,50 +618,107 @@ def __getitem__(self, key): for i, k in enumerate(key): if i >= self.obj.ndim: raise ValueError('Too many indexers') - if not self._has_valid_type(k): + if not self._has_valid_type(k,i): raise ValueError("Location based indexing can only have [%s] types" % self._valid_types) return self._getitem_tuple(key) else: return self._getitem_axis(key, axis=0) - def _getitem_tuple(self, tup): + def _getitem_axis(self, key, axis=0): + raise NotImplementedError() - retval = self.obj - for i, key in enumerate(tup): - if _is_null_slice(key): - continue + def _getbool_axis(self, key, axis=0): + labels = self.obj._get_axis(axis) + key = _check_bool_indexer(labels, key) + inds, = key.nonzero() + try: + return self.obj.take(inds, axis=axis) + except (Exception), detail: + raise self._exception(detail) - retval = getattr(retval,self.name)._getitem_axis(key, axis=i) +class _LocIndexer(_LocationIndexer): + """ purely label based location based indexing """ + _valid_types = "labels (MUST BE INCLUSIVE), slices of labels, slices of integers if the index is integers, boolean" + _exception = KeyError - return retval + def _has_valid_type(self, key, axis): + ax = self.obj._get_axis(axis) - def _get_slice_axis(self, slice_obj, axis=0): - raise NotImplemented + # valid for a label where all labels are in the index + # slice of lables (where start-end in labels) + # slice of integers (only if in the lables) + # boolean - def _getitem_axis(self, key, axis=0): - raise NotImplemented + if isinstance(key, slice): - def _convert_to_indexer(self, obj, axis=0): - """ much simpler as we only have to deal with our valid types """ - if self._has_valid_type(obj): - return obj + if key.start is not None and key.start not in ax: + raise KeyError + if key.stop is not None and key.stop-1 not in ax: + raise KeyError - raise ValueError("Can only index by location with a [%s]" % self._valid_types) + elif com._is_bool_indexer(key): + return True -class _LocIndexer(_LocationIndexer): - """ purely label based location based indexing """ - _valid_types = None + elif _is_list_like(key): + + # require all elements in the index + idx = _ensure_index(key) + if not idx.isin(ax).all(): + raise KeyError + + return True + + else: + + # if its empty we want a KeyError here + if not len(ax): + raise KeyError + + if not key in ax: + raise KeyError - def _has_valid_type(self, k): return True + def _getitem_axis(self, key, axis=0): + labels = self.obj._get_axis(axis) + + if isinstance(key, slice): + return self._get_slice_axis(key, axis=axis) + elif com._is_bool_indexer(key): + return self._getbool_axis(key, axis=axis) + elif _is_list_like(key) and not (isinstance(key, tuple) and + isinstance(labels, MultiIndex)): + + if hasattr(key, 'ndim') and key.ndim > 1: + raise ValueError('Cannot index with multidimensional key') + + return self._getitem_iterable(key, axis=axis) + else: + indexer = labels.get_loc(key) + return self._get_loc(indexer, axis=axis) + + def _get_loc(self, key, axis=0): + return self.obj._ixs(key, axis=axis) + class _iLocIndexer(_LocationIndexer): """ purely integer based location based indexing """ _valid_types = "integer, integer slice, listlike of integers, boolean array" + _exception = IndexError - def _has_valid_type(self, k): - return isinstance(k, slice) or com.is_integer(k) or _is_list_like(k) or com._is_bool_indexer(k) + def _has_valid_type(self, key, axis): + return isinstance(key, slice) or com.is_integer(key) or com._is_bool_indexer(key) or _is_list_like(key) + + def _getitem_tuple(self, tup): + + retval = self.obj + for i, key in enumerate(tup): + if _is_null_slice(key): + continue + + retval = getattr(retval,self.name)._getitem_axis(key, axis=i) + + return retval def _get_slice_axis(self, slice_obj, axis=0): obj = self.obj @@ -679,11 +737,7 @@ def _getitem_axis(self, key, axis=0): return self._get_slice_axis(key, axis=axis) elif com._is_bool_indexer(key): - - labels = self.obj._get_axis(axis) - key = _check_bool_indexer(labels, key) - inds, = key.nonzero() - return self.obj.take(inds, axis=axis) + return self._getbool_axis(key, axis=axis) # a single integer or a list of integers else: @@ -693,6 +747,13 @@ def _getitem_axis(self, key, axis=0): return self._get_loc(key,axis=axis) + def _convert_to_indexer(self, obj, axis=0): + """ much simpler as we only have to deal with our valid types """ + if self._has_valid_type(obj,axis): + return obj + + raise ValueError("Can only index by location with a [%s]" % self._valid_types) + class _ScalarAccessIndexer(_NDFrameIndexer): """ access scalars quickly """ diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 16d5f09aadc9c..dd1aeed70513b 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -12,7 +12,7 @@ from pandas.core.categorical import Factor from pandas.core.index import (Index, MultiIndex, _ensure_index, _get_combined_index) -from pandas.core.indexing import _NDFrameIndexer, _maybe_droplevels +from pandas.core.indexing import _maybe_droplevels, _is_list_like from pandas.core.internals import BlockManager, make_block, form_blocks from pandas.core.series import Series from pandas.core.frame import DataFrame @@ -1069,6 +1069,11 @@ def _ixs(self, i, axis=0): # for compatibility with .ix indexing # Won't work with hierarchical indexing yet key = self._get_axis(axis)[i] + + # xs cannot handle a non-scalar key, so just reindex here + if _is_list_like(key): + return self.reindex(**{ self._get_axis_name(axis) : key }) + return self.xs(key, axis=axis) def groupby(self, function, axis='major'): diff --git a/pandas/core/series.py b/pandas/core/series.py index a3a0cfea97f9d..74f96aff083dd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -560,7 +560,31 @@ def ix(self): return self._ix def _ixs(self, i, axis=0): - return self[self.index[i]] + """ + Return the i-th value or values in the Series by location + + Parameters + ---------- + i : int, slice, or sequence of integers + + Returns + ------- + value : scalar (int) or Series (slice, sequence) + """ + try: + return _index.get_value_at(self, i) + except IndexError: + raise + except: + if isinstance(i, slice): + return self[i] + else: + label = self.index[i] + if isinstance(label, Index): + return self.reindex(label) + else: + return _index.get_value_at(self, i) + @property def _is_mixed_type(self): @@ -924,34 +948,9 @@ def get(self, label, default=None): except KeyError: return default - def iget_value(self, i): - """ - Return the i-th value or values in the Series by location - - Parameters - ---------- - i : int, slice, or sequence of integers - - Returns - ------- - value : scalar (int) or Series (slice, sequence) - """ - try: - return _index.get_value_at(self, i) - except IndexError: - raise - except: - if isinstance(i, slice): - return self[i] - else: - label = self.index[i] - if isinstance(label, Index): - return self.reindex(label) - else: - return _index.get_value_at(self, i) - - iget = iget_value - irow = iget_value + iget_value = _ixs + iget = _ixs + irow = _ixs def get_value(self, label): """ diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index acb720cfca580..105a12dd0dba6 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -59,6 +59,12 @@ def _get_result(obj, method, key, axis): if isinstance(key, dict): key = key[axis] + # use an artifical conversion to map the key as integers to the labels + # so ix can work for comparisions + if method == 'indexer': + method = 'ix' + key = obj._get_axis(axis)[key] + # in case we actually want 0 index slicing try: xp = getattr(obj, method).__getitem__(_axify(obj,key,axis)) @@ -95,14 +101,14 @@ def setUp(self): self.series_labels = Series(np.random.randn(4), index=list('abcd')) self.frame_labels = DataFrame(np.random.randn(4, 4), index=list('abcd'), columns=list('ABCD')) - self.panel_labels = Panel(np.random.randn(4,4,4), items=list('ZYXW'), major_axis=list('abcd'), minor_axis=list('ABCD')) + self.panel_labels = Panel(np.random.randn(4,4,4), items=list('abcd'), major_axis=list('ABCD'), minor_axis=list('ZYXW')) self.series_mixed = Series(np.random.randn(4), index=[2, 4, 'null', 8]) - self.frame_mixed = DataFrame(np.random.randn(4, 4), columns=[2, 4, 'null', 8]) + self.frame_mixed = DataFrame(np.random.randn(4, 4), index=[2, 4, 'null', 8]) self.panel_mixed = Panel(np.random.randn(4,4,4), items=[2,4,'null',8]) self.series_ts = Series(np.random.randn(4), index=date_range('20130101', periods=4)) - self.frame_ts = DataFrame(np.random.randn(4, 4), columns=date_range('20130101', periods=4)) + self.frame_ts = DataFrame(np.random.randn(4, 4), index=date_range('20130101', periods=4)) self.panel_ts = Panel(np.random.randn(4, 4, 4), items=date_range('20130101', periods=4)) #self.series_floats = Series(np.random.randn(4), index=[1.00, 2.00, 3.00, 4.00]) @@ -142,7 +148,7 @@ def check_values(self, f, func, values = False): assert_almost_equal(result, expected) - def check_result(self, name, method1, key1, method2, key2, typs = None, objs = None, axis = None, empty = 'fail', fails = None): + def check_result(self, name, method1, key1, method2, key2, typs = None, objs = None, axes = None, fails = None): def _eq(t, o, a, obj, k1, k2): @@ -154,12 +160,16 @@ def _eq(t, o, a, obj, k1, k2): def _print(result, show = True,error = None): if error is not None: error = str(error) - v = "%-12.12s [%-10.10s]: [typ->%-8.8s,obj->%-8.8s,key1->(%-4.4s) %-20.20s,key2->(%-4.4s) %-20.20s,axis->%s] %s" % (name,result,t,o,method1,key1,method2,key2,a,error or '') + v = "%-16.16s [%-16.16s]: [typ->%-8.8s,obj->%-8.8s,key1->(%-4.4s),key2->(%-4.4s),axis->%s] %s" % (name,result,t,o,method1,method2,a,error or '') if show: print(v) try: + ### good debug location ### + #if name == 'bool' and t == 'empty' and o == 'series' and method1 == 'loc': + # import pdb; pdb.set_trace() + rs = getattr(obj, method1).__getitem__(_axify(obj,k1,a)) try: @@ -200,15 +210,13 @@ def _print(result, show = True,error = None): except (Exception), detail: # if we are in fails, the ok, otherwise raise it - if isinstance(fails,(tuple,list)): - if tuple(t,o) in fails or tuple(o,t) in fails: + if fails is not None: + if fails == type(detail): + result = 'ok (%s)' % type(detail).__name__ + _print(result) return - # empty fails are ok - if empty == 'fail' and t == 'empty': - return - - result = 'error' + result = type(detail).__name__ raise AssertionError(_print(result, error = detail)) if typs is None: @@ -217,12 +225,11 @@ def _print(result, show = True,error = None): if objs is None: objs = self._objs - axes = [] - if axis is not None: - if not isinstance(axis,(tuple,list)): - axes = [ axis ] + if axes is not None: + if not isinstance(axes,(tuple,list)): + axes = [ axes ] else: - axes = list(axis) + axes = list(axes) else: axes = [ 0, 1, 2] @@ -242,10 +249,6 @@ def _print(result, show = True,error = None): obj = obj.copy() k2 = key2 - - if name == 'list int' and o == 'panel': - import pdb; pdb.set_trace() - _eq(t, o, a, obj, key1, k2) def test_at_and_iat_get(self): @@ -301,33 +304,49 @@ def _check(f, func, values = False): _check(d['ts'], 'at') _check(d['floats'],'at') - def test_iloc_getitem(self): + def test_iloc_getitem_int(self): # integer self.check_result('integer', 'iloc', 2, 'ix', { 0 : 4, 1: 6, 2: 8 }, typs = ['ints']) - self.check_result('integer', 'iloc', 2, 'ix', { 0 : 4, 1: 6, 2: 8 }, typs = ['labels','mixed','ts','floats','empty'], fails = True) + self.check_result('integer', 'iloc', 2, 'indexer', 2, typs = ['labels','mixed','ts','floats','empty'], fails = IndexError) + def test_iloc_getitem_neg_int(self): + # neg integer self.check_result('neg int', 'iloc', -1, 'ix', { 0 : 6, 1: 9, 2: 12 }, typs = ['ints']) - self.check_result('neg int', 'iloc', -1, 'ix', { 0 : 6, 1: 9, 2: 12 }, typs = ['labels','mixed','ts','floats','empty'], fails = True) + self.check_result('neg int', 'iloc', -1, 'indexer', -1, typs = ['labels','mixed','ts','floats','empty'], fails = IndexError) + + def test_iloc_getitem_list_int(self): # list of ints - self.check_result('list int', 'iloc', [0,1,3], 'ix', { 0 : [0,2,6], 1 : [0,3,9], 2: [0,4,12] }, typs = ['ints']) - self.check_result('list int', 'iloc', [0,1,3], 'ix', { 0 : [0,2,6], 1 : [0,3,9], 2: [0,4,12] }, typs = ['labels','mixed','ts','floats','empty'], fails = True) - self.check_result('list int (dups)', 'iloc', [0,1,1,3], 'ix', { 0 : [0,2,2,6], 1 : [0,3,3,9], 2: [0,4,4,12] }, typs = ['ints']) + self.check_result('list int', 'iloc', [0,1,2], 'ix', { 0 : [0,2,4], 1 : [0,3,6], 2: [0,4,8] }, typs = ['ints']) + self.check_result('list int', 'iloc', [0,1,2], 'indexer', [0,1,2], typs = ['labels','mixed','ts','floats','empty'], fails = IndexError) + + def test_iloc_getitem_dups(self): + + # no dups in panel (bug?) + self.check_result('list int (dups)', 'iloc', [0,1,1,3], 'ix', { 0 : [0,2,2,6], 1 : [0,3,3,9] }, objs = ['series','frame'], typs = ['ints']) + + def test_iloc_getitem_array(self): - # series like + # array like s = Series(index=range(1,4)) - self.check_result('array like', 'iloc', s.index, 'ix', [2,4,6], typs = ['ints']) + self.check_result('array like', 'iloc', s.index, 'ix', { 0 : [2,4,6], 1 : [3,6,9], 2: [4,8,12] }, typs = ['ints']) + + def test_iloc_getitem_bool(self): # boolean indexers b = [True,False,True,False,] self.check_result('bool', 'iloc', b, 'ix', b, typs = ['ints']) - self.check_result('bool', 'iloc', b, 'ix', b, typs = ['labels','mixed','ts','floats','empty'], fails = True) + self.check_result('bool', 'iloc', b, 'ix', b, typs = ['labels','mixed','ts','floats','empty'], fails = IndexError) + + def test_iloc_getitem_slice(self): # slices - self.check_result('slice', 'iloc', slice(1,3), 'ix', slice(2,4,2), typs = ['ints']) - self.check_result('slice', 'iloc', slice(1,3), 'ix', slice(2,4,2), typs = ['labels','mixed','ts','floats','empty'], fails = True) + self.check_result('slice', 'iloc', slice(1,3), 'ix', { 0 : [2,4], 1: [3,6], 2: [4,8] }, typs = ['ints']) + self.check_result('slice', 'iloc', slice(1,3), 'indexer', slice(1,3), typs = ['labels','mixed','ts','floats','empty'], fails = IndexError) + + def test_iloc_getitem_out_of_bounds(self): # out-of-bounds slice self.assertRaises(IndexError, self.frame_ints.iloc.__getitem__, tuple([slice(None),slice(1,5,None)])) @@ -364,6 +383,105 @@ def test_iloc_multiindex(self): xp = df.values[2,2] self.assert_(rs == xp) + def test_loc_getitem_int(self): + + # int label + self.check_result('int label', 'loc', 2, 'ix', 2, typs = ['ints'], axes = 0) + self.check_result('int label', 'loc', 3, 'ix', 3, typs = ['ints'], axes = 1) + self.check_result('int label', 'loc', 4, 'ix', 4, typs = ['ints'], axes = 2) + self.check_result('int label', 'loc', 2, 'ix', 2, typs = ['label'], fails = KeyError) + + def test_loc_getitem_label(self): + + # label + self.check_result('label', 'loc', 'c', 'ix', 'c', typs = ['labels'], axes=0) + self.check_result('label', 'loc', 'null', 'ix', 'null', typs = ['mixed'] , axes=0) + self.check_result('label', 'loc', 8, 'ix', 8, typs = ['mixed'] , axes=0) + self.check_result('label', 'loc', Timestamp('20130102'), 'ix', 1, typs = ['ts'], axes=0) + self.check_result('label', 'loc', 'c', 'ix', 'c', typs = ['empty'], fails = KeyError) + + def test_loc_getitem_label_out_of_range(self): + + # out of range label + self.check_result('label range', 'loc', 'f', 'ix', 'f', typs = ['ints','labels','mixed','ts','floats'], fails=KeyError) + + def test_loc_getitem_label_list(self): + + # list of labels + self.check_result('list lbl', 'loc', [0,2,4], 'ix', [0,2,4], typs = ['ints'], axes=0) + self.check_result('list lbl', 'loc', [3,6,9], 'ix', [3,6,9], typs = ['ints'], axes=1) + self.check_result('list lbl', 'loc', [4,8,12], 'ix', [4,8,12], typs = ['ints'], axes=2) + self.check_result('list lbl', 'loc', ['a','b','d'], 'ix', ['a','b','d'], typs = ['labels'], axes=0) + self.check_result('list lbl', 'loc', ['A','B','C'], 'ix', ['A','B','C'], typs = ['labels'], axes=1) + self.check_result('list lbl', 'loc', ['Z','Y','W'], 'ix', ['Z','Y','W'], typs = ['labels'], axes=2) + self.check_result('list lbl', 'loc', [2,8,'null'], 'ix', [2,8,'null'], typs = ['mixed'], axes=0) + self.check_result('list lbl', 'loc', [Timestamp('20130102'),Timestamp('20130103')], 'ix', + [Timestamp('20130102'),Timestamp('20130103')], typs = ['ts'], axes=0) + + # fails + self.check_result('list lbl', 'loc', [0,1,2], 'indexer', [0,1,2], typs = ['empty'], fails = KeyError) + self.check_result('list lbl', 'loc', [0,2,3], 'ix', [0,2,3], typs = ['ints'], axes=0, fails = KeyError) + self.check_result('list lbl', 'loc', [3,6,7], 'ix', [3,6,9], typs = ['ints'], axes=1, fails = KeyError) + self.check_result('list lbl', 'loc', [4,8,10], 'ix', [4,8,12], typs = ['ints'], axes=2, fails = KeyError) + + # array like + self.check_result('array like', 'loc', Series(index=[0,2,4]).index, 'ix', [0,2,4], typs = ['ints'], axes=0) + self.check_result('array like', 'loc', Series(index=[3,6,9]).index, 'ix', [3,6,9], typs = ['ints'], axes=1) + self.check_result('array like', 'loc', Series(index=[4,8,12]).index, 'ix', [4,8,12], typs = ['ints'], axes=2) + + def test_loc_getitem_bool(self): + + # boolean indexers + b = [True,False,True,False] + self.check_result('bool', 'loc', b, 'ix', b, typs = ['ints','labels','mixed','ts','floats']) + self.check_result('bool', 'loc', b, 'ix', b, typs = ['empty'], fails = KeyError) + + def test_loc_getitem_int_slice(self): + + # int slices in int + self.check_result('int slice1', 'loc', slice(1,3), 'ix', { 0 : [2,4], 1: [3,6], 2: [4,8] }, typs = ['ints'], fails=KeyError) + + # ok + self.check_result('int slice2', 'loc', slice(2,5), 'ix', [2,4], typs = ['ints'], axes = 0) + self.check_result('int slice2', 'loc', slice(3,7), 'ix', [3,6], typs = ['ints'], axes = 1) + self.check_result('int slice2', 'loc', slice(4,9), 'ix', [4,8], typs = ['ints'], axes = 2) + + def test_loc_getitem_label_slice(self): + + # label slices (with ints) + self.check_result('lab slice', 'loc', slice(1,3), 'ix', slice(1,3), typs = ['labels','mixed','ts','floats','empty'], fails=KeyError) + + # real label slices + self.check_result('lab slice', 'loc', slice('a','c'), 'ix', slice('a','c'), typs = ['labels'], axes=0) + self.check_result('lab slice', 'loc', slice('A','C'), 'ix', slice('A','C'), typs = ['labels'], axes=1) + self.check_result('lab slice', 'loc', slice('W','Z'), 'ix', slice('W','Z'), typs = ['labels'], axes=2) + + self.check_result('ts slice', 'loc', slice('20130102','20130104'), 'ix', slice('20130102','20130104'), typs = ['ts'], axes=0) + self.check_result('ts slice', 'loc', slice('20130102','20130104'), 'ix', slice('20130102','20130104'), typs = ['ts'], axes=1) + self.check_result('ts slice', 'loc', slice('20130102','20130104'), 'ix', slice('20130102','20130104'), typs = ['ts'], axes=2) + + self.check_result('ts slice', 'loc', slice(2,6), 'ix', slice(2,6), typs = ['mixed'], axes=0) + self.check_result('ts slice', 'loc', slice(2,6), 'ix', slice(2,6), typs = ['mixed'], axes=1) + self.check_result('ts slice', 'loc', slice(2,6), 'ix', slice(2,6), typs = ['mixed'], axes=2) + + def test_loc_setitem(self): + df = self.frame_labels + + import pdb; pdb.set_trace() + result = df.iloc[0,0] + + df.loc['a','A'] = 1 + result = df.loc['a','A'] + self.assert_(result == 1) + + result = df.iloc[0,0] + self.assert_(result == 1) + + df.loc[:,'B':'D'] = 0 + expected = df.loc[:,'B':'D'] + result = df.ix[:,2:3] + assert_frame_equal(result, expected) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 28c3d9a631bd6c87184947b78be8e875cdc4a9e1 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 28 Feb 2013 10:03:52 -0500 Subject: [PATCH 4/7] DOC/TST: revised indexing section in docs updated whatsnew all tests work DOC: changes suggested by Jan Schulz revised whatsnew to include mostly references to new indexing --- RELEASE.rst | 9 +- doc/source/conf.py | 1 - doc/source/dsintro.rst | 12 +- doc/source/indexing.rst | 284 +++++++++++++++++++++------------- doc/source/v0.10.1.txt | 1 + doc/source/v0.11.0.txt | 81 +++++----- pandas/core/frame.py | 125 ++++++--------- pandas/core/indexing.py | 40 +++-- pandas/tests/test_frame.py | 102 ------------ pandas/tests/test_indexing.py | 176 +++++++++++++++++++-- pandas/tests/test_series.py | 34 ---- 11 files changed, 470 insertions(+), 395 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 32cbddf6bfd74..e4e7087772d3a 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -35,7 +35,12 @@ pandas 0.11.0 Yahoo! finance (GH2795_) - Add ``squeeze`` function to reduce dimensionality of 1-len objects - Support slicing with time objects (GH2681_) - - Add ``.iloc`` attribute, to support location-based indexing, analagous to ``.ix`` + - Added ``.iloc`` attribute, to support strict integer based indexing, analagous to ``.ix`` (GH2922_) + - Added ``.loc`` attribute, to support strict label based indexing, analagous to ``.ix`` + - Added ``.iat`` attribute, to support fast scalar access via integers (replaces ``iget_value/iset_value``) + - Added ``.at`` attribute, to support fast scalar access via labels (replaces ``get_value/set_value``) + - Moved functionaility from ``irow,icol,iget_value/iset_value`` to ``.iloc`` indexer + (via ``_ixs`` methods in each object) **Improvements to existing features** @@ -52,6 +57,7 @@ pandas 0.11.0 - ``describe_option()`` now reports the default and current value of options. - Add ``format`` option to ``pandas.to_datetime`` with faster conversion of strings that can be parsed with datetime.strptime + - Add ``axes`` property to ``Series`` for compatibility **API Changes** @@ -129,6 +135,7 @@ pandas 0.11.0 - Bug in argsort of ``datetime64[ns]`` Series with ``NaT`` (GH2967_) - Bug in idxmin/idxmax of ``datetime64[ns]`` Series with ``NaT`` (GH2982__) - ``icol`` with negative indicies was return ``nan`` (see GH2922_) + - Bug in ``icol`` with negative indicies was incorrect producing incorrect return values (see GH2922_) .. _GH622: https://github.com/pydata/pandas/issues/622 .. _GH797: https://github.com/pydata/pandas/issues/797 diff --git a/doc/source/conf.py b/doc/source/conf.py index 43b7df6af805b..76093d83b32e7 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -17,7 +17,6 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # sys.path.append(os.path.abspath('.')) -sys.path.insert(0,'/home/jreback/pandas') sys.path.insert(0, os.path.abspath('../sphinxext')) sys.path.extend([ diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index 45fabb551d993..83f2de01300c9 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -437,8 +437,8 @@ The basics of indexing are as follows: :widths: 30, 20, 10 Select column, ``df[col]``, Series - Select row by label, ``df.xs(label)`` or ``df.ix[label]``, Series - Select row by location (int), ``df.ix[loc]``, Series + Select row by label, ``df.loc[label]``, Series + Select row by integer location, ``df.iloc[loc]``, Series Slice rows, ``df[5:10]``, DataFrame Select rows by boolean vector, ``df[bool_vec]``, DataFrame @@ -447,8 +447,8 @@ DataFrame: .. ipython:: python - df.xs('b') - df.ix[2] + df.loc('b') + df.iloc[2] For a more exhaustive treatment of more sophisticated label-based indexing and slicing, see the :ref:`section on indexing `. We will address the @@ -475,7 +475,7 @@ row-wise. For example: .. ipython:: python - df - df.ix[0] + df - df.iloc[0] In the special case of working with time series data, if the Series is a TimeSeries (which it will be automatically if the index contains datetime @@ -592,7 +592,7 @@ DataFrame in tabular form, though it won't always fit the console width: .. ipython:: python - print baseball.ix[-20:, :12].to_string() + print baseball.iloc[-20:, :12].to_string() New since 0.10.0, wide DataFrames will now be printed across multiple rows by default: diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 11708f68d9719..75fa4803fe1f6 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -32,6 +32,78 @@ attention in this area. Expect more work to be invested higher-dimensional data structures (including Panel) in the future, especially in label-based advanced indexing. +Choice +------ + +Starting in 0.11.0, object selection has had a number of user-requested additions in +order to support more explicit location based indexing. Pandas now supports +three types of multi-axis indexing. + + - ``.loc`` is strictly label based, will raise ``KeyError`` when the items are not found, + allowed inputs are: + + - A single label, e.g. ``5`` or ``'a'`` + + (note that ``5`` when used as a *label* of an integer based index) + - A list or array of labels ``['a', 'b', 'c']`` + - A slice object with labels ``'a':'f'`` + + (note that contrary to usual python slices, **both** the start and the stop are included!) + - A boolean array + + See more at :ref:`Label indexing ` + + - ``.iloc`` is strictly integer position based (from 0 to length-1 of the axis), will + raise ``IndexError`` when the requested indicies are out of bounds. Allowed inputs are: + + - An integer e.g. ``5`` + - A list or array of integers ``[4, 3, 0]`` + - A slice object with ints ``1:7`` + - A boolean array + + See more at :ref:`Integer indexing ` + + - ``.ix`` supports mixed integer and label based access. It is primarily label based, but + will fallback to integer positional access. ``.ix`` is the most general and will support + any of the inputsx to ``.loc`` and ``.iloc``, as well as support for floating point label schemes. + As using integer slices with ``.ix`` have different behavior depending on whether the slice + is interpreted as integer location based or label position based, it's usually better to be + explicit and use ``.iloc`` (integer location) or ``.loc`` (label location). + + See more at :ref:`Advanced indexing ` + +Getting values from object with multi-axes uses the following notation (using ``.loc`` as an +example, but applies to ``.iloc`` and ``.ix`` as well) Any of the axes accessors may be the null +slice ``:``. Axes left out of the specification are assumed to be ``:``. +(e.g. ``p.loc['a']`` is equiv to ``p.loc['a',:,:]``) + +.. csv-table:: + :header: "Object Type", "Indexers" + :widths: 30, 50 + :delim: ; + + Series; ``s.loc[indexer]`` + DataFrame; ``df.loc[row_indexer,column_indexer]`` + Panel; ``p.loc[item_indexer,major_indexer,minor_indexer]`` + +Indexing Deprecations +~~~~~~~~~~~~~~~~~~~~~ + +Starting in version 0.11.0, the methods may be deprecated in future versions. + + - ``irow`` + - ``icol`` + - ``iget_value`` + +See the section :ref:`Integer indexing ` for substitutes. + +.. _indexing.xs: + +Cross-sectional slices on non-hierarchical indices are now easily +performed using ``.loc`` and/or ``.loc``. The methods ``xs`` (for DataFrame), +``minor_xs`` and ``major_xs`` (for Panel), exist primarily for backward +compatibility. + .. _indexing.basics: Basics @@ -72,39 +144,6 @@ Thus, as per above, we have the most basic indexing using ``[]``: s[dates[5]] panel['two'] - -.. _indexing.basics.get_value: - -Fast scalar value getting and setting -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Since indexing with ``[]`` must handle a lot of cases (single-label access, -slicing, boolean indexing, etc.), it has a bit of overhead in order to figure -out what you're asking for. If you only want to access a scalar value, the -fastest way is to use the ``at`` and ``iat`` methods, which are implemented on all of -the data structures. - -Similary to ``loc`` and ``at`` provides **label** based lookups, while, ``iat`` provides -**integer** based lookups analagously to ``iloc`` - -.. ipython:: python - - type(dates[5]) - dates[5] - s.at[dates[5]] - s.iat[5] - df.at[dates[5], 'A'] - df.iat[3, 0] - -You can also set using these same indexers. These have the additional capability -of enlarging an object. This method *always* returns a reference to the object -it modified, which in the case of enlargement, will be a **new object**: - -.. ipython:: python - - df.at[dates[5], 'E'] = 7 - df.iat[3, 0] = 7 - Additional Column Access ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -134,30 +173,12 @@ raised. Multiple columns can also be set in this manner: You may find this useful for applying a transform (in-place) to a subset of the columns. -Data slices on other axes -~~~~~~~~~~~~~~~~~~~~~~~~~ - -It's certainly possible to retrieve data slices along the other axes of a -DataFrame or Panel. We tend to refer to these slices as -*cross-sections*. DataFrame has the ``xs`` function for retrieving rows as -Series and Panel has the analogous ``major_xs`` and ``minor_xs`` functions for -retrieving slices as DataFrames for a given ``major_axis`` or ``minor_axis`` -label, respectively. - -.. ipython:: python - - date = dates[5] - df.xs(date) - panel.major_xs(date) - panel.minor_xs('A') - - Slicing ranges ~~~~~~~~~~~~~~ The most robust and consistent way of slicing ranges along arbitrary axes is -described in the :ref:`Advanced indexing ` section detailing -the ``.ix`` method. For now, we explain the semantics of slicing using the +described in the :ref:`Integer indexing ` section detailing +the ``.iloc`` method. For now, we explain the semantics of slicing using the ``[]`` operator. With Series, the syntax works exactly as with an ndarray, returning a slice of @@ -185,19 +206,31 @@ largely as a convenience since it is such a common operation. df[:3] df[::-1] +.. _indexing.label: + Label Based Indexing ~~~~~~~~~~~~~~~~~~~~ -Pandas provides a suite of methods in order to get **purely label based indexing**. +Pandas provides a suite of methods in order to have **purely label based indexing**. This is a strict inclusion based protocol. **ALL** of the labels for which you ask, -must be in the index or a KeyError will be raised! When slicing, the start bounds is -*included*, **AND** the upper bound is *included*. Invalid selections will raise with -an ``KeyError``. Integers are valid indicies (and labels), but standard semantics -**DO NOT APPLY**, labels must be **INCLUSIVE**. Valid selection include: *label, listlike -of labels, boolean indexers, and label slices* +must be in the index or a ``KeyError`` will be raised! + +When slicing, the start bound is *included*, **AND** the stop bound is *included*. +Integers are valid labels, but they refer to the label *and not the position*. The ``.loc`` attribute is the primary access method. +The following are valid inputs: + + - A single label, e.g. ``5`` or ``'a'`` + + (note that ``5`` when used as a *label* of an integer based index) + - A list or array of labels ``['a', 'b', 'c']`` + - A slice object with labels ``'a':'f'`` + + (note that contrary to usual python slices, **both** the start and the stop are included!) + - A boolean array + .. ipython:: python s1 = Series(np.random.randn(6),index=list('abcdef')) @@ -216,36 +249,50 @@ With a DataFrame .. ipython:: python - df1 = DataFrame(np.random.randn(6,4),index=list('abcdef'),columns=range(4)) + df1 = DataFrame(np.random.randn(6,4),index=list('abcdef'),columns=list('ABCD')) df1 df1.loc[['a','b','d'],:] - # slices (this is an ok integer slice because it encompasses all of the labels) - df1.loc['d':,1:3] + # slices + df1.loc['d':,'A':'C'] + +For getting a cross section using a label (equiv to deprecated ``df.xs('a')``) + +.. ipython:: python + + df1.loc['a'] # boolean - df1.loc[:,[True,True,False,False]] + df1.loc['a']>0 + df1.loc[:,df1.loc['a']>0] -For getting a value explicity. +For getting a value explicity (equiv to deprecated ``df.get_value('a','A')``) .. ipython:: python - # this is equivalent to ``df1.at['a',1]`` - df1.loc['a',1] + # this is also equivalent to ``df1.at['a','A']`` + df1.loc['a','A'] +.. _indexing.integer: Integer Based Indexing ~~~~~~~~~~~~~~~~~~~~~~ Pandas provides a suite of methods in order to get **purely integer based indexing**. The semantics follow closely python and numpy slicing. These are ``0-based`` indexing. + When slicing, the start bounds is *included*, while the upper bound is *excluded*. -Invalid selections will raise with an ``IndexError``. Trying to use a non-integer, -even a **valid** label will raise a ``ValueError``. Integers, lists of integers, and slices -are allowed indexers. +Trying to use a non-integer, even a **valid** label will raise a ``IndexError``. The ``.iloc`` attribute is the primary access method . +The following are valid inputs: + + - An integer e.g. ``5`` + - A list or array of integers ``[4, 3, 0]`` + - A slice object with ints ``1:7`` + - A boolean array + .. ipython:: python s1 = Series(np.random.randn(5),index=range(0,10,2)) @@ -277,25 +324,34 @@ With a DataFrame # integer lists df1.iloc[[1,3,5],[1,3]] -For slicing rows explicitly. + # boolean + df1.iloc[:,df1.iloc[0]>0] + +For slicing rows explicitly (equiv to deprecated ``df.irow(slice(1,3))``). .. ipython:: python df1.iloc[1:3,:] -For slicing columns explicitly. +For slicing columns explicitly (equiv to deprecated ``df.icol(slice(1,3))``). .. ipython:: python df1.iloc[:,1:3] -For getting a value explicity. +For getting a value explicity (equiv to deprecated ``df.get_value(1,1)``) .. ipython:: python - # this is equivalent to ``df1.iat[1,1]`` + # this is also equivalent to ``df1.iat[1,1]`` df1.iloc[1,1] +For getting a cross section using an integer position (equiv to deprecated ``df.xs(1)``) + +.. ipython:: python + + df1.iloc[1] + There is one signficant departure from standard python/numpy slicing semantics. python/numpy allow slicing past the end of an array without an associated error. @@ -306,11 +362,41 @@ python/numpy allow slicing past the end of an array without an associated error. x[4:10] x[8:10] +Pandas will detect this and raise ``IndexError``, rather than return an empty structure. + :: >>> df.iloc[:,3:6] IndexError: out-of-bounds on slice (end) +.. _indexing.basics.get_value: + +Fast scalar value getting and setting +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Since indexing with ``[]`` must handle a lot of cases (single-label access, +slicing, boolean indexing, etc.), it has a bit of overhead in order to figure +out what you're asking for. If you only want to access a scalar value, the +fastest way is to use the ``at`` and ``iat`` methods, which are implemented on all of +the data structures. + +Similary to ``loc``, ``at`` provides **label** based scalar lookups, while, ``iat`` provides +**integer** based lookups analagously to ``iloc`` + +.. ipython:: python + + s.iat[5] + df.at[dates[5], 'A'] + df.iat[3, 0] + +You can also set using these same indexers. These have the additional capability +of enlarging an object. This method *always* returns a reference to the object +it modified, which in the case of enlargement, will be a **new object**: + +.. ipython:: python + + df.at[dates[5], 'E'] = 7 + df.iat[3, 0] = 7 Boolean indexing ~~~~~~~~~~~~~~~~ @@ -363,8 +449,8 @@ more complex criteria: df2[criterion & (df2['b'] == 'x')] -Note, with the :ref:`advanced indexing ` ``ix`` method, you -may select along more than one axis using boolean vectors combined with other +Note, with the choice methods :ref:`Label indexing `, :ref:`Integer indexing `, +and :ref:`Advanced indexing ` may select along more than one axis using boolean vectors combined with other indexing expressions. Where and Masking @@ -548,20 +634,21 @@ default value. .. _indexing.advanced: -Advanced indexing with labels ------------------------------ +Advanced Indexing with ``.ix`` +------------------------------ + +.. note:: -We have avoided excessively overloading the ``[]`` / ``__getitem__`` operator -to keep the basic functionality of the pandas objects straightforward and -simple. However, there are often times when you may wish get a subset (or -analogously set a subset) of the data in a way that is not straightforward -using the combination of ``reindex`` and ``[]``. Complicated setting operations -are actually quite difficult because ``reindex`` usually returns a copy. + The recent addition of ``.loc`` and ``.iloc`` have enabled users to be quite + explicit about indexing choices. ``.ix`` allows a great flexibility to specify + indexing locations by *label* an/or *integer position*. Pandas will attempt + to use any passed *integer* as *label* locations first (like what ``.loc`` + would do, then to fall back on *positional* indexing, like what ``.iloc`` would do). -By *advanced* indexing we are referring to a special ``.ix`` attribute on -pandas objects which enable you to do getting/setting operations on a -DataFrame, for example, with matrix/ndarray-like semantics. Thus you can -combine the following kinds of indexing: +The syntax of using ``.ix`` is identical to ``.loc``, in :ref:`Label indexing `, +and ``.iloc`` in :ref:`Integer indexing `. + +The ``.ix`` attribute takes the following inputs: - An integer or single label, e.g. ``5`` or ``'a'`` - A list or array of labels ``['a', 'b', 'c']`` or integers ``[4, 3, 0]`` @@ -664,27 +751,6 @@ numpy array. For instance, dflookup.lookup(xrange(0,10,2), ['B','C','A','B','D']) -Advanced indexing with integer labels -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Label-based indexing with integer axis labels is a thorny topic. It has been -discussed heavily on mailing lists and among various members of the scientific -Python community. In pandas, our general viewpoint is that labels matter more -than integer locations. Therefore, with an integer axis index *only* -label-based indexing is possible with the standard tools like ``.ix``. The -following code will generate exceptions: - -.. code-block:: python - - s = Series(range(5)) - s[-1] - df = DataFrame(np.random.randn(5, 4)) - df - df.ix[-2:] - -This deliberate decision was made to prevent ambiguities and subtle bugs (many -users reported finding bugs when the API change was made to stop "falling back" -on position-based indexing). - Setting values in mixed-type DataFrame ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -953,8 +1019,6 @@ but as you use it you may uncover corner cases or unintuitive behavior. If you do find something like this, do not hesitate to report the issue or ask on the mailing list. -.. _indexing.xs: - Cross-section with hierarchical index ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.10.1.txt b/doc/source/v0.10.1.txt index 4c7369c27cc30..e8435df7b2b0c 100644 --- a/doc/source/v0.10.1.txt +++ b/doc/source/v0.10.1.txt @@ -232,4 +232,5 @@ on GitHub for a complete list. .. _GH2626: https://github.com/pydata/pandas/issues/2626 .. _GH2613: https://github.com/pydata/pandas/issues/2613 .. _GH2602: https://github.com/pydata/pandas/issues/2602 +.. _GH2687: https://github.com/pydata/pandas/issues/2687 .. _GH2563: https://github.com/pydata/pandas/issues/2563 diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt index 76f7ffae746b4..8ea49d1074310 100644 --- a/doc/source/v0.11.0.txt +++ b/doc/source/v0.11.0.txt @@ -11,65 +11,66 @@ to. API changes ~~~~~~~~~~~ -Location Based Indexing -~~~~~~~~~~~~~~~~~~~~~~~ +Indexing Choice +~~~~~~~~~~~~~~~ -Pandas provides a suite of methods in order to get **purely integer based indexing**. -The semantics follow closely python and numpy slicing. These are ``0-based`` indexing. -When slicing, the start bounds is *included*, while the upper bound is *excluded*. -Invalid selections will raise with an ``IndexError``. Trying to use a non-integer, -even a **valid** label will raise a ``ValueError``. Integers, lists of integers, and slices -are allowed indexers. +Starting in 0.11.0, object selection has had a number of user-requested additions in +order to support more explicit location based indexing. Pandas now supports +three types of multi-axis indexing. -The ``.iloc`` attribute is the primary access method . + - ``.loc`` is strictly label based, will raise ``KeyError`` when the items are not found, + allowed inputs are: -.. ipython:: python + - A single label, e.g. ``5`` or ``'a'`` - s1 = Series(np.random.randn(5),index=range(0,10,2)) - s1 - s1.iloc[:3] - s1.iloc[3] + (note that ``5`` when used as a *label* of an integer based index) + - A list or array of labels ``['a', 'b', 'c']`` + - A slice object with labels ``'a':'f'`` -Note that setting works as well: + (note that contrary to usual python slices, **both** the start and the stop are included!) + - A boolean array -.. ipython:: python + See more at :ref:`Label indexing ` - s1.iloc[:3] = 0 - s1 + - ``.iloc`` is strictly integer position based (from 0 to length-1 of the axis), will + raise ``IndexError`` when the requested indicies are out of bounds. Allowed inputs are: -With a DataFrame + - An integer e.g. ``5`` + - A list or array of integers ``[4, 3, 0]`` + - A slice object with ints ``1:7`` + - A boolean array -.. ipython:: python + See more at :ref:`Integer indexing ` - df1 = DataFrame(np.random.randn(8,4),index=range(0,16,2),columns=range(0,8,2)) + - ``.ix`` supports mixed integer and label based access. It is primarily label based, but + will fallback to integer positional access. ``.ix`` is the most general and will support + any of the inputsx to ``.loc`` and ``.iloc``, as well as support for floating point label schemes. + As using integer slices with ``.ix`` have different behavior depending on whether the slice + is interpreted as integer location based or label position based, it's usually better to be + explicit and use ``.iloc`` (integer location) or ``.loc`` (label location). - # integer access - df1.iloc[5,2] + See more at :ref:`Advanced indexing ` - # slices - df1.iloc[:3] - df1.iloc[1:5,2:4] - # integer lists - df1.iloc[[1,3,5],[1,3]] +Indexing Deprecations +~~~~~~~~~~~~~~~~~~~~~ -There is one signficant departure from standard python/numpy slicing semantics. -python/numpy allow slicing past the end of an array without an associated error. +Starting in version 0.11.0, the methods may be deprecated in future versions. -.. ipython:: python + - ``irow`` + - ``icol`` + - ``iget_value`` - # these are allowed in python/numpy. - x = list('abcdef') - x[4:10] - x[8:10] +See the section :ref:`Integer indexing ` for substitutes. -:: +Cross-sectional slices on non-hierarchical indices are now easily +performed using ``.loc`` and/or ``.loc``. The methods ``xs`` (for DataFrame), +``minor_xs`` and ``major_xs`` (for Panel), exist primarily for backward +compatibility - >>> df.iloc[:,3:6] - IndexError: out-of-bounds on slice (end) -Dtype Specification -~~~~~~~~~~~~~~~~~~~ +Dtypes +~~~~~~ Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, or a passed ``Series``, then it will be preserved in DataFrame operations. Furthermore, different numeric dtypes will **NOT** be combined. The following example will give you a taste. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b3770cb790f5b..3a7da212d6a3f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1884,94 +1884,71 @@ def set_value(self, index, col, value): return result.set_value(index, col, value) def irow(self, i, copy=False): - """ - Retrieve the i-th row or rows of the DataFrame by location - - Parameters - ---------- - i : int, slice, or sequence of integers + return self._ixs(i,axis=0) - Notes - ----- - If slice passed, the resulting data will be a view + def icol(self, i): + return self._ixs(i,axis=1) - Returns - ------- - row : Series (int) or DataFrame (slice, sequence) + def _ixs(self, i, axis=0, copy=False): + """ + i : int, slice, or sequence of integers + axis : int """ - if isinstance(i, slice): - return self[i] - else: - label = self.index[i] - if isinstance(label, Index): - return self.reindex(label) - else: - try: - new_values = self._data.fast_2d_xs(i, copy=copy) - except: - new_values = self._data.fast_2d_xs(i, copy=True) - return Series(new_values, index=self.columns, - name=self.index[i]) - def icol(self, i): - """ - Retrieve the i-th column or columns of the DataFrame by location + # irow + if axis == 0: - Parameters - ---------- - i : int, slice, or sequence of integers + """ + Notes + ----- + If slice passed, the resulting data will be a view + """ - Notes - ----- - If slice passed, the resulting data will be a view + if isinstance(i, slice): + return self[i] + else: + label = self.index[i] + if isinstance(label, Index): + return self.reindex(label) + else: + try: + new_values = self._data.fast_2d_xs(i, copy=copy) + except: + new_values = self._data.fast_2d_xs(i, copy=True) + return Series(new_values, index=self.columns, + name=self.index[i]) - Returns - ------- - column : Series (int) or DataFrame (slice, sequence) - """ - label = self.columns[i] - if isinstance(i, slice): - # need to return view - lab_slice = slice(label[0], label[-1]) - return self.ix[:, lab_slice] + # icol else: - label = self.columns[i] - if isinstance(label, Index): - # if we have negative indicies, translate to postive here - # (take doesen't deal properly with these) - l = len(self.columns) - i = [ v if v >= 0 else l+v for v in i ] + """ + Notes + ----- + If slice passed, the resulting data will be a view + """ - return self.take(i, axis=1) + label = self.columns[i] + if isinstance(i, slice): + # need to return view + lab_slice = slice(label[0], label[-1]) + return self.ix[:, lab_slice] + else: + label = self.columns[i] + if isinstance(label, Index): - values = self._data.iget(i) - return self._col_klass.from_array(values, index=self.index, - name=label) + # if we have negative indicies, translate to postive here + # (take doesen't deal properly with these) + l = len(self.columns) + i = [ v if v >= 0 else l+v for v in i ] + + return self.take(i, axis=1) - def _ixs(self, i, axis=0): - if axis == 0: - return self.irow(i) - else: - return self.icol(i) + values = self._data.iget(i) + return self._col_klass.from_array(values, index=self.index, + name=label) def iget_value(self, i, j): - """ - Return scalar value stored at row i and column j, where i and j are - integers - - Parameters - ---------- - i : int - j : int - - Returns - ------- - value : scalar value - """ - row = self.index[i] - col = self.columns[j] - return self.get_value(row, col) + return self.iat[i,j] def __getitem__(self, key): if isinstance(key, slice): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e21405b775166..0b4474f7a465c 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -320,8 +320,12 @@ def _getitem_lowerdim(self, tup): if _is_label_like(key) or isinstance(key, tuple): section = self._getitem_axis(key, axis=i) + # we have yielded a scalar ? + if not _is_list_like(section): + return section + # might have been a MultiIndex - if section.ndim == self.ndim: + elif section.ndim == self.ndim: new_key = tup[:i] + (_NS,) + tup[i + 1:] # new_key = tup[:i] + tup[i+1:] else: @@ -639,7 +643,7 @@ def _getbool_axis(self, key, axis=0): class _LocIndexer(_LocationIndexer): """ purely label based location based indexing """ - _valid_types = "labels (MUST BE INCLUSIVE), slices of labels, slices of integers if the index is integers, boolean" + _valid_types = "labels (MUST BE INCLUSIVE), slices of labels (BOTH endpoints included! Can be slices of integers if the index is integers), listlike of labels, boolean" _exception = KeyError def _has_valid_type(self, key, axis): @@ -652,10 +656,15 @@ def _has_valid_type(self, key, axis): if isinstance(key, slice): - if key.start is not None and key.start not in ax: - raise KeyError - if key.stop is not None and key.stop-1 not in ax: - raise KeyError + if key.start is not None: + if key.start not in ax: + raise KeyError("start bound [%s] is not the [%s]" % (key.start,self.obj._get_axis_name(axis))) + if key.stop is not None: + stop = key.stop + if com.is_integer(stop): + stop -= 1 + if stop not in ax: + raise KeyError("stop bound [%s] is not in the [%s]" % (stop,self.obj._get_axis_name(axis))) elif com._is_bool_indexer(key): return True @@ -665,7 +674,7 @@ def _has_valid_type(self, key, axis): # require all elements in the index idx = _ensure_index(key) if not idx.isin(ax).all(): - raise KeyError + raise KeyError("[%s] are not in ALL in the [%s]" % (key,self.obj._get_axis_name(axis))) return True @@ -673,10 +682,10 @@ def _has_valid_type(self, key, axis): # if its empty we want a KeyError here if not len(ax): - raise KeyError + raise KeyError("The [%s] axis is empty" % self.obj._get_axis_name(axis)) if not key in ax: - raise KeyError + raise KeyError("the label [%s] is not in the [%s]" % (key,self.obj._get_axis_name(axis))) return True @@ -684,6 +693,9 @@ def _getitem_axis(self, key, axis=0): labels = self.obj._get_axis(axis) if isinstance(key, slice): + ltype = labels.inferred_type + if ltype == 'mixed-integer-float' or ltype == 'mixed-integer': + raise ValueError('cannot slice with a non-single type label array') return self._get_slice_axis(key, axis=axis) elif com._is_bool_indexer(key): return self._getbool_axis(key, axis=axis) @@ -703,7 +715,7 @@ def _get_loc(self, key, axis=0): class _iLocIndexer(_LocationIndexer): """ purely integer based location based indexing """ - _valid_types = "integer, integer slice, listlike of integers, boolean array" + _valid_types = "integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array" _exception = IndexError def _has_valid_type(self, key, axis): @@ -763,7 +775,13 @@ def _convert_key(self, key): def __getitem__(self, key): if not isinstance(key, tuple): - raise ValueError('Invalid call for scalar access (getting)!') + + # we could have a convertible item here (e.g. Timestamp) + if not _is_list_like(key): + key = tuple([ key ]) + else: + raise ValueError('Invalid call for scalar access (getting)!') + if len(key) != self.obj.ndim: raise ValueError('Not enough indexers for scalar access (getting)!') key = self._convert_key(key) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 3968371f8b1c8..d8dd2e8c6f0d0 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1451,108 +1451,6 @@ def test_single_element_ix_dont_upcast(self): result = self.frame.ix[self.frame.index[5], 'E'] self.assert_(com.is_integer(result)) - def test_iloc_getitem(self): - df = DataFrame(np.random.randn(10, 4), index=range(0, 20, 2), columns=range(0,8,2)) - - result = df.iloc[2] - exp = df.ix[4] - assert_series_equal(result, exp) - - result = df.iloc[2,2] - exp = df.ix[4,4] - self.assert_(result == exp) - - # slice - result = df.iloc[4:8] - expected = df.ix[8:14] - assert_frame_equal(result, expected) - - result = df.iloc[:,2:3] - expected = df.ix[:,4:5] - assert_frame_equal(result, expected) - - # list of integers - result = df.iloc[[0,1,3]] - expected = df.ix[[0,2,6]] - assert_frame_equal(result, expected) - - result = df.iloc[[0,1,3],[0,1]] - expected = df.ix[[0,2,6],[0,2]] - assert_frame_equal(result, expected) - - # neg indicies - result = df.iloc[[-1,1,3],[-1,1]] - expected = df.ix[[18,2,6],[6,2]] - assert_frame_equal(result, expected) - - # dups indicies - result = df.iloc[[-1,-1,1,3],[-1,1]] - expected = df.ix[[18,18,2,6],[6,2]] - assert_frame_equal(result, expected) - - # with index-like - s = Series(index=range(1,5)) - result = df.iloc[s.index] - expected = df.ix[[2,4,6,8]] - assert_frame_equal(result, expected) - - # out-of-bounds slice - self.assertRaises(IndexError, df.iloc.__getitem__, tuple([slice(None),slice(1,5,None)])) - self.assertRaises(IndexError, df.iloc.__getitem__, tuple([slice(None),slice(-5,3,None)])) - self.assertRaises(IndexError, df.iloc.__getitem__, tuple([slice(1,11,None)])) - self.assertRaises(IndexError, df.iloc.__getitem__, tuple([slice(-11,3,None)])) - - # try with labelled frame - df = DataFrame(np.random.randn(10, 4), index=list('abcdefghij'), columns=list('ABCD')) - - result = df.iloc[1,1] - exp = df.ix['b','B'] - self.assert_(result == exp) - - result = df.iloc[:,2:3] - expected = df.ix[:,['C']] - assert_frame_equal(result, expected) - - # negative indexing - result = df.iloc[-1,-1] - exp = df.ix['j','D'] - self.assert_(result == exp) - - # out-of-bounds exception - self.assertRaises(IndexError, df.iloc.__getitem__, tuple([10,5])) - - # trying to use a label - self.assertRaises(ValueError, df.iloc.__getitem__, tuple(['j','D'])) - - def test_iloc_setitem(self): - df = DataFrame(np.random.randn(10, 4), index=list('abcdefghij'), columns=list('ABCD')) - - df.iloc[1,1] = 1 - result = df.iloc[1,1] - self.assert_(result == 1) - - df.iloc[:,2:3] = 0 - expected = df.iloc[:,2:3] - result = df.iloc[:,2:3] - assert_frame_equal(result, expected) - - def test_iloc_multiindex(self): - df = DataFrame(np.random.randn(3, 3), - columns=[[2,2,4],[6,8,10]], - index=[[4,4,8],[8,10,12]]) - - rs = df.iloc[2] - xp = df.irow(2) - assert_series_equal(rs, xp) - - rs = df.iloc[:,2] - xp = df.icol(2) - assert_series_equal(rs, xp) - - rs = df.iloc[2,2] - xp = df.values[2,2] - self.assert_(rs == xp) - def test_irow(self): df = DataFrame(np.random.randn(10, 4), index=range(0, 20, 2)) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 105a12dd0dba6..c219c6fef196f 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -18,10 +18,10 @@ import pandas.util.testing as tm import pandas.lib as lib - +from pandas import date_range from numpy.testing.decorators import slow -_verbose = True +_verbose = False #------------------------------------------------------------------------------- # Indexing test cases @@ -93,8 +93,6 @@ def setUp(self): import warnings warnings.filterwarnings(action='ignore', category=FutureWarning) - from pandas import date_range - self.series_ints = Series(np.random.rand(4), index=range(0,8,2)) self.frame_ints = DataFrame(np.random.randn(4, 4), index=range(0, 8, 2), columns=range(0,12,3)) self.panel_ints = Panel(np.random.rand(4,4,4), items=range(0,8,2),major_axis=range(0,12,3),minor_axis=range(0,16,4)) @@ -157,11 +155,11 @@ def _eq(t, o, a, obj, k1, k2): if a is not None and a > obj.ndim-1: return - def _print(result, show = True,error = None): + def _print(result, error = None): if error is not None: error = str(error) v = "%-16.16s [%-16.16s]: [typ->%-8.8s,obj->%-8.8s,key1->(%-4.4s),key2->(%-4.4s),axis->%s] %s" % (name,result,t,o,method1,method2,a,error or '') - if show: + if _verbose: print(v) try: @@ -200,8 +198,7 @@ def _print(result, show = True,error = None): if not result.startswith('ok'): raise AssertionError(_print(result)) - if _verbose: - _print(result) + _print(result) except (AssertionError): raise @@ -303,7 +300,21 @@ def _check(f, func, values = False): _check(d['labels'],'at') _check(d['ts'], 'at') _check(d['floats'],'at') + + def test_at_timestamp(self): + # as timestamp is not a tuple! + dates = date_range('1/1/2000', periods=8) + df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) + s = df['A'] + + result = s.at[dates[5]] + xp = s.values[5] + self.assert_(result == xp) + + def test_iat_invalid_args(self): + pass + def test_iloc_getitem_int(self): # integer @@ -457,17 +468,33 @@ def test_loc_getitem_label_slice(self): self.check_result('lab slice', 'loc', slice('W','Z'), 'ix', slice('W','Z'), typs = ['labels'], axes=2) self.check_result('ts slice', 'loc', slice('20130102','20130104'), 'ix', slice('20130102','20130104'), typs = ['ts'], axes=0) - self.check_result('ts slice', 'loc', slice('20130102','20130104'), 'ix', slice('20130102','20130104'), typs = ['ts'], axes=1) - self.check_result('ts slice', 'loc', slice('20130102','20130104'), 'ix', slice('20130102','20130104'), typs = ['ts'], axes=2) + self.check_result('ts slice', 'loc', slice('20130102','20130104'), 'ix', slice('20130102','20130104'), typs = ['ts'], axes=1, fails=KeyError) + self.check_result('ts slice', 'loc', slice('20130102','20130104'), 'ix', slice('20130102','20130104'), typs = ['ts'], axes=2, fails=KeyError) + + self.check_result('mixed slice', 'loc', slice(2,8), 'ix', slice(2,8), typs = ['mixed'], axes=0, fails=KeyError) + self.check_result('mixed slice', 'loc', slice(2,8), 'ix', slice(2,8), typs = ['mixed'], axes=1, fails=KeyError) + self.check_result('mixed slice', 'loc', slice(2,8), 'ix', slice(2,8), typs = ['mixed'], axes=2, fails=KeyError) - self.check_result('ts slice', 'loc', slice(2,6), 'ix', slice(2,6), typs = ['mixed'], axes=0) - self.check_result('ts slice', 'loc', slice(2,6), 'ix', slice(2,6), typs = ['mixed'], axes=1) - self.check_result('ts slice', 'loc', slice(2,6), 'ix', slice(2,6), typs = ['mixed'], axes=2) + # you would think this would work, but we don't have an ordering, so fail + self.check_result('mixed slice', 'loc', slice(2,5,2), 'ix', slice(2,4,2), typs = ['mixed'], axes=0, fails=ValueError) - def test_loc_setitem(self): + def test_loc_general(self): + + # GH 2922 (these are fails) + df = DataFrame(np.random.rand(4,4),columns=['A','B','C','D']) + self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(0,2),slice(0,2)])) + + df = DataFrame(np.random.rand(4,4),columns=['A','B','C','D'], index=['A','B','C','D']) + self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(0,2),df.columns[0:2]])) + + # want this to work + result = df.loc[:,"A":"B"].iloc[0:2,:] + self.assert_((result.columns == ['A','B']).all() == True) + self.assert_((result.index == ['A','B']).all() == True) + + def test_loc_setitem_frame(self): df = self.frame_labels - import pdb; pdb.set_trace() result = df.iloc[0,0] df.loc['a','A'] = 1 @@ -479,9 +506,126 @@ def test_loc_setitem(self): df.loc[:,'B':'D'] = 0 expected = df.loc[:,'B':'D'] - result = df.ix[:,2:3] + result = df.ix[:,1:] + assert_frame_equal(result, expected) + + def test_iloc_getitem_frame(self): + """ originally from test_frame.py""" + df = DataFrame(np.random.randn(10, 4), index=range(0, 20, 2), columns=range(0,8,2)) + + result = df.iloc[2] + exp = df.ix[4] + assert_series_equal(result, exp) + + result = df.iloc[2,2] + exp = df.ix[4,4] + self.assert_(result == exp) + + # slice + result = df.iloc[4:8] + expected = df.ix[8:14] assert_frame_equal(result, expected) + result = df.iloc[:,2:3] + expected = df.ix[:,4:5] + assert_frame_equal(result, expected) + + # list of integers + result = df.iloc[[0,1,3]] + expected = df.ix[[0,2,6]] + assert_frame_equal(result, expected) + + result = df.iloc[[0,1,3],[0,1]] + expected = df.ix[[0,2,6],[0,2]] + assert_frame_equal(result, expected) + + # neg indicies + result = df.iloc[[-1,1,3],[-1,1]] + expected = df.ix[[18,2,6],[6,2]] + assert_frame_equal(result, expected) + + # dups indicies + result = df.iloc[[-1,-1,1,3],[-1,1]] + expected = df.ix[[18,18,2,6],[6,2]] + assert_frame_equal(result, expected) + + # with index-like + s = Series(index=range(1,5)) + result = df.iloc[s.index] + expected = df.ix[[2,4,6,8]] + assert_frame_equal(result, expected) + + # out-of-bounds slice + self.assertRaises(IndexError, df.iloc.__getitem__, tuple([slice(None),slice(1,5,None)])) + self.assertRaises(IndexError, df.iloc.__getitem__, tuple([slice(None),slice(-5,3,None)])) + self.assertRaises(IndexError, df.iloc.__getitem__, tuple([slice(1,11,None)])) + self.assertRaises(IndexError, df.iloc.__getitem__, tuple([slice(-11,3,None)])) + + # try with labelled frame + df = DataFrame(np.random.randn(10, 4), index=list('abcdefghij'), columns=list('ABCD')) + + result = df.iloc[1,1] + exp = df.ix['b','B'] + self.assert_(result == exp) + + result = df.iloc[:,2:3] + expected = df.ix[:,['C']] + assert_frame_equal(result, expected) + + # negative indexing + result = df.iloc[-1,-1] + exp = df.ix['j','D'] + self.assert_(result == exp) + + # out-of-bounds exception + self.assertRaises(IndexError, df.iloc.__getitem__, tuple([10,5])) + + # trying to use a label + self.assertRaises(ValueError, df.iloc.__getitem__, tuple(['j','D'])) + + def test_iloc_setitem_series(self): + """ originally from test_series.py """ + df = DataFrame(np.random.randn(10, 4), index=list('abcdefghij'), columns=list('ABCD')) + + df.iloc[1,1] = 1 + result = df.iloc[1,1] + self.assert_(result == 1) + + df.iloc[:,2:3] = 0 + expected = df.iloc[:,2:3] + result = df.iloc[:,2:3] + assert_frame_equal(result, expected) + + def test_iloc_setitem_series(self): + s = Series(np.random.randn(10), index=range(0,20,2)) + + s.iloc[1] = 1 + result = s.iloc[1] + self.assert_(result == 1) + + s.iloc[:4] = 0 + expected = s.iloc[:4] + result = s.iloc[:4] + assert_series_equal(result, expected) + + def test_iloc_multiindex(self): + df = DataFrame(np.random.randn(3, 3), + columns=[[2,2,4],[6,8,10]], + index=[[4,4,8],[8,10,12]]) + + rs = df.iloc[2] + xp = df.irow(2) + assert_series_equal(rs, xp) + + rs = df.iloc[:,2] + xp = df.icol(2) + assert_series_equal(rs, xp) + + rs = df.iloc[2,2] + xp = df.values[2,2] + self.assert_(rs == xp) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index eab956fb5feb2..ee288fda120d3 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -1000,40 +1000,6 @@ def test_basic_setitem_with_labels(self): self.assertRaises(Exception, s.__setitem__, inds_notfound, 0) self.assertRaises(Exception, s.__setitem__, arr_inds_notfound, 0) - def test_iloc_getitem(self): - s = Series(np.random.randn(10), index=list('abcdefghij')) - - result = s.iloc[1] - exp = s.ix['b'] - self.assert_(result == exp) - - result = s.iloc[2:4] - expected = s.ix['c':'d'] - assert_series_equal(result, expected) - - # negative indexing - result = s.iloc[-1] - exp = s.ix['j'] - self.assert_(result == exp) - - # out-of-bounds exception - self.assertRaises(IndexError, s.iloc.__getitem__, tuple([12])) - - # trying to use a label - self.assertRaises(ValueError, s.iloc.__getitem__, tuple(['j'])) - - def test_iloc_setitem(self): - s = Series(np.random.randn(10), index=range(0,20,2)) - - s.iloc[1] = 1 - result = s.iloc[1] - self.assert_(result == 1) - - s.iloc[:4] = 0 - expected = s.iloc[:4] - result = s.iloc[:4] - assert_series_equal(result, expected) - def test_ix_getitem(self): inds = self.series.index[[3, 4, 7]] assert_series_equal(self.series.ix[inds], self.series.reindex(inds)) From fbf197738d0e2d90d69da9a9c636c5d53a5fc4f4 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 2 Mar 2013 16:41:31 -0500 Subject: [PATCH 5/7] DOC: added 10min newbie intro to pandas changes in indexing suggested by Jan Schulz, and nehalecky DOC: added plotting,reshaping, more examples in setting to 10min.rst DOC: more doc updates, added more examples in selection added join to 10min DOC: release notes and whatsnew updates for 10min --- RELEASE.rst | 1 + doc/source/10min.rst | 540 ++++++++++++++++++++++++++++++++++++++++ doc/source/index.rst | 1 + doc/source/indexing.rst | 112 +++++---- doc/source/io.rst | 10 +- doc/source/v0.10.0.txt | 2 +- doc/source/v0.11.0.txt | 45 ++-- pandas/core/indexing.py | 2 +- 8 files changed, 650 insertions(+), 63 deletions(-) create mode 100644 doc/source/10min.rst diff --git a/RELEASE.rst b/RELEASE.rst index e4e7087772d3a..3ed6d71ce9273 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -29,6 +29,7 @@ pandas 0.11.0 **New features** + - New documentation section, ``10 Minutes to Pandas`` - Allow mixed dtypes (e.g ``float32/float64/int32/int16/int8``) to coexist in DataFrames and propogate in operations - Add function to pandas.io.data for retrieving stock index components from diff --git a/doc/source/10min.rst b/doc/source/10min.rst new file mode 100644 index 0000000000000..d73ff600e1450 --- /dev/null +++ b/doc/source/10min.rst @@ -0,0 +1,540 @@ +.. _10min: + +.. currentmodule:: pandas + +.. ipython:: python + :suppress: + + import numpy as np + import random + np.random.seed(123456) + from pandas import * + import pandas as pd + randn = np.random.randn + randint = np.random.randint + np.set_printoptions(precision=4, suppress=True) + + #### portions of this were borrowed from the + #### Pandas cheatsheet + #### created during the PyData Workshop-Sprint 2012 + #### Hannah Chen, Henry Chow, Eric Cox, Robert Mauriello + + +******************** +10 Minutes to Pandas +******************** + +This is a short introduction to pandas, geared mainly for new users. + +Customarily, we import as follows + +.. ipython:: python + + import pandas as pd + import numpy as np + +Object Creation +--------------- + +See the :ref:`Data Structure Intro section ` + +Creating a ``Series`` by passing a list of values, letting pandas create a default +integer index + +.. ipython:: python + + s = pd.Series([1,3,5,np.nan,6,8]) + s + +Creating a ``DataFrame`` by passing a numpy array, with a datetime index and labeled columns. + +.. ipython:: python + + dates = pd.date_range('20130101',periods=6) + dates + df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD')) + df + +Creating a ``DataFrame`` by passing a dict of objects that can be converted to series-like. + +.. ipython:: python + + df2 = pd.DataFrame({ 'A' : 1., + 'B' : pd.Timestamp('20130102'), + 'C' : pd.Series(1,index=range(4),dtype='float32'), + 'D' : np.array([3] * 4,dtype='int32'), + 'E' : 'foo' }) + df2 + +Having specific dtypes + +.. ipython:: python + + df2.dtypes + +Viewing Data +------------ + +See the :ref:`Basics section ` + +See the top & bottom rows of the frame + +.. ipython:: python + + df.head() + df.tail() + +Display the index,columns, and the underlying numpy data + +.. ipython:: python + + df.index + df.columns + df.values + +Describe shows a quick statistic summary of your data + +.. ipython:: python + + df.describe() + +Selection +--------- + +See the :ref:`Indexing section ` + + +Getting +~~~~~~~ + +Selecting a single column, which yields a ``Series`` + +.. ipython:: python + + df['A'] + +Selecting via ``[]``, which slices the rows. + +.. ipython:: python + + df[0:3] + df['20130102':'20130104'] + +Selection by Label +~~~~~~~~~~~~~~~~~~ + +For getting a cross section using a label + +.. ipython:: python + + df.loc[dates[0]] + +Selecting on a multi-axis by label + +.. ipython:: python + + df.loc[:,['A','B']] + +Showing label slicing, both endpoints are *included* + +.. ipython:: python + + df.loc['20130102':'20130104',['A','B']] + +Reduction in the dimensions of the returned object + +.. ipython:: python + + df.loc['20130102',['A','B']] + +For getting a scalar value + +.. ipython:: python + + df.loc[dates[0],'A'] + +For getting fast access to a scalar (equiv to the prior method) + +.. ipython:: python + + df.at[dates[0],'A'] + +Selection by Position +~~~~~~~~~~~~~~~~~~~~~ + +Select via the position of the passed integers + +.. ipython:: python + + # this is a cross-section of the object + df.iloc[3] + +By integer slices, acting similar to numpy/python + +.. ipython:: python + + df.iloc[3:5,0:2] + +By lists of integer position locations, similar to the numpy/python style + +.. ipython:: python + + df.iloc[[1,2,4],[0,2]] + +For slicing rows explicitly + +.. ipython:: python + + df.iloc[1:3,:] + +For slicing columns explicitly + +.. ipython:: python + + df.iloc[:,1:3] + +For getting a value explicity + +.. ipython:: python + + df.iloc[1,1] + +For getting fast access to a scalar (equiv to the prior method) + +.. ipython:: python + + df.iat[1,1] + +There is one signficant departure from standard python/numpy slicing semantics. +python/numpy allow slicing past the end of an array without an associated error. + +.. ipython:: python + + # these are allowed in python/numpy. + x = list('abcdef') + x[4:10] + x[8:10] + +Pandas will detect this and raise ``IndexError``, rather than return an empty structure. + +:: + + >>> df.iloc[:,3:6] + IndexError: out-of-bounds on slice (end) + +Boolean Indexing +~~~~~~~~~~~~~~~~ + +Using a single column's values to select data. + +.. ipython:: python + + df[df.A > 0] + +A ``where`` operation. + +.. ipython:: python + + df[df > 0] + + +Setting +~~~~~~~ + +Setting a new column automatically aligns the data +by the indexes + +.. ipython:: python + + s1 = pd.Series([1,2,3,4,5,6],index=date_range('20130102',periods=6)) + s1 + df['F'] = s1 + +Setting values by label + +.. ipython:: python + + df.at[dates[0],'A'] = 0 + +Setting values by position + +.. ipython:: python + + df.iat[0,1] = 0 + +Setting by assigning with a numpy array + +.. ipython:: python + + df.loc[:,'D'] = np.array([5] * len(df)) + df + +Missing Data +------------ + +Pandas primarily uses the value ``np.nan`` to represent missing data. It +is by default not included in computations. See the :ref:`Missing Data section ` + +Reindexing allows you to change/add/delete the index on a specified axis. This +returns a copy of the data. + +.. ipython:: python + + df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + ['E']) + df1.loc[dates[0]:dates[1],'E'] = 1 + df1 + +To drop any rows that have missing data. + +.. ipython:: python + + df1.dropna(how='any') + +Filling missing data + +.. ipython:: python + + df1.fillna(value=5) + + +Operations +---------- + +See the :ref:`Basic section on Binary Ops ` + +Stats +~~~~~ + +Performing a descriptive statistic + +.. ipython:: python + + df.mean() + +Same operation on the other axis + +.. ipython:: python + + df.mean(1) + +Operations on missing data, exclude the data + +.. ipython:: python + + df1.mean() + +Apply +~~~~~ + +Applying functions to the data + +.. ipython:: python + + df.apply(np.cumsum) + df.apply(lambda x: x.max() - x.min()) + +Merge +----- + +Concat +~~~~~~ + +Pandas provides various facilities for easily combining together Series, +DataFrame, and Panel objects with various kinds of set logic for the indexes +and relational algebra functionality in the case of join / merge-type +operations. + +See the :ref:`Merging section ` + +Concatenating pandas objects together + +.. ipython:: python + + df = pd.DataFrame(np.random.randn(10, 4)) + df + + # break it into pieces + pieces = [df[:3], df[3:7], df[7:]] + + concat(pieces) + +Join +~~~~ + +SQL style merges. See the :ref:`Database style joining ` + +.. ipython:: python + + left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]}) + right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]}) + left + right + merge(left, right, on='key') + +Append +~~~~~~ + +Append rows to a dataframe. See the :ref:`Appending ` + +.. ipython:: python + + df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D']) + df + s = df.iloc[3] + df.append(s, ignore_index=True) + df + + +Grouping +-------- + +By "group by" we are referring to a process involving one or more of the following +steps + + - **Splitting** the data into groups based on some criteria + - **Applying** a function to each group independently + - **Combining** the results into a data structure + +See the :ref:`Grouping section ` + +.. ipython:: python + + df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B' : ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C' : randn(8), 'D' : randn(8)}) + df + +Grouping and then applying a function ``sum`` to the resulting groups. + +.. ipython:: python + + df.groupby('A').sum() + +Grouping by multiple columns forms a hierarchical index, which we then apply the function. + +.. ipython:: python + + df.groupby(['A','B']).sum() + +Reshaping +--------- + +See the section on :ref:`Hierarchical Indexing ` and +see the section on :ref:`Reshaping `). + +.. ipython:: python + + tuples = zip(*[['bar', 'bar', 'baz', 'baz', + 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', + 'one', 'two', 'one', 'two']]) + index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) + df = pd.DataFrame(randn(8, 2), index=index, columns=['A', 'B']) + df2 = df[:4] + df2 + +The ``stack`` function "compresses" a level in the DataFrame's columns. to + +.. ipython:: python + + stacked = df2.stack() + stacked + +With a "stacked" DataFrame or Series (having a ``MultiIndex`` as the +``index``), the inverse operation of ``stack`` is ``unstack``, which by default +unstacks the **last level**: + +.. ipython:: python + + stacked.unstack() + stacked.unstack(1) + stacked.unstack(0) + +Time Series +----------- + +Pandas has simple, powerful, and efficient functionality for +performing resampling operations during frequency conversion (e.g., converting +secondly data into 5-minutely data). This is extremely common in, but not +limited to, financial applications. See the :ref:`Time Series section ` + +.. ipython:: python + + rng = pd.date_range('1/1/2012', periods=100, freq='S') + ts = pd.Series(randint(0, 500, len(rng)), index=rng) + ts.resample('5Min', how='sum') + +Plotting +-------- + +.. ipython:: python + :suppress: + + import matplotlib.pyplot as plt + plt.close('all') + +.. ipython:: python + + ts = pd.Series(randn(1000), index=pd.date_range('1/1/2000', periods=1000)) + ts = ts.cumsum() + + @savefig series_plot_basic.png width=4.5in + ts.plot() + +On DataFrame, ``plot`` is a convenience to plot all of the columns with labels: + +.. ipython:: python + + df = pd.DataFrame(randn(1000, 4), index=ts.index, + columns=['A', 'B', 'C', 'D']) + df = df.cumsum() + + @savefig frame_plot_basic.png width=4.5in + plt.figure(); df.plot(); plt.legend(loc='best') + +Getting Data In/Out +------------------- + +CSV +~~~ + +:ref:`Writing to a csv file ` + +.. ipython:: python + + df.to_csv('foo.csv') + +:ref:`Reading from a csv file ` + +.. ipython:: python + + pd.read_csv('foo.csv') + +HDF5 +~~~~ + +Reading and writing to :ref:`HDFStores ` + +Writing to a HDF5 Store + +.. ipython:: python + + store = pd.HDFStore('foo.h5') + store['df'] = df + +Reading from a HDF5 Store + +.. ipython:: python + + store['df'] + +.. ipython:: python + :suppress: + :okexcept: + + store.close() + os.remove('foo.h5') + os.remove('foo.csv') + diff --git a/doc/source/index.rst b/doc/source/index.rst index bc51f1b13f36e..d59cb6d7a816b 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -112,6 +112,7 @@ See the package overview for more detail about what's in the library. install faq overview + 10min dsintro basics indexing diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 75fa4803fe1f6..02aa00b7eaca6 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -14,9 +14,9 @@ randint = np.random.randint np.set_printoptions(precision=4, suppress=True) -*************************** -Indexing and selecting data -*************************** +************** +Selecting Data +************** The axis labeling information in pandas objects serves many purposes: @@ -44,14 +44,14 @@ three types of multi-axis indexing. - A single label, e.g. ``5`` or ``'a'`` - (note that ``5`` when used as a *label* of an integer based index) + (note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index) - A list or array of labels ``['a', 'b', 'c']`` - A slice object with labels ``'a':'f'`` (note that contrary to usual python slices, **both** the start and the stop are included!) - A boolean array - See more at :ref:`Label indexing ` + See more at :ref:`Selection by Label ` - ``.iloc`` is strictly integer position based (from 0 to length-1 of the axis), will raise ``IndexError`` when the requested indicies are out of bounds. Allowed inputs are: @@ -61,16 +61,19 @@ three types of multi-axis indexing. - A slice object with ints ``1:7`` - A boolean array - See more at :ref:`Integer indexing ` + See more at :ref:`Selection by Position ` - ``.ix`` supports mixed integer and label based access. It is primarily label based, but will fallback to integer positional access. ``.ix`` is the most general and will support - any of the inputsx to ``.loc`` and ``.iloc``, as well as support for floating point label schemes. + any of the inputs to ``.loc`` and ``.iloc``, as well as support for floating point label schemes. + As using integer slices with ``.ix`` have different behavior depending on whether the slice is interpreted as integer location based or label position based, it's usually better to be explicit and use ``.iloc`` (integer location) or ``.loc`` (label location). - See more at :ref:`Advanced indexing ` + ``.ix`` is especially useful when dealing with mixed positional and label based hierarchial indexes. + + See more at :ref:`Advanced Indexing ` and :ref:`Advanced Hierarchical ` Getting values from object with multi-axes uses the following notation (using ``.loc`` as an example, but applies to ``.iloc`` and ``.ix`` as well) Any of the axes accessors may be the null @@ -86,23 +89,28 @@ slice ``:``. Axes left out of the specification are assumed to be ``:``. DataFrame; ``df.loc[row_indexer,column_indexer]`` Panel; ``p.loc[item_indexer,major_indexer,minor_indexer]`` -Indexing Deprecations -~~~~~~~~~~~~~~~~~~~~~ +Deprecations +~~~~~~~~~~~~ -Starting in version 0.11.0, the methods may be deprecated in future versions. +Starting in version 0.11.0, these methods may be deprecated in future versions. - ``irow`` - ``icol`` - ``iget_value`` -See the section :ref:`Integer indexing ` for substitutes. +See the section :ref:`Selection by Position ` for substitutes. .. _indexing.xs: -Cross-sectional slices on non-hierarchical indices are now easily -performed using ``.loc`` and/or ``.loc``. The methods ``xs`` (for DataFrame), -``minor_xs`` and ``major_xs`` (for Panel), exist primarily for backward -compatibility. +Cross-sectional slices on non-hierarchical indices are now easily performed using +``.loc`` and/or ``.loc``. The methods: + + - ``xs`` (for DataFrame), + - ``minor_xs`` and ``major_xs`` (for Panel) + +now exist primarily for backward compatibility. + +See the section at :ref:`Selection by Label ` for substitutes. .. _indexing.basics: @@ -114,11 +122,14 @@ As mentioned when introducing the data structures in the :ref:`last section for those familiar with implementing class behavior in Python) is selecting out lower-dimensional slices. Thus, - - **Series**: ``series[label]`` returns a scalar value - - **DataFrame**: ``frame[colname]`` returns a Series corresponding to the - passed column name - - **Panel**: ``panel[itemname]`` returns a DataFrame corresponding to the - passed item name +.. csv-table:: + :header: "Object Type", "Selection", "Return Value Type" + :widths: 30, 30, 60 + :delim: ; + + Series; ``series[label]``; scalar value + DataFrame; ``frame[colname]``; ``Series`` corresponding to colname + Panel; ``panel[itemname]``; ``DataFrame`` corresponing to the itemname Here we construct a simple time series data set to use for illustrating the indexing functionality: @@ -144,21 +155,22 @@ Thus, as per above, we have the most basic indexing using ``[]``: s[dates[5]] panel['two'] -Additional Column Access -~~~~~~~~~~~~~~~~~~~~~~~~ +Attribute Access +~~~~~~~~~~~~~~~~ .. _indexing.columns.multiple: .. _indexing.df_cols: -You may access a column on a dataframe directly as an attribute: +You may access a column on a ``DataFrame``, and a item on a ``Panel`` directly as an attribute: .. ipython:: python df.A + panel.one If you are using the IPython environment, you may also use tab-completion to -see the accessible columns of a DataFrame. +see these accessable attributes. You can pass a list of columns to ``[]`` to select columns in that order: If a column is not contained in the DataFrame, an exception will be @@ -177,7 +189,7 @@ Slicing ranges ~~~~~~~~~~~~~~ The most robust and consistent way of slicing ranges along arbitrary axes is -described in the :ref:`Integer indexing ` section detailing +described in the :ref:`Selection by Position ` section detailing the ``.iloc`` method. For now, we explain the semantics of slicing using the ``[]`` operator. @@ -208,8 +220,8 @@ largely as a convenience since it is such a common operation. .. _indexing.label: -Label Based Indexing -~~~~~~~~~~~~~~~~~~~~ +Selection By Label +~~~~~~~~~~~~~~~~~~ Pandas provides a suite of methods in order to have **purely label based indexing**. This is a strict inclusion based protocol. **ALL** of the labels for which you ask, @@ -224,7 +236,7 @@ The following are valid inputs: - A single label, e.g. ``5`` or ``'a'`` - (note that ``5`` when used as a *label* of an integer based index) + (note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index) - A list or array of labels ``['a', 'b', 'c']`` - A slice object with labels ``'a':'f'`` @@ -253,7 +265,10 @@ With a DataFrame df1 df1.loc[['a','b','d'],:] - # slices +Accessing via label slices + +.. ipython:: python + df1.loc['d':,'A':'C'] For getting a cross section using a label (equiv to deprecated ``df.xs('a')``) @@ -262,7 +277,10 @@ For getting a cross section using a label (equiv to deprecated ``df.xs('a')``) df1.loc['a'] - # boolean +For getting values with a boolean array + +.. ipython:: python + df1.loc['a']>0 df1.loc[:,df1.loc['a']>0] @@ -275,8 +293,8 @@ For getting a value explicity (equiv to deprecated ``df.get_value('a','A')``) .. _indexing.integer: -Integer Based Indexing -~~~~~~~~~~~~~~~~~~~~~~ +Selection By Position +~~~~~~~~~~~~~~~~~~~~~ Pandas provides a suite of methods in order to get **purely integer based indexing**. The semantics follow closely python and numpy slicing. These are ``0-based`` indexing. @@ -314,17 +332,23 @@ With a DataFrame df1 = DataFrame(np.random.randn(6,4),index=range(0,12,2),columns=range(0,8,2)) df1 - # integer access - df1.iloc[5,2] +Select via integer slicing + +.. ipython:: python - # slices df1.iloc[:3] df1.iloc[1:5,2:4] - # integer lists +Select via integer list + +.. ipython:: python + df1.iloc[[1,3,5],[1,3]] - # boolean +Select via boolean array + +.. ipython:: python + df1.iloc[:,df1.iloc[0]>0] For slicing rows explicitly (equiv to deprecated ``df.irow(slice(1,3))``). @@ -339,7 +363,7 @@ For slicing columns explicitly (equiv to deprecated ``df.icol(slice(1,3))``). df1.iloc[:,1:3] -For getting a value explicity (equiv to deprecated ``df.get_value(1,1)``) +For getting a scalar via integer position (equiv to deprecated ``df.get_value(1,1)``) .. ipython:: python @@ -449,8 +473,8 @@ more complex criteria: df2[criterion & (df2['b'] == 'x')] -Note, with the choice methods :ref:`Label indexing `, :ref:`Integer indexing `, -and :ref:`Advanced indexing ` may select along more than one axis using boolean vectors combined with other +Note, with the choice methods :ref:`Selection by Label `, :ref:`Selection by Position `, +and :ref:`Advanced Indexing ` may select along more than one axis using boolean vectors combined with other indexing expressions. Where and Masking @@ -645,8 +669,8 @@ Advanced Indexing with ``.ix`` to use any passed *integer* as *label* locations first (like what ``.loc`` would do, then to fall back on *positional* indexing, like what ``.iloc`` would do). -The syntax of using ``.ix`` is identical to ``.loc``, in :ref:`Label indexing `, -and ``.iloc`` in :ref:`Integer indexing `. +The syntax of using ``.ix`` is identical to ``.loc``, in :ref:`Selection by Label `, +and ``.iloc`` in :ref:`Selection by Position `. The ``.ix`` attribute takes the following inputs: @@ -980,6 +1004,8 @@ of tuples: s.reindex(index[:3]) s.reindex([('foo', 'two'), ('bar', 'one'), ('qux', 'one'), ('baz', 'one')]) +.. _indexing.advanced_hierarchical: + Advanced indexing with hierarchical index ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/io.rst b/doc/source/io.rst index 86d590965f141..e2b66d3d50b9b 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -970,7 +970,7 @@ one can use the ExcelWriter class, as in the following example: df2.to_excel(writer, sheet_name='sheet2') writer.save() -.. _io-hdf5: +.. _io.hdf5: HDF5 (PyTables) --------------- @@ -1058,6 +1058,7 @@ These stores are **not** appendable once written (though you can simply remove them and rewrite). Nor are they **queryable**; they must be retrieved in their entirety. +.. _io.hdf5-table: Storing in Table format ~~~~~~~~~~~~~~~~~~~~~~~ @@ -1091,6 +1092,8 @@ supported. # the type of stored data store.root.df._v_attrs.pandas_type +.. _io.hdf5-keys: + Hierarchical Keys ~~~~~~~~~~~~~~~~~ @@ -1115,6 +1118,8 @@ everying in the sub-store and BELOW, so be *careful*. store.remove('food') store +.. _io.hdf5-types: + Storing Mixed Types in a Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1170,6 +1175,8 @@ storing/selecting from homogeneous index DataFrames. store.select('df_mi', Term('foo=bar')) +.. _io.hdf5-query: + Querying a Table ~~~~~~~~~~~~~~~~ @@ -1372,6 +1379,7 @@ table (optional) to let it have the remaining columns. The argument store.select_as_multiple(['df1_mt', 'df2_mt'], where=['A>0', 'B>0'], selector = 'df1_mt') +.. _io.hdf5-delete: Delete from a Table ~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.10.0.txt b/doc/source/v0.10.0.txt index c220d2cbba81d..0c5497868efe2 100644 --- a/doc/source/v0.10.0.txt +++ b/doc/source/v0.10.0.txt @@ -217,7 +217,7 @@ The width of each line can be changed via 'line_width' (80 by default): Updated PyTables Support ~~~~~~~~~~~~~~~~~~~~~~~~ -:ref:`Docs ` for PyTables ``Table`` format & several enhancements to the api. Here is a taste of what to expect. +:ref:`Docs ` for PyTables ``Table`` format & several enhancements to the api. Here is a taste of what to expect. .. ipython:: python :suppress: diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt index 8ea49d1074310..f4c9d13c0d23e 100644 --- a/doc/source/v0.11.0.txt +++ b/doc/source/v0.11.0.txt @@ -4,15 +4,19 @@ v0.11.0 (March ??, 2013) ------------------------ This is a major release from 0.10.1 and includes many new features and -enhancements along with a large number of bug fixes. There are also a number of -important API changes that long-time pandas users should pay close attention -to. +enhancements along with a large number of bug fixes. The methods of Selecting +Data have had quite a number of additions, and Dtype support is now full-fledged. +There are also a number of important API changes that long-time pandas users should +pay close attention to. + +There is a new section in the documentation, :ref:`10 Minutes to Pandas <10min>`, +primarily geared to new users. API changes ~~~~~~~~~~~ -Indexing Choice -~~~~~~~~~~~~~~~ +Selection Choices +~~~~~~~~~~~~~~~~~ Starting in 0.11.0, object selection has had a number of user-requested additions in order to support more explicit location based indexing. Pandas now supports @@ -23,14 +27,14 @@ three types of multi-axis indexing. - A single label, e.g. ``5`` or ``'a'`` - (note that ``5`` when used as a *label* of an integer based index) + (note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index) - A list or array of labels ``['a', 'b', 'c']`` - A slice object with labels ``'a':'f'`` (note that contrary to usual python slices, **both** the start and the stop are included!) - A boolean array - See more at :ref:`Label indexing ` + See more at :ref:`Selection by Label ` - ``.iloc`` is strictly integer position based (from 0 to length-1 of the axis), will raise ``IndexError`` when the requested indicies are out of bounds. Allowed inputs are: @@ -40,20 +44,23 @@ three types of multi-axis indexing. - A slice object with ints ``1:7`` - A boolean array - See more at :ref:`Integer indexing ` + See more at :ref:`Selection by Position ` - ``.ix`` supports mixed integer and label based access. It is primarily label based, but will fallback to integer positional access. ``.ix`` is the most general and will support - any of the inputsx to ``.loc`` and ``.iloc``, as well as support for floating point label schemes. + any of the inputs to ``.loc`` and ``.iloc``, as well as support for floating point label schemes. + As using integer slices with ``.ix`` have different behavior depending on whether the slice is interpreted as integer location based or label position based, it's usually better to be explicit and use ``.iloc`` (integer location) or ``.loc`` (label location). - See more at :ref:`Advanced indexing ` + ``.ix`` is especially usefull when dealing with mixed positional/label based hierarchial indexes. + + See more at :ref:`Advanced Indexing ` and :ref:`Advanced Hierarchical ` -Indexing Deprecations -~~~~~~~~~~~~~~~~~~~~~ +Selection Deprecations +~~~~~~~~~~~~~~~~~~~~~~ Starting in version 0.11.0, the methods may be deprecated in future versions. @@ -61,13 +68,17 @@ Starting in version 0.11.0, the methods may be deprecated in future versions. - ``icol`` - ``iget_value`` -See the section :ref:`Integer indexing ` for substitutes. +See the section :ref:`Selection by Position ` for substitutes. + +Cross-sectional slices on non-hierarchical indices are now easily performed using +``.loc`` and/or ``.loc``. The methods: + + - ``xs`` (for DataFrame), + - ``minor_xs`` and ``major_xs`` (for Panel) -Cross-sectional slices on non-hierarchical indices are now easily -performed using ``.loc`` and/or ``.loc``. The methods ``xs`` (for DataFrame), -``minor_xs`` and ``major_xs`` (for Panel), exist primarily for backward -compatibility +now exist primarily for backward compatibility. +See the section :ref:`Selection by Label ` for substitutes. Dtypes ~~~~~~ diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 0b4474f7a465c..b1da18709cf20 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -643,7 +643,7 @@ def _getbool_axis(self, key, axis=0): class _LocIndexer(_LocationIndexer): """ purely label based location based indexing """ - _valid_types = "labels (MUST BE INCLUSIVE), slices of labels (BOTH endpoints included! Can be slices of integers if the index is integers), listlike of labels, boolean" + _valid_types = "labels (MUST BE IN THE INDEX), slices of labels (BOTH endpoints included! Can be slices of integers if the index is integers), listlike of labels, boolean" _exception = KeyError def _has_valid_type(self, key, axis): From 643e1cbd42c76633c67982872404618cbfa453c8 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 3 Mar 2013 13:14:15 -0500 Subject: [PATCH 6/7] DOC: revamped dtypes section in basics.rst fixed removal of foo temp files in 10min DOC: added to time series in 10min.rst --- doc/source/10min.rst | 47 +++++++++++++++++++++++++-- doc/source/basics.rst | 73 ++++++++++++++++++++++++++++++++++-------- doc/source/dsintro.rst | 2 +- 3 files changed, 105 insertions(+), 17 deletions(-) diff --git a/doc/source/10min.rst b/doc/source/10min.rst index d73ff600e1450..50159d53a1fd7 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -7,6 +7,7 @@ import numpy as np import random + import os np.random.seed(123456) from pandas import * import pandas as pd @@ -466,6 +467,45 @@ limited to, financial applications. See the :ref:`Time Series section ` diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index 83f2de01300c9..d5eb863580b6b 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -447,7 +447,7 @@ DataFrame: .. ipython:: python - df.loc('b') + df.loc['b'] df.iloc[2] For a more exhaustive treatment of more sophisticated label-based indexing and From 41793eaf9553972fdbb31c108c5afa6282bf0858 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 3 Mar 2013 17:20:46 -0500 Subject: [PATCH 7/7] DOC: added sorting examples to 10min BUG: fixed multi-index selection via loc, back to using some of ix code (but still do validation if not mi) ENH: add xs to Series for compatiblity, create _xs functions in all objects DOC: added several sub-sections to 10min fixed some references in basics.rst --- RELEASE.rst | 2 +- doc/source/10min.rst | 118 ++++++++++++++++++++++++++++++++-- doc/source/basics.rst | 12 ++-- doc/source/io.rst | 2 + pandas/core/frame.py | 2 + pandas/core/indexing.py | 32 +++++---- pandas/core/panel.py | 2 + pandas/core/series.py | 3 + pandas/tests/test_indexing.py | 64 +++++++++++++++--- 9 files changed, 201 insertions(+), 36 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 3ed6d71ce9273..78e946006e1fb 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -59,6 +59,7 @@ pandas 0.11.0 - Add ``format`` option to ``pandas.to_datetime`` with faster conversion of strings that can be parsed with datetime.strptime - Add ``axes`` property to ``Series`` for compatibility + - Add ``xs`` function to ``Series`` for compatibility **API Changes** @@ -135,7 +136,6 @@ pandas 0.11.0 - Bug on in-place putmasking on an ``integer`` series that needs to be converted to ``float`` (GH2746_) - Bug in argsort of ``datetime64[ns]`` Series with ``NaT`` (GH2967_) - Bug in idxmin/idxmax of ``datetime64[ns]`` Series with ``NaT`` (GH2982__) - - ``icol`` with negative indicies was return ``nan`` (see GH2922_) - Bug in ``icol`` with negative indicies was incorrect producing incorrect return values (see GH2922_) .. _GH622: https://github.com/pydata/pandas/issues/622 diff --git a/doc/source/10min.rst b/doc/source/10min.rst index 50159d53a1fd7..a6945eed1387c 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -67,7 +67,7 @@ Creating a ``DataFrame`` by passing a dict of objects that can be converted to s 'E' : 'foo' }) df2 -Having specific dtypes +Having specific :ref:`dtypes ` .. ipython:: python @@ -83,7 +83,7 @@ See the top & bottom rows of the frame .. ipython:: python df.head() - df.tail() + df.tail(3) Display the index,columns, and the underlying numpy data @@ -99,6 +99,24 @@ Describe shows a quick statistic summary of your data df.describe() +Transposing your data + +.. ipython:: python + + df.T + +Sorting by an axis + +.. ipython:: python + + df.sort_index(axis=1, ascending=False) + +Sorting by values + +.. ipython:: python + + df.sort(columns='B') + Selection --------- @@ -112,6 +130,7 @@ Selecting a single column, which yields a ``Series`` .. ipython:: python + # equivalently ``df.A`` df['A'] Selecting via ``[]``, which slices the rows. @@ -167,7 +186,6 @@ Select via the position of the passed integers .. ipython:: python - # this is a cross-section of the object df.iloc[3] By integer slices, acting similar to numpy/python @@ -220,7 +238,7 @@ Pandas will detect this and raise ``IndexError``, rather than return an empty st :: - >>> df.iloc[:,3:6] + >>> df.iloc[:,8:10] IndexError: out-of-bounds on slice (end) Boolean Indexing @@ -232,7 +250,7 @@ Using a single column's values to select data. df[df.A > 0] -A ``where`` operation. +A ``where`` operation for getting. .. ipython:: python @@ -270,6 +288,14 @@ Setting by assigning with a numpy array df.loc[:,'D'] = np.array([5] * len(df)) df +A ``where`` operation with setting. + +.. ipython:: python + + df2 = df.copy() + df2[df2 > 0] = -df2 + df2 + Missing Data ------------ @@ -297,6 +323,12 @@ Filling missing data df1.fillna(value=5) +To get the boolean mask where values are ``nan`` + +.. ipython:: python + + pd.isnull(df1) + Operations ---------- @@ -306,6 +338,8 @@ See the :ref:`Basic section on Binary Ops ` Stats ~~~~~ +Operations in general *exclude* missing data. + Performing a descriptive statistic .. ipython:: python @@ -318,11 +352,15 @@ Same operation on the other axis df.mean(1) -Operations on missing data, exclude the data +Operating with objects that have different dimensionality and need alignment. +In addition, pandas automatically broadcasts along the specified dimension. .. ipython:: python - df1.mean() + s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2) + s + df.sub(s,axis='index') + Apply ~~~~~ @@ -334,6 +372,27 @@ Applying functions to the data df.apply(np.cumsum) df.apply(lambda x: x.max() - x.min()) +Histogramming +~~~~~~~~~~~~~ + +See more at :ref:`Histogramming and Discretization ` + +.. ipython:: python + + s = Series(np.random.randint(0,7,size=10)) + s + s.value_counts() + +String Methods +~~~~~~~~~~~~~~ + +See more at :ref:`Vectorized String Methods ` + +.. ipython:: python + + s = Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) + s.str.lower() + Merge ----- @@ -425,6 +484,9 @@ Reshaping See the section on :ref:`Hierarchical Indexing ` and see the section on :ref:`Reshaping `). +Stack +~~~~~ + .. ipython:: python tuples = zip(*[['bar', 'bar', 'baz', 'baz', @@ -453,6 +515,26 @@ unstacks the **last level**: stacked.unstack(1) stacked.unstack(0) +Pivot Tables +~~~~~~~~~~~~ +See the section on :ref:`Pivot Tables `). + +.. ipython:: python + + df = DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3, + 'B' : ['A', 'B', 'C'] * 4, + 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, + 'D' : np.random.randn(12), + 'E' : np.random.randn(12)}) + df + +We can produce pivot tables from this data very easily: + +.. ipython:: python + + pivot_table(df, values='D', rows=['A', 'B'], cols=['C']) + + Time Series ----------- @@ -581,3 +663,25 @@ Reading from a HDF5 Store store.close() os.remove('foo.h5') +Excel +~~~~~ + +Reading and writing to :ref:`MS Excel ` + +Writing to an excel file + +.. ipython:: python + + df.to_excel('foo.xlsx', sheet_name='sheet1') + +Reading from an excel file + +.. ipython:: python + + xls = ExcelFile('foo.xlsx') + xls.parse('sheet1', index_col=None, na_values=['NA']) + +.. ipython:: python + :suppress: + + os.remove('foo.xlsx') diff --git a/doc/source/basics.rst b/doc/source/basics.rst index aa199d48e0dd3..d32cbf7dcb8d1 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -9,9 +9,9 @@ randn = np.random.randn np.set_printoptions(precision=4, suppress=True) -***************************** -Essential Basic Functionality -***************************** +============================== + Essential Basic Functionality +============================== Here we discuss a lot of the essential functionality common to the pandas data structures. Here's how to create some of the objects used in the examples from @@ -374,6 +374,8 @@ value, ``idxmin`` and ``idxmax`` return the first matching index: df3 df3['A'].idxmin() +.. _basics.discretization: + Value counts (histogramming) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -976,11 +978,11 @@ To be clear, no pandas methods have the side effect of modifying your data; almost all methods return new objects, leaving the original object untouched. If data is modified, it is because you did so explicitly. +.. _basics.dtypes: + dtypes ------ -.. _basics.dtypes: - The main types stored in pandas objects are ``float``, ``int``, ``bool``, ``datetime64[ns]``, ``timedelta[ns]``, and ``object``. In addition these dtypes have item sizes, e.g. ``int64`` and ``int32``. A convenient ``dtypes`` attribute for DataFrames returns a Series with the data type of each column. diff --git a/doc/source/io.rst b/doc/source/io.rst index e2b66d3d50b9b..914506fb0d3cd 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -906,6 +906,8 @@ And then import the data directly to a DataFrame by calling: clipdf +.. _io.excel: + Excel files ----------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3a7da212d6a3f..faac974ae9ddb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2343,6 +2343,8 @@ def xs(self, key, axis=0, level=None, copy=True): result.index = new_index return result + _xs = xs + def lookup(self, row_labels, col_labels): """ Label-based "fancy indexing" function for DataFrame. Given equal-length diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index b1da18709cf20..b86518e8947ef 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -55,9 +55,9 @@ def _get_label(self, label, axis=0): raise IndexingError('no slices here') try: - return self.obj.xs(label, axis=axis, copy=False) + return self.obj._xs(label, axis=axis, copy=False) except Exception: - return self.obj.xs(label, axis=axis, copy=True) + return self.obj._xs(label, axis=axis, copy=True) def _get_loc(self, key, axis=0): return self.obj._ixs(key, axis=axis) @@ -86,6 +86,9 @@ def __setitem__(self, key, value): self._setitem_with_indexer(indexer, value) + def _has_valid_tuple(self, key): + pass + def _convert_tuple(self, key): keyidx = [] for i, k in enumerate(key): @@ -224,6 +227,9 @@ def _getitem_tuple(self, tup): if self._multi_take_opportunity(tup): return self._multi_take(tup) + # no multi-index, so validate all of the indexers + self._has_valid_tuple(tup) + # no shortcut needed retval = self.obj for i, key in enumerate(tup): @@ -616,15 +622,16 @@ class _LocationIndexer(_NDFrameIndexer): def _has_valid_type(self, k, axis): raise NotImplementedError() + def _has_valid_tuple(self, key): + """ check the key for valid keys across my indexer """ + for i, k in enumerate(key): + if i >= self.obj.ndim: + raise ValueError('Too many indexers') + if not self._has_valid_type(k,i): + raise ValueError("Location based indexing can only have [%s] types" % self._valid_types) + def __getitem__(self, key): if type(key) is tuple: - - for i, k in enumerate(key): - if i >= self.obj.ndim: - raise ValueError('Too many indexers') - if not self._has_valid_type(k,i): - raise ValueError("Location based indexing can only have [%s] types" % self._valid_types) - return self._getitem_tuple(key) else: return self._getitem_axis(key, axis=0) @@ -707,11 +714,7 @@ def _getitem_axis(self, key, axis=0): return self._getitem_iterable(key, axis=axis) else: - indexer = labels.get_loc(key) - return self._get_loc(indexer, axis=axis) - - def _get_loc(self, key, axis=0): - return self.obj._ixs(key, axis=axis) + return self._get_label(key, axis=axis) class _iLocIndexer(_LocationIndexer): """ purely integer based location based indexing """ @@ -723,6 +726,7 @@ def _has_valid_type(self, key, axis): def _getitem_tuple(self, tup): + self._has_valid_tuple(tup) retval = self.obj for i, key in enumerate(tup): if _is_null_slice(key): diff --git a/pandas/core/panel.py b/pandas/core/panel.py index dd1aeed70513b..9f91d8add1eac 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -1065,6 +1065,8 @@ def xs(self, key, axis=1, copy=True): new_data = self._data.xs(key, axis=axis_number, copy=copy) return self._constructor_sliced(new_data) + _xs = xs + def _ixs(self, i, axis=0): # for compatibility with .ix indexing # Won't work with hierarchical indexing yet diff --git a/pandas/core/series.py b/pandas/core/series.py index 74f96aff083dd..27480d9e489be 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -559,6 +559,9 @@ def ix(self): return self._ix + def _xs(self, key, axis=0, level=None, copy=True): + return self.__getitem__(key) + def _ixs(self, i, axis=0): """ Return the i-th value or values in the Series by location diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index c219c6fef196f..e48d8dbdcb498 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -609,22 +609,68 @@ def test_iloc_setitem_series(self): assert_series_equal(result, expected) def test_iloc_multiindex(self): - df = DataFrame(np.random.randn(3, 3), - columns=[[2,2,4],[6,8,10]], - index=[[4,4,8],[8,10,12]]) + mi_labels = DataFrame(np.random.randn(4, 3), columns=[['i', 'i', 'j'], + ['A', 'A', 'B']], + index=[['i', 'i', 'j', 'k'], ['X', 'X', 'Y','Y']]) - rs = df.iloc[2] - xp = df.irow(2) + mi_int = DataFrame(np.random.randn(3, 3), + columns=[[2,2,4],[6,8,10]], + index=[[4,4,8],[8,10,12]]) + + + # the first row + rs = mi_int.iloc[0] + xp = mi_int.ix[4].ix[8] assert_series_equal(rs, xp) - rs = df.iloc[:,2] - xp = df.icol(2) + # 2nd (last) columns + rs = mi_int.iloc[:,2] + xp = mi_int.ix[:,2] assert_series_equal(rs, xp) - rs = df.iloc[2,2] - xp = df.values[2,2] + # corner column + rs = mi_int.iloc[2,2] + xp = mi_int.ix[:,2].ix[2] + self.assert_(rs == xp) + + # this is basically regular indexing + rs = mi_labels.iloc[2,2] + xp = mi_labels.ix['j'].ix[:,'j'].ix[0,0] self.assert_(rs == xp) + def test_loc_multiindex(self): + + mi_labels = DataFrame(np.random.randn(3, 3), columns=[['i', 'i', 'j'], + ['A', 'A', 'B']], + index=[['i', 'i', 'j'], ['X', 'X', 'Y']]) + + mi_int = DataFrame(np.random.randn(3, 3), + columns=[[2,2,4],[6,8,10]], + index=[[4,4,8],[8,10,12]]) + + # the first row + rs = mi_labels.loc['i'] + xp = mi_labels.ix['i'] + assert_frame_equal(rs, xp) + + # 2nd (last) columns + rs = mi_labels.loc[:,'j'] + xp = mi_labels.ix[:,'j'] + assert_frame_equal(rs, xp) + + # corner column + rs = mi_labels.loc['j'].loc[:,'j'] + xp = mi_labels.ix['j'].ix[:,'j'] + assert_frame_equal(rs,xp) + + # with a tuple + rs = mi_labels.loc[('i','X')] + xp = mi_labels.ix[('i','X')] + assert_frame_equal(rs,xp) + + rs = mi_int.loc[4] + xp = mi_int.ix[4] + assert_frame_equal(rs,xp) if __name__ == '__main__': import nose