diff --git a/RELEASE.rst b/RELEASE.rst index cf3fd598a8186..78e946006e1fb 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -29,12 +29,19 @@ pandas 0.11.0 **New features** + - New documentation section, ``10 Minutes to Pandas`` - Allow mixed dtypes (e.g ``float32/float64/int32/int16/int8``) to coexist in DataFrames and propogate in operations - Add function to pandas.io.data for retrieving stock index components from Yahoo! finance (GH2795_) - Add ``squeeze`` function to reduce dimensionality of 1-len objects - Support slicing with time objects (GH2681_) + - Added ``.iloc`` attribute, to support strict integer based indexing, analagous to ``.ix`` (GH2922_) + - Added ``.loc`` attribute, to support strict label based indexing, analagous to ``.ix`` + - Added ``.iat`` attribute, to support fast scalar access via integers (replaces ``iget_value/iset_value``) + - Added ``.at`` attribute, to support fast scalar access via labels (replaces ``get_value/set_value``) + - Moved functionaility from ``irow,icol,iget_value/iset_value`` to ``.iloc`` indexer + (via ``_ixs`` methods in each object) **Improvements to existing features** @@ -51,6 +58,8 @@ pandas 0.11.0 - ``describe_option()`` now reports the default and current value of options. - Add ``format`` option to ``pandas.to_datetime`` with faster conversion of strings that can be parsed with datetime.strptime + - Add ``axes`` property to ``Series`` for compatibility + - Add ``xs`` function to ``Series`` for compatibility **API Changes** @@ -127,6 +136,7 @@ pandas 0.11.0 - Bug on in-place putmasking on an ``integer`` series that needs to be converted to ``float`` (GH2746_) - Bug in argsort of ``datetime64[ns]`` Series with ``NaT`` (GH2967_) - Bug in idxmin/idxmax of ``datetime64[ns]`` Series with ``NaT`` (GH2982__) + - Bug in ``icol`` with negative indicies was incorrect producing incorrect return values (see GH2922_) .. _GH622: https://github.com/pydata/pandas/issues/622 .. _GH797: https://github.com/pydata/pandas/issues/797 @@ -145,6 +155,7 @@ pandas 0.11.0 .. _GH2849: https://github.com/pydata/pandas/issues/2849 .. _GH2898: https://github.com/pydata/pandas/issues/2898 .. _GH2909: https://github.com/pydata/pandas/issues/2909 +.. _GH2922: https://github.com/pydata/pandas/issues/2922 .. _GH2931: https://github.com/pydata/pandas/issues/2931 .. _GH2973: https://github.com/pydata/pandas/issues/2973 .. _GH2967: https://github.com/pydata/pandas/issues/2967 diff --git a/doc/source/10min.rst b/doc/source/10min.rst new file mode 100644 index 0000000000000..a6945eed1387c --- /dev/null +++ b/doc/source/10min.rst @@ -0,0 +1,687 @@ +.. _10min: + +.. currentmodule:: pandas + +.. ipython:: python + :suppress: + + import numpy as np + import random + import os + np.random.seed(123456) + from pandas import * + import pandas as pd + randn = np.random.randn + randint = np.random.randint + np.set_printoptions(precision=4, suppress=True) + + #### portions of this were borrowed from the + #### Pandas cheatsheet + #### created during the PyData Workshop-Sprint 2012 + #### Hannah Chen, Henry Chow, Eric Cox, Robert Mauriello + + +******************** +10 Minutes to Pandas +******************** + +This is a short introduction to pandas, geared mainly for new users. + +Customarily, we import as follows + +.. ipython:: python + + import pandas as pd + import numpy as np + +Object Creation +--------------- + +See the :ref:`Data Structure Intro section ` + +Creating a ``Series`` by passing a list of values, letting pandas create a default +integer index + +.. ipython:: python + + s = pd.Series([1,3,5,np.nan,6,8]) + s + +Creating a ``DataFrame`` by passing a numpy array, with a datetime index and labeled columns. + +.. ipython:: python + + dates = pd.date_range('20130101',periods=6) + dates + df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD')) + df + +Creating a ``DataFrame`` by passing a dict of objects that can be converted to series-like. + +.. ipython:: python + + df2 = pd.DataFrame({ 'A' : 1., + 'B' : pd.Timestamp('20130102'), + 'C' : pd.Series(1,index=range(4),dtype='float32'), + 'D' : np.array([3] * 4,dtype='int32'), + 'E' : 'foo' }) + df2 + +Having specific :ref:`dtypes ` + +.. ipython:: python + + df2.dtypes + +Viewing Data +------------ + +See the :ref:`Basics section ` + +See the top & bottom rows of the frame + +.. ipython:: python + + df.head() + df.tail(3) + +Display the index,columns, and the underlying numpy data + +.. ipython:: python + + df.index + df.columns + df.values + +Describe shows a quick statistic summary of your data + +.. ipython:: python + + df.describe() + +Transposing your data + +.. ipython:: python + + df.T + +Sorting by an axis + +.. ipython:: python + + df.sort_index(axis=1, ascending=False) + +Sorting by values + +.. ipython:: python + + df.sort(columns='B') + +Selection +--------- + +See the :ref:`Indexing section ` + + +Getting +~~~~~~~ + +Selecting a single column, which yields a ``Series`` + +.. ipython:: python + + # equivalently ``df.A`` + df['A'] + +Selecting via ``[]``, which slices the rows. + +.. ipython:: python + + df[0:3] + df['20130102':'20130104'] + +Selection by Label +~~~~~~~~~~~~~~~~~~ + +For getting a cross section using a label + +.. ipython:: python + + df.loc[dates[0]] + +Selecting on a multi-axis by label + +.. ipython:: python + + df.loc[:,['A','B']] + +Showing label slicing, both endpoints are *included* + +.. ipython:: python + + df.loc['20130102':'20130104',['A','B']] + +Reduction in the dimensions of the returned object + +.. ipython:: python + + df.loc['20130102',['A','B']] + +For getting a scalar value + +.. ipython:: python + + df.loc[dates[0],'A'] + +For getting fast access to a scalar (equiv to the prior method) + +.. ipython:: python + + df.at[dates[0],'A'] + +Selection by Position +~~~~~~~~~~~~~~~~~~~~~ + +Select via the position of the passed integers + +.. ipython:: python + + df.iloc[3] + +By integer slices, acting similar to numpy/python + +.. ipython:: python + + df.iloc[3:5,0:2] + +By lists of integer position locations, similar to the numpy/python style + +.. ipython:: python + + df.iloc[[1,2,4],[0,2]] + +For slicing rows explicitly + +.. ipython:: python + + df.iloc[1:3,:] + +For slicing columns explicitly + +.. ipython:: python + + df.iloc[:,1:3] + +For getting a value explicity + +.. ipython:: python + + df.iloc[1,1] + +For getting fast access to a scalar (equiv to the prior method) + +.. ipython:: python + + df.iat[1,1] + +There is one signficant departure from standard python/numpy slicing semantics. +python/numpy allow slicing past the end of an array without an associated error. + +.. ipython:: python + + # these are allowed in python/numpy. + x = list('abcdef') + x[4:10] + x[8:10] + +Pandas will detect this and raise ``IndexError``, rather than return an empty structure. + +:: + + >>> df.iloc[:,8:10] + IndexError: out-of-bounds on slice (end) + +Boolean Indexing +~~~~~~~~~~~~~~~~ + +Using a single column's values to select data. + +.. ipython:: python + + df[df.A > 0] + +A ``where`` operation for getting. + +.. ipython:: python + + df[df > 0] + + +Setting +~~~~~~~ + +Setting a new column automatically aligns the data +by the indexes + +.. ipython:: python + + s1 = pd.Series([1,2,3,4,5,6],index=date_range('20130102',periods=6)) + s1 + df['F'] = s1 + +Setting values by label + +.. ipython:: python + + df.at[dates[0],'A'] = 0 + +Setting values by position + +.. ipython:: python + + df.iat[0,1] = 0 + +Setting by assigning with a numpy array + +.. ipython:: python + + df.loc[:,'D'] = np.array([5] * len(df)) + df + +A ``where`` operation with setting. + +.. ipython:: python + + df2 = df.copy() + df2[df2 > 0] = -df2 + df2 + +Missing Data +------------ + +Pandas primarily uses the value ``np.nan`` to represent missing data. It +is by default not included in computations. See the :ref:`Missing Data section ` + +Reindexing allows you to change/add/delete the index on a specified axis. This +returns a copy of the data. + +.. ipython:: python + + df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + ['E']) + df1.loc[dates[0]:dates[1],'E'] = 1 + df1 + +To drop any rows that have missing data. + +.. ipython:: python + + df1.dropna(how='any') + +Filling missing data + +.. ipython:: python + + df1.fillna(value=5) + +To get the boolean mask where values are ``nan`` + +.. ipython:: python + + pd.isnull(df1) + + +Operations +---------- + +See the :ref:`Basic section on Binary Ops ` + +Stats +~~~~~ + +Operations in general *exclude* missing data. + +Performing a descriptive statistic + +.. ipython:: python + + df.mean() + +Same operation on the other axis + +.. ipython:: python + + df.mean(1) + +Operating with objects that have different dimensionality and need alignment. +In addition, pandas automatically broadcasts along the specified dimension. + +.. ipython:: python + + s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2) + s + df.sub(s,axis='index') + + +Apply +~~~~~ + +Applying functions to the data + +.. ipython:: python + + df.apply(np.cumsum) + df.apply(lambda x: x.max() - x.min()) + +Histogramming +~~~~~~~~~~~~~ + +See more at :ref:`Histogramming and Discretization ` + +.. ipython:: python + + s = Series(np.random.randint(0,7,size=10)) + s + s.value_counts() + +String Methods +~~~~~~~~~~~~~~ + +See more at :ref:`Vectorized String Methods ` + +.. ipython:: python + + s = Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) + s.str.lower() + +Merge +----- + +Concat +~~~~~~ + +Pandas provides various facilities for easily combining together Series, +DataFrame, and Panel objects with various kinds of set logic for the indexes +and relational algebra functionality in the case of join / merge-type +operations. + +See the :ref:`Merging section ` + +Concatenating pandas objects together + +.. ipython:: python + + df = pd.DataFrame(np.random.randn(10, 4)) + df + + # break it into pieces + pieces = [df[:3], df[3:7], df[7:]] + + concat(pieces) + +Join +~~~~ + +SQL style merges. See the :ref:`Database style joining ` + +.. ipython:: python + + left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]}) + right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]}) + left + right + merge(left, right, on='key') + +Append +~~~~~~ + +Append rows to a dataframe. See the :ref:`Appending ` + +.. ipython:: python + + df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D']) + df + s = df.iloc[3] + df.append(s, ignore_index=True) + df + + +Grouping +-------- + +By "group by" we are referring to a process involving one or more of the following +steps + + - **Splitting** the data into groups based on some criteria + - **Applying** a function to each group independently + - **Combining** the results into a data structure + +See the :ref:`Grouping section ` + +.. ipython:: python + + df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B' : ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C' : randn(8), 'D' : randn(8)}) + df + +Grouping and then applying a function ``sum`` to the resulting groups. + +.. ipython:: python + + df.groupby('A').sum() + +Grouping by multiple columns forms a hierarchical index, which we then apply the function. + +.. ipython:: python + + df.groupby(['A','B']).sum() + +Reshaping +--------- + +See the section on :ref:`Hierarchical Indexing ` and +see the section on :ref:`Reshaping `). + +Stack +~~~~~ + +.. ipython:: python + + tuples = zip(*[['bar', 'bar', 'baz', 'baz', + 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', + 'one', 'two', 'one', 'two']]) + index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) + df = pd.DataFrame(randn(8, 2), index=index, columns=['A', 'B']) + df2 = df[:4] + df2 + +The ``stack`` function "compresses" a level in the DataFrame's columns. to + +.. ipython:: python + + stacked = df2.stack() + stacked + +With a "stacked" DataFrame or Series (having a ``MultiIndex`` as the +``index``), the inverse operation of ``stack`` is ``unstack``, which by default +unstacks the **last level**: + +.. ipython:: python + + stacked.unstack() + stacked.unstack(1) + stacked.unstack(0) + +Pivot Tables +~~~~~~~~~~~~ +See the section on :ref:`Pivot Tables `). + +.. ipython:: python + + df = DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3, + 'B' : ['A', 'B', 'C'] * 4, + 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, + 'D' : np.random.randn(12), + 'E' : np.random.randn(12)}) + df + +We can produce pivot tables from this data very easily: + +.. ipython:: python + + pivot_table(df, values='D', rows=['A', 'B'], cols=['C']) + + +Time Series +----------- + +Pandas has simple, powerful, and efficient functionality for +performing resampling operations during frequency conversion (e.g., converting +secondly data into 5-minutely data). This is extremely common in, but not +limited to, financial applications. See the :ref:`Time Series section ` + +.. ipython:: python + + rng = pd.date_range('1/1/2012', periods=100, freq='S') + ts = pd.Series(randint(0, 500, len(rng)), index=rng) + ts.resample('5Min', how='sum') + +Time zone representation + +.. ipython:: python + + rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D') + ts = pd.Series(randn(len(rng)), rng) + ts_utc = ts.tz_localize('UTC') + ts_utc + +Convert to another time zone + +.. ipython:: python + + ts_utc.tz_convert('US/Eastern') + +Converting between time span representations + +.. ipython:: python + + rng = pd.date_range('1/1/2012', periods=5, freq='M') + ts = pd.Series(randn(len(rng)), index=rng) + ts + ps = ts.to_period() + ps + ps.to_timestamp() + +Converting between period and timestamp enables some convenient arithmetic +functions to be used. In the following example, we convert a quarterly +frequency with year ending in November to 9am of the end of the month following +the quarter end: + +.. ipython:: python + + prng = period_range('1990Q1', '2000Q4', freq='Q-NOV') + ts = Series(randn(len(prng)), prng) + ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9 + ts.head() + + +Plotting +-------- + +.. ipython:: python + :suppress: + + import matplotlib.pyplot as plt + plt.close('all') + +.. ipython:: python + + ts = pd.Series(randn(1000), index=pd.date_range('1/1/2000', periods=1000)) + ts = ts.cumsum() + + @savefig series_plot_basic.png width=4.5in + ts.plot() + +On DataFrame, ``plot`` is a convenience to plot all of the columns with labels: + +.. ipython:: python + + df = pd.DataFrame(randn(1000, 4), index=ts.index, + columns=['A', 'B', 'C', 'D']) + df = df.cumsum() + + @savefig frame_plot_basic.png width=4.5in + plt.figure(); df.plot(); plt.legend(loc='best') + +Getting Data In/Out +------------------- + +CSV +~~~ + +:ref:`Writing to a csv file ` + +.. ipython:: python + + df.to_csv('foo.csv') + +:ref:`Reading from a csv file ` + +.. ipython:: python + + pd.read_csv('foo.csv') + +.. ipython:: python + :suppress: + + os.remove('foo.csv') + +HDF5 +~~~~ + +Reading and writing to :ref:`HDFStores ` + +Writing to a HDF5 Store + +.. ipython:: python + + store = pd.HDFStore('foo.h5') + store['df'] = df + +Reading from a HDF5 Store + +.. ipython:: python + + store['df'] + +.. ipython:: python + :suppress: + + store.close() + os.remove('foo.h5') + +Excel +~~~~~ + +Reading and writing to :ref:`MS Excel ` + +Writing to an excel file + +.. ipython:: python + + df.to_excel('foo.xlsx', sheet_name='sheet1') + +Reading from an excel file + +.. ipython:: python + + xls = ExcelFile('foo.xlsx') + xls.parse('sheet1', index_col=None, na_values=['NA']) + +.. ipython:: python + :suppress: + + os.remove('foo.xlsx') diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 05025e4f9479a..d32cbf7dcb8d1 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -9,9 +9,9 @@ randn = np.random.randn np.set_printoptions(precision=4, suppress=True) -***************************** -Essential Basic Functionality -***************************** +============================== + Essential Basic Functionality +============================== Here we discuss a lot of the essential functionality common to the pandas data structures. Here's how to create some of the objects used in the examples from @@ -374,6 +374,8 @@ value, ``idxmin`` and ``idxmax`` return the first matching index: df3 df3['A'].idxmin() +.. _basics.discretization: + Value counts (histogramming) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -976,14 +978,14 @@ To be clear, no pandas methods have the side effect of modifying your data; almost all methods return new objects, leaving the original object untouched. If data is modified, it is because you did so explicitly. -DTypes ------- - .. _basics.dtypes: -The main types stored in pandas objects are float, int, boolean, datetime64[ns], -and object. A convenient ``dtypes`` attribute for DataFrames returns a Series with -the data type of each column. +dtypes +------ + +The main types stored in pandas objects are ``float``, ``int``, ``bool``, ``datetime64[ns]``, ``timedelta[ns]``, +and ``object``. In addition these dtypes have item sizes, e.g. ``int64`` and ``int32``. A convenient ``dtypes`` +attribute for DataFrames returns a Series with the data type of each column. .. ipython:: python @@ -992,11 +994,26 @@ the data type of each column. F = False, G = Series([1]*3,dtype='int8'))) dft + dft.dtypes + +On a ``Series`` use the ``dtype`` method. + +.. ipython:: python + + dft['A'].dtype -If a DataFrame contains columns of multiple dtypes, the dtype of the column -will be chosen to accommodate all of the data types (dtype=object is the most +If a pandas object contains data multiple dtypes *IN A SINGLE COLUMN*, the dtype of the +column will be chosen to accommodate all of the data types (``object`` is the most general). +.. ipython:: python + + # these ints are coerced to floats + Series([1, 2, 3, 4, 5, 6.]) + + # string data forces an ``object`` dtype + Series([1, 2, 3, 6., 'foo']) + The related method ``get_dtype_counts`` will return the number of columns of each type: @@ -1019,15 +1036,42 @@ or a passed ``Series``, then it will be preserved in DataFrame operations. Furth df2 df2.dtypes - # here you get some upcasting +defaults +~~~~~~~~ + +By default integer types are ``int64`` and float types are ``float64``, *REGARDLESS* of platform (32-bit or 64-bit). + +The following will all result in ``int64`` dtypes. + +.. ipython:: python + + DataFrame([1,2],columns=['a']).dtypes + DataFrame({'a' : [1,2] }).dtypes + DataFrame({'a' : 1 }, index=range(2)).dtypes + +Numpy, however will choose *platform-dependent* types when creating arrays. +Thus, ``DataFrame(np.array([1,2]))`` **WILL** result in ``int32`` on 32-bit platform. + + +upcasting +~~~~~~~~~ + +Types can potentially be *upcasted* when combined with other types, meaning they are promoted from the current type (say ``int`` to ``float``) + +.. ipython:: python + df3 = df1.reindex_like(df2).fillna(value=0.0) + df2 df3 df3.dtypes - # this is lower-common-denomicator upcasting (meaning you get the dtype which can accomodate all of the types) +The ``values`` attribute on a DataFrame return the *lower-common-denominator* of the dtypes, meaning the dtype that can accomodate **ALL** of the types in the resulting homogenous dtyped numpy array. This can +force some *upcasting*. + +.. ipython:: python + df3.values.dtype -Astype +astype ~~~~~~ .. _basics.cast: @@ -1044,7 +1088,7 @@ then the more *general* one will be used as the result of the operation. # conversion of dtypes df3.astype('float32').dtypes -Object Conversion +object conversion ~~~~~~~~~~~~~~~~~ To force conversion of specific types of number conversion, pass ``convert_numeric = True``. @@ -1067,16 +1111,19 @@ the objects in a Series are of the same type, the Series will have that dtype. df3['E'] = df3['E'].astype('int32') df3.dtypes - # forcing date coercion +This is a *forced coercion* on datelike types. This might be useful if you are reading in data which is mostly dates, but occasionally has non-dates intermixed and you want to make those values ``nan``. + +.. ipython:: python + s = Series([datetime(2001,1,1,0,0), 'foo', 1.0, 1, Timestamp('20010104'), '20010105'],dtype='O') s s.convert_objects(convert_dates='coerce') -Upcasting Gotchas -~~~~~~~~~~~~~~~~~ +gotchas +~~~~~~~ -Performing indexing operations on ``integer`` type data can easily upcast the data to ``floating``. +Performing selection operations on ``integer`` type data can easily upcast the data to ``floating``. The dtype of the input data will be preserved in cases where ``nans`` are not introduced (starting in 0.11.0) See also :ref:`integer na gotchas ` diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index 45fabb551d993..d5eb863580b6b 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -437,8 +437,8 @@ The basics of indexing are as follows: :widths: 30, 20, 10 Select column, ``df[col]``, Series - Select row by label, ``df.xs(label)`` or ``df.ix[label]``, Series - Select row by location (int), ``df.ix[loc]``, Series + Select row by label, ``df.loc[label]``, Series + Select row by integer location, ``df.iloc[loc]``, Series Slice rows, ``df[5:10]``, DataFrame Select rows by boolean vector, ``df[bool_vec]``, DataFrame @@ -447,8 +447,8 @@ DataFrame: .. ipython:: python - df.xs('b') - df.ix[2] + df.loc['b'] + df.iloc[2] For a more exhaustive treatment of more sophisticated label-based indexing and slicing, see the :ref:`section on indexing `. We will address the @@ -475,7 +475,7 @@ row-wise. For example: .. ipython:: python - df - df.ix[0] + df - df.iloc[0] In the special case of working with time series data, if the Series is a TimeSeries (which it will be automatically if the index contains datetime @@ -592,7 +592,7 @@ DataFrame in tabular form, though it won't always fit the console width: .. ipython:: python - print baseball.ix[-20:, :12].to_string() + print baseball.iloc[-20:, :12].to_string() New since 0.10.0, wide DataFrames will now be printed across multiple rows by default: diff --git a/doc/source/index.rst b/doc/source/index.rst index bc51f1b13f36e..d59cb6d7a816b 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -112,6 +112,7 @@ See the package overview for more detail about what's in the library. install faq overview + 10min dsintro basics indexing diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 8c18d9f69bee3..02aa00b7eaca6 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -14,9 +14,9 @@ randint = np.random.randint np.set_printoptions(precision=4, suppress=True) -*************************** -Indexing and selecting data -*************************** +************** +Selecting Data +************** The axis labeling information in pandas objects serves many purposes: @@ -32,6 +32,86 @@ attention in this area. Expect more work to be invested higher-dimensional data structures (including Panel) in the future, especially in label-based advanced indexing. +Choice +------ + +Starting in 0.11.0, object selection has had a number of user-requested additions in +order to support more explicit location based indexing. Pandas now supports +three types of multi-axis indexing. + + - ``.loc`` is strictly label based, will raise ``KeyError`` when the items are not found, + allowed inputs are: + + - A single label, e.g. ``5`` or ``'a'`` + + (note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index) + - A list or array of labels ``['a', 'b', 'c']`` + - A slice object with labels ``'a':'f'`` + + (note that contrary to usual python slices, **both** the start and the stop are included!) + - A boolean array + + See more at :ref:`Selection by Label ` + + - ``.iloc`` is strictly integer position based (from 0 to length-1 of the axis), will + raise ``IndexError`` when the requested indicies are out of bounds. Allowed inputs are: + + - An integer e.g. ``5`` + - A list or array of integers ``[4, 3, 0]`` + - A slice object with ints ``1:7`` + - A boolean array + + See more at :ref:`Selection by Position ` + + - ``.ix`` supports mixed integer and label based access. It is primarily label based, but + will fallback to integer positional access. ``.ix`` is the most general and will support + any of the inputs to ``.loc`` and ``.iloc``, as well as support for floating point label schemes. + + As using integer slices with ``.ix`` have different behavior depending on whether the slice + is interpreted as integer location based or label position based, it's usually better to be + explicit and use ``.iloc`` (integer location) or ``.loc`` (label location). + + ``.ix`` is especially useful when dealing with mixed positional and label based hierarchial indexes. + + See more at :ref:`Advanced Indexing ` and :ref:`Advanced Hierarchical ` + +Getting values from object with multi-axes uses the following notation (using ``.loc`` as an +example, but applies to ``.iloc`` and ``.ix`` as well) Any of the axes accessors may be the null +slice ``:``. Axes left out of the specification are assumed to be ``:``. +(e.g. ``p.loc['a']`` is equiv to ``p.loc['a',:,:]``) + +.. csv-table:: + :header: "Object Type", "Indexers" + :widths: 30, 50 + :delim: ; + + Series; ``s.loc[indexer]`` + DataFrame; ``df.loc[row_indexer,column_indexer]`` + Panel; ``p.loc[item_indexer,major_indexer,minor_indexer]`` + +Deprecations +~~~~~~~~~~~~ + +Starting in version 0.11.0, these methods may be deprecated in future versions. + + - ``irow`` + - ``icol`` + - ``iget_value`` + +See the section :ref:`Selection by Position ` for substitutes. + +.. _indexing.xs: + +Cross-sectional slices on non-hierarchical indices are now easily performed using +``.loc`` and/or ``.loc``. The methods: + + - ``xs`` (for DataFrame), + - ``minor_xs`` and ``major_xs`` (for Panel) + +now exist primarily for backward compatibility. + +See the section at :ref:`Selection by Label ` for substitutes. + .. _indexing.basics: Basics @@ -42,18 +122,21 @@ As mentioned when introducing the data structures in the :ref:`last section for those familiar with implementing class behavior in Python) is selecting out lower-dimensional slices. Thus, - - **Series**: ``series[label]`` returns a scalar value - - **DataFrame**: ``frame[colname]`` returns a Series corresponding to the - passed column name - - **Panel**: ``panel[itemname]`` returns a DataFrame corresponding to the - passed item name +.. csv-table:: + :header: "Object Type", "Selection", "Return Value Type" + :widths: 30, 30, 60 + :delim: ; + + Series; ``series[label]``; scalar value + DataFrame; ``frame[colname]``; ``Series`` corresponding to colname + Panel; ``panel[itemname]``; ``DataFrame`` corresponing to the itemname Here we construct a simple time series data set to use for illustrating the indexing functionality: .. ipython:: python - dates = np.asarray(date_range('1/1/2000', periods=8)) + dates = date_range('1/1/2000', periods=8) df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) df panel = Panel({'one' : df, 'two' : df - df.mean()}) @@ -72,46 +155,22 @@ Thus, as per above, we have the most basic indexing using ``[]``: s[dates[5]] panel['two'] - -.. _indexing.basics.get_value: - -Fast scalar value getting and setting -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Since indexing with ``[]`` must handle a lot of cases (single-label access, -slicing, boolean indexing, etc.), it has a bit of overhead in order to figure -out what you're asking for. If you only want to access a scalar value, the -fastest way is to use the ``get_value`` method, which is implemented on all of -the data structures: - -.. ipython:: python - - s.get_value(dates[5]) - df.get_value(dates[5], 'A') - -There is an analogous ``set_value`` method which has the additional capability -of enlarging an object. This method *always* returns a reference to the object -it modified, which in the case of enlargement, will be a **new object**: - -.. ipython:: python - - df.set_value(dates[5], 'E', 7) - -Additional Column Access -~~~~~~~~~~~~~~~~~~~~~~~~ +Attribute Access +~~~~~~~~~~~~~~~~ .. _indexing.columns.multiple: .. _indexing.df_cols: -You may access a column on a dataframe directly as an attribute: +You may access a column on a ``DataFrame``, and a item on a ``Panel`` directly as an attribute: .. ipython:: python df.A + panel.one If you are using the IPython environment, you may also use tab-completion to -see the accessible columns of a DataFrame. +see these accessable attributes. You can pass a list of columns to ``[]`` to select columns in that order: If a column is not contained in the DataFrame, an exception will be @@ -126,30 +185,12 @@ raised. Multiple columns can also be set in this manner: You may find this useful for applying a transform (in-place) to a subset of the columns. -Data slices on other axes -~~~~~~~~~~~~~~~~~~~~~~~~~ - -It's certainly possible to retrieve data slices along the other axes of a -DataFrame or Panel. We tend to refer to these slices as -*cross-sections*. DataFrame has the ``xs`` function for retrieving rows as -Series and Panel has the analogous ``major_xs`` and ``minor_xs`` functions for -retrieving slices as DataFrames for a given ``major_axis`` or ``minor_axis`` -label, respectively. - -.. ipython:: python - - date = dates[5] - df.xs(date) - panel.major_xs(date) - panel.minor_xs('A') - - Slicing ranges ~~~~~~~~~~~~~~ The most robust and consistent way of slicing ranges along arbitrary axes is -described in the :ref:`Advanced indexing ` section detailing -the ``.ix`` method. For now, we explain the semantics of slicing using the +described in the :ref:`Selection by Position ` section detailing +the ``.iloc`` method. For now, we explain the semantics of slicing using the ``[]`` operator. With Series, the syntax works exactly as with an ndarray, returning a slice of @@ -177,6 +218,210 @@ largely as a convenience since it is such a common operation. df[:3] df[::-1] +.. _indexing.label: + +Selection By Label +~~~~~~~~~~~~~~~~~~ + +Pandas provides a suite of methods in order to have **purely label based indexing**. +This is a strict inclusion based protocol. **ALL** of the labels for which you ask, +must be in the index or a ``KeyError`` will be raised! + +When slicing, the start bound is *included*, **AND** the stop bound is *included*. +Integers are valid labels, but they refer to the label *and not the position*. + +The ``.loc`` attribute is the primary access method. + +The following are valid inputs: + + - A single label, e.g. ``5`` or ``'a'`` + + (note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index) + - A list or array of labels ``['a', 'b', 'c']`` + - A slice object with labels ``'a':'f'`` + + (note that contrary to usual python slices, **both** the start and the stop are included!) + - A boolean array + +.. ipython:: python + + s1 = Series(np.random.randn(6),index=list('abcdef')) + s1 + s1.loc['c':] + s1.loc['b'] + +Note that setting works as well: + +.. ipython:: python + + s1.loc['c':] = 0 + s1 + +With a DataFrame + +.. ipython:: python + + df1 = DataFrame(np.random.randn(6,4),index=list('abcdef'),columns=list('ABCD')) + df1 + df1.loc[['a','b','d'],:] + +Accessing via label slices + +.. ipython:: python + + df1.loc['d':,'A':'C'] + +For getting a cross section using a label (equiv to deprecated ``df.xs('a')``) + +.. ipython:: python + + df1.loc['a'] + +For getting values with a boolean array + +.. ipython:: python + + df1.loc['a']>0 + df1.loc[:,df1.loc['a']>0] + +For getting a value explicity (equiv to deprecated ``df.get_value('a','A')``) + +.. ipython:: python + + # this is also equivalent to ``df1.at['a','A']`` + df1.loc['a','A'] + +.. _indexing.integer: + +Selection By Position +~~~~~~~~~~~~~~~~~~~~~ + +Pandas provides a suite of methods in order to get **purely integer based indexing**. +The semantics follow closely python and numpy slicing. These are ``0-based`` indexing. + +When slicing, the start bounds is *included*, while the upper bound is *excluded*. +Trying to use a non-integer, even a **valid** label will raise a ``IndexError``. + +The ``.iloc`` attribute is the primary access method . + +The following are valid inputs: + + - An integer e.g. ``5`` + - A list or array of integers ``[4, 3, 0]`` + - A slice object with ints ``1:7`` + - A boolean array + +.. ipython:: python + + s1 = Series(np.random.randn(5),index=range(0,10,2)) + s1 + s1.iloc[:3] + s1.iloc[3] + +Note that setting works as well: + +.. ipython:: python + + s1.iloc[:3] = 0 + s1 + +With a DataFrame + +.. ipython:: python + + df1 = DataFrame(np.random.randn(6,4),index=range(0,12,2),columns=range(0,8,2)) + df1 + +Select via integer slicing + +.. ipython:: python + + df1.iloc[:3] + df1.iloc[1:5,2:4] + +Select via integer list + +.. ipython:: python + + df1.iloc[[1,3,5],[1,3]] + +Select via boolean array + +.. ipython:: python + + df1.iloc[:,df1.iloc[0]>0] + +For slicing rows explicitly (equiv to deprecated ``df.irow(slice(1,3))``). + +.. ipython:: python + + df1.iloc[1:3,:] + +For slicing columns explicitly (equiv to deprecated ``df.icol(slice(1,3))``). + +.. ipython:: python + + df1.iloc[:,1:3] + +For getting a scalar via integer position (equiv to deprecated ``df.get_value(1,1)``) + +.. ipython:: python + + # this is also equivalent to ``df1.iat[1,1]`` + df1.iloc[1,1] + +For getting a cross section using an integer position (equiv to deprecated ``df.xs(1)``) + +.. ipython:: python + + df1.iloc[1] + +There is one signficant departure from standard python/numpy slicing semantics. +python/numpy allow slicing past the end of an array without an associated error. + +.. ipython:: python + + # these are allowed in python/numpy. + x = list('abcdef') + x[4:10] + x[8:10] + +Pandas will detect this and raise ``IndexError``, rather than return an empty structure. + +:: + + >>> df.iloc[:,3:6] + IndexError: out-of-bounds on slice (end) + +.. _indexing.basics.get_value: + +Fast scalar value getting and setting +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Since indexing with ``[]`` must handle a lot of cases (single-label access, +slicing, boolean indexing, etc.), it has a bit of overhead in order to figure +out what you're asking for. If you only want to access a scalar value, the +fastest way is to use the ``at`` and ``iat`` methods, which are implemented on all of +the data structures. + +Similary to ``loc``, ``at`` provides **label** based scalar lookups, while, ``iat`` provides +**integer** based lookups analagously to ``iloc`` + +.. ipython:: python + + s.iat[5] + df.at[dates[5], 'A'] + df.iat[3, 0] + +You can also set using these same indexers. These have the additional capability +of enlarging an object. This method *always* returns a reference to the object +it modified, which in the case of enlargement, will be a **new object**: + +.. ipython:: python + + df.at[dates[5], 'E'] = 7 + df.iat[3, 0] = 7 + Boolean indexing ~~~~~~~~~~~~~~~~ @@ -228,8 +473,8 @@ more complex criteria: df2[criterion & (df2['b'] == 'x')] -Note, with the :ref:`advanced indexing ` ``ix`` method, you -may select along more than one axis using boolean vectors combined with other +Note, with the choice methods :ref:`Selection by Label `, :ref:`Selection by Position `, +and :ref:`Advanced Indexing ` may select along more than one axis using boolean vectors combined with other indexing expressions. Where and Masking @@ -413,20 +658,21 @@ default value. .. _indexing.advanced: -Advanced indexing with labels ------------------------------ +Advanced Indexing with ``.ix`` +------------------------------ + +.. note:: + + The recent addition of ``.loc`` and ``.iloc`` have enabled users to be quite + explicit about indexing choices. ``.ix`` allows a great flexibility to specify + indexing locations by *label* an/or *integer position*. Pandas will attempt + to use any passed *integer* as *label* locations first (like what ``.loc`` + would do, then to fall back on *positional* indexing, like what ``.iloc`` would do). -We have avoided excessively overloading the ``[]`` / ``__getitem__`` operator -to keep the basic functionality of the pandas objects straightforward and -simple. However, there are often times when you may wish get a subset (or -analogously set a subset) of the data in a way that is not straightforward -using the combination of ``reindex`` and ``[]``. Complicated setting operations -are actually quite difficult because ``reindex`` usually returns a copy. +The syntax of using ``.ix`` is identical to ``.loc``, in :ref:`Selection by Label `, +and ``.iloc`` in :ref:`Selection by Position `. -By *advanced* indexing we are referring to a special ``.ix`` attribute on -pandas objects which enable you to do getting/setting operations on a -DataFrame, for example, with matrix/ndarray-like semantics. Thus you can -combine the following kinds of indexing: +The ``.ix`` attribute takes the following inputs: - An integer or single label, e.g. ``5`` or ``'a'`` - A list or array of labels ``['a', 'b', 'c']`` or integers ``[4, 3, 0]`` @@ -529,27 +775,6 @@ numpy array. For instance, dflookup.lookup(xrange(0,10,2), ['B','C','A','B','D']) -Advanced indexing with integer labels -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Label-based indexing with integer axis labels is a thorny topic. It has been -discussed heavily on mailing lists and among various members of the scientific -Python community. In pandas, our general viewpoint is that labels matter more -than integer locations. Therefore, with an integer axis index *only* -label-based indexing is possible with the standard tools like ``.ix``. The -following code will generate exceptions: - -.. code-block:: python - - s = Series(range(5)) - s[-1] - df = DataFrame(np.random.randn(5, 4)) - df - df.ix[-2:] - -This deliberate decision was made to prevent ambiguities and subtle bugs (many -users reported finding bugs when the API change was made to stop "falling back" -on position-based indexing). - Setting values in mixed-type DataFrame ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -779,6 +1004,8 @@ of tuples: s.reindex(index[:3]) s.reindex([('foo', 'two'), ('bar', 'one'), ('qux', 'one'), ('baz', 'one')]) +.. _indexing.advanced_hierarchical: + Advanced indexing with hierarchical index ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -818,8 +1045,6 @@ but as you use it you may uncover corner cases or unintuitive behavior. If you do find something like this, do not hesitate to report the issue or ask on the mailing list. -.. _indexing.xs: - Cross-section with hierarchical index ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/io.rst b/doc/source/io.rst index 86d590965f141..914506fb0d3cd 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -906,6 +906,8 @@ And then import the data directly to a DataFrame by calling: clipdf +.. _io.excel: + Excel files ----------- @@ -970,7 +972,7 @@ one can use the ExcelWriter class, as in the following example: df2.to_excel(writer, sheet_name='sheet2') writer.save() -.. _io-hdf5: +.. _io.hdf5: HDF5 (PyTables) --------------- @@ -1058,6 +1060,7 @@ These stores are **not** appendable once written (though you can simply remove them and rewrite). Nor are they **queryable**; they must be retrieved in their entirety. +.. _io.hdf5-table: Storing in Table format ~~~~~~~~~~~~~~~~~~~~~~~ @@ -1091,6 +1094,8 @@ supported. # the type of stored data store.root.df._v_attrs.pandas_type +.. _io.hdf5-keys: + Hierarchical Keys ~~~~~~~~~~~~~~~~~ @@ -1115,6 +1120,8 @@ everying in the sub-store and BELOW, so be *careful*. store.remove('food') store +.. _io.hdf5-types: + Storing Mixed Types in a Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1170,6 +1177,8 @@ storing/selecting from homogeneous index DataFrames. store.select('df_mi', Term('foo=bar')) +.. _io.hdf5-query: + Querying a Table ~~~~~~~~~~~~~~~~ @@ -1372,6 +1381,7 @@ table (optional) to let it have the remaining columns. The argument store.select_as_multiple(['df1_mt', 'df2_mt'], where=['A>0', 'B>0'], selector = 'df1_mt') +.. _io.hdf5-delete: Delete from a Table ~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.10.0.txt b/doc/source/v0.10.0.txt index c220d2cbba81d..0c5497868efe2 100644 --- a/doc/source/v0.10.0.txt +++ b/doc/source/v0.10.0.txt @@ -217,7 +217,7 @@ The width of each line can be changed via 'line_width' (80 by default): Updated PyTables Support ~~~~~~~~~~~~~~~~~~~~~~~~ -:ref:`Docs ` for PyTables ``Table`` format & several enhancements to the api. Here is a taste of what to expect. +:ref:`Docs ` for PyTables ``Table`` format & several enhancements to the api. Here is a taste of what to expect. .. ipython:: python :suppress: diff --git a/doc/source/v0.10.1.txt b/doc/source/v0.10.1.txt index 4c7369c27cc30..e8435df7b2b0c 100644 --- a/doc/source/v0.10.1.txt +++ b/doc/source/v0.10.1.txt @@ -232,4 +232,5 @@ on GitHub for a complete list. .. _GH2626: https://github.com/pydata/pandas/issues/2626 .. _GH2613: https://github.com/pydata/pandas/issues/2613 .. _GH2602: https://github.com/pydata/pandas/issues/2602 +.. _GH2687: https://github.com/pydata/pandas/issues/2687 .. _GH2563: https://github.com/pydata/pandas/issues/2563 diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt index cc3b39dd22e34..f4c9d13c0d23e 100644 --- a/doc/source/v0.11.0.txt +++ b/doc/source/v0.11.0.txt @@ -4,17 +4,86 @@ v0.11.0 (March ??, 2013) ------------------------ This is a major release from 0.10.1 and includes many new features and -enhancements along with a large number of bug fixes. There are also a number of -important API changes that long-time pandas users should pay close attention -to. +enhancements along with a large number of bug fixes. The methods of Selecting +Data have had quite a number of additions, and Dtype support is now full-fledged. +There are also a number of important API changes that long-time pandas users should +pay close attention to. + +There is a new section in the documentation, :ref:`10 Minutes to Pandas <10min>`, +primarily geared to new users. API changes ~~~~~~~~~~~ -Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, or a passed ``Series``, then it will be preserved in DataFrame operations. Furthermore, different numeric dtypes will **NOT** be combined. The following example will give you a taste. +Selection Choices +~~~~~~~~~~~~~~~~~ + +Starting in 0.11.0, object selection has had a number of user-requested additions in +order to support more explicit location based indexing. Pandas now supports +three types of multi-axis indexing. + + - ``.loc`` is strictly label based, will raise ``KeyError`` when the items are not found, + allowed inputs are: + + - A single label, e.g. ``5`` or ``'a'`` + + (note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index) + - A list or array of labels ``['a', 'b', 'c']`` + - A slice object with labels ``'a':'f'`` + + (note that contrary to usual python slices, **both** the start and the stop are included!) + - A boolean array + + See more at :ref:`Selection by Label ` + + - ``.iloc`` is strictly integer position based (from 0 to length-1 of the axis), will + raise ``IndexError`` when the requested indicies are out of bounds. Allowed inputs are: + + - An integer e.g. ``5`` + - A list or array of integers ``[4, 3, 0]`` + - A slice object with ints ``1:7`` + - A boolean array + + See more at :ref:`Selection by Position ` + + - ``.ix`` supports mixed integer and label based access. It is primarily label based, but + will fallback to integer positional access. ``.ix`` is the most general and will support + any of the inputs to ``.loc`` and ``.iloc``, as well as support for floating point label schemes. + + As using integer slices with ``.ix`` have different behavior depending on whether the slice + is interpreted as integer location based or label position based, it's usually better to be + explicit and use ``.iloc`` (integer location) or ``.loc`` (label location). -Dtype Specification -~~~~~~~~~~~~~~~~~~~ + ``.ix`` is especially usefull when dealing with mixed positional/label based hierarchial indexes. + + See more at :ref:`Advanced Indexing ` and :ref:`Advanced Hierarchical ` + + +Selection Deprecations +~~~~~~~~~~~~~~~~~~~~~~ + +Starting in version 0.11.0, the methods may be deprecated in future versions. + + - ``irow`` + - ``icol`` + - ``iget_value`` + +See the section :ref:`Selection by Position ` for substitutes. + +Cross-sectional slices on non-hierarchical indices are now easily performed using +``.loc`` and/or ``.loc``. The methods: + + - ``xs`` (for DataFrame), + - ``minor_xs`` and ``major_xs`` (for Panel) + +now exist primarily for backward compatibility. + +See the section :ref:`Selection by Label ` for substitutes. + +Dtypes +~~~~~~ + +Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, or a passed ``Series``, then it will be preserved in DataFrame operations. Furthermore, different numeric dtypes will **NOT** be combined. The following example will give you a taste. .. ipython:: python diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c0449faf40368..faac974ae9ddb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -568,16 +568,6 @@ def axes(self): def _constructor(self): return DataFrame - # Fancy indexing - _ix = None - - @property - def ix(self): - if self._ix is None: - self._ix = _NDFrameIndexer(self) - - return self._ix - @property def shape(self): return (len(self.index), len(self.columns)) @@ -1894,88 +1884,71 @@ def set_value(self, index, col, value): return result.set_value(index, col, value) def irow(self, i, copy=False): - """ - Retrieve the i-th row or rows of the DataFrame by location - - Parameters - ---------- - i : int, slice, or sequence of integers + return self._ixs(i,axis=0) - Notes - ----- - If slice passed, the resulting data will be a view + def icol(self, i): + return self._ixs(i,axis=1) - Returns - ------- - row : Series (int) or DataFrame (slice, sequence) + def _ixs(self, i, axis=0, copy=False): + """ + i : int, slice, or sequence of integers + axis : int """ - if isinstance(i, slice): - return self[i] - else: - label = self.index[i] - if isinstance(label, Index): - return self.reindex(label) - else: - try: - new_values = self._data.fast_2d_xs(i, copy=copy) - except: - new_values = self._data.fast_2d_xs(i, copy=True) - return Series(new_values, index=self.columns, - name=self.index[i]) - def icol(self, i): - """ - Retrieve the i-th column or columns of the DataFrame by location + # irow + if axis == 0: - Parameters - ---------- - i : int, slice, or sequence of integers + """ + Notes + ----- + If slice passed, the resulting data will be a view + """ - Notes - ----- - If slice passed, the resulting data will be a view + if isinstance(i, slice): + return self[i] + else: + label = self.index[i] + if isinstance(label, Index): + return self.reindex(label) + else: + try: + new_values = self._data.fast_2d_xs(i, copy=copy) + except: + new_values = self._data.fast_2d_xs(i, copy=True) + return Series(new_values, index=self.columns, + name=self.index[i]) - Returns - ------- - column : Series (int) or DataFrame (slice, sequence) - """ - label = self.columns[i] - if isinstance(i, slice): - # need to return view - lab_slice = slice(label[0], label[-1]) - return self.ix[:, lab_slice] + # icol else: - label = self.columns[i] - if isinstance(label, Index): - return self.take(i, axis=1) - values = self._data.iget(i) - return self._col_klass.from_array(values, index=self.index, - name=label) + """ + Notes + ----- + If slice passed, the resulting data will be a view + """ - def _ixs(self, i, axis=0): - if axis == 0: - return self.irow(i) - else: - return self.icol(i) + label = self.columns[i] + if isinstance(i, slice): + # need to return view + lab_slice = slice(label[0], label[-1]) + return self.ix[:, lab_slice] + else: + label = self.columns[i] + if isinstance(label, Index): - def iget_value(self, i, j): - """ - Return scalar value stored at row i and column j, where i and j are - integers + # if we have negative indicies, translate to postive here + # (take doesen't deal properly with these) + l = len(self.columns) + i = [ v if v >= 0 else l+v for v in i ] + + return self.take(i, axis=1) - Parameters - ---------- - i : int - j : int + values = self._data.iget(i) + return self._col_klass.from_array(values, index=self.index, + name=label) - Returns - ------- - value : scalar value - """ - row = self.index[i] - col = self.columns[j] - return self.get_value(row, col) + def iget_value(self, i, j): + return self.iat[i,j] def __getitem__(self, key): if isinstance(key, slice): @@ -2054,13 +2027,13 @@ def _getitem_frame(self, key): raise ValueError('Must pass DataFrame with boolean values only') return self.where(key) - def _slice(self, slobj, axis=0): + def _slice(self, slobj, axis=0, raise_on_error=False): if axis == 0: mgr_axis = 1 else: mgr_axis = 0 - new_data = self._data.get_slice(slobj, axis=mgr_axis) + new_data = self._data.get_slice(slobj, axis=mgr_axis, raise_on_error=raise_on_error) return self._constructor(new_data) def _box_item_values(self, key, values): @@ -2370,6 +2343,8 @@ def xs(self, key, axis=0, level=None, copy=True): result.index = new_index return result + _xs = xs + def lookup(self, row_labels, col_labels): """ Label-based "fancy indexing" function for DataFrame. Given equal-length diff --git a/pandas/core/generic.py b/pandas/core/generic.py index afe7f8775b1e9..c25e686afacbf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3,6 +3,7 @@ import numpy as np from pandas.core.index import MultiIndex +import pandas.core.indexing as indexing from pandas.tseries.index import DatetimeIndex import pandas.core.common as com import pandas.lib as lib @@ -59,6 +60,21 @@ def _get_axis(self, axis): name = self._get_axis_name(axis) return getattr(self, name) + #---------------------------------------------------------------------- + # Indexers + @classmethod + def _create_indexer(cls, name, indexer): + """ create an indexer like _name in the class """ + iname = '_%s' % name + setattr(cls,iname,None) + + def _indexer(self): + if getattr(self,iname,None) is None: + setattr(self,iname,indexer(self, name)) + return getattr(self,iname) + + setattr(cls,name,property(_indexer)) + def abs(self): """ Return an object with absolute value taken. Only applicable to objects @@ -396,10 +412,6 @@ def sort_index(self, axis=0, ascending=True): new_axis = labels.take(sort_index) return self.reindex(**{axis_name: new_axis}) - @property - def ix(self): - raise NotImplementedError - def reindex(self, *args, **kwds): raise NotImplementedError @@ -466,6 +478,9 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, np.putmask(rs.values, mask, np.nan) return rs +# install the indexerse +for _name, _indexer in indexing.get_indexers_list(): + PandasObject._create_indexer(_name,_indexer) class NDFrame(PandasObject): """ diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 8f812252134a1..b86518e8947ef 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1,12 +1,23 @@ # pylint: disable=W0223 from pandas.core.common import _asarray_tuplesafe -from pandas.core.index import Index, MultiIndex +from pandas.core.index import Index, MultiIndex, _ensure_index import pandas.core.common as com import pandas.lib as lib import numpy as np +# the supported indexers +def get_indexers_list(): + + return [ + ('ix' ,_NDFrameIndexer), + ('iloc',_iLocIndexer ), + ('loc' ,_LocIndexer ), + ('at' ,_AtIndexer ), + ('iat' ,_iAtIndexer ), + ] + # "null slice" _NS = slice(None, None) @@ -17,9 +28,10 @@ class IndexingError(Exception): class _NDFrameIndexer(object): - def __init__(self, obj): + def __init__(self, obj, name): self.obj = obj self.ndim = obj.ndim + self.name = name def __iter__(self): raise NotImplementedError('ix is not iterable') @@ -43,15 +55,15 @@ def _get_label(self, label, axis=0): raise IndexingError('no slices here') try: - return self.obj.xs(label, axis=axis, copy=False) + return self.obj._xs(label, axis=axis, copy=False) except Exception: - return self.obj.xs(label, axis=axis, copy=True) + return self.obj._xs(label, axis=axis, copy=True) def _get_loc(self, key, axis=0): return self.obj._ixs(key, axis=axis) - def _slice(self, obj, axis=0): - return self.obj._slice(obj, axis=axis) + def _slice(self, obj, axis=0, raise_on_error=False): + return self.obj._slice(obj, axis=axis, raise_on_error=raise_on_error) def __setitem__(self, key, value): # kludgetastic @@ -74,6 +86,9 @@ def __setitem__(self, key, value): self._setitem_with_indexer(indexer, value) + def _has_valid_tuple(self, key): + pass + def _convert_tuple(self, key): keyidx = [] for i, k in enumerate(key): @@ -212,6 +227,9 @@ def _getitem_tuple(self, tup): if self._multi_take_opportunity(tup): return self._multi_take(tup) + # no multi-index, so validate all of the indexers + self._has_valid_tuple(tup) + # no shortcut needed retval = self.obj for i, key in enumerate(tup): @@ -221,7 +239,7 @@ def _getitem_tuple(self, tup): if _is_null_slice(key): continue - retval = retval.ix._getitem_axis(key, axis=i) + retval = getattr(retval,self.name)._getitem_axis(key, axis=i) return retval @@ -308,8 +326,12 @@ def _getitem_lowerdim(self, tup): if _is_label_like(key) or isinstance(key, tuple): section = self._getitem_axis(key, axis=i) + # we have yielded a scalar ? + if not _is_list_like(section): + return section + # might have been a MultiIndex - if section.ndim == self.ndim: + elif section.ndim == self.ndim: new_key = tup[:i] + (_NS,) + tup[i + 1:] # new_key = tup[:i] + tup[i+1:] else: @@ -325,7 +347,7 @@ def _getitem_lowerdim(self, tup): if len(new_key) == 1: new_key, = new_key - return section.ix[new_key] + return getattr(section,self.name)[new_key] raise IndexingError('not applicable') @@ -593,6 +615,207 @@ def _get_slice_axis(self, slice_obj, axis=0): else: return self.obj.take(indexer, axis=axis) +class _LocationIndexer(_NDFrameIndexer): + _valid_types = None + _exception = Exception + + def _has_valid_type(self, k, axis): + raise NotImplementedError() + + def _has_valid_tuple(self, key): + """ check the key for valid keys across my indexer """ + for i, k in enumerate(key): + if i >= self.obj.ndim: + raise ValueError('Too many indexers') + if not self._has_valid_type(k,i): + raise ValueError("Location based indexing can only have [%s] types" % self._valid_types) + + def __getitem__(self, key): + if type(key) is tuple: + return self._getitem_tuple(key) + else: + return self._getitem_axis(key, axis=0) + + def _getitem_axis(self, key, axis=0): + raise NotImplementedError() + + def _getbool_axis(self, key, axis=0): + labels = self.obj._get_axis(axis) + key = _check_bool_indexer(labels, key) + inds, = key.nonzero() + try: + return self.obj.take(inds, axis=axis) + except (Exception), detail: + raise self._exception(detail) + +class _LocIndexer(_LocationIndexer): + """ purely label based location based indexing """ + _valid_types = "labels (MUST BE IN THE INDEX), slices of labels (BOTH endpoints included! Can be slices of integers if the index is integers), listlike of labels, boolean" + _exception = KeyError + + def _has_valid_type(self, key, axis): + ax = self.obj._get_axis(axis) + + # valid for a label where all labels are in the index + # slice of lables (where start-end in labels) + # slice of integers (only if in the lables) + # boolean + + if isinstance(key, slice): + + if key.start is not None: + if key.start not in ax: + raise KeyError("start bound [%s] is not the [%s]" % (key.start,self.obj._get_axis_name(axis))) + if key.stop is not None: + stop = key.stop + if com.is_integer(stop): + stop -= 1 + if stop not in ax: + raise KeyError("stop bound [%s] is not in the [%s]" % (stop,self.obj._get_axis_name(axis))) + + elif com._is_bool_indexer(key): + return True + + elif _is_list_like(key): + + # require all elements in the index + idx = _ensure_index(key) + if not idx.isin(ax).all(): + raise KeyError("[%s] are not in ALL in the [%s]" % (key,self.obj._get_axis_name(axis))) + + return True + + else: + + # if its empty we want a KeyError here + if not len(ax): + raise KeyError("The [%s] axis is empty" % self.obj._get_axis_name(axis)) + + if not key in ax: + raise KeyError("the label [%s] is not in the [%s]" % (key,self.obj._get_axis_name(axis))) + + return True + + def _getitem_axis(self, key, axis=0): + labels = self.obj._get_axis(axis) + + if isinstance(key, slice): + ltype = labels.inferred_type + if ltype == 'mixed-integer-float' or ltype == 'mixed-integer': + raise ValueError('cannot slice with a non-single type label array') + return self._get_slice_axis(key, axis=axis) + elif com._is_bool_indexer(key): + return self._getbool_axis(key, axis=axis) + elif _is_list_like(key) and not (isinstance(key, tuple) and + isinstance(labels, MultiIndex)): + + if hasattr(key, 'ndim') and key.ndim > 1: + raise ValueError('Cannot index with multidimensional key') + + return self._getitem_iterable(key, axis=axis) + else: + return self._get_label(key, axis=axis) + +class _iLocIndexer(_LocationIndexer): + """ purely integer based location based indexing """ + _valid_types = "integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array" + _exception = IndexError + + def _has_valid_type(self, key, axis): + return isinstance(key, slice) or com.is_integer(key) or com._is_bool_indexer(key) or _is_list_like(key) + + def _getitem_tuple(self, tup): + + self._has_valid_tuple(tup) + retval = self.obj + for i, key in enumerate(tup): + if _is_null_slice(key): + continue + + retval = getattr(retval,self.name)._getitem_axis(key, axis=i) + + return retval + + def _get_slice_axis(self, slice_obj, axis=0): + obj = self.obj + + if not _need_slice(slice_obj): + return obj + + if isinstance(slice_obj, slice): + return self._slice(slice_obj, axis=axis, raise_on_error=True) + else: + return self.obj.take(slice_obj, axis=axis) + + def _getitem_axis(self, key, axis=0): + + if isinstance(key, slice): + return self._get_slice_axis(key, axis=axis) + + elif com._is_bool_indexer(key): + return self._getbool_axis(key, axis=axis) + + # a single integer or a list of integers + else: + + if not (com.is_integer(key) or _is_list_like(key)): + raise ValueError("Cannot index by location index with a non-integer key") + + return self._get_loc(key,axis=axis) + + def _convert_to_indexer(self, obj, axis=0): + """ much simpler as we only have to deal with our valid types """ + if self._has_valid_type(obj,axis): + return obj + + raise ValueError("Can only index by location with a [%s]" % self._valid_types) + + +class _ScalarAccessIndexer(_NDFrameIndexer): + """ access scalars quickly """ + + def _convert_key(self, key): + return list(key) + + def __getitem__(self, key): + if not isinstance(key, tuple): + + # we could have a convertible item here (e.g. Timestamp) + if not _is_list_like(key): + key = tuple([ key ]) + else: + raise ValueError('Invalid call for scalar access (getting)!') + + if len(key) != self.obj.ndim: + raise ValueError('Not enough indexers for scalar access (getting)!') + key = self._convert_key(key) + return self.obj.get_value(*key) + + def __setitem__(self, key, value): + if not isinstance(key, tuple): + raise ValueError('Invalid call for scalar access (setting)!') + if len(key) != self.obj.ndim: + raise ValueError('Not enough indexers for scalar access (setting)!') + key = self._convert_key(key) + key.append(value) + self.obj.set_value(*key) + +class _AtIndexer(_ScalarAccessIndexer): + """ label based scalar accessor """ + pass + +class _iAtIndexer(_ScalarAccessIndexer): + """ integer based scalar accessor """ + + def _convert_key(self, key): + """ require integer args (and convert to label arguments) """ + ckey = [] + for a, i in zip(self.obj.axes,key): + if not com.is_integer(i): + raise ValueError("iAt based indexing can only have integer indexers") + ckey.append(a[i]) + return ckey + # 32-bit floating point machine epsilon _eps = np.finfo('f4').eps @@ -737,6 +960,17 @@ def _need_slice(obj): (obj.step is not None and obj.step != 1)) +def _check_slice_bounds(slobj, values): + l = len(values) + start = slobj.start + if start is not None: + if start < -l or start > l-1: + raise IndexError("out-of-bounds on slice (start)") + stop = slobj.stop + if stop is not None: + if stop < -l-1 or stop > l: + raise IndexError("out-of-bounds on slice (end)") + def _maybe_droplevels(index, key): # drop levels if isinstance(key, tuple): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 159393be38b07..5bf918aff6367 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -5,6 +5,7 @@ import numpy as np from pandas.core.index import Index, _ensure_index, _handle_legacy_indexes +from pandas.core.indexing import _check_slice_bounds import pandas.core.common as com import pandas.lib as lib import pandas.tslib as tslib @@ -1034,8 +1035,12 @@ def get_bool_data(self, copy=False, as_blocks=False): return self.get_numeric_data(copy=copy, type_list=(BoolBlock,), as_blocks=as_blocks) - def get_slice(self, slobj, axis=0): + def get_slice(self, slobj, axis=0, raise_on_error=False): new_axes = list(self.axes) + + if raise_on_error: + _check_slice_bounds(slobj, new_axes[axis]) + new_axes[axis] = new_axes[axis][slobj] if axis == 0: diff --git a/pandas/core/panel.py b/pandas/core/panel.py index b418995ce3085..9f91d8add1eac 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -12,7 +12,7 @@ from pandas.core.categorical import Factor from pandas.core.index import (Index, MultiIndex, _ensure_index, _get_combined_index) -from pandas.core.indexing import _NDFrameIndexer, _maybe_droplevels +from pandas.core.indexing import _maybe_droplevels, _is_list_like from pandas.core.internals import BlockManager, make_block, form_blocks from pandas.core.series import Series from pandas.core.frame import DataFrame @@ -540,16 +540,6 @@ def _get_plane_axes(self, axis): return index, columns - # Fancy indexing - _ix = None - - @property - def ix(self): - if self._ix is None: - self._ix = _NDFrameIndexer(self) - - return self._ix - def _wrap_array(self, arr, axes, copy=False): d = self._construct_axes_dict_from(self, axes, copy=copy) return self._constructor(arr, **d) @@ -679,8 +669,8 @@ def __getattr__(self, name): raise AttributeError("'%s' object has no attribute '%s'" % (type(self).__name__, name)) - def _slice(self, slobj, axis=0): - new_data = self._data.get_slice(slobj, axis=axis) + def _slice(self, slobj, axis=0, raise_on_error=False): + new_data = self._data.get_slice(slobj, axis=axis, raise_on_error=raise_on_error) return self._constructor(new_data) def __setitem__(self, key, value): @@ -1075,10 +1065,17 @@ def xs(self, key, axis=1, copy=True): new_data = self._data.xs(key, axis=axis_number, copy=copy) return self._constructor_sliced(new_data) + _xs = xs + def _ixs(self, i, axis=0): # for compatibility with .ix indexing # Won't work with hierarchical indexing yet key = self._get_axis(axis)[i] + + # xs cannot handle a non-scalar key, so just reindex here + if _is_list_like(key): + return self.reindex(**{ self._get_axis_name(axis) : key }) + return self.xs(key, axis=axis) def groupby(self, function, axis='major'): diff --git a/pandas/core/series.py b/pandas/core/series.py index b349dd65ff82d..27480d9e489be 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -20,7 +20,7 @@ _infer_dtype_from_scalar, is_list_like) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, _ensure_index, _handle_legacy_indexes) -from pandas.core.indexing import _SeriesIndexer, _check_bool_indexer +from pandas.core.indexing import _SeriesIndexer, _check_bool_indexer, _check_slice_bounds from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex, Period from pandas.util import py3compat @@ -547,15 +547,58 @@ def __setstate__(self, state): self.index = _handle_legacy_indexes([index])[0] self.name = name - _ix = None + # indexers + @property + def axes(self): + return [ self.index ] @property def ix(self): if self._ix is None: - self._ix = _SeriesIndexer(self) + self._ix = _SeriesIndexer(self, 'ix') return self._ix + def _xs(self, key, axis=0, level=None, copy=True): + return self.__getitem__(key) + + def _ixs(self, i, axis=0): + """ + Return the i-th value or values in the Series by location + + Parameters + ---------- + i : int, slice, or sequence of integers + + Returns + ------- + value : scalar (int) or Series (slice, sequence) + """ + try: + return _index.get_value_at(self, i) + except IndexError: + raise + except: + if isinstance(i, slice): + return self[i] + else: + label = self.index[i] + if isinstance(label, Index): + return self.reindex(label) + else: + return _index.get_value_at(self, i) + + + @property + def _is_mixed_type(self): + return False + + def _slice(self, slobj, axis=0, raise_on_error=False): + if raise_on_error: + _check_slice_bounds(slobj, self.values) + + return self._constructor(self.values[slobj], index=self.index[slobj]) + def __getitem__(self, key): try: return self.index.get_value(self, key) @@ -908,34 +951,9 @@ def get(self, label, default=None): except KeyError: return default - def iget_value(self, i): - """ - Return the i-th value or values in the Series by location - - Parameters - ---------- - i : int, slice, or sequence of integers - - Returns - ------- - value : scalar (int) or Series (slice, sequence) - """ - try: - return _index.get_value_at(self, i) - except IndexError: - raise - except: - if isinstance(i, slice): - return self[i] - else: - label = self.index[i] - if isinstance(label, Index): - return self.reindex(label) - else: - return _index.get_value_at(self, i) - - iget = iget_value - irow = iget_value + iget_value = _ixs + iget = _ixs + irow = _ixs def get_value(self, label): """ diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index bf978c322dbd2..f142b36534e22 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -10,6 +10,7 @@ from pandas.core.common import _pickle_array, _unpickle_array, _try_sort from pandas.core.index import Index, MultiIndex, _ensure_index +from pandas.core.indexing import _check_slice_bounds from pandas.core.series import Series from pandas.core.frame import (DataFrame, extract_index, _prep_ndarray, _default_index) @@ -416,11 +417,15 @@ def set_value(self, index, col, value): return dense.to_sparse(kind=self.default_kind, fill_value=self.default_fill_value) - def _slice(self, slobj, axis=0): + def _slice(self, slobj, axis=0, raise_on_error=False): if axis == 0: + if raise_on_error: + _check_slice_bounds(slobj, self.index) new_index = self.index[slobj] new_columns = self.columns else: + if raise_on_error: + _check_slice_bounds(slobj, self.columns) new_index = self.index new_columns = self.columns[slobj] diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 304072acc664e..d8dd2e8c6f0d0 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -6305,7 +6305,6 @@ def _check_set(df, cond, check_dtypes = True): econd = cond.reindex_like(df).fillna(True) expected = dfi.mask(~econd) - #import pdb; pdb.set_trace() dfi.where(cond, np.nan, inplace=True) assert_frame_equal(dfi, expected) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py new file mode 100644 index 0000000000000..e48d8dbdcb498 --- /dev/null +++ b/pandas/tests/test_indexing.py @@ -0,0 +1,678 @@ +# pylint: disable-msg=W0612,E1101 +import unittest +import nose +import itertools + +from numpy import random, nan +from numpy.random import randn +import numpy as np +from numpy.testing import assert_array_equal + +import pandas as pan +import pandas.core.common as com +from pandas.core.api import (DataFrame, Index, Series, Panel, notnull, isnull, + MultiIndex, DatetimeIndex, Timestamp) +from pandas.util.testing import (assert_almost_equal, assert_series_equal, + assert_frame_equal, assert_panel_equal) +from pandas.util import py3compat + +import pandas.util.testing as tm +import pandas.lib as lib +from pandas import date_range +from numpy.testing.decorators import slow + +_verbose = False + +#------------------------------------------------------------------------------- +# Indexing test cases + + +def _generate_indices(f, values=False): + """ generate the indicies + if values is True , use the axis values + is False, use the range + """ + + axes = f.axes + if values: + axes = [ range(len(a)) for a in axes ] + + return itertools.product(*axes) + +def _get_value(f, i, values=False): + """ return the value for the location i """ + + # check agains values + if values: + return f.values[i] + + # this is equiv of f[col][row]..... + #v = f + #for a in reversed(i): + # v = v.__getitem__(a) + #return v + return f.ix[i] + +def _get_result(obj, method, key, axis): + """ return the result for this obj with this key and this axis """ + + if isinstance(key, dict): + key = key[axis] + + # use an artifical conversion to map the key as integers to the labels + # so ix can work for comparisions + if method == 'indexer': + method = 'ix' + key = obj._get_axis(axis)[key] + + # in case we actually want 0 index slicing + try: + xp = getattr(obj, method).__getitem__(_axify(obj,key,axis)) + except: + xp = getattr(obj, method).__getitem__(key) + + return xp + +def _axify(obj, key, axis): + # create a tuple accessor + if axis is not None: + axes = [ slice(None) ] * obj.ndim + axes[axis] = key + return tuple(axes) + return k + + +class TestIndexing(unittest.TestCase): + + _multiprocess_can_split_ = True + + _objs = set(['series','frame','panel']) + _typs = set(['ints','labels','mixed','ts','floats','empty']) + + def setUp(self): + import warnings + warnings.filterwarnings(action='ignore', category=FutureWarning) + + self.series_ints = Series(np.random.rand(4), index=range(0,8,2)) + self.frame_ints = DataFrame(np.random.randn(4, 4), index=range(0, 8, 2), columns=range(0,12,3)) + self.panel_ints = Panel(np.random.rand(4,4,4), items=range(0,8,2),major_axis=range(0,12,3),minor_axis=range(0,16,4)) + + self.series_labels = Series(np.random.randn(4), index=list('abcd')) + self.frame_labels = DataFrame(np.random.randn(4, 4), index=list('abcd'), columns=list('ABCD')) + self.panel_labels = Panel(np.random.randn(4,4,4), items=list('abcd'), major_axis=list('ABCD'), minor_axis=list('ZYXW')) + + self.series_mixed = Series(np.random.randn(4), index=[2, 4, 'null', 8]) + self.frame_mixed = DataFrame(np.random.randn(4, 4), index=[2, 4, 'null', 8]) + self.panel_mixed = Panel(np.random.randn(4,4,4), items=[2,4,'null',8]) + + self.series_ts = Series(np.random.randn(4), index=date_range('20130101', periods=4)) + self.frame_ts = DataFrame(np.random.randn(4, 4), index=date_range('20130101', periods=4)) + self.panel_ts = Panel(np.random.randn(4, 4, 4), items=date_range('20130101', periods=4)) + + #self.series_floats = Series(np.random.randn(4), index=[1.00, 2.00, 3.00, 4.00]) + #self.frame_floats = DataFrame(np.random.randn(4, 4), columns=[1.00, 2.00, 3.00, 4.00]) + #self.panel_floats = Panel(np.random.rand(4,4,4), items = [1.00,2.00,3.00,4.00]) + + self.frame_empty = DataFrame({}) + self.series_empty = Series({}) + self.panel_empty = Panel({}) + + # form agglomerates + for o in self._objs: + + d = dict() + for t in self._typs: + d[t] = getattr(self,'%s_%s' % (o,t),None) + + setattr(self,o,d) + + def check_values(self, f, func, values = False): + + if f is None: return + axes = f.axes + indicies = itertools.product(*axes) + + for i in indicies: + result = getattr(f,func)[i] + + # check agains values + if values: + expected = f.values[i] + else: + expected = f + for a in reversed(i): + expected = expected.__getitem__(a) + + assert_almost_equal(result, expected) + + + def check_result(self, name, method1, key1, method2, key2, typs = None, objs = None, axes = None, fails = None): + + + def _eq(t, o, a, obj, k1, k2): + """ compare equal for these 2 keys """ + + if a is not None and a > obj.ndim-1: + return + + def _print(result, error = None): + if error is not None: + error = str(error) + v = "%-16.16s [%-16.16s]: [typ->%-8.8s,obj->%-8.8s,key1->(%-4.4s),key2->(%-4.4s),axis->%s] %s" % (name,result,t,o,method1,method2,a,error or '') + if _verbose: + print(v) + + try: + + ### good debug location ### + #if name == 'bool' and t == 'empty' and o == 'series' and method1 == 'loc': + # import pdb; pdb.set_trace() + + rs = getattr(obj, method1).__getitem__(_axify(obj,k1,a)) + + try: + xp = _get_result(obj,method2,k2,a) + except: + result = 'no comp' + _print(result) + return + + try: + if np.isscalar(rs) and np.isscalar(xp): + self.assert_(rs == xp) + elif xp.ndim == 1: + assert_series_equal(rs,xp) + elif xp.ndim == 2: + assert_frame_equal(rs,xp) + elif xp.ndim == 3: + assert_panel_equal(rs,xp) + result = 'ok' + except (AssertionError): + result = 'fail' + + # reverse the checks + if fails is True: + if result == 'fail': + result = 'ok (fail)' + + if not result.startswith('ok'): + raise AssertionError(_print(result)) + + _print(result) + + except (AssertionError): + raise + except (TypeError): + raise AssertionError(_print('type error')) + except (Exception), detail: + + # if we are in fails, the ok, otherwise raise it + if fails is not None: + if fails == type(detail): + result = 'ok (%s)' % type(detail).__name__ + _print(result) + return + + result = type(detail).__name__ + raise AssertionError(_print(result, error = detail)) + + if typs is None: + typs = self._typs + + if objs is None: + objs = self._objs + + if axes is not None: + if not isinstance(axes,(tuple,list)): + axes = [ axes ] + else: + axes = list(axes) + else: + axes = [ 0, 1, 2] + + # check + for o in objs: + if o not in self._objs: + continue + + d = getattr(self,o) + for a in axes: + for t in typs: + if t not in self._typs: + continue + + obj = d[t] + if obj is not None: + obj = obj.copy() + + k2 = key2 + _eq(t, o, a, obj, key1, k2) + + def test_at_and_iat_get(self): + + def _check(f, func, values = False): + + if f is not None: + indicies = _generate_indices(f, values) + for i in indicies: + result = getattr(f,func)[i] + expected = _get_value(f,i,values) + assert_almost_equal(result, expected) + + for o in self._objs: + + d = getattr(self,o) + + # iat + _check(d['ints'],'iat', values=True) + for f in [d['labels'],d['ts'],d['floats']]: + if f is not None: + self.assertRaises(ValueError, self.check_values, f, 'iat') + + # at + _check(d['ints'], 'at') + _check(d['labels'],'at') + _check(d['ts'], 'at') + _check(d['floats'],'at') + + def test_at_and_iat_set(self): + + def _check(f, func, values = False): + + if f is not None: + indicies = _generate_indices(f, values) + for i in indicies: + getattr(f,func)[i] = 1 + expected = _get_value(f,i,values) + assert_almost_equal(expected, 1) + + for t in self._objs: + + d = getattr(self,t) + + _check(d['ints'],'iat',values=True) + for f in [d['labels'],d['ts'],d['floats']]: + if f is not None: + self.assertRaises(ValueError, _check, f, 'iat') + + # at + _check(d['ints'], 'at') + _check(d['labels'],'at') + _check(d['ts'], 'at') + _check(d['floats'],'at') + + def test_at_timestamp(self): + + # as timestamp is not a tuple! + dates = date_range('1/1/2000', periods=8) + df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) + s = df['A'] + + result = s.at[dates[5]] + xp = s.values[5] + self.assert_(result == xp) + + def test_iat_invalid_args(self): + pass + + def test_iloc_getitem_int(self): + + # integer + self.check_result('integer', 'iloc', 2, 'ix', { 0 : 4, 1: 6, 2: 8 }, typs = ['ints']) + self.check_result('integer', 'iloc', 2, 'indexer', 2, typs = ['labels','mixed','ts','floats','empty'], fails = IndexError) + + def test_iloc_getitem_neg_int(self): + + # neg integer + self.check_result('neg int', 'iloc', -1, 'ix', { 0 : 6, 1: 9, 2: 12 }, typs = ['ints']) + self.check_result('neg int', 'iloc', -1, 'indexer', -1, typs = ['labels','mixed','ts','floats','empty'], fails = IndexError) + + def test_iloc_getitem_list_int(self): + + # list of ints + self.check_result('list int', 'iloc', [0,1,2], 'ix', { 0 : [0,2,4], 1 : [0,3,6], 2: [0,4,8] }, typs = ['ints']) + self.check_result('list int', 'iloc', [0,1,2], 'indexer', [0,1,2], typs = ['labels','mixed','ts','floats','empty'], fails = IndexError) + + def test_iloc_getitem_dups(self): + + # no dups in panel (bug?) + self.check_result('list int (dups)', 'iloc', [0,1,1,3], 'ix', { 0 : [0,2,2,6], 1 : [0,3,3,9] }, objs = ['series','frame'], typs = ['ints']) + + def test_iloc_getitem_array(self): + + # array like + s = Series(index=range(1,4)) + self.check_result('array like', 'iloc', s.index, 'ix', { 0 : [2,4,6], 1 : [3,6,9], 2: [4,8,12] }, typs = ['ints']) + + def test_iloc_getitem_bool(self): + + # boolean indexers + b = [True,False,True,False,] + self.check_result('bool', 'iloc', b, 'ix', b, typs = ['ints']) + self.check_result('bool', 'iloc', b, 'ix', b, typs = ['labels','mixed','ts','floats','empty'], fails = IndexError) + + def test_iloc_getitem_slice(self): + + # slices + self.check_result('slice', 'iloc', slice(1,3), 'ix', { 0 : [2,4], 1: [3,6], 2: [4,8] }, typs = ['ints']) + self.check_result('slice', 'iloc', slice(1,3), 'indexer', slice(1,3), typs = ['labels','mixed','ts','floats','empty'], fails = IndexError) + + def test_iloc_getitem_out_of_bounds(self): + + # out-of-bounds slice + self.assertRaises(IndexError, self.frame_ints.iloc.__getitem__, tuple([slice(None),slice(1,5,None)])) + self.assertRaises(IndexError, self.frame_ints.iloc.__getitem__, tuple([slice(None),slice(-5,3,None)])) + self.assertRaises(IndexError, self.frame_ints.iloc.__getitem__, tuple([slice(1,5,None)])) + self.assertRaises(IndexError, self.frame_ints.iloc.__getitem__, tuple([slice(-5,3,None)])) + + def test_iloc_setitem(self): + df = self.frame_ints + + df.iloc[1,1] = 1 + result = df.iloc[1,1] + self.assert_(result == 1) + + df.iloc[:,2:3] = 0 + expected = df.iloc[:,2:3] + result = df.iloc[:,2:3] + assert_frame_equal(result, expected) + + def test_iloc_multiindex(self): + df = DataFrame(np.random.randn(3, 3), + columns=[[2,2,4],[6,8,10]], + index=[[4,4,8],[8,10,12]]) + + rs = df.iloc[2] + xp = df.irow(2) + assert_series_equal(rs, xp) + + rs = df.iloc[:,2] + xp = df.icol(2) + assert_series_equal(rs, xp) + + rs = df.iloc[2,2] + xp = df.values[2,2] + self.assert_(rs == xp) + + def test_loc_getitem_int(self): + + # int label + self.check_result('int label', 'loc', 2, 'ix', 2, typs = ['ints'], axes = 0) + self.check_result('int label', 'loc', 3, 'ix', 3, typs = ['ints'], axes = 1) + self.check_result('int label', 'loc', 4, 'ix', 4, typs = ['ints'], axes = 2) + self.check_result('int label', 'loc', 2, 'ix', 2, typs = ['label'], fails = KeyError) + + def test_loc_getitem_label(self): + + # label + self.check_result('label', 'loc', 'c', 'ix', 'c', typs = ['labels'], axes=0) + self.check_result('label', 'loc', 'null', 'ix', 'null', typs = ['mixed'] , axes=0) + self.check_result('label', 'loc', 8, 'ix', 8, typs = ['mixed'] , axes=0) + self.check_result('label', 'loc', Timestamp('20130102'), 'ix', 1, typs = ['ts'], axes=0) + self.check_result('label', 'loc', 'c', 'ix', 'c', typs = ['empty'], fails = KeyError) + + def test_loc_getitem_label_out_of_range(self): + + # out of range label + self.check_result('label range', 'loc', 'f', 'ix', 'f', typs = ['ints','labels','mixed','ts','floats'], fails=KeyError) + + def test_loc_getitem_label_list(self): + + # list of labels + self.check_result('list lbl', 'loc', [0,2,4], 'ix', [0,2,4], typs = ['ints'], axes=0) + self.check_result('list lbl', 'loc', [3,6,9], 'ix', [3,6,9], typs = ['ints'], axes=1) + self.check_result('list lbl', 'loc', [4,8,12], 'ix', [4,8,12], typs = ['ints'], axes=2) + self.check_result('list lbl', 'loc', ['a','b','d'], 'ix', ['a','b','d'], typs = ['labels'], axes=0) + self.check_result('list lbl', 'loc', ['A','B','C'], 'ix', ['A','B','C'], typs = ['labels'], axes=1) + self.check_result('list lbl', 'loc', ['Z','Y','W'], 'ix', ['Z','Y','W'], typs = ['labels'], axes=2) + self.check_result('list lbl', 'loc', [2,8,'null'], 'ix', [2,8,'null'], typs = ['mixed'], axes=0) + self.check_result('list lbl', 'loc', [Timestamp('20130102'),Timestamp('20130103')], 'ix', + [Timestamp('20130102'),Timestamp('20130103')], typs = ['ts'], axes=0) + + # fails + self.check_result('list lbl', 'loc', [0,1,2], 'indexer', [0,1,2], typs = ['empty'], fails = KeyError) + self.check_result('list lbl', 'loc', [0,2,3], 'ix', [0,2,3], typs = ['ints'], axes=0, fails = KeyError) + self.check_result('list lbl', 'loc', [3,6,7], 'ix', [3,6,9], typs = ['ints'], axes=1, fails = KeyError) + self.check_result('list lbl', 'loc', [4,8,10], 'ix', [4,8,12], typs = ['ints'], axes=2, fails = KeyError) + + # array like + self.check_result('array like', 'loc', Series(index=[0,2,4]).index, 'ix', [0,2,4], typs = ['ints'], axes=0) + self.check_result('array like', 'loc', Series(index=[3,6,9]).index, 'ix', [3,6,9], typs = ['ints'], axes=1) + self.check_result('array like', 'loc', Series(index=[4,8,12]).index, 'ix', [4,8,12], typs = ['ints'], axes=2) + + def test_loc_getitem_bool(self): + + # boolean indexers + b = [True,False,True,False] + self.check_result('bool', 'loc', b, 'ix', b, typs = ['ints','labels','mixed','ts','floats']) + self.check_result('bool', 'loc', b, 'ix', b, typs = ['empty'], fails = KeyError) + + def test_loc_getitem_int_slice(self): + + # int slices in int + self.check_result('int slice1', 'loc', slice(1,3), 'ix', { 0 : [2,4], 1: [3,6], 2: [4,8] }, typs = ['ints'], fails=KeyError) + + # ok + self.check_result('int slice2', 'loc', slice(2,5), 'ix', [2,4], typs = ['ints'], axes = 0) + self.check_result('int slice2', 'loc', slice(3,7), 'ix', [3,6], typs = ['ints'], axes = 1) + self.check_result('int slice2', 'loc', slice(4,9), 'ix', [4,8], typs = ['ints'], axes = 2) + + def test_loc_getitem_label_slice(self): + + # label slices (with ints) + self.check_result('lab slice', 'loc', slice(1,3), 'ix', slice(1,3), typs = ['labels','mixed','ts','floats','empty'], fails=KeyError) + + # real label slices + self.check_result('lab slice', 'loc', slice('a','c'), 'ix', slice('a','c'), typs = ['labels'], axes=0) + self.check_result('lab slice', 'loc', slice('A','C'), 'ix', slice('A','C'), typs = ['labels'], axes=1) + self.check_result('lab slice', 'loc', slice('W','Z'), 'ix', slice('W','Z'), typs = ['labels'], axes=2) + + self.check_result('ts slice', 'loc', slice('20130102','20130104'), 'ix', slice('20130102','20130104'), typs = ['ts'], axes=0) + self.check_result('ts slice', 'loc', slice('20130102','20130104'), 'ix', slice('20130102','20130104'), typs = ['ts'], axes=1, fails=KeyError) + self.check_result('ts slice', 'loc', slice('20130102','20130104'), 'ix', slice('20130102','20130104'), typs = ['ts'], axes=2, fails=KeyError) + + self.check_result('mixed slice', 'loc', slice(2,8), 'ix', slice(2,8), typs = ['mixed'], axes=0, fails=KeyError) + self.check_result('mixed slice', 'loc', slice(2,8), 'ix', slice(2,8), typs = ['mixed'], axes=1, fails=KeyError) + self.check_result('mixed slice', 'loc', slice(2,8), 'ix', slice(2,8), typs = ['mixed'], axes=2, fails=KeyError) + + # you would think this would work, but we don't have an ordering, so fail + self.check_result('mixed slice', 'loc', slice(2,5,2), 'ix', slice(2,4,2), typs = ['mixed'], axes=0, fails=ValueError) + + def test_loc_general(self): + + # GH 2922 (these are fails) + df = DataFrame(np.random.rand(4,4),columns=['A','B','C','D']) + self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(0,2),slice(0,2)])) + + df = DataFrame(np.random.rand(4,4),columns=['A','B','C','D'], index=['A','B','C','D']) + self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(0,2),df.columns[0:2]])) + + # want this to work + result = df.loc[:,"A":"B"].iloc[0:2,:] + self.assert_((result.columns == ['A','B']).all() == True) + self.assert_((result.index == ['A','B']).all() == True) + + def test_loc_setitem_frame(self): + df = self.frame_labels + + result = df.iloc[0,0] + + df.loc['a','A'] = 1 + result = df.loc['a','A'] + self.assert_(result == 1) + + result = df.iloc[0,0] + self.assert_(result == 1) + + df.loc[:,'B':'D'] = 0 + expected = df.loc[:,'B':'D'] + result = df.ix[:,1:] + assert_frame_equal(result, expected) + + def test_iloc_getitem_frame(self): + """ originally from test_frame.py""" + df = DataFrame(np.random.randn(10, 4), index=range(0, 20, 2), columns=range(0,8,2)) + + result = df.iloc[2] + exp = df.ix[4] + assert_series_equal(result, exp) + + result = df.iloc[2,2] + exp = df.ix[4,4] + self.assert_(result == exp) + + # slice + result = df.iloc[4:8] + expected = df.ix[8:14] + assert_frame_equal(result, expected) + + result = df.iloc[:,2:3] + expected = df.ix[:,4:5] + assert_frame_equal(result, expected) + + # list of integers + result = df.iloc[[0,1,3]] + expected = df.ix[[0,2,6]] + assert_frame_equal(result, expected) + + result = df.iloc[[0,1,3],[0,1]] + expected = df.ix[[0,2,6],[0,2]] + assert_frame_equal(result, expected) + + # neg indicies + result = df.iloc[[-1,1,3],[-1,1]] + expected = df.ix[[18,2,6],[6,2]] + assert_frame_equal(result, expected) + + # dups indicies + result = df.iloc[[-1,-1,1,3],[-1,1]] + expected = df.ix[[18,18,2,6],[6,2]] + assert_frame_equal(result, expected) + + # with index-like + s = Series(index=range(1,5)) + result = df.iloc[s.index] + expected = df.ix[[2,4,6,8]] + assert_frame_equal(result, expected) + + # out-of-bounds slice + self.assertRaises(IndexError, df.iloc.__getitem__, tuple([slice(None),slice(1,5,None)])) + self.assertRaises(IndexError, df.iloc.__getitem__, tuple([slice(None),slice(-5,3,None)])) + self.assertRaises(IndexError, df.iloc.__getitem__, tuple([slice(1,11,None)])) + self.assertRaises(IndexError, df.iloc.__getitem__, tuple([slice(-11,3,None)])) + + # try with labelled frame + df = DataFrame(np.random.randn(10, 4), index=list('abcdefghij'), columns=list('ABCD')) + + result = df.iloc[1,1] + exp = df.ix['b','B'] + self.assert_(result == exp) + + result = df.iloc[:,2:3] + expected = df.ix[:,['C']] + assert_frame_equal(result, expected) + + # negative indexing + result = df.iloc[-1,-1] + exp = df.ix['j','D'] + self.assert_(result == exp) + + # out-of-bounds exception + self.assertRaises(IndexError, df.iloc.__getitem__, tuple([10,5])) + + # trying to use a label + self.assertRaises(ValueError, df.iloc.__getitem__, tuple(['j','D'])) + + def test_iloc_setitem_series(self): + """ originally from test_series.py """ + df = DataFrame(np.random.randn(10, 4), index=list('abcdefghij'), columns=list('ABCD')) + + df.iloc[1,1] = 1 + result = df.iloc[1,1] + self.assert_(result == 1) + + df.iloc[:,2:3] = 0 + expected = df.iloc[:,2:3] + result = df.iloc[:,2:3] + assert_frame_equal(result, expected) + + def test_iloc_setitem_series(self): + s = Series(np.random.randn(10), index=range(0,20,2)) + + s.iloc[1] = 1 + result = s.iloc[1] + self.assert_(result == 1) + + s.iloc[:4] = 0 + expected = s.iloc[:4] + result = s.iloc[:4] + assert_series_equal(result, expected) + + def test_iloc_multiindex(self): + mi_labels = DataFrame(np.random.randn(4, 3), columns=[['i', 'i', 'j'], + ['A', 'A', 'B']], + index=[['i', 'i', 'j', 'k'], ['X', 'X', 'Y','Y']]) + + mi_int = DataFrame(np.random.randn(3, 3), + columns=[[2,2,4],[6,8,10]], + index=[[4,4,8],[8,10,12]]) + + + # the first row + rs = mi_int.iloc[0] + xp = mi_int.ix[4].ix[8] + assert_series_equal(rs, xp) + + # 2nd (last) columns + rs = mi_int.iloc[:,2] + xp = mi_int.ix[:,2] + assert_series_equal(rs, xp) + + # corner column + rs = mi_int.iloc[2,2] + xp = mi_int.ix[:,2].ix[2] + self.assert_(rs == xp) + + # this is basically regular indexing + rs = mi_labels.iloc[2,2] + xp = mi_labels.ix['j'].ix[:,'j'].ix[0,0] + self.assert_(rs == xp) + + def test_loc_multiindex(self): + + mi_labels = DataFrame(np.random.randn(3, 3), columns=[['i', 'i', 'j'], + ['A', 'A', 'B']], + index=[['i', 'i', 'j'], ['X', 'X', 'Y']]) + + mi_int = DataFrame(np.random.randn(3, 3), + columns=[[2,2,4],[6,8,10]], + index=[[4,4,8],[8,10,12]]) + + # the first row + rs = mi_labels.loc['i'] + xp = mi_labels.ix['i'] + assert_frame_equal(rs, xp) + + # 2nd (last) columns + rs = mi_labels.loc[:,'j'] + xp = mi_labels.ix[:,'j'] + assert_frame_equal(rs, xp) + + # corner column + rs = mi_labels.loc['j'].loc[:,'j'] + xp = mi_labels.ix['j'].ix[:,'j'] + assert_frame_equal(rs,xp) + + # with a tuple + rs = mi_labels.loc[('i','X')] + xp = mi_labels.ix[('i','X')] + assert_frame_equal(rs,xp) + + rs = mi_int.loc[4] + xp = mi_int.ix[4] + assert_frame_equal(rs,xp) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False)