diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index bb424e29cfc21..acddf1bb3fe30 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -6,18 +6,16 @@ import numpy as np np.random.seed(123456) - from pandas import * - options.display.max_rows=15 - randn = np.random.randn np.set_printoptions(precision=4, suppress=True) - import matplotlib.pyplot as plt - plt.close('all') + import pandas as pd + pd.options.display.max_rows = 15 import matplotlib try: matplotlib.style.use('ggplot') except AttributeError: - options.display.mpl_style = 'default' - from pandas.compat import zip + pd.options.display.mpl_style = 'default' + import matplotlib.pyplot as plt + plt.close('all') ***************************** Group By: split-apply-combine @@ -105,11 +103,12 @@ consider the following DataFrame: .. ipython:: python - df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B' : ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C' : randn(8), 'D' : randn(8)}) + df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B' : ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C' : np.random.randn(8), + 'D' : np.random.randn(8)}) df We could naturally group by either the ``A`` or ``B`` columns or both: @@ -142,7 +141,7 @@ output of aggregation functions will only contain unique index values: lst = [1, 2, 3, 1, 2, 3] - s = Series([1, 2, 3, 10, 20, 30], lst) + s = pd.Series([1, 2, 3, 10, 20, 30], lst) grouped = s.groupby(level=0) @@ -189,7 +188,7 @@ however pass ``sort=False`` for potential speedups: .. ipython:: python - df2 = DataFrame({'X' : ['B', 'B', 'A', 'A'], 'Y' : [1, 2, 3, 4]}) + df2 = pd.DataFrame({'X' : ['B', 'B', 'A', 'A'], 'Y' : [1, 2, 3, 4]}) df2.groupby(['X'], sort=True).sum() df2.groupby(['X'], sort=False).sum() @@ -203,10 +202,10 @@ however pass ``sort=False`` for potential speedups: n = 10 weight = np.random.normal(166, 20, size=n) height = np.random.normal(60, 10, size=n) - time = date_range('1/1/2000', periods=n) + time = pd.date_range('1/1/2000', periods=n) gender = tm.choice(['male', 'female'], size=n) - df = DataFrame({'height': height, 'weight': weight, - 'gender': gender}, index=time) + df = pd.DataFrame({'height': height, 'weight': weight, + 'gender': gender}, index=time) .. ipython:: python @@ -226,11 +225,12 @@ however pass ``sort=False`` for potential speedups: .. ipython:: python :suppress: - df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B' : ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C' : randn(8), 'D' : randn(8)}) + df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B' : ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C' : np.random.randn(8), + 'D' : np.random.randn(8)}) .. _groupby.multiindex: @@ -248,8 +248,8 @@ natural to group by one of the levels of the hierarchy. ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = list(zip(*arrays)) tuples - index = MultiIndex.from_tuples(tuples, names=['first', 'second']) - s = Series(randn(8), index=index) + index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) + s = pd.Series(np.random.randn(8), index=index) .. ipython:: python @@ -281,13 +281,13 @@ Also as of v0.6, grouping with multiple levels is supported. ['doo', 'doo', 'bee', 'bee', 'bop', 'bop', 'bop', 'bop'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = list(zip(*arrays)) - index = MultiIndex.from_tuples(tuples, names=['first', 'second', 'third']) - s = Series(randn(8), index=index) + index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second', 'third']) + s = pd.Series(np.random.randn(8), index=index) .. ipython:: python s - s.groupby(level=['first','second']).sum() + s.groupby(level=['first', 'second']).sum() More on the ``sum`` function and aggregation later. @@ -499,9 +499,9 @@ to standardize the data within each group: .. ipython:: python - index = date_range('10/1/1999', periods=1100) - ts = Series(np.random.normal(0.5, 2, 1100), index) - ts = rolling_mean(ts, 100, 100).dropna() + index = pd.date_range('10/1/1999', periods=1100) + ts = pd.Series(np.random.normal(0.5, 2, 1100), index) + ts = pd.rolling_mean(ts, 100, 100).dropna() ts.head() ts.tail() @@ -528,7 +528,7 @@ We can also visually compare the original and transformed data sets. .. ipython:: python - compare = DataFrame({'Original': ts, 'Transformed': transformed}) + compare = pd.DataFrame({'Original': ts, 'Transformed': transformed}) @savefig groupby_transform_plot.png compare.plot() @@ -539,11 +539,11 @@ Another common data transform is to replace missing data with the group mean. :suppress: cols = ['A', 'B', 'C'] - values = randn(1000, 3) + values = np.random.randn(1000, 3) values[np.random.randint(0, 1000, 100), 0] = np.nan values[np.random.randint(0, 1000, 50), 1] = np.nan values[np.random.randint(0, 1000, 200), 2] = np.nan - data_df = DataFrame(values, columns=cols) + data_df = pd.DataFrame(values, columns=cols) .. ipython:: python @@ -599,7 +599,7 @@ than 2. .. ipython:: python - sf = Series([1, 1, 2, 3, 3, 3]) + sf = pd.Series([1, 1, 2, 3, 3, 3]) sf.groupby(sf).filter(lambda x: x.sum() > 2) The argument of ``filter`` must be a function that, applied to the group as a @@ -610,7 +610,7 @@ with only a couple members. .. ipython:: python - dff = DataFrame({'A': np.arange(8), 'B': list('aabbbbcc')}) + dff = pd.DataFrame({'A': np.arange(8), 'B': list('aabbbbcc')}) dff.groupby('B').filter(lambda x: len(x) > 2) Alternatively, instead of dropping the offending groups, we can return a @@ -672,9 +672,9 @@ next). This enables some operations to be carried out rather succinctly: .. ipython:: python - tsdf = DataFrame(randn(1000, 3), - index=date_range('1/1/2000', periods=1000), - columns=['A', 'B', 'C']) + tsdf = pd.DataFrame(np.random.randn(1000, 3), + index=pd.date_range('1/1/2000', periods=1000), + columns=['A', 'B', 'C']) tsdf.ix[::2] = np.nan grouped = tsdf.groupby(lambda x: x.year) grouped.fillna(method='pad') @@ -689,8 +689,8 @@ The ``nlargest`` and ``nsmallest`` methods work on ``Series`` style groupbys: .. ipython:: python - s = Series([9, 8, 7, 5, 19, 1, 4.2, 3.3]) - g = Series(list('abababab')) + s = pd.Series([9, 8, 7, 5, 19, 1, 4.2, 3.3]) + g = pd.Series(list('abababab')) gb = s.groupby(g) gb.nlargest(3) gb.nsmallest(3) @@ -721,8 +721,8 @@ The dimension of the returned result can also change: In [8]: grouped = df.groupby('A')['C'] In [10]: def f(group): - ....: return DataFrame({'original' : group, - ....: 'demeaned' : group - group.mean()}) + ....: return pd.DataFrame({'original' : group, + ....: 'demeaned' : group - group.mean()}) ....: In [11]: grouped.apply(f) @@ -732,8 +732,8 @@ The dimension of the returned result can also change: .. ipython:: python def f(x): - return Series([ x, x**2 ], index = ['x', 'x^s']) - s = Series(np.random.rand(5)) + return pd.Series([ x, x**2 ], index = ['x', 'x^s']) + s = pd.Series(np.random.rand(5)) s s.apply(f) @@ -754,7 +754,7 @@ The dimension of the returned result can also change: .. ipython:: python - d = DataFrame({"a":["x", "y"], "b":[1,2]}) + d = pd.DataFrame({"a":["x", "y"], "b":[1,2]}) def identity(df): print df return df @@ -802,9 +802,9 @@ can be used as group keys. If so, the order of the levels will be preserved: .. ipython:: python - data = Series(np.random.randn(100)) + data = pd.Series(np.random.randn(100)) - factor = qcut(data, [0, .25, .5, .75, 1.]) + factor = pd.qcut(data, [0, .25, .5, .75, 1.]) data.groupby(factor).mean() @@ -813,27 +813,28 @@ can be used as group keys. If so, the order of the levels will be preserved: Grouping with a Grouper specification ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Your may need to specify a bit more data to properly group. You can +You may need to specify a bit more data to properly group. You can use the ``pd.Grouper`` to provide this local control. .. ipython:: python - import datetime as DT - - df = DataFrame({ - 'Branch' : 'A A A A A A A B'.split(), - 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), - 'Quantity': [1,3,5,1,8,1,9,3], - 'Date' : [ - DT.datetime(2013,1,1,13,0), - DT.datetime(2013,1,1,13,5), - DT.datetime(2013,10,1,20,0), - DT.datetime(2013,10,2,10,0), - DT.datetime(2013,10,1,20,0), - DT.datetime(2013,10,2,10,0), - DT.datetime(2013,12,2,12,0), - DT.datetime(2013,12,2,14,0), - ]}) + import datetime + + df = pd.DataFrame({ + 'Branch' : 'A A A A A A A B'.split(), + 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), + 'Quantity': [1,3,5,1,8,1,9,3], + 'Date' : [ + datetime.datetime(2013,1,1,13,0), + datetime.datetime(2013,1,1,13,5), + datetime.datetime(2013,10,1,20,0), + datetime.datetime(2013,10,2,10,0), + datetime.datetime(2013,10,1,20,0), + datetime.datetime(2013,10,2,10,0), + datetime.datetime(2013,12,2,12,0), + datetime.datetime(2013,12,2,14,0), + ] + }) df @@ -862,7 +863,7 @@ Just like for a DataFrame or Series you can call head and tail on a groupby: .. ipython:: python - df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) df g = df.groupby('A') @@ -894,7 +895,7 @@ To select from a DataFrame or Series the nth item, use the nth method. This is a .. ipython:: python - df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) g = df.groupby('A') g.nth(0) @@ -919,7 +920,7 @@ As with other methods, passing ``as_index=False``, will achieve a filtration, wh .. ipython:: python - df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) g = df.groupby('A',as_index=False) g.nth(0) @@ -929,8 +930,8 @@ You can also select multiple rows from each group by specifying multiple nth val .. ipython:: python - business_dates = date_range(start='4/1/2014', end='6/30/2014', freq='B') - df = DataFrame(1, index=business_dates, columns=['a', 'b']) + business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', freq='B') + df = pd.DataFrame(1, index=business_dates, columns=['a', 'b']) # get the first, 4th, and last date index for each month df.groupby((df.index.year, df.index.month)).nth([0, 3, -1]) @@ -961,7 +962,7 @@ the values in column 1 where the group is "B" are 3 higher on average. .. ipython:: python np.random.seed(1234) - df = DataFrame(np.random.randn(50, 2)) + df = pd.DataFrame(np.random.randn(50, 2)) df['g'] = np.random.choice(['A', 'B'], size=50) df.loc[df['g'] == 'B', 1] += 3 @@ -1010,11 +1011,11 @@ column index name will be used as the name of the inserted column: .. ipython:: python df = pd.DataFrame({ - 'a': [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], - 'b': [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1], - 'c': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], - 'd': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1], - }) + 'a': [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], + 'b': [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1], + 'c': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], + 'd': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1], + }) def compute_metrics(x): result = {'b_sum': x['b'].sum(), 'c_mean': x['c'].mean()} diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index a1912032bc3bf..618a2ae42c65f 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -6,15 +6,10 @@ :suppress: import numpy as np - import random np.random.seed(123456) - from pandas import * - options.display.max_rows=15 - import pandas as pd - randn = np.random.randn - randint = np.random.randint np.set_printoptions(precision=4, suppress=True) - from pandas.compat import range, zip + import pandas as pd + pd.options.display.max_rows=15 *************************** Indexing and Selecting Data @@ -162,10 +157,10 @@ indexing functionality: .. ipython:: python - dates = date_range('1/1/2000', periods=8) - df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) + dates = pd.date_range('1/1/2000', periods=8) + df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) df - panel = Panel({'one' : df, 'two' : df - df.mean()}) + panel = pd.Panel({'one' : df, 'two' : df - df.mean()}) panel .. note:: @@ -208,7 +203,7 @@ as an attribute: .. ipython:: python - sa = Series([1,2,3],index=list('abc')) + sa = pd.Series([1,2,3],index=list('abc')) dfa = df.copy() .. ipython:: python @@ -307,7 +302,7 @@ Selection By Label .. ipython:: python - dfl = DataFrame(np.random.randn(5,4), columns=list('ABCD'), index=date_range('20130101',periods=5)) + dfl = pd.DataFrame(np.random.randn(5,4), columns=list('ABCD'), index=pd.date_range('20130101',periods=5)) dfl .. code-block:: python @@ -333,7 +328,7 @@ The ``.loc`` attribute is the primary access method. The following are valid inp .. ipython:: python - s1 = Series(np.random.randn(6),index=list('abcdef')) + s1 = pd.Series(np.random.randn(6),index=list('abcdef')) s1 s1.loc['c':] s1.loc['b'] @@ -349,9 +344,9 @@ With a DataFrame .. ipython:: python - df1 = DataFrame(np.random.randn(6,4), - index=list('abcdef'), - columns=list('ABCD')) + df1 = pd.DataFrame(np.random.randn(6,4), + index=list('abcdef'), + columns=list('ABCD')) df1 df1.loc[['a','b','d'],:] @@ -403,7 +398,7 @@ The ``.iloc`` attribute is the primary access method. The following are valid in .. ipython:: python - s1 = Series(np.random.randn(5),index=list(range(0,10,2))) + s1 = pd.Series(np.random.randn(5), index=list(range(0,10,2))) s1 s1.iloc[:3] s1.iloc[3] @@ -419,9 +414,9 @@ With a DataFrame .. ipython:: python - df1 = DataFrame(np.random.randn(6,4), - index=list(range(0,12,2)), - columns=list(range(0,8,2))) + df1 = pd.DataFrame(np.random.randn(6,4), + index=list(range(0,12,2)), + columns=list(range(0,8,2))) df1 Select via integer slicing @@ -472,7 +467,7 @@ Out of range slice indexes are handled gracefully just as in Python/Numpy. x x[4:10] x[8:10] - s = Series(x) + s = pd.Series(x) s s.iloc[4:10] s.iloc[8:10] @@ -488,7 +483,7 @@ returned) .. ipython:: python - dfl = DataFrame(np.random.randn(5,2),columns=list('AB')) + dfl = pd.DataFrame(np.random.randn(5,2), columns=list('AB')) dfl dfl.iloc[:,2:3] dfl.iloc[:,1:3] @@ -516,7 +511,7 @@ A random selection of rows or columns from a Series, DataFrame, or Panel with th .. ipython :: python - s = Series([0,1,2,3,4,5]) + s = pd.Series([0,1,2,3,4,5]) # When no arguments are passed, returns 1 row. s.sample() @@ -532,7 +527,7 @@ using the ``replace`` option: .. ipython :: python - s = Series([0,1,2,3,4,5]) + s = pd.Series([0,1,2,3,4,5]) # Without replacement (default): s.sample(n=6, replace=False) @@ -547,7 +542,7 @@ to have different probabilities, you can pass the ``sample`` function sampling w .. ipython :: python - s = Series([0,1,2,3,4,5]) + s = pd.Series([0,1,2,3,4,5]) example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4] s.sample(n=3, weights=example_weights) @@ -561,21 +556,21 @@ as a string. .. ipython :: python - df2 = DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]}) + df2 = pd.DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]}) df2.sample(n = 3, weights = 'weight_column') ``sample`` also allows users to sample columns instead of rows using the ``axis`` argument. .. ipython :: python - df3 = DataFrame({'col1':[1,2,3], 'col2':[2,3,4]}) + df3 = pd.DataFrame({'col1':[1,2,3], 'col2':[2,3,4]}) df3.sample(n=1, axis=1) Finally, one can also set a seed for ``sample``'s random number generator using the ``random_state`` argument, which will accept either an integer (as a seed) or a numpy RandomState object. .. ipython :: python - df4 = DataFrame({'col1':[1,2,3], 'col2':[2,3,4]}) + df4 = pd.DataFrame({'col1':[1,2,3], 'col2':[2,3,4]}) # With a given seed, the sample will always draw the same rows. df4.sample(n=2, random_state=2) @@ -594,7 +589,7 @@ In the ``Series`` case this is effectively an appending operation .. ipython:: python - se = Series([1,2,3]) + se = pd.Series([1,2,3]) se se[5] = 5. se @@ -603,7 +598,7 @@ A ``DataFrame`` can be enlarged on either axis via ``.loc`` .. ipython:: python - dfi = DataFrame(np.arange(6).reshape(3,2), + dfi = pd.DataFrame(np.arange(6).reshape(3,2), columns=['A','B']) dfi dfi.loc[:,'C'] = dfi.loc[:,'A'] @@ -661,7 +656,7 @@ Using a boolean vector to index a Series works exactly as in a numpy ndarray: .. ipython:: python - s = Series(range(-3, 4)) + s = pd.Series(range(-3, 4)) s s[s > 0] s[(s < -1) | (s > 0.5)] @@ -680,9 +675,9 @@ more complex criteria: .. ipython:: python - df2 = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'], - 'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'], - 'c' : randn(7)}) + df2 = pd.DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'], + 'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'], + 'c' : np.random.randn(7)}) # only want 'two' or 'three' criterion = df2['a'].map(lambda x: x.startswith('t')) @@ -713,7 +708,7 @@ select rows where one or more columns have values you want: .. ipython:: python - s = Series(np.arange(5),index=np.arange(5)[::-1],dtype='int64') + s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype='int64') s s.isin([2, 4, 6]) s[s.isin([2, 4, 6])] @@ -733,8 +728,8 @@ in the membership check: .. ipython:: python - s_mi = Series(np.arange(6), - index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']])) + s_mi = pd.Series(np.arange(6), + index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']])) s_mi s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])] s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)] @@ -746,8 +741,8 @@ wherever the element is in the sequence of values. .. ipython:: python - df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], - 'ids2': ['a', 'n', 'c', 'n']}) + df = pd.DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], + 'ids2': ['a', 'n', 'c', 'n']}) values = ['a', 'b', 1, 3] @@ -801,8 +796,8 @@ Equivalent is ``df.where(df < 0)`` .. ipython:: python :suppress: - dates = date_range('1/1/2000', periods=8) - df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) + dates = pd.date_range('1/1/2000', periods=8) + df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) .. ipython:: python @@ -889,16 +884,10 @@ method that allows selection using an expression. You can get the value of the frame where column ``b`` has values between the values of columns ``a`` and ``c``. For example: -.. ipython:: python - :suppress: - - from numpy.random import randint, rand - np.random.seed(1234) - .. ipython:: python n = 10 - df = DataFrame(rand(n, 3), columns=list('abc')) + df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc')) df # pure python @@ -912,7 +901,7 @@ with the name ``a``. .. ipython:: python - df = DataFrame(randint(n / 2, size=(n, 2)), columns=list('bc')) + df = pd.DataFrame(np.random.randint(n / 2, size=(n, 2)), columns=list('bc')) df.index.name = 'a' df df.query('a < b and b < c') @@ -928,7 +917,7 @@ If instead you don't want to or cannot name your index, you can use the name .. ipython:: python - df = DataFrame(randint(n, size=(n, 2)), columns=list('bc')) + df = pd.DataFrame(np.random.randint(n, size=(n, 2)), columns=list('bc')) df df.query('index < b < c') @@ -946,7 +935,7 @@ If instead you don't want to or cannot name your index, you can use the name .. ipython:: python - df = DataFrame({'a': randint(5, size=5)}) + df = pd.DataFrame({'a': np.random.randint(5, size=5)}) df.index.name = 'a' df.query('a > 2') # uses the column 'a', not the index @@ -970,23 +959,20 @@ You can also use the levels of a ``DataFrame`` with a .. ipython:: python - import pandas.util.testing as tm - n = 10 - colors = tm.choice(['red', 'green'], size=n) - foods = tm.choice(['eggs', 'ham'], size=n) + colors = np.random.choice(['red', 'green'], size=n) + foods = np.random.choice(['eggs', 'ham'], size=n) colors foods - index = MultiIndex.from_arrays([colors, foods], names=['color', 'food']) - df = DataFrame(randn(n, 2), index=index) + index = pd.MultiIndex.from_arrays([colors, foods], names=['color', 'food']) + df = pd.DataFrame(np.random.randn(n, 2), index=index) df df.query('color == "red"') If the levels of the ``MultiIndex`` are unnamed, you can refer to them using special names: - .. ipython:: python df.index.names = [None, None] @@ -1008,9 +994,9 @@ having to specify which frame you're interested in querying .. ipython:: python - df = DataFrame(rand(n, 3), columns=list('abc')) + df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc')) df - df2 = DataFrame(rand(n + 2, 3), columns=df.columns) + df2 = pd.DataFrame(np.random.rand(n + 2, 3), columns=df.columns) df2 expr = '0.0 <= a <= c <= 0.5' map(lambda frame: frame.query(expr), [df, df2]) @@ -1022,7 +1008,7 @@ Full numpy-like syntax .. ipython:: python - df = DataFrame(randint(n, size=(n, 3)), columns=list('abc')) + df = pd.DataFrame(np.random.randint(n, size=(n, 3)), columns=list('abc')) df df.query('(a < b) & (b < c)') df[(df.a < df.b) & (df.b < df.c)] @@ -1065,8 +1051,9 @@ The ``in`` and ``not in`` operators .. ipython:: python # get all rows where columns "a" and "b" have overlapping values - df = DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'), - 'c': randint(5, size=12), 'd': randint(9, size=12)}) + df = pd.DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'), + 'c': np.random.randint(5, size=12), + 'd': np.random.randint(9, size=12)}) df df.query('a in b') @@ -1139,8 +1126,8 @@ You can negate boolean expressions with the word ``not`` or the ``~`` operator. .. ipython:: python - df = DataFrame(rand(n, 3), columns=list('abc')) - df['bools'] = rand(len(df)) > 0.5 + df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc')) + df['bools'] = np.random.rand(len(df)) > 0.5 df.query('~bools') df.query('not bools') df.query('not bools') == df[~df.bools] @@ -1192,7 +1179,7 @@ floating point values generated using ``numpy.random.randn()``. .. ipython:: python :suppress: - df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) + df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) df2 = df.copy() @@ -1214,9 +1201,9 @@ should be taken instead. .. ipython:: python - df2 = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'], - 'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'], - 'c' : np.random.randn(7)}) + df2 = pd.DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'], + 'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'], + 'c' : np.random.randn(7)}) df2.duplicated(['a','b']) df2.drop_duplicates(['a','b']) df2.drop_duplicates(['a','b'], take_last=True) @@ -1242,7 +1229,7 @@ default value. .. ipython:: python - s = Series([1,2,3], index=['a','b','c']) + s = pd.Series([1,2,3], index=['a','b','c']) s.get('a') # equivalent to s['a'] s.get('x', default=-1) @@ -1267,7 +1254,7 @@ numpy array. For instance, .. ipython:: python - dflookup = DataFrame(np.random.rand(20,4), columns = ['A','B','C','D']) + dflookup = pd.DataFrame(np.random.rand(20,4), columns = ['A','B','C','D']) dflookup.lookup(list(range(0,10,2)), ['B','C','A','B','D']) .. _indexing.class: @@ -1287,7 +1274,7 @@ lookups, data alignment, and reindexing. The easiest way to create an .. ipython:: python - index = Index(['e', 'd', 'a', 'b']) + index = pd.Index(['e', 'd', 'a', 'b']) index 'd' in index @@ -1296,26 +1283,26 @@ You can also pass a ``name`` to be stored in the index: .. ipython:: python - index = Index(['e', 'd', 'a', 'b'], name='something') + index = pd.Index(['e', 'd', 'a', 'b'], name='something') index.name The name, if set, will be shown in the console display: .. ipython:: python - index = Index(list(range(5)), name='rows') - columns = Index(['A', 'B', 'C'], name='cols') - df = DataFrame(np.random.randn(5, 3), index=index, columns=columns) + index = pd.Index(list(range(5)), name='rows') + columns = pd.Index(['A', 'B', 'C'], name='cols') + df = pd.DataFrame(np.random.randn(5, 3), index=index, columns=columns) df df['A'] +.. _indexing.set_metadata: + Setting metadata ~~~~~~~~~~~~~~~~ .. versionadded:: 0.13.0 -.. _indexing.set_metadata: - Indexes are "mostly immutable", but it is possible to set and change their metadata, like the index ``name`` (or, for ``MultiIndex``, ``levels`` and ``labels``). @@ -1328,7 +1315,7 @@ See :ref:`Advanced Indexing ` for usage of MultiIndexes. .. ipython:: python - ind = Index([1, 2, 3]) + ind = pd.Index([1, 2, 3]) ind.rename("apple") ind ind.set_names(["apple"], inplace=True) @@ -1342,8 +1329,7 @@ See :ref:`Advanced Indexing ` for usage of MultiIndexes. .. ipython:: python - - index = MultiIndex.from_product([range(3), ['one', 'two']], names=['first', 'second']) + index = pd.MultiIndex.from_product([range(3), ['one', 'two']], names=['first', 'second']) index index.levels[1] index.set_levels(["a", "b"], level=1) @@ -1364,8 +1350,8 @@ operators. Difference is provided via the ``.difference()`` method. .. ipython:: python - a = Index(['c', 'b', 'a']) - b = Index(['c', 'e', 'd']) + a = pd.Index(['c', 'b', 'a']) + b = pd.Index(['c', 'e', 'd']) a | b a & b a.difference(b) @@ -1377,8 +1363,8 @@ with duplicates dropped. .. ipython:: python - idx1 = Index([1, 2, 3, 4]) - idx2 = Index([2, 3, 4, 5]) + idx1 = pd.Index([1, 2, 3, 4]) + idx2 = pd.Index([2, 3, 4, 5]) idx1.sym_diff(idx2) idx1 ^ idx2 @@ -1401,10 +1387,10 @@ indexed DataFrame: .. ipython:: python :suppress: - data = DataFrame({'a' : ['bar', 'bar', 'foo', 'foo'], - 'b' : ['one', 'two', 'one', 'two'], - 'c' : ['z', 'y', 'x', 'w'], - 'd' : [1., 2., 3, 4]}) + data = pd.DataFrame({'a' : ['bar', 'bar', 'foo', 'foo'], + 'b' : ['one', 'two', 'one', 'two'], + 'c' : ['z', 'y', 'x', 'w'], + 'd' : [1., 2., 3, 4]}) .. ipython:: python @@ -1482,12 +1468,12 @@ When setting values in a pandas object, care must be taken to avoid what is call .. ipython:: python - dfmi = DataFrame([list('abcd'), - list('efgh'), - list('ijkl'), - list('mnop')], - columns=MultiIndex.from_product([['one','two'], - ['first','second']])) + dfmi = pd.DataFrame([list('abcd'), + list('efgh'), + list('ijkl'), + list('mnop')], + columns=pd.MultiIndex.from_product([['one','two'], + ['first','second']])) dfmi Compare these two access methods: @@ -1543,9 +1529,9 @@ which can take the values ``['raise','warn',None]``, where showing a warning is .. ipython:: python :okwarning: - dfb = DataFrame({'a' : ['one', 'one', 'two', - 'three', 'two', 'one', 'six'], - 'c' : np.arange(7)}) + dfb = pd.DataFrame({'a' : ['one', 'one', 'two', + 'three', 'two', 'one', 'six'], + 'c' : np.arange(7)}) # This will show the SettingWithCopyWarning # but the frame values will be set @@ -1573,7 +1559,7 @@ This is the correct access method .. ipython:: python - dfc = DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]}) + dfc = pd.DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]}) dfc.loc[0,'A'] = 11 dfc diff --git a/doc/source/internals.rst b/doc/source/internals.rst index 8b4f7360fc235..5899c3089cdac 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -6,15 +6,10 @@ :suppress: import numpy as np - import random np.random.seed(123456) - from pandas import * - options.display.max_rows=15 - import pandas as pd - randn = np.random.randn - randint = np.random.randint np.set_printoptions(precision=4, suppress=True) - from pandas.compat import range, zip + import pandas as pd + pd.options.display.max_rows = 15 ********* Internals @@ -81,7 +76,7 @@ integer **labels**, and the level **names**: .. ipython:: python - index = MultiIndex.from_product([range(3), ['one', 'two']], names=['first', 'second']) + index = pd.MultiIndex.from_product([range(3), ['one', 'two']], names=['first', 'second']) index index.levels index.labels @@ -210,7 +205,7 @@ Below is an example to define 2 original properties, "internal_cache" as a tempo class SubclassedDataFrame2(DataFrame): # temporary properties - _internal_names = DataFrame._internal_names + ['internal_cache'] + _internal_names = pd.DataFrame._internal_names + ['internal_cache'] _internal_names_set = set(_internal_names) # normal properties @@ -244,5 +239,3 @@ Below is an example to define 2 original properties, "internal_cache" as a tempo # properties defined in _metadata are retained >>> df[['A', 'B']].added_property property - - diff --git a/doc/source/io.rst b/doc/source/io.rst index 7a4318fb02cfc..4c829c3252533 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2407,7 +2407,7 @@ for some advanced strategies As of version 0.15.0, pandas requires ``PyTables`` >= 3.0.0. Stores written with prior versions of pandas / ``PyTables`` >= 2.3 are fully compatible (this was the previous minimum ``PyTables`` required version). .. warning:: - + There is a ``PyTables`` indexing bug which may appear when querying stores using an index. If you see a subset of results being returned, upgrade to ``PyTables`` >= 3.2. Stores created previously will need to be rewritten using the updated version. .. ipython:: python diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index 5a350b4d9a1e7..6f1a272e3c40d 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -82,7 +82,7 @@ pandas provides the :func:`~pandas.core.common.isnull` and .. ipython:: python df2['one'] - isnull(df2['one']) + pd.isnull(df2['one']) df2['four'].notnull() **Summary:** ``NaN`` and ``None`` (in object arrays) are considered @@ -99,7 +99,7 @@ pandas objects provide intercompatibility between ``NaT`` and ``NaN``. .. ipython:: python df2 = df.copy() - df2['timestamp'] = Timestamp('20120101') + df2['timestamp'] = pd.Timestamp('20120101') df2 df2.ix[['a','c','h'],['one','timestamp']] = np.nan df2 @@ -260,7 +260,7 @@ use case of this is to fill a DataFrame with the mean of that column. .. ipython:: python - dff = pd.DataFrame(np.random.randn(10,3),columns=list('ABC')) + dff = pd.DataFrame(np.random.randn(10,3), columns=list('ABC')) dff.iloc[3:5,0] = np.nan dff.iloc[4:6,1] = np.nan dff.iloc[5:8,2] = np.nan @@ -276,7 +276,7 @@ a Series in this case. .. ipython:: python - dff.where(notnull(dff),dff.mean(),axis='columns') + dff.where(pd.notnull(dff), dff.mean(), axis='columns') .. _missing_data.dropna: @@ -321,7 +321,7 @@ performs linear interpolation at missing datapoints. :suppress: np.random.seed(123456) - idx = date_range('1/1/2000', periods=100, freq='BM') + idx = pd.date_range('1/1/2000', periods=100, freq='BM') ts = pd.Series(np.random.randn(100), index=idx) ts[1:20] = np.nan ts[60:80] = np.nan @@ -368,7 +368,7 @@ You can also interpolate with a DataFrame: .. ipython:: python df = pd.DataFrame({'A': [1, 2.1, np.nan, 4.7, 5.6, 6.8], - 'B': [.25, np.nan, np.nan, 4, 12.2, 14.4]}) + 'B': [.25, np.nan, np.nan, 4, 12.2, 14.4]}) df df.interpolate() @@ -425,7 +425,7 @@ at the new values. ser = pd.Series(np.sort(np.random.uniform(size=100))) # interpolate at new_index - new_index = ser.index | Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75]) + new_index = ser.index | pd.Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75]) interp_s = ser.reindex(new_index).interpolate(method='pchip') interp_s[49:51] diff --git a/doc/source/options.rst b/doc/source/options.rst index 9ede87422b21c..7e140b1b2deaf 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -154,7 +154,7 @@ lines are replaced by an ellipsis. .. ipython:: python - df=pd.DataFrame(np.random.randn(7,2)) + df = pd.DataFrame(np.random.randn(7,2)) pd.set_option('max_rows', 7) df pd.set_option('max_rows', 5) @@ -166,7 +166,7 @@ dataframes to stretch across pages, wrapped over the full column vs row-wise. .. ipython:: python - df=pd.DataFrame(np.random.randn(5,10)) + df = pd.DataFrame(np.random.randn(5,10)) pd.set_option('expand_frame_repr', True) df pd.set_option('expand_frame_repr', False) @@ -178,7 +178,7 @@ dataframes to stretch across pages, wrapped over the full column vs row-wise. .. ipython:: python - df=pd.DataFrame(np.random.randn(10,10)) + df = pd.DataFrame(np.random.randn(10,10)) pd.set_option('max_rows', 5) pd.set_option('large_repr', 'truncate') df @@ -192,8 +192,8 @@ of this length or longer will be truncated with an ellipsis. .. ipython:: python - df=pd.DataFrame(np.array([['foo', 'bar', 'bim', 'uncomfortably long string'], - ['horse', 'cow', 'banana', 'apple']])) + df = pd.DataFrame(np.array([['foo', 'bar', 'bim', 'uncomfortably long string'], + ['horse', 'cow', 'banana', 'apple']])) pd.set_option('max_colwidth',40) df pd.set_option('max_colwidth', 6) @@ -205,7 +205,7 @@ will be given. .. ipython:: python - df=pd.DataFrame(np.random.randn(10,10)) + df = pd.DataFrame(np.random.randn(10,10)) pd.set_option('max_info_columns', 11) df.info() pd.set_option('max_info_columns', 5) @@ -219,7 +219,7 @@ can specify the option ``df.info(null_counts=True)`` to override on showing a pa .. ipython:: python - df=pd.DataFrame(np.random.choice([0,1,np.nan],size=(10,10))) + df =pd.DataFrame(np.random.choice([0,1,np.nan], size=(10,10))) df pd.set_option('max_info_rows', 11) df.info() @@ -232,7 +232,7 @@ suggestion. .. ipython:: python - df=pd.DataFrame(np.random.randn(5,5)) + df = pd.DataFrame(np.random.randn(5,5)) pd.set_option('precision',7) df pd.set_option('precision',4) @@ -244,7 +244,7 @@ precision at which the number is stored. .. ipython:: python - df=pd.DataFrame(np.random.randn(6,6)) + df = pd.DataFrame(np.random.randn(6,6)) pd.set_option('chop_threshold', 0) df pd.set_option('chop_threshold', .5) @@ -256,7 +256,8 @@ Options are 'right', and 'left'. .. ipython:: python - df=pd.DataFrame(np.array([np.random.randn(6), np.random.randint(1,9,6)*.1, np.zeros(6)]).T, columns=['A', 'B', 'C'], dtype='float') + df = pd.DataFrame(np.array([np.random.randn(6), np.random.randint(1,9,6)*.1, np.zeros(6)]).T, + columns=['A', 'B', 'C'], dtype='float') pd.set_option('colheader_justify', 'right') df pd.set_option('colheader_justify', 'left') diff --git a/doc/source/r_interface.rst b/doc/source/r_interface.rst index da37c92c88ecf..74cdc5a526585 100644 --- a/doc/source/r_interface.rst +++ b/doc/source/r_interface.rst @@ -5,8 +5,8 @@ .. ipython:: python :suppress: - from pandas import * - options.display.max_rows=15 + import pandas as pd + pd.options.display.max_rows = 15 ****************** @@ -136,10 +136,8 @@ DataFrames into the equivalent R object (that is, **data.frame**): .. ipython:: python - from pandas import DataFrame - - df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C':[7,8,9]}, - index=["one", "two", "three"]) + df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C':[7,8,9]}, + index=["one", "two", "three"]) r_dataframe = com.convert_to_r_dataframe(df) print(type(r_dataframe))