diff --git a/doc/source/10min.rst b/doc/source/10min.rst index 94c2d921eb116..1714e00030026 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -6,18 +6,16 @@ :suppress: import numpy as np - import random + import pandas as pd import os np.random.seed(123456) - from pandas import options - import pandas as pd np.set_printoptions(precision=4, suppress=True) import matplotlib try: matplotlib.style.use('ggplot') except AttributeError: - options.display.mpl_style = 'default' - options.display.max_rows=15 + pd.options.display.mpl_style = 'default' + pd.options.display.max_rows = 15 #### portions of this were borrowed from the #### Pandas cheatsheet @@ -298,7 +296,7 @@ Using the :func:`~Series.isin` method for filtering: .. ipython:: python df2 = df.copy() - df2['E']=['one', 'one','two','three','four','three'] + df2['E'] = ['one', 'one','two','three','four','three'] df2 df2[df2['E'].isin(['two','four'])] @@ -310,7 +308,7 @@ by the indexes .. ipython:: python - s1 = pd.Series([1,2,3,4,5,6],index=pd.date_range('20130102',periods=6)) + s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6)) s1 df['F'] = s1 @@ -359,7 +357,7 @@ returns a copy of the data. .. ipython:: python - df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + ['E']) + df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E']) df1.loc[dates[0]:dates[1],'E'] = 1 df1 @@ -409,9 +407,9 @@ In addition, pandas automatically broadcasts along the specified dimension. .. ipython:: python - s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2) + s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2) s - df.sub(s,axis='index') + df.sub(s, axis='index') Apply @@ -431,7 +429,7 @@ See more at :ref:`Histogramming and Discretization ` .. ipython:: python - s = pd.Series(np.random.randint(0,7,size=10)) + s = pd.Series(np.random.randint(0, 7, size=10)) s s.value_counts() @@ -516,9 +514,9 @@ See the :ref:`Grouping section ` .. ipython:: python df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], + 'foo', 'bar', 'foo', 'foo'], 'B' : ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], + 'two', 'two', 'one', 'three'], 'C' : np.random.randn(8), 'D' : np.random.randn(8)}) df diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 688935c6b104d..656eff744bb47 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -6,15 +6,10 @@ :suppress: import numpy as np - import random - np.random.seed(123456) - from pandas import * - options.display.max_rows=15 import pandas as pd - randn = np.random.randn - randint = np.random.randint + np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) - from pandas.compat import range, zip + pd.options.display.max_rows=15 ****************************** MultiIndex / Advanced Indexing @@ -80,10 +75,10 @@ demo different ways to initialize MultiIndexes. tuples = list(zip(*arrays)) tuples - index = MultiIndex.from_tuples(tuples, names=['first', 'second']) + index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) index - s = Series(randn(8), index=index) + s = pd.Series(np.random.randn(8), index=index) s When you want every pairing of the elements in two iterables, it can be easier @@ -92,7 +87,7 @@ to use the ``MultiIndex.from_product`` function: .. ipython:: python iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']] - MultiIndex.from_product(iterables, names=['first', 'second']) + pd.MultiIndex.from_product(iterables, names=['first', 'second']) As a convenience, you can pass a list of arrays directly into Series or DataFrame to construct a MultiIndex automatically: @@ -101,9 +96,9 @@ DataFrame to construct a MultiIndex automatically: arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']), np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])] - s = Series(randn(8), index=arrays) + s = pd.Series(np.random.randn(8), index=arrays) s - df = DataFrame(randn(8, 4), index=arrays) + df = pd.DataFrame(np.random.randn(8, 4), index=arrays) df All of the ``MultiIndex`` constructors accept a ``names`` argument which stores @@ -119,9 +114,9 @@ of the index is up to you: .. ipython:: python - df = DataFrame(randn(3, 8), index=['A', 'B', 'C'], columns=index) + df = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=index) df - DataFrame(randn(6, 6), index=index[:6], columns=index[:6]) + pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6]) We've "sparsified" the higher levels of the indexes to make the console output a bit easier on the eyes. @@ -131,7 +126,7 @@ tuples as atomic labels on an axis: .. ipython:: python - Series(randn(8), index=tuples) + pd.Series(np.random.randn(8), index=tuples) The reason that the ``MultiIndex`` matters is that it can allow you to do grouping, selection, and reshaping operations as we will describe below and in @@ -282,16 +277,16 @@ As usual, **both sides** of the slicers are included as this is label indexing. def mklbl(prefix,n): return ["%s%s" % (prefix,i) for i in range(n)] - miindex = MultiIndex.from_product([mklbl('A',4), - mklbl('B',2), - mklbl('C',4), - mklbl('D',2)]) - micolumns = MultiIndex.from_tuples([('a','foo'),('a','bar'), - ('b','foo'),('b','bah')], - names=['lvl0', 'lvl1']) - dfmi = DataFrame(np.arange(len(miindex)*len(micolumns)).reshape((len(miindex),len(micolumns))), - index=miindex, - columns=micolumns).sortlevel().sortlevel(axis=1) + miindex = pd.MultiIndex.from_product([mklbl('A',4), + mklbl('B',2), + mklbl('C',4), + mklbl('D',2)]) + micolumns = pd.MultiIndex.from_tuples([('a','foo'),('a','bar'), + ('b','foo'),('b','bah')], + names=['lvl0', 'lvl1']) + dfmi = pd.DataFrame(np.arange(len(miindex)*len(micolumns)).reshape((len(miindex),len(micolumns))), + index=miindex, + columns=micolumns).sortlevel().sortlevel(axis=1) dfmi Basic multi-index slicing using slices, lists, and labels. @@ -418,9 +413,9 @@ instance: .. ipython:: python - midx = MultiIndex(levels=[['zero', 'one'], ['x','y']], - labels=[[1,1,0,0],[1,0,1,0]]) - df = DataFrame(randn(4,2), index=midx) + midx = pd.MultiIndex(levels=[['zero', 'one'], ['x','y']], + labels=[[1,1,0,0],[1,0,1,0]]) + df = pd.DataFrame(np.random.randn(4,2), index=midx) df df2 = df.mean(level=0) df2 @@ -471,7 +466,7 @@ labels will be sorted lexicographically! .. ipython:: python import random; random.shuffle(tuples) - s = Series(randn(8), index=MultiIndex.from_tuples(tuples)) + s = pd.Series(np.random.randn(8), index=pd.MultiIndex.from_tuples(tuples)) s s.sortlevel(0) s.sortlevel(1) @@ -509,13 +504,13 @@ an exception. Here is a concrete example to illustrate this: .. ipython:: python tuples = [('a', 'a'), ('a', 'b'), ('b', 'a'), ('b', 'b')] - idx = MultiIndex.from_tuples(tuples) + idx = pd.MultiIndex.from_tuples(tuples) idx.lexsort_depth reordered = idx[[1, 0, 3, 2]] reordered.lexsort_depth - s = Series(randn(4), index=reordered) + s = pd.Series(np.random.randn(4), index=reordered) s.ix['a':'a'] However: @@ -540,7 +535,7 @@ index positions. ``take`` will also accept negative integers as relative positio .. ipython:: python - index = Index(randint(0, 1000, 10)) + index = pd.Index(np.random.randint(0, 1000, 10)) index positions = [0, 9, 3] @@ -548,7 +543,7 @@ index positions. ``take`` will also accept negative integers as relative positio index[positions] index.take(positions) - ser = Series(randn(10)) + ser = pd.Series(np.random.randn(10)) ser.iloc[positions] ser.take(positions) @@ -558,7 +553,7 @@ row or column positions. .. ipython:: python - frm = DataFrame(randn(5, 3)) + frm = pd.DataFrame(np.random.randn(5, 3)) frm.take([1, 4, 3]) @@ -569,11 +564,11 @@ intended to work on boolean indices and may return unexpected results. .. ipython:: python - arr = randn(10) + arr = np.random.randn(10) arr.take([False, False, True, True]) arr[[0, 1]] - ser = Series(randn(10)) + ser = pd.Series(np.random.randn(10)) ser.take([False, False, True, True]) ser.ix[[0, 1]] @@ -583,14 +578,14 @@ faster than fancy indexing. .. ipython:: - arr = randn(10000, 5) + arr = np.random.randn(10000, 5) indexer = np.arange(10000) random.shuffle(indexer) timeit arr[indexer] timeit arr.take(indexer, axis=0) - ser = Series(arr[:, 0]) + ser = pd.Series(arr[:, 0]) timeit ser.ix[indexer] timeit ser.take(indexer) @@ -608,10 +603,9 @@ setting the index of a ``DataFrame/Series`` with a ``category`` dtype would conv .. ipython:: python - df = DataFrame({'A' : np.arange(6), - 'B' : Series(list('aabbca')).astype('category', - categories=list('cab')) - }) + df = pd.DataFrame({'A': np.arange(6), + 'B': list('aabbca')}) + df['B'] = df['B'].astype('category', categories=list('cab')) df df.dtypes df.B.cat.categories @@ -669,10 +663,10 @@ values NOT in the categories, similarly to how you can reindex ANY pandas index. .. code-block:: python - In [10]: df3 = DataFrame({'A' : np.arange(6), - 'B' : Series(list('aabbca')).astype('category', - categories=list('abc')) - }).set_index('B') + In [9]: df3 = pd.DataFrame({'A' : np.arange(6), + 'B' : pd.Series(list('aabbca')).astype('category')}) + + In [11]: df3 = df3.set_index('B') In [11]: df3.index Out[11]: @@ -680,7 +674,7 @@ values NOT in the categories, similarly to how you can reindex ANY pandas index. categories=[u'a', u'b', u'c'], ordered=False) - In [12]: pd.concat([df2,df3] + In [12]: pd.concat([df2, df3] TypeError: categories must match existing categories when appending .. _indexing.float64index: @@ -705,9 +699,9 @@ same. .. ipython:: python - indexf = Index([1.5, 2, 3, 4.5, 5]) + indexf = pd.Index([1.5, 2, 3, 4.5, 5]) indexf - sf = Series(range(5),index=indexf) + sf = pd.Series(range(5), index=indexf) sf Scalar selection for ``[],.ix,.loc`` will always be label based. An integer will match an equal float index (e.g. ``3`` is equivalent to ``3.0``) @@ -749,17 +743,17 @@ In non-float indexes, slicing using floats will raise a ``TypeError`` .. code-block:: python - In [1]: Series(range(5))[3.5] + In [1]: pd.Series(range(5))[3.5] TypeError: the label [3.5] is not a proper indexer for this index type (Int64Index) - In [1]: Series(range(5))[3.5:4.5] + In [1]: pd.Series(range(5))[3.5:4.5] TypeError: the slice start [3.5] is not a proper indexer for this index type (Int64Index) Using a scalar float indexer will be deprecated in a future version, but is allowed for now. .. code-block:: python - In [3]: Series(range(5))[3.0] + In [3]: pd.Series(range(5))[3.0] Out[3]: 3 Here is a typical use-case for using this type of indexing. Imagine that you have a somewhat @@ -768,12 +762,12 @@ example be millisecond offsets. .. ipython:: python - dfir = concat([DataFrame(randn(5,2), - index=np.arange(5) * 250.0, - columns=list('AB')), - DataFrame(randn(6,2), - index=np.arange(4,10) * 250.1, - columns=list('AB'))]) + dfir = pd.concat([pd.DataFrame(np.random.randn(5,2), + index=np.arange(5) * 250.0, + columns=list('AB')), + pd.DataFrame(np.random.randn(6,2), + index=np.arange(4,10) * 250.1, + columns=list('AB'))]) dfir Selection operations then will always work on a value basis, for all selection operators. diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 76efdc0553c7d..96372ddab68bc 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1,16 +1,14 @@ .. currentmodule:: pandas -.. _basics: .. ipython:: python :suppress: import numpy as np - from pandas import * - randn = np.random.randn + import pandas as pd np.set_printoptions(precision=4, suppress=True) - from pandas.compat import lrange - options.display.max_rows=15 + pd.options.display.max_rows = 15 +.. _basics: ============================== Essential Basic Functionality @@ -22,13 +20,13 @@ the previous section: .. ipython:: python - index = date_range('1/1/2000', periods=8) - s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) - df = DataFrame(randn(8, 3), index=index, - columns=['A', 'B', 'C']) - wp = Panel(randn(2, 5, 4), items=['Item1', 'Item2'], - major_axis=date_range('1/1/2000', periods=5), - minor_axis=['A', 'B', 'C', 'D']) + index = pd.date_range('1/1/2000', periods=8) + s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) + df = pd.DataFrame(np.random.randn(8, 3), index=index, + columns=['A', 'B', 'C']) + wp = pd.Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=pd.date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) .. _basics.head_tail: @@ -41,7 +39,7 @@ of elements to display is five, but you may pass a custom number. .. ipython:: python - long_series = Series(randn(1000)) + long_series = pd.Series(np.random.randn(1000)) long_series.head() long_series.tail(3) @@ -143,9 +141,9 @@ either match on the *index* or *columns* via the **axis** keyword: .. ipython:: python - df = DataFrame({'one' : Series(randn(3), index=['a', 'b', 'c']), - 'two' : Series(randn(4), index=['a', 'b', 'c', 'd']), - 'three' : Series(randn(3), index=['b', 'c', 'd'])}) + df = pd.DataFrame({'one' : pd.Series(np.random.randn(3), index=['a', 'b', 'c']), + 'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']), + 'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])}) df row = df.ix[1] column = df['two'] @@ -166,8 +164,8 @@ Furthermore you can align a level of a multi-indexed DataFrame with a Series. .. ipython:: python dfmi = df.copy() - dfmi.index = MultiIndex.from_tuples([(1,'a'),(1,'b'),(1,'c'),(2,'a')], - names=['first','second']) + dfmi.index = pd.MultiIndex.from_tuples([(1,'a'),(1,'b'),(1,'c'),(2,'a')], + names=['first','second']) dfmi.sub(column, axis=0, level='second') With Panel, describing the matching behavior is a bit more difficult, so @@ -256,17 +254,17 @@ You can test if a pandas object is empty, via the :attr:`~DataFrame.empty` prope .. ipython:: python df.empty - DataFrame(columns=list('ABC')).empty + pd.DataFrame(columns=list('ABC')).empty To evaluate single-element pandas objects in a boolean context, use the method :meth:`~DataFrame.bool`: .. ipython:: python - Series([True]).bool() - Series([False]).bool() - DataFrame([[True]]).bool() - DataFrame([[False]]).bool() + pd.Series([True]).bool() + pd.Series([False]).bool() + pd.DataFrame([[True]]).bool() + pd.DataFrame([[False]]).bool() .. warning:: @@ -327,8 +325,8 @@ equality to be True: .. ipython:: python - df1 = DataFrame({'col':['foo', 0, np.nan]}) - df2 = DataFrame({'col':[np.nan, 0, 'foo']}, index=[2,1,0]) + df1 = pd.DataFrame({'col':['foo', 0, np.nan]}) + df2 = pd.DataFrame({'col':[np.nan, 0, 'foo']}, index=[2,1,0]) df1.equals(df2) df1.equals(df2.sort()) @@ -348,10 +346,10 @@ which we illustrate: .. ipython:: python - df1 = DataFrame({'A' : [1., np.nan, 3., 5., np.nan], - 'B' : [np.nan, 2., 3., np.nan, 6.]}) - df2 = DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.], - 'B' : [np.nan, np.nan, 3., 4., 6., 8.]}) + df1 = pd.DataFrame({'A' : [1., np.nan, 3., 5., np.nan], + 'B' : [np.nan, 2., 3., np.nan, 6.]}) + df2 = pd.DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.], + 'B' : [np.nan, np.nan, 3., 4., 6., 8.]}) df1 df2 df1.combine_first(df2) @@ -368,7 +366,7 @@ So, for instance, to reproduce :meth:`~DataFrame.combine_first` as above: .. ipython:: python - combiner = lambda x, y: np.where(isnull(x), y, x) + combiner = lambda x, y: np.where(pd.isnull(x), y, x) df1.combine(df2, combiner) .. _basics.stats: @@ -467,7 +465,7 @@ number of unique non-null values: .. ipython:: python - series = Series(randn(500)) + series = pd.Series(np.random.randn(500)) series[20:500] = np.nan series[10:20] = 5 series.nunique() @@ -483,10 +481,10 @@ course): .. ipython:: python - series = Series(randn(1000)) + series = pd.Series(np.random.randn(1000)) series[::2] = np.nan series.describe() - frame = DataFrame(randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) frame.ix[::2] = np.nan frame.describe() @@ -503,7 +501,7 @@ summary of the number of unique values and most frequently occurring values: .. ipython:: python - s = Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a']) + s = pd.Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a']) s.describe() Note that on a mixed-type DataFrame object, :meth:`~DataFrame.describe` will @@ -512,7 +510,7 @@ categorical columns: .. ipython:: python - frame = DataFrame({'a': ['Yes', 'Yes', 'No', 'No'], 'b': range(4)}) + frame = pd.DataFrame({'a': ['Yes', 'Yes', 'No', 'No'], 'b': range(4)}) frame.describe() This behaviour can be controlled by providing a list of types as ``include``/``exclude`` @@ -538,11 +536,11 @@ corresponding values: .. ipython:: python - s1 = Series(randn(5)) + s1 = pd.Series(np.random.randn(5)) s1 s1.idxmin(), s1.idxmax() - df1 = DataFrame(randn(5,3), columns=['A','B','C']) + df1 = pd.DataFrame(np.random.randn(5,3), columns=['A','B','C']) df1 df1.idxmin(axis=0) df1.idxmax(axis=1) @@ -553,7 +551,7 @@ matching index: .. ipython:: python - df3 = DataFrame([2, 1, 1, 3, np.nan], columns=['A'], index=list('edcba')) + df3 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=['A'], index=list('edcba')) df3 df3['A'].idxmin() @@ -573,18 +571,18 @@ of a 1D array of values. It can also be used as a function on regular arrays: data = np.random.randint(0, 7, size=50) data - s = Series(data) + s = pd.Series(data) s.value_counts() - value_counts(data) + pd.value_counts(data) Similarly, you can get the most frequently occurring value(s) (the mode) of the values in a Series or DataFrame: .. ipython:: python - s5 = Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7]) + s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7]) s5.mode() - df5 = DataFrame({"A": np.random.randint(0, 7, size=50), - "B": np.random.randint(-10, 15, size=50)}) + df5 = pd.DataFrame({"A": np.random.randint(0, 7, size=50), + "B": np.random.randint(-10, 15, size=50)}) df5.mode() @@ -597,10 +595,10 @@ and :func:`qcut` (bins based on sample quantiles) functions: .. ipython:: python arr = np.random.randn(20) - factor = cut(arr, 4) + factor = pd.cut(arr, 4) factor - factor = cut(arr, [-5, -1, 0, 1, 5]) + factor = pd.cut(arr, [-5, -1, 0, 1, 5]) factor :func:`qcut` computes sample quantiles. For example, we could slice up some @@ -609,16 +607,16 @@ normally distributed data into equal-size quartiles like so: .. ipython:: python arr = np.random.randn(30) - factor = qcut(arr, [0, .25, .5, .75, 1]) + factor = pd.qcut(arr, [0, .25, .5, .75, 1]) factor - value_counts(factor) + pd.value_counts(factor) We can also pass infinite values to define the bins: .. ipython:: python arr = np.random.randn(20) - factor = cut(arr, [-np.inf, 0, np.inf]) + factor = pd.cut(arr, [-np.inf, 0, np.inf]) factor .. _basics.apply: @@ -647,8 +645,8 @@ maximum value for each column occurred: .. ipython:: python - tsdf = DataFrame(randn(1000, 3), columns=['A', 'B', 'C'], - index=date_range('1/1/2000', periods=1000)) + tsdf = pd.DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'], + index=pd.date_range('1/1/2000', periods=1000)) tsdf.apply(lambda x: x.idxmax()) You may also pass additional arguments and keyword arguments to the :meth:`~DataFrame.apply` @@ -671,14 +669,14 @@ Series operation on each column or row: .. ipython:: python :suppress: - tsdf = DataFrame(randn(10, 3), columns=['A', 'B', 'C'], - index=date_range('1/1/2000', periods=10)) + tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], + index=pd.date_range('1/1/2000', periods=10)) tsdf.values[3:7] = np.nan .. ipython:: python tsdf - tsdf.apply(Series.interpolate) + tsdf.apply(pd.Series.interpolate) Finally, :meth:`~DataFrame.apply` takes an argument ``raw`` which is False by default, which converts each row or column into a Series before applying the function. When @@ -718,9 +716,9 @@ to :ref:`merging/joining functionality `: .. ipython:: python - s = Series(['six', 'seven', 'six', 'seven', 'six'], - index=['a', 'b', 'c', 'd', 'e']) - t = Series({'six' : 6., 'seven' : 7.}) + s = pd.Series(['six', 'seven', 'six', 'seven', 'six'], + index=['a', 'b', 'c', 'd', 'e']) + t = pd.Series({'six' : 6., 'seven' : 7.}) s s.map(t) @@ -797,7 +795,7 @@ This is equivalent to the following .. ipython:: python - result = Panel(dict([ (ax,f(panel.loc[:,:,ax])) + result = pd.Panel(dict([ (ax, f(panel.loc[:,:,ax])) for ax in panel.minor_axis ])) result result.loc[:,:,'ItemA'] @@ -823,7 +821,7 @@ Here is a simple example: .. ipython:: python - s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) + s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) s s.reindex(['e', 'b', 'f', 'd']) @@ -909,7 +907,7 @@ It returns a tuple with both of the reindexed Series: .. ipython:: python - s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) + s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) s1 = s[:4] s2 = s[1:] s1.align(s2) @@ -960,8 +958,8 @@ We illustrate these fill methods on a simple Series: .. ipython:: python - rng = date_range('1/3/2000', periods=8) - ts = Series(randn(8), index=rng) + rng = pd.date_range('1/3/2000', periods=8) + ts = pd.Series(np.random.randn(8), index=rng) ts2 = ts[[0, 3, 6]] ts ts2 @@ -1095,11 +1093,11 @@ For instance, a contrived way to transpose the DataFrame would be: .. ipython:: python - df2 = DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) + df2 = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) print(df2) print(df2.T) - df2_t = DataFrame(dict((idx,values) for idx, values in df2.iterrows())) + df2_t = pd.DataFrame(dict((idx,values) for idx, values in df2.iterrows())) print(df2_t) .. note:: @@ -1109,7 +1107,7 @@ For instance, a contrived way to transpose the DataFrame would be: .. ipython:: python - df_iter = DataFrame([[1, 1.0]], columns=['x', 'y']) + df_iter = pd.DataFrame([[1, 1.0]], columns=['x', 'y']) row = next(df_iter.iterrows())[1] print(row['x'].dtype) print(df_iter['x'].dtype) @@ -1140,7 +1138,7 @@ This will return a Series, indexed like the existing Series. .. ipython:: python # datetime - s = Series(date_range('20130101 09:10:12',periods=4)) + s = pd.Series(pd.date_range('20130101 09:10:12',periods=4)) s s.dt.hour s.dt.second @@ -1171,7 +1169,7 @@ The ``.dt`` accessor works for period and timedelta dtypes. .. ipython:: python # period - s = Series(period_range('20130101',periods=4,freq='D')) + s = pd.Series(pd.period_range('20130101', periods=4,freq='D')) s s.dt.year s.dt.day @@ -1179,7 +1177,7 @@ The ``.dt`` accessor works for period and timedelta dtypes. .. ipython:: python # timedelta - s = Series(timedelta_range('1 day 00:00:05',periods=4,freq='s')) + s = pd.Series(pd.timedelta_range('1 day 00:00:05',periods=4,freq='s')) s s.dt.days s.dt.seconds @@ -1200,7 +1198,7 @@ built-in string methods. For example: .. ipython:: python - s = Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) + s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) s.str.lower() Powerful pattern-matching methods are provided as well, but note that @@ -1234,7 +1232,7 @@ determine the sort order: .. ipython:: python - df1 = DataFrame({'one':[2,1,1,1],'two':[1,3,2,4],'three':[5,4,3,2]}) + df1 = pd.DataFrame({'one':[2,1,1,1],'two':[1,3,2,4],'three':[5,4,3,2]}) df1.sort_index(by='two') The ``by`` argument can take a list of column names, e.g.: @@ -1265,12 +1263,12 @@ Series has the :meth:`~Series.searchsorted` method, which works similar to .. ipython:: python - ser = Series([1, 2, 3]) + ser = pd.Series([1, 2, 3]) ser.searchsorted([0, 3]) ser.searchsorted([0, 4]) ser.searchsorted([1, 3], side='right') ser.searchsorted([1, 3], side='left') - ser = Series([3, 1, 2]) + ser = pd.Series([3, 1, 2]) ser.searchsorted([0, 3], sorter=np.argsort(ser)) .. _basics.nsorted: @@ -1286,7 +1284,7 @@ faster than sorting the entire Series and calling ``head(n)`` on the result. .. ipython:: python - s = Series(np.random.permutation(10)) + s = pd.Series(np.random.permutation(10)) s s.order() s.nsmallest(3) @@ -1303,7 +1301,7 @@ all levels to ``by``. .. ipython:: python - df1.columns = MultiIndex.from_tuples([('a','one'),('a','two'),('b','three')]) + df1.columns = pd.MultiIndex.from_tuples([('a','one'),('a','two'),('b','three')]) df1.sort_index(by=('a','two')) @@ -1336,13 +1334,13 @@ attribute for DataFrames returns a Series with the data type of each column. .. ipython:: python - dft = DataFrame(dict( A = np.random.rand(3), - B = 1, - C = 'foo', - D = Timestamp('20010102'), - E = Series([1.0]*3).astype('float32'), - F = False, - G = Series([1]*3,dtype='int8'))) + dft = pd.DataFrame(dict(A = np.random.rand(3), + B = 1, + C = 'foo', + D = pd.Timestamp('20010102'), + E = pd.Series([1.0]*3).astype('float32'), + F = False, + G = pd.Series([1]*3,dtype='int8'))) dft dft.dtypes @@ -1359,10 +1357,10 @@ general). .. ipython:: python # these ints are coerced to floats - Series([1, 2, 3, 4, 5, 6.]) + pd.Series([1, 2, 3, 4, 5, 6.]) # string data forces an ``object`` dtype - Series([1, 2, 3, 6., 'foo']) + pd.Series([1, 2, 3, 6., 'foo']) The method :meth:`~DataFrame.get_dtype_counts` will return the number of columns of each type in a ``DataFrame``: @@ -1378,12 +1376,12 @@ different numeric dtypes will **NOT** be combined. The following example will gi .. ipython:: python - df1 = DataFrame(randn(8, 1), columns = ['A'], dtype = 'float32') + df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32') df1 df1.dtypes - df2 = DataFrame(dict( A = Series(randn(8),dtype='float16'), - B = Series(randn(8)), - C = Series(np.array(randn(8),dtype='uint8')) )) + df2 = pd.DataFrame(dict( A = pd.Series(np.random.randn(8), dtype='float16'), + B = pd.Series(np.random.randn(8)), + C = pd.Series(np.array(np.random.randn(8), dtype='uint8')) )) df2 df2.dtypes @@ -1395,16 +1393,16 @@ By default integer types are ``int64`` and float types are ``float64``, .. ipython:: python - DataFrame([1, 2], columns=['a']).dtypes - DataFrame({'a': [1, 2]}).dtypes - DataFrame({'a': 1 }, index=list(range(2))).dtypes + pd.DataFrame([1, 2], columns=['a']).dtypes + pd.DataFrame({'a': [1, 2]}).dtypes + pd.DataFrame({'a': 1 }, index=list(range(2))).dtypes Numpy, however will choose *platform-dependent* types when creating arrays. The following **WILL** result in ``int32`` on 32-bit platform. .. ipython:: python - frame = DataFrame(np.array([1, 2])) + frame = pd.DataFrame(np.array([1, 2])) upcasting @@ -1473,9 +1471,10 @@ but occasionally has non-dates intermixed and you want to represent as missing. .. ipython:: python - s = Series([datetime(2001,1,1,0,0), - 'foo', 1.0, 1, Timestamp('20010104'), - '20010105'],dtype='O') + import datetime + s = pd.Series([datetime.datetime(2001,1,1,0,0), + 'foo', 1.0, 1, pd.Timestamp('20010104'), + '20010105'], dtype='O') s s.convert_objects(convert_dates='coerce') @@ -1527,14 +1526,14 @@ dtypes: .. ipython:: python - df = DataFrame({'string': list('abc'), - 'int64': list(range(1, 4)), - 'uint8': np.arange(3, 6).astype('u1'), - 'float64': np.arange(4.0, 7.0), - 'bool1': [True, False, True], - 'bool2': [False, True, False], - 'dates': pd.date_range('now', periods=3).values, - 'category': pd.Categorical(list("ABC"))}) + df = pd.DataFrame({'string': list('abc'), + 'int64': list(range(1, 4)), + 'uint8': np.arange(3, 6).astype('u1'), + 'float64': np.arange(4.0, 7.0), + 'bool1': [True, False, True], + 'bool2': [False, True, False], + 'dates': pd.date_range('now', periods=3).values, + 'category': pd.Series(list("ABC")).astype('category')}) df['tdeltas'] = df.dates.diff() df['uint64'] = np.arange(3, 6).astype('u8') df['other_dates'] = pd.date_range('20130101', periods=3).values diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 11e7fb0fd4117..85fab1367114e 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -6,14 +6,10 @@ :suppress: import numpy as np - import random - import os - np.random.seed(123456) - from pandas import options - from pandas import * import pandas as pd + np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) - options.display.max_rows=15 + pd.options.display.max_rows = 15 **************** @@ -65,14 +61,14 @@ By specifying ``dtype="category"`` when constructing a `Series`: .. ipython:: python - s = Series(["a","b","c","a"], dtype="category") + s = pd.Series(["a","b","c","a"], dtype="category") s By converting an existing `Series` or column to a ``category`` dtype: .. ipython:: python - df = DataFrame({"A":["a","b","c","a"]}) + df = pd.DataFrame({"A":["a","b","c","a"]}) df["B"] = df["A"].astype('category') df @@ -80,7 +76,7 @@ By using some special functions: .. ipython:: python - df = DataFrame({'value': np.random.randint(0, 100, 20)}) + df = pd.DataFrame({'value': np.random.randint(0, 100, 20)}) labels = [ "{0} - {1}".format(i, i + 9) for i in range(0, 100, 10) ] df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) @@ -92,11 +88,11 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to .. ipython:: python - raw_cat = Categorical(["a","b","c","a"], categories=["b","c","d"], + raw_cat = pd.Categorical(["a","b","c","a"], categories=["b","c","d"], ordered=False) - s = Series(raw_cat) + s = pd.Series(raw_cat) s - df = DataFrame({"A":["a","b","c","a"]}) + df = pd.DataFrame({"A":["a","b","c","a"]}) df["B"] = raw_cat df @@ -104,7 +100,7 @@ You can also specify differently ordered categories or make the resulting data o .. ipython:: python - s = Series(["a","b","c","a"]) + s = pd.Series(["a","b","c","a"]) s_cat = s.astype("category", categories=["b","c","d"], ordered=False) s_cat @@ -129,7 +125,7 @@ To get back to the original Series or `numpy` array, use ``Series.astype(origina .. ipython:: python - s = Series(["a","b","c","a"]) + s = pd.Series(["a","b","c","a"]) s s2 = s.astype('category') s2 @@ -143,7 +139,7 @@ constructor to save the factorize step during normal constructor mode: .. ipython:: python splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) - s = Series(Categorical.from_codes(splitter, categories=["train", "test"])) + s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) Description ----------- @@ -153,8 +149,8 @@ Using ``.describe()`` on categorical data will produce similar output to a `Seri .. ipython:: python - cat = Categorical(["a","c","c",np.nan], categories=["b","a","c",np.nan] ) - df = DataFrame({"cat":cat, "s":["a","c","c",np.nan]}) + cat = pd.Categorical(["a","c","c",np.nan], categories=["b","a","c",np.nan] ) + df = pd.DataFrame({"cat":cat, "s":["a","c","c",np.nan]}) df.describe() df["cat"].describe() @@ -168,7 +164,7 @@ passed in values. .. ipython:: python - s = Series(["a","b","c","a"], dtype="category") + s = pd.Series(["a","b","c","a"], dtype="category") s.cat.categories s.cat.ordered @@ -176,7 +172,7 @@ It's also possible to pass in the categories in a specific order: .. ipython:: python - s = Series(Categorical(["a","b","c","a"], categories=["c","b","a"])) + s = pd.Series(pd.Categorical(["a","b","c","a"], categories=["c","b","a"])) s.cat.categories s.cat.ordered @@ -194,7 +190,7 @@ by using the :func:`Categorical.rename_categories` method: .. ipython:: python - s = Series(["a","b","c","a"], dtype="category") + s = pd.Series(["a","b","c","a"], dtype="category") s s.cat.categories = ["Group %s" % g for g in s.cat.categories] s @@ -247,7 +243,7 @@ Removing unused categories can also be done: .. ipython:: python - s = Series(Categorical(["a","b","a"], categories=["a","b","c","d"])) + s = pd.Series(pd.Categorical(["a","b","a"], categories=["a","b","c","d"])) s s.cat.remove_unused_categories() @@ -259,7 +255,7 @@ or simply set the categories to a predefined scale, use :func:`Categorical.set_c .. ipython:: python - s = Series(["one","two","four", "-"], dtype="category") + s = pd.Series(["one","two","four", "-"], dtype="category") s s = s.cat.set_categories(["one","two","three","four"]) s @@ -283,9 +279,9 @@ meaning and certain operations are possible. If the categorical is unordered, `` .. ipython:: python - s = Series(Categorical(["a","b","c","a"], ordered=False)) + s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=False)) s.sort() - s = Series(["a","b","c","a"]).astype('category', ordered=True) + s = pd.Series(["a","b","c","a"]).astype('category', ordered=True) s.sort() s s.min(), s.max() @@ -303,7 +299,7 @@ This is even true for strings and numeric data: .. ipython:: python - s = Series([1,2,3,1], dtype="category") + s = pd.Series([1,2,3,1], dtype="category") s = s.cat.set_categories([2,3,1], ordered=True) s s.sort() @@ -321,7 +317,7 @@ necessarily make the sort order the same as the categories order. .. ipython:: python - s = Series([1,2,3,1], dtype="category") + s = pd.Series([1,2,3,1], dtype="category") s = s.cat.reorder_categories([2,3,1], ordered=True) s s.sort() @@ -351,8 +347,8 @@ The ordering of the categorical is determined by the ``categories`` of that colu .. ipython:: python - dfs = DataFrame({'A' : Categorical(list('bbeebbaa'), categories=['e','a','b'], ordered=True), - 'B' : [1,2,1,2,2,1,2,1] }) + dfs = pd.DataFrame({'A' : pd.Categorical(list('bbeebbaa'), categories=['e','a','b'], ordered=True), + 'B' : [1,2,1,2,2,1,2,1] }) dfs.sort(['A', 'B']) Reordering the ``categories`` changes a future sort. @@ -385,9 +381,9 @@ categories or a categorical with any list-like object, will raise a TypeError. .. ipython:: python - cat = Series([1,2,3]).astype("category", categories=[3,2,1], ordered=True) - cat_base = Series([2,2,2]).astype("category", categories=[3,2,1], ordered=True) - cat_base2 = Series([2,2,2]).astype("category", ordered=True) + cat = pd.Series([1,2,3]).astype("category", categories=[3,2,1], ordered=True) + cat_base = pd.Series([2,2,2]).astype("category", categories=[3,2,1], ordered=True) + cat_base2 = pd.Series([2,2,2]).astype("category", ordered=True) cat cat_base @@ -443,19 +439,19 @@ present in the data: .. ipython:: python - s = Series(Categorical(["a","b","c","c"], categories=["c","a","b","d"])) + s = pd.Series(pd.Categorical(["a","b","c","c"], categories=["c","a","b","d"])) s.value_counts() Groupby will also show "unused" categories: .. ipython:: python - cats = Categorical(["a","b","b","b","c","c","c"], categories=["a","b","c","d"]) - df = DataFrame({"cats":cats,"values":[1,2,2,2,3,4,5]}) + cats = pd.Categorical(["a","b","b","b","c","c","c"], categories=["a","b","c","d"]) + df = pd.DataFrame({"cats":cats,"values":[1,2,2,2,3,4,5]}) df.groupby("cats").mean() - cats2 = Categorical(["a","a","b","b"], categories=["a","b","c"]) - df2 = DataFrame({"cats":cats2,"B":["c","d","c","d"], "values":[1,2,3,4]}) + cats2 = pd.Categorical(["a","a","b","b"], categories=["a","b","c"]) + df2 = pd.DataFrame({"cats":cats2,"B":["c","d","c","d"], "values":[1,2,3,4]}) df2.groupby(["cats","B"]).mean() @@ -463,8 +459,8 @@ Pivot tables: .. ipython:: python - raw_cat = Categorical(["a","a","b","b"], categories=["a","b","c"]) - df = DataFrame({"A":raw_cat,"B":["c","d","c","d"], "values":[1,2,3,4]}) + raw_cat = pd.Categorical(["a","a","b","b"], categories=["a","b","c"]) + df = pd.DataFrame({"A":raw_cat,"B":["c","d","c","d"], "values":[1,2,3,4]}) pd.pivot_table(df, values='values', index=['A', 'B']) Data munging @@ -482,10 +478,10 @@ the ``category`` dtype is preserved. .. ipython:: python - idx = Index(["h","i","j","k","l","m","n",]) - cats = Series(["a","b","b","b","c","c","c"], dtype="category", index=idx) + idx = pd.Index(["h","i","j","k","l","m","n",]) + cats = pd.Series(["a","b","b","b","c","c","c"], dtype="category", index=idx) values= [1,2,2,2,3,4,5] - df = DataFrame({"cats":cats,"values":values}, index=idx) + df = pd.DataFrame({"cats":cats,"values":values}, index=idx) df.iloc[2:4,:] df.iloc[2:4,:].dtypes df.loc["h":"j","cats"] @@ -527,10 +523,10 @@ Setting values in a categorical column (or `Series`) works as long as the value .. ipython:: python - idx = Index(["h","i","j","k","l","m","n"]) - cats = Categorical(["a","a","a","a","a","a","a"], categories=["a","b"]) + idx = pd.Index(["h","i","j","k","l","m","n"]) + cats = pd.Categorical(["a","a","a","a","a","a","a"], categories=["a","b"]) values = [1,1,1,1,1,1,1] - df = DataFrame({"cats":cats,"values":values}, index=idx) + df = pd.DataFrame({"cats":cats,"values":values}, index=idx) df.iloc[2:4,:] = [["b",2],["b",2]] df @@ -543,10 +539,10 @@ Setting values by assigning categorical data will also check that the `categorie .. ipython:: python - df.loc["j":"k","cats"] = Categorical(["a","a"], categories=["a","b"]) + df.loc["j":"k","cats"] = pd.Categorical(["a","a"], categories=["a","b"]) df try: - df.loc["j":"k","cats"] = Categorical(["b","b"], categories=["a","b","c"]) + df.loc["j":"k","cats"] = pd.Categorical(["b","b"], categories=["a","b","c"]) except ValueError as e: print("ValueError: " + str(e)) @@ -554,9 +550,9 @@ Assigning a `Categorical` to parts of a column of other types will use the value .. ipython:: python - df = DataFrame({"a":[1,1,1,1,1], "b":["a","a","a","a","a"]}) - df.loc[1:2,"a"] = Categorical(["b","b"], categories=["a","b"]) - df.loc[2:3,"b"] = Categorical(["b","b"], categories=["a","b"]) + df = pd.DataFrame({"a":[1,1,1,1,1], "b":["a","a","a","a","a"]}) + df.loc[1:2,"a"] = pd.Categorical(["b","b"], categories=["a","b"]) + df.loc[2:3,"b"] = pd.Categorical(["b","b"], categories=["a","b"]) df df.dtypes @@ -569,9 +565,9 @@ but the categories of these categoricals need to be the same: .. ipython:: python - cat = Series(["a","b"], dtype="category") + cat = pd.Series(["a","b"], dtype="category") vals = [1,2] - df = DataFrame({"cats":cat, "vals":vals}) + df = pd.DataFrame({"cats":cat, "vals":vals}) res = pd.concat([df,df]) res res.dtypes @@ -611,12 +607,12 @@ relevant columns back to `category` and assign the right categories and categori .. ipython:: python - s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'd'])) + s = pd.Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'd'])) # rename the categories s.cat.categories = ["very good", "good", "bad"] # reorder the categories and add missing categories s = s.cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) - df = DataFrame({"cats":s, "vals":[1,2,3,4,5,6]}) + df = pd.DataFrame({"cats":s, "vals":[1,2,3,4,5,6]}) csv = StringIO() df.to_csv(csv) df2 = pd.read_csv(StringIO(csv.getvalue())) @@ -643,10 +639,10 @@ available ("missing value") or `np.nan` is a valid category. .. ipython:: python - s = Series(["a","b",np.nan,"a"], dtype="category") + s = pd.Series(["a","b",np.nan,"a"], dtype="category") # only two categories s - s2 = Series(["a","b","c","a"], dtype="category") + s2 = pd.Series(["a","b","c","a"], dtype="category") s2.cat.categories = [1,2,np.nan] # three categories, np.nan included s2 @@ -660,11 +656,11 @@ available ("missing value") or `np.nan` is a valid category. .. ipython:: python - c = Series(["a","b",np.nan], dtype="category") + c = pd.Series(["a","b",np.nan], dtype="category") c.cat.set_categories(["a","b",np.nan], inplace=True) # will be inserted as a NA category: c[0] = np.nan - s = Series(c) + s = pd.Series(c) s pd.isnull(s) s.fillna("a") @@ -697,7 +693,7 @@ an ``object`` dtype is a constant times the length of the data. .. ipython:: python - s = Series(['foo','bar']*1000) + s = pd.Series(['foo','bar']*1000) # object dtype s.nbytes @@ -712,7 +708,7 @@ an ``object`` dtype is a constant times the length of the data. .. ipython:: python - s = Series(['foo%04d' % i for i in range(2000)]) + s = pd.Series(['foo%04d' % i for i in range(2000)]) # object dtype s.nbytes @@ -734,7 +730,7 @@ will work with the current pandas version, resulting in subtle bugs: .. code-block:: python - >>> cat = Categorical([1,2], [1,2,3]) + >>> cat = pd.Categorical([1,2], [1,2,3]) >>> # old version >>> cat.get_values() array([2, 3], dtype=int64) @@ -762,7 +758,7 @@ object and not as a low-level `numpy` array dtype. This leads to some problems. except TypeError as e: print("TypeError: " + str(e)) - dtype = Categorical(["a"]).dtype + dtype = pd.Categorical(["a"]).dtype try: np.dtype(dtype) except TypeError as e: @@ -780,15 +776,15 @@ To check if a Series contains Categorical data, with pandas 0.16 or later, use .. ipython:: python - hasattr(Series(['a'], dtype='category'), 'cat') - hasattr(Series(['a']), 'cat') + hasattr(pd.Series(['a'], dtype='category'), 'cat') + hasattr(pd.Series(['a']), 'cat') Using `numpy` functions on a `Series` of type ``category`` should not work as `Categoricals` are not numeric data (even in the case that ``.categories`` is numeric). .. ipython:: python - s = Series(Categorical([1,2,3,4])) + s = pd.Series(pd.Categorical([1,2,3,4])) try: np.sum(s) #same with np.log(s),.. @@ -807,9 +803,9 @@ basic type) and applying along columns will also convert to object. .. ipython:: python - df = DataFrame({"a":[1,2,3,4], - "b":["a","b","c","d"], - "cats":Categorical([1,2,3,2])}) + df = pd.DataFrame({"a":[1,2,3,4], + "b":["a","b","c","d"], + "cats":pd.Categorical([1,2,3,2])}) df.apply(lambda row: type(row["cats"]), axis=1) df.apply(lambda col: col.dtype, axis=0) @@ -822,10 +818,10 @@ ordering of the categories: .. ipython:: python - cats = Categorical([1,2,3,4], categories=[4,2,3,1]) + cats = pd.Categorical([1,2,3,4], categories=[4,2,3,1]) strings = ["a","b","c","d"] values = [4,2,3,1] - df = DataFrame({"strings":strings, "values":values}, index=cats) + df = pd.DataFrame({"strings":strings, "values":values}, index=cats) df.index # This should sort by categories but does not as there is no CategoricalIndex! df.sort_index() @@ -843,12 +839,12 @@ means that changes to the `Series` will in most cases change the original `Categ .. ipython:: python - cat = Categorical([1,2,3,10], categories=[1,2,3,4,10]) - s = Series(cat, name="cat") + cat = pd.Categorical([1,2,3,10], categories=[1,2,3,4,10]) + s = pd.Series(cat, name="cat") cat s.iloc[0:2] = 10 cat - df = DataFrame(s) + df = pd.DataFrame(s) df["cat"].cat.categories = [1,2,3,4,5] cat @@ -856,8 +852,8 @@ Use ``copy=True`` to prevent such a behaviour or simply don't reuse `Categorical .. ipython:: python - cat = Categorical([1,2,3,10], categories=[1,2,3,4,10]) - s = Series(cat, name="cat", copy=True) + cat = pd.Categorical([1,2,3,10], categories=[1,2,3,4,10]) + s = pd.Series(cat, name="cat", copy=True) cat s.iloc[0:2] = 10 cat diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 4621d7bd9b216..dfb9fab19bf31 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -258,7 +258,7 @@ These functions can be applied to ndarrays or Series objects: ts.plot(style='k--') @savefig rolling_mean_ex.png - rolling_mean(ts, 60).plot(style='k') + pd.rolling_mean(ts, 60).plot(style='k') They can also be applied to DataFrame objects. This is really just syntactic sugar for applying the moving window operator to all of the DataFrame's columns: @@ -275,7 +275,7 @@ sugar for applying the moving window operator to all of the DataFrame's columns: df = df.cumsum() @savefig rolling_mean_frame.png - rolling_sum(df, 60).plot(subplots=True) + pd.rolling_sum(df, 60).plot(subplots=True) The ``rolling_apply`` function takes an extra ``func`` argument and performs generic rolling computations. The ``func`` argument should be a single function @@ -286,7 +286,7 @@ compute the mean absolute deviation on a rolling basis: mad = lambda x: np.fabs(x - x.mean()).mean() @savefig rolling_apply_ex.png - rolling_apply(ts, 60, mad).plot(style='k') + pd.rolling_apply(ts, 60, mad).plot(style='k') The ``rolling_window`` function performs a generic rolling window computation on the input data. The weights used in the window are specified by the ``win_type`` @@ -311,21 +311,21 @@ keyword. The list of recognized types are: ser = pd.Series(np.random.randn(10), index=pd.date_range('1/1/2000', periods=10)) - rolling_window(ser, 5, 'triang') + pd.rolling_window(ser, 5, 'triang') Note that the ``boxcar`` window is equivalent to ``rolling_mean``. .. ipython:: python - rolling_window(ser, 5, 'boxcar') + pd.rolling_window(ser, 5, 'boxcar') - rolling_mean(ser, 5) + pd.rolling_mean(ser, 5) For some windowing functions, additional parameters must be specified: .. ipython:: python - rolling_window(ser, 5, 'gaussian', std=0.1) + pd.rolling_window(ser, 5, 'gaussian', std=0.1) By default the labels are set to the right edge of the window, but a ``center`` keyword is available so the labels can be set at the center. @@ -333,11 +333,11 @@ This keyword is available in other rolling functions as well. .. ipython:: python - rolling_window(ser, 5, 'boxcar') + pd.rolling_window(ser, 5, 'boxcar') - rolling_window(ser, 5, 'boxcar', center=True) + pd.rolling_window(ser, 5, 'boxcar', center=True) - rolling_mean(ser, 5, center=True) + pd.rolling_mean(ser, 5, center=True) .. _stats.moments.normalization: @@ -376,7 +376,7 @@ For example: .. ipython:: python df2 = df[:20] - rolling_corr(df2, df2['B'], window=5) + pd.rolling_corr(df2, df2['B'], window=5) .. _stats.moments.corr_pairwise: @@ -401,12 +401,12 @@ can even be omitted: .. ipython:: python - covs = rolling_cov(df[['B','C','D']], df[['A','B','C']], 50, pairwise=True) + covs = pd.rolling_cov(df[['B','C','D']], df[['A','B','C']], 50, pairwise=True) covs[df.index[-50]] .. ipython:: python - correls = rolling_corr(df, 50) + correls = pd.rolling_corr(df, 50) correls[df.index[-50]] .. note:: @@ -440,9 +440,9 @@ they are implemented in pandas such that the following two calls are equivalent: .. ipython:: python - rolling_mean(df, window=len(df), min_periods=1)[:5] + pd.rolling_mean(df, window=len(df), min_periods=1)[:5] - expanding_mean(df)[:5] + pd.expanding_mean(df)[:5] Like the ``rolling_`` functions, the following methods are included in the ``pandas`` namespace or can be located in ``pandas.stats.moments``. @@ -501,7 +501,7 @@ relative impact of an individual data point. As an example, here is the ts.plot(style='k--') @savefig expanding_mean_frame.png - expanding_mean(ts).plot(style='k') + pd.expanding_mean(ts).plot(style='k') .. _stats.moments.exponentially_weighted: @@ -583,7 +583,7 @@ Here is an example for a univariate time series: ts.plot(style='k--') @savefig ewma_ex.png - ewma(ts, span=20).plot(style='k') + pd.ewma(ts, span=20).plot(style='k') All the EW functions have a ``min_periods`` argument, which has the same meaning it does for all the ``expanding_`` and ``rolling_`` functions: