From 25dc280c94605174cbfc3faf4082abb62fde573c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 25 Apr 2015 14:00:54 +0200 Subject: [PATCH] DOC: clean up / consistent imports (GH9886) --- doc/source/computation.rst | 41 ++++++------ doc/source/missing_data.rst | 105 +++++++++++++------------------ doc/source/reshaping.rst | 97 ++++++++++++++-------------- doc/source/visualization.rst | 119 +++++++++++++++-------------------- 4 files changed, 159 insertions(+), 203 deletions(-) diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 4b0fe39d929a9..4621d7bd9b216 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -1,23 +1,22 @@ .. currentmodule:: pandas -.. _computation: .. ipython:: python :suppress: import numpy as np np.random.seed(123456) - from pandas import * - import pandas.util.testing as tm - randn = np.random.randn np.set_printoptions(precision=4, suppress=True) + import pandas as pd import matplotlib try: matplotlib.style.use('ggplot') except AttributeError: - options.display.mpl_style = 'default' + pd.options.display.mpl_style = 'default' import matplotlib.pyplot as plt plt.close('all') - options.display.max_rows=15 + pd.options.display.max_rows=15 + +.. _computation: Computational tools =================== @@ -36,13 +35,13 @@ NA/null values *before* computing the percent change). .. ipython:: python - ser = Series(randn(8)) + ser = pd.Series(np.random.randn(8)) ser.pct_change() .. ipython:: python - df = DataFrame(randn(10, 4)) + df = pd.DataFrame(np.random.randn(10, 4)) df.pct_change(periods=3) @@ -56,8 +55,8 @@ The ``Series`` object has a method ``cov`` to compute covariance between series .. ipython:: python - s1 = Series(randn(1000)) - s2 = Series(randn(1000)) + s1 = pd.Series(np.random.randn(1000)) + s2 = pd.Series(np.random.randn(1000)) s1.cov(s2) Analogously, ``DataFrame`` has a method ``cov`` to compute pairwise covariances @@ -78,7 +77,7 @@ among the series in the DataFrame, also excluding NA/null values. .. ipython:: python - frame = DataFrame(randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) frame.cov() ``DataFrame.cov`` also supports an optional ``min_periods`` keyword that @@ -87,7 +86,7 @@ in order to have a valid result. .. ipython:: python - frame = DataFrame(randn(20, 3), columns=['a', 'b', 'c']) + frame = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c']) frame.ix[:5, 'a'] = np.nan frame.ix[5:10, 'b'] = np.nan @@ -123,7 +122,7 @@ All of these are currently computed using pairwise complete observations. .. ipython:: python - frame = DataFrame(randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) frame.ix[::2] = np.nan # Series with Series @@ -140,7 +139,7 @@ Like ``cov``, ``corr`` also supports the optional ``min_periods`` keyword: .. ipython:: python - frame = DataFrame(randn(20, 3), columns=['a', 'b', 'c']) + frame = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c']) frame.ix[:5, 'a'] = np.nan frame.ix[5:10, 'b'] = np.nan @@ -157,8 +156,8 @@ objects. index = ['a', 'b', 'c', 'd', 'e'] columns = ['one', 'two', 'three', 'four'] - df1 = DataFrame(randn(5, 4), index=index, columns=columns) - df2 = DataFrame(randn(4, 4), index=index[:4], columns=columns) + df1 = pd.DataFrame(np.random.randn(5, 4), index=index, columns=columns) + df2 = pd.DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns) df1.corrwith(df2) df2.corrwith(df1, axis=1) @@ -172,7 +171,7 @@ of the ranks (by default) for the group: .. ipython:: python - s = Series(np.random.randn(5), index=list('abcde')) + s = pd.Series(np.random.np.random.randn(5), index=list('abcde')) s['d'] = s['b'] # so there's a tie s.rank() @@ -181,7 +180,7 @@ or the columns (``axis=1``). ``NaN`` values are excluded from the ranking. .. ipython:: python - df = DataFrame(np.random.randn(10, 6)) + df = pd.DataFrame(np.random.np.random.randn(10, 6)) df[4] = df[2][:5] # some ties df df.rank(1) @@ -253,7 +252,7 @@ These functions can be applied to ndarrays or Series objects: .. ipython:: python - ts = Series(randn(1000), index=date_range('1/1/2000', periods=1000)) + ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) ts = ts.cumsum() ts.plot(style='k--') @@ -271,7 +270,7 @@ sugar for applying the moving window operator to all of the DataFrame's columns: .. ipython:: python - df = DataFrame(randn(1000, 4), index=ts.index, + df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=['A', 'B', 'C', 'D']) df = df.cumsum() @@ -310,7 +309,7 @@ keyword. The list of recognized types are: .. ipython:: python - ser = Series(randn(10), index=date_range('1/1/2000', periods=10)) + ser = pd.Series(np.random.randn(10), index=pd.date_range('1/1/2000', periods=10)) rolling_window(ser, 5, 'triang') diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index 4505d256d31f6..04a6302f958a2 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -1,11 +1,19 @@ .. currentmodule:: pandas -.. _missing_data: .. ipython:: python :suppress: - from pandas import * - options.display.max_rows=15 + import numpy as np + import pandas as pd + pd.options.display.max_rows=15 + import matplotlib + try: + matplotlib.style.use('ggplot') + except AttributeError: + pd.options.display.mpl_style = 'default' + import matplotlib.pyplot as plt + +.. _missing_data: ************************* Working with missing data @@ -14,14 +22,6 @@ Working with missing data In this section, we will discuss missing (also referred to as NA) values in pandas. -.. ipython:: python - :suppress: - - import numpy as np; randn = np.random.randn; randint =np.random.randint - from pandas import * - import matplotlib.pyplot as plt - from pandas.compat import lrange - .. note:: The choice of using ``NaN`` internally to denote missing data was largely @@ -50,8 +50,8 @@ a data set is by reindexing. For example .. ipython:: python - df = DataFrame(randn(5, 3), index=['a', 'c', 'e', 'f', 'h'], - columns=['one', 'two', 'three']) + df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'], + columns=['one', 'two', 'three']) df['four'] = 'bar' df['five'] = df['one'] > 0 df @@ -118,7 +118,7 @@ the missing value type chosen: .. ipython:: python - s = Series([1, 2, 3]) + s = pd.Series([1, 2, 3]) s.loc[0] = None s @@ -128,7 +128,7 @@ For object containers, pandas will use the value given: .. ipython:: python - s = Series(["a", "b", "c"]) + s = pd.Series(["a", "b", "c"]) s.loc[0] = None s.loc[1] = np.nan s @@ -255,7 +255,7 @@ use case of this is to fill a DataFrame with the mean of that column. .. ipython:: python - dff = DataFrame(np.random.randn(10,3),columns=list('ABC')) + dff = pd.DataFrame(np.random.randn(10,3),columns=list('ABC')) dff.iloc[3:5,0] = np.nan dff.iloc[4:6,1] = np.nan dff.iloc[5:8,2] = np.nan @@ -307,7 +307,7 @@ Interpolation .. versionadded:: 0.13.0 :meth:`~pandas.DataFrame.interpolate`, and :meth:`~pandas.Series.interpolate` have - revamped interpolation methods and functionaility. + revamped interpolation methods and functionality. Both Series and Dataframe objects have an ``interpolate`` method that, by default, performs linear interpolation at missing datapoints. @@ -317,7 +317,7 @@ performs linear interpolation at missing datapoints. np.random.seed(123456) idx = date_range('1/1/2000', periods=100, freq='BM') - ts = Series(randn(100), index=idx) + ts = pd.Series(np.random.randn(100), index=idx) ts[1:20] = np.nan ts[60:80] = np.nan ts = ts.cumsum() @@ -328,7 +328,6 @@ performs linear interpolation at missing datapoints. ts.count() ts.interpolate().count() - plt.figure() @savefig series_interpolate.png ts.interpolate().plot() @@ -351,7 +350,7 @@ For a floating-point index, use ``method='values'``: :suppress: idx = [0., 1., 10.] - ser = Series([0., np.nan, 10.], idx) + ser = pd.Series([0., np.nan, 10.], idx) .. ipython:: python @@ -363,7 +362,7 @@ You can also interpolate with a DataFrame: .. ipython:: python - df = DataFrame({'A': [1, 2.1, np.nan, 4.7, 5.6, 6.8], + df = pd.DataFrame({'A': [1, 2.1, np.nan, 4.7, 5.6, 6.8], 'B': [.25, np.nan, np.nan, 4, 12.2, 14.4]}) df df.interpolate() @@ -401,13 +400,12 @@ Compare several methods: np.random.seed(2) - ser = Series(np.arange(1, 10.1, .25)**2 + np.random.randn(37)) + ser = pd.Series(np.arange(1, 10.1, .25)**2 + np.random.randn(37)) bad = np.array([4, 13, 14, 15, 16, 17, 18, 20, 29]) ser[bad] = np.nan methods = ['linear', 'quadratic', 'cubic'] - df = DataFrame({m: ser.interpolate(method=m) for m in methods}) - plt.figure() + df = pd.DataFrame({m: ser.interpolate(method=m) for m in methods}) @savefig compare_interpolations.png df.plot() @@ -419,7 +417,7 @@ at the new values. .. ipython:: python - ser = Series(np.sort(np.random.uniform(size=100))) + ser = pd.Series(np.sort(np.random.uniform(size=100))) # interpolate at new_index new_index = ser.index | Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75]) @@ -438,7 +436,7 @@ observation: .. ipython:: python - ser = Series([1, 3, np.nan, np.nan, np.nan, 11]) + ser = pd.Series([1, 3, np.nan, np.nan, np.nan, 11]) ser.interpolate(limit=2) .. _missing_data.replace: @@ -454,7 +452,7 @@ value: .. ipython:: python - ser = Series([0., 1., 2., 3., 4.]) + ser = pd.Series([0., 1., 2., 3., 4.]) ser.replace(0, 5) @@ -474,7 +472,7 @@ For a DataFrame, you can specify individual values by column: .. ipython:: python - df = DataFrame({'a': [0, 1, 2, 3, 4], 'b': [5, 6, 7, 8, 9]}) + df = pd.DataFrame({'a': [0, 1, 2, 3, 4], 'b': [5, 6, 7, 8, 9]}) df.replace({'a': 0, 'b': 5}, 100) @@ -502,31 +500,24 @@ String/Regular Expression Replacement Replace the '.' with ``nan`` (str -> str) -.. ipython:: python - :suppress: - - from numpy.random import rand, randn - from numpy import nan - from pandas import DataFrame - .. ipython:: python - d = {'a': list(range(4)), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} - df = DataFrame(d) - df.replace('.', nan) + d = {'a': list(range(4)), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']} + df = pd.DataFrame(d) + df.replace('.', np.nan) Now do it with a regular expression that removes surrounding whitespace (regex -> regex) .. ipython:: python - df.replace(r'\s*\.\s*', nan, regex=True) + df.replace(r'\s*\.\s*', np.nan, regex=True) Replace a few different values (list -> list) .. ipython:: python - df.replace(['a', '.'], ['b', nan]) + df.replace(['a', '.'], ['b', np.nan]) list of regex -> list of regex @@ -538,14 +529,14 @@ Only search in column ``'b'`` (dict -> dict) .. ipython:: python - df.replace({'b': '.'}, {'b': nan}) + df.replace({'b': '.'}, {'b': np.nan}) Same as the previous example, but use a regular expression for searching instead (dict of regex -> dict) .. ipython:: python - df.replace({'b': r'\s*\.\s*'}, {'b': nan}, regex=True) + df.replace({'b': r'\s*\.\s*'}, {'b': np.nan}, regex=True) You can pass nested dictionaries of regular expressions that use ``regex=True`` @@ -557,7 +548,7 @@ or you can pass the nested dictionary like so .. ipython:: python - df.replace(regex={'b': {r'\s*\.\s*': nan}}) + df.replace(regex={'b': {r'\s*\.\s*': np.nan}}) You can also use the group of a regular expression match when replacing (dict of regex -> dict of regex), this works for lists as well @@ -571,7 +562,7 @@ will be replaced with a scalar (list of regex -> regex) .. ipython:: python - df.replace([r'\s*\.\s*', r'a|b'], nan, regex=True) + df.replace([r'\s*\.\s*', r'a|b'], np.nan, regex=True) All of the regular expression examples can also be passed with the ``to_replace`` argument as the ``regex`` argument. In this case the ``value`` @@ -580,7 +571,7 @@ dictionary. The previous example, in this case, would then be .. ipython:: python - df.replace(regex=[r'\s*\.\s*', r'a|b'], value=nan) + df.replace(regex=[r'\s*\.\s*', r'a|b'], value=np.nan) This can be convenient if you do not want to pass ``regex=True`` every time you want to use a regular expression. @@ -595,33 +586,25 @@ Numeric Replacement Similar to ``DataFrame.fillna`` -.. ipython:: python - :suppress: - - from numpy.random import rand, randn - from numpy import nan - from pandas import DataFrame - from pandas.util.testing import assert_frame_equal - .. ipython:: python - df = DataFrame(randn(10, 2)) - df[rand(df.shape[0]) > 0.5] = 1.5 - df.replace(1.5, nan) + df = pd.DataFrame(np.random.randn(10, 2)) + df[np.random.rand(df.shape[0]) > 0.5] = 1.5 + df.replace(1.5, np.nan) Replacing more than one value via lists works as well .. ipython:: python df00 = df.values[0, 0] - df.replace([1.5, df00], [nan, 'a']) + df.replace([1.5, df00], [np.nan, 'a']) df[1].dtype You can also operate on the DataFrame in place .. ipython:: python - df.replace(1.5, nan, inplace=True) + df.replace(1.5, np.nan, inplace=True) .. warning:: @@ -631,7 +614,7 @@ You can also operate on the DataFrame in place .. code-block:: python - s = Series([True, False, True]) + s = pd.Series([True, False, True]) s.replace({'a string': 'new value', True: False}) # raises TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' @@ -643,7 +626,7 @@ You can also operate on the DataFrame in place .. ipython:: python - s = Series([True, False, True]) + s = pd.Series([True, False, True]) s.replace('a string', 'another string') the original ``NDFrame`` object will be returned untouched. We're working on @@ -672,7 +655,7 @@ For example: .. ipython:: python - s = Series(randn(5), index=[0, 2, 4, 6, 7]) + s = pd.Series(np.random.randn(5), index=[0, 2, 4, 6, 7]) s > 0 (s > 0).dtype crit = (s > 0).reindex(list(range(8))) diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index dc13ce3e5c4da..26aaf9c2be69d 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -6,14 +6,9 @@ import numpy as np np.random.seed(123456) - from pandas import * - options.display.max_rows=15 - from pandas.core.reshape import * - import pandas.util.testing as tm - randn = np.random.randn + import pandas as pd + pd.options.display.max_rows=15 np.set_printoptions(precision=4, suppress=True) - from pandas.tools.tile import * - from pandas.compat import zip ************************** Reshaping and Pivot Tables @@ -56,7 +51,7 @@ For the curious here is how the above DataFrame was created: data = {'value' : frame.values.ravel('F'), 'variable' : np.asarray(frame.columns).repeat(N), 'date' : np.tile(np.asarray(frame.index), K)} - return DataFrame(data, columns=['date', 'variable', 'value']) + return pd.DataFrame(data, columns=['date', 'variable', 'value']) df = unpivot(tm.makeTimeDataFrame()) To select out everything for variable ``A`` we could do: @@ -119,11 +114,11 @@ from the hierarchical indexing section: .. ipython:: python tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', - 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', - 'one', 'two', 'one', 'two']])) - index = MultiIndex.from_tuples(tuples, names=['first', 'second']) - df = DataFrame(randn(8, 2), index=index, columns=['A', 'B']) + 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', + 'one', 'two', 'one', 'two']])) + index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) + df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B']) df2 = df[:4] df2 @@ -166,8 +161,8 @@ will result in a **sorted** copy of the original DataFrame or Series: .. ipython:: python - index = MultiIndex.from_product([[2,1], ['a', 'b']]) - df = DataFrame(randn(4), index=index, columns=['A']) + index = pd.MultiIndex.from_product([[2,1], ['a', 'b']]) + df = pd.DataFrame(np.random.randn(4), index=index, columns=['A']) df all(df.unstack().stack() == df.sort()) @@ -185,13 +180,13 @@ processed individually. .. ipython:: python - columns = MultiIndex.from_tuples([ + columns = pd.MultiIndex.from_tuples([ ('A', 'cat', 'long'), ('B', 'cat', 'long'), ('A', 'dog', 'short'), ('B', 'dog', 'short') ], names=['exp', 'animal', 'hair_length'] ) - df = DataFrame(randn(4, 4), columns=columns) + df = pd.DataFrame(np.random.randn(4, 4), columns=columns) df df.stack(level=['animal', 'hair_length']) @@ -215,12 +210,13 @@ calling ``sortlevel``, of course). Here is a more complex example: .. ipython:: python - columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), - ('B', 'cat'), ('A', 'dog')], - names=['exp', 'animal']) - index = MultiIndex.from_product([('bar', 'baz', 'foo', 'qux'), ('one', 'two')], - names=['first', 'second']) - df = DataFrame(randn(8, 4), index=index, columns=columns) + columns = pd.MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), + ('B', 'cat'), ('A', 'dog')], + names=['exp', 'animal']) + index = pd.MultiIndex.from_product([('bar', 'baz', 'foo', 'qux'), + ('one', 'two')], + names=['first', 'second']) + df = pd.DataFrame(np.random.randn(8, 4), index=index, columns=columns) df2 = df.ix[[0, 1, 2, 4, 5, 7]] df2 @@ -259,13 +255,13 @@ For instance, .. ipython:: python - cheese = DataFrame({'first' : ['John', 'Mary'], - 'last' : ['Doe', 'Bo'], - 'height' : [5.5, 6.0], - 'weight' : [130, 150]}) + cheese = pd.DataFrame({'first' : ['John', 'Mary'], + 'last' : ['Doe', 'Bo'], + 'height' : [5.5, 6.0], + 'weight' : [130, 150]}) cheese - melt(cheese, id_vars=['first', 'last']) - melt(cheese, id_vars=['first', 'last'], var_name='quantity') + pd.melt(cheese, id_vars=['first', 'last']) + pd.melt(cheese, id_vars=['first', 'last'], var_name='quantity') Another way to transform is to use the ``wide_to_long`` panel data convenience function. @@ -324,22 +320,22 @@ Consider a data set like this: .. ipython:: python import datetime - df = DataFrame({'A' : ['one', 'one', 'two', 'three'] * 6, - 'B' : ['A', 'B', 'C'] * 8, - 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4, - 'D' : np.random.randn(24), - 'E' : np.random.randn(24), - 'F' : [datetime.datetime(2013, i, 1) for i in range(1, 13)] + - [datetime.datetime(2013, i, 15) for i in range(1, 13)]}) + df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 6, + 'B': ['A', 'B', 'C'] * 8, + 'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4, + 'D': np.random.randn(24), + 'E': np.random.randn(24), + 'F': [datetime.datetime(2013, i, 1) for i in range(1, 13)] + + [datetime.datetime(2013, i, 15) for i in range(1, 13)]}) df We can produce pivot tables from this data very easily: .. ipython:: python - pivot_table(df, values='D', index=['A', 'B'], columns=['C']) - pivot_table(df, values='D', index=['B'], columns=['A', 'C'], aggfunc=np.sum) - pivot_table(df, values=['D','E'], index=['B'], columns=['A', 'C'], aggfunc=np.sum) + pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']) + pd.pivot_table(df, values='D', index=['B'], columns=['A', 'C'], aggfunc=np.sum) + pd.pivot_table(df, values=['D','E'], index=['B'], columns=['A', 'C'], aggfunc=np.sum) The result object is a DataFrame having potentially hierarchical indexes on the rows and columns. If the ``values`` column name is not given, the pivot table @@ -348,20 +344,20 @@ hierarchy in the columns: .. ipython:: python - pivot_table(df, index=['A', 'B'], columns=['C']) + pd.pivot_table(df, index=['A', 'B'], columns=['C']) Also, you can use ``Grouper`` for ``index`` and ``columns`` keywords. For detail of ``Grouper``, see :ref:`Grouping with a Grouper specification `. .. ipython:: python - pivot_table(df, values='D', index=Grouper(freq='M', key='F'), columns='C') + pd.pivot_table(df, values='D', index=Grouper(freq='M', key='F'), columns='C') You can render a nice output of the table omitting the missing values by calling ``to_string`` if you wish: .. ipython:: python - table = pivot_table(df, index=['A', 'B'], columns=['C']) + table = pd.pivot_table(df, index=['A', 'B'], columns=['C']) print(table.to_string(na_rep='')) Note that ``pivot_table`` is also available as an instance method on DataFrame. @@ -397,7 +393,7 @@ For example: a = np.array([foo, foo, bar, bar, foo, foo], dtype=object) b = np.array([one, one, two, one, two, one], dtype=object) c = np.array([dull, dull, shiny, dull, dull, shiny], dtype=object) - crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) + pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) .. _reshaping.pivot.margins: @@ -428,14 +424,14 @@ variables: ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60]) - cut(ages, bins=3) + pd.cut(ages, bins=3) If the ``bins`` keyword is an integer, then equal-width bins are formed. Alternatively we can specify custom bin-edges: .. ipython:: python - cut(ages, bins=[0, 18, 35, 70]) + pd.cut(ages, bins=[0, 18, 35, 70]) .. _reshaping.dummies: @@ -449,17 +445,16 @@ containing ``k`` columns of 1s and 0s: .. ipython:: python - df = DataFrame({'key': list('bbacab'), 'data1': range(6)}) + df = pd.DataFrame({'key': list('bbacab'), 'data1': range(6)}) - - get_dummies(df['key']) + pd.get_dummies(df['key']) Sometimes it's useful to prefix the column names, for example when merging the result with the original DataFrame: .. ipython:: python - dummies = get_dummies(df['key'], prefix='key') + dummies = pd.get_dummies(df['key'], prefix='key') dummies @@ -469,14 +464,14 @@ This function is often used along with discretization functions like ``cut``: .. ipython:: python - values = randn(10) + values = np.random.randn(10) values bins = [0, 0.2, 0.4, 0.6, 0.8, 1] - get_dummies(cut(values, bins)) + pd.get_dummies(pd.cut(values, bins)) See also :func:`Series.str.get_dummies `. diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 43fa6ea759b33..6dfeeadeb0167 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -6,20 +6,16 @@ import numpy as np import pandas as pd - from numpy.random import randn, rand, randint np.random.seed(123456) - from pandas import DataFrame, Series, date_range, options - import pandas.util.testing as tm np.set_printoptions(precision=4, suppress=True) - import matplotlib.pyplot as plt - plt.close('all') + pd.options.display.max_rows = 15 import matplotlib try: matplotlib.style.use('ggplot') except AttributeError: - options.display.mpl_style = 'default' - options.display.max_rows = 15 - from pandas.compat import lrange + pd.options.display.mpl_style = 'default' + import matplotlib.pyplot as plt + plt.close('all') ******** Plotting @@ -68,7 +64,7 @@ The ``plot`` method on Series and DataFrame is just a simple wrapper around .. ipython:: python - ts = Series(randn(1000), index=date_range('1/1/2000', periods=1000)) + ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) ts = ts.cumsum() @savefig series_plot_basic.png @@ -87,7 +83,7 @@ On DataFrame, :meth:`~DataFrame.plot` is a convenience to plot all of the column .. ipython:: python - df = DataFrame(randn(1000, 4), index=ts.index, columns=list('ABCD')) + df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=list('ABCD')) df = df.cumsum() @savefig frame_plot_basic.png @@ -105,8 +101,8 @@ You can plot one column versus another using the `x` and `y` keywords in .. ipython:: python - df3 = DataFrame(randn(1000, 2), columns=['B', 'C']).cumsum() - df3['A'] = Series(list(range(len(df)))) + df3 = pd.DataFrame(np.random.randn(1000, 2), columns=['B', 'C']).cumsum() + df3['A'] = pd.Series(list(range(len(df)))) @savefig df_plot_xy.png df3.plot(x='A', y='B') @@ -182,7 +178,7 @@ bar plot: .. ipython:: python - df2 = DataFrame(rand(10, 4), columns=['a', 'b', 'c', 'd']) + df2 = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd']) @savefig bar_plot_multi_ex.png df2.plot(kind='bar'); @@ -224,8 +220,8 @@ Histogram can be drawn specifying ``kind='hist'``. .. ipython:: python - df4 = DataFrame({'a': randn(1000) + 1, 'b': randn(1000), - 'c': randn(1000) - 1}, columns=['a', 'b', 'c']) + df4 = pd.DataFrame({'a': randn(1000) + 1, 'b': randn(1000), + 'c': randn(1000) - 1}, columns=['a', 'b', 'c']) plt.figure(); @@ -308,10 +304,10 @@ The ``by`` keyword can be specified to plot grouped histograms: .. ipython:: python - data = Series(randn(1000)) + data = pd.Series(np.random.randn(1000)) @savefig grouped_hist.png - data.hist(by=randint(0, 4, 1000), figsize=(6, 4)) + data.hist(by=np.random.randint(0, 4, 1000), figsize=(6, 4)) .. _visualization.box: @@ -337,7 +333,7 @@ a uniform random variable on [0,1). .. ipython:: python - df = DataFrame(rand(10, 5), columns=['A', 'B', 'C', 'D', 'E']) + df = pd.DataFrame(np.random.rand(10, 5), columns=['A', 'B', 'C', 'D', 'E']) @savefig box_plot_new.png df.plot(kind='box') @@ -392,7 +388,7 @@ The existing interface ``DataFrame.boxplot`` to plot boxplot still can be used. .. ipython:: python - df = DataFrame(rand(10,5)) + df = pd.DataFrame(np.random.rand(10,5)) plt.figure(); @savefig box_plot_ex.png @@ -410,8 +406,8 @@ groupings. For instance, .. ipython:: python :okwarning: - df = DataFrame(rand(10,2), columns=['Col1', 'Col2'] ) - df['X'] = Series(['A','A','A','A','A','B','B','B','B','B']) + df = pd.DataFrame(np.random.rand(10,2), columns=['Col1', 'Col2'] ) + df['X'] = pd.Series(['A','A','A','A','A','B','B','B','B','B']) plt.figure(); @@ -430,9 +426,9 @@ columns: .. ipython:: python :okwarning: - df = DataFrame(rand(10,3), columns=['Col1', 'Col2', 'Col3']) - df['X'] = Series(['A','A','A','A','A','B','B','B','B','B']) - df['Y'] = Series(['A','B','A','B','A','B','A','B','A','B']) + df = pd.DataFrame(np.random.rand(10,3), columns=['Col1', 'Col2', 'Col3']) + df['X'] = pd.Series(['A','A','A','A','A','B','B','B','B','B']) + df['Y'] = pd.Series(['A','B','A','B','A','B','A','B','A','B']) plt.figure(); @@ -473,7 +469,7 @@ DataFrame. :okwarning: np.random.seed(1234) - df_box = DataFrame(np.random.randn(50, 2)) + df_box = pd.DataFrame(np.random.randn(50, 2)) df_box['g'] = np.random.choice(['A', 'B'], size=50) df_box.loc[df_box['g'] == 'B', 1] += 3 @@ -517,7 +513,7 @@ When input data contains `NaN`, it will be automatically filled by 0. If you wan .. ipython:: python - df = DataFrame(rand(10, 4), columns=['a', 'b', 'c', 'd']) + df = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd']) @savefig area_plot_stacked.png df.plot(kind='area'); @@ -555,7 +551,7 @@ These can be specified by ``x`` and ``y`` keywords each. .. ipython:: python - df = DataFrame(rand(50, 4), columns=['a', 'b', 'c', 'd']) + df = pd.DataFrame(np.random.rand(50, 4), columns=['a', 'b', 'c', 'd']) @savefig scatter_plot.png df.plot(kind='scatter', x='a', y='b'); @@ -626,7 +622,7 @@ too dense to plot each point individually. .. ipython:: python - df = DataFrame(randn(1000, 2), columns=['a', 'b']) + df = pd.DataFrame(np.random.randn(1000, 2), columns=['a', 'b']) df['b'] = df['b'] + np.arange(1000) @savefig hexbin_plot.png @@ -654,7 +650,7 @@ given by column ``z``. The bins are aggregated with numpy's ``max`` function. .. ipython:: python - df = DataFrame(randn(1000, 2), columns=['a', 'b']) + df = pd.DataFrame(np.random.randn(1000, 2), columns=['a', 'b']) df['b'] = df['b'] = df['b'] + np.arange(1000) df['z'] = np.random.uniform(0, 3, 1000) @@ -689,7 +685,7 @@ A ``ValueError`` will be raised if there are any negative values in your data. .. ipython:: python - series = Series(3 * rand(4), index=['a', 'b', 'c', 'd'], name='series') + series = pd.Series(3 * np.random.rand(4), index=['a', 'b', 'c', 'd'], name='series') @savefig series_pie_plot.png series.plot(kind='pie', figsize=(6, 6)) @@ -716,7 +712,7 @@ A legend will be drawn in each pie plots by default; specify ``legend=False`` to .. ipython:: python - df = DataFrame(3 * rand(4, 2), index=['a', 'b', 'c', 'd'], columns=['x', 'y']) + df = pd.DataFrame(3 * np.random.rand(4, 2), index=['a', 'b', 'c', 'd'], columns=['x', 'y']) @savefig df_pie_plot.png df.plot(kind='pie', subplots=True, figsize=(8, 4)) @@ -759,7 +755,7 @@ If you pass values whose sum total is less than 1.0, matplotlib draws a semicirc .. ipython:: python - series = Series([0.1] * 4, index=['a', 'b', 'c', 'd'], name='series2') + series = pd.Series([0.1] * 4, index=['a', 'b', 'c', 'd'], name='series2') @savefig series_pie_plot_semi.png series.plot(kind='pie', figsize=(6, 6)) @@ -835,7 +831,7 @@ You can create a scatter plot matrix using the .. ipython:: python from pandas.tools.plotting import scatter_matrix - df = DataFrame(randn(1000, 4), columns=['a', 'b', 'c', 'd']) + df = pd.DataFrame(np.random.randn(1000, 4), columns=['a', 'b', 'c', 'd']) @savefig scatter_matrix_kde.png scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal='kde') @@ -863,7 +859,7 @@ setting ``kind='kde'``: .. ipython:: python - ser = Series(randn(1000)) + ser = pd.Series(np.random.randn(1000)) @savefig kde_plot.png ser.plot(kind='kde') @@ -888,10 +884,9 @@ of the same class will usually be closer together and form larger structures. .. ipython:: python - from pandas import read_csv from pandas.tools.plotting import andrews_curves - data = read_csv('data/iris.data') + data = pd.read_csv('data/iris.data') plt.figure() @@ -911,10 +906,9 @@ represents one data point. Points that tend to cluster will appear closer togeth .. ipython:: python - from pandas import read_csv from pandas.tools.plotting import parallel_coordinates - data = read_csv('data/iris.data') + data = pd.read_csv('data/iris.data') plt.figure() @@ -946,8 +940,8 @@ implies that the underlying data are not random. plt.figure() - data = Series(0.1 * rand(1000) + - 0.9 * np.sin(np.linspace(-99 * np.pi, 99 * np.pi, num=1000))) + data = pd.Series(0.1 * np.random.rand(1000) + + 0.9 * np.sin(np.linspace(-99 * np.pi, 99 * np.pi, num=1000))) @savefig lag_plot.png lag_plot(data) @@ -981,7 +975,7 @@ confidence band. plt.figure() - data = Series(0.7 * rand(1000) + + data = pd.Series(0.7 * np.random.rand(1000) + 0.3 * np.sin(np.linspace(-9 * np.pi, 9 * np.pi, num=1000))) @savefig autocorrelation_plot.png @@ -1012,7 +1006,7 @@ are what constitutes the bootstrap plot. from pandas.tools.plotting import bootstrap_plot - data = Series(rand(1000)) + data = pd.Series(np.random.rand(1000)) @savefig bootstrap_plot.png bootstrap_plot(data, size=50, samples=500, color='grey') @@ -1042,10 +1036,9 @@ be colored differently. .. ipython:: python - from pandas import read_csv from pandas.tools.plotting import radviz - data = read_csv('data/iris.data') + data = pd.read_csv('data/iris.data') plt.figure() @@ -1095,7 +1088,7 @@ shown by default. .. ipython:: python - df = DataFrame(randn(1000, 4), index=ts.index, columns=list('ABCD')) + df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=list('ABCD')) df = df.cumsum() @savefig frame_plot_basic_noleg.png @@ -1119,7 +1112,7 @@ You may pass ``logy`` to get a log-scale Y axis. .. ipython:: python - ts = Series(randn(1000), index=date_range('1/1/2000', periods=1000)) + ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) ts = np.exp(ts.cumsum()) @savefig series_plot_logy.png @@ -1227,8 +1220,6 @@ in ``pandas.plot_params`` can be used in a `with statement`: .. ipython:: python - import pandas as pd - plt.figure() @savefig ser_plot_suppress_context.png @@ -1325,10 +1316,10 @@ Another option is passing an ``ax`` argument to :meth:`Series.plot` to plot on a :suppress: np.random.seed(123456) - ts = Series(randn(1000), index=date_range('1/1/2000', periods=1000)) + ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) ts = ts.cumsum() - df = DataFrame(randn(1000, 4), index=ts.index, columns=list('ABCD')) + df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=list('ABCD')) df = df.cumsum() .. ipython:: python @@ -1410,7 +1401,7 @@ Plotting with matplotlib table is now supported in :meth:`DataFrame.plot` and : .. ipython:: python fig, ax = plt.subplots(1, 1) - df = DataFrame(rand(5, 3), columns=['a', 'b', 'c']) + df = pd.DataFrame(np.random.rand(5, 3), columns=['a', 'b', 'c']) ax.get_xaxis().set_visible(False) # Hide Ticks @savefig line_plot_table_true.png @@ -1482,7 +1473,7 @@ To use the cubehelix colormap, we can simply pass ``'cubehelix'`` to ``colormap= .. ipython:: python - df = DataFrame(randn(1000, 10), index=ts.index) + df = pd.DataFrame(np.random.randn(1000, 10), index=ts.index) df = df.cumsum() plt.figure() @@ -1520,7 +1511,7 @@ Colormaps can also be used other plot types, like bar charts: .. ipython:: python - dd = DataFrame(randn(10, 10)).applymap(abs) + dd = pd.DataFrame(np.random.randn(10, 10)).applymap(abs) dd = dd.cumsum() plt.figure() @@ -1587,8 +1578,8 @@ when plotting a large number of points. .. ipython:: python - price = Series(randn(150).cumsum(), - index=date_range('2000-1-1', periods=150, freq='B')) + price = pd.Series(np.random.randn(150).cumsum(), + index=pd.date_range('2000-1-1', periods=150, freq='B')) ma = pd.rolling_mean(price, 20) mstd = pd.rolling_std(price, 20) @@ -1624,18 +1615,8 @@ Trellis plotting interface .. ipython:: python :suppress: - import numpy as np - np.random.seed(123456) - from pandas import * - options.display.max_rows=15 - import pandas.util.testing as tm - randn = np.random.randn - np.set_printoptions(precision=4, suppress=True) - import matplotlib.pyplot as plt - tips_data = read_csv('data/tips.csv') - iris_data = read_csv('data/iris.data') - from pandas import read_csv - from pandas.tools.plotting import radviz + tips_data = pd.read_csv('data/tips.csv') + iris_data = pd.read_csv('data/iris.data') plt.close('all') @@ -1646,8 +1627,7 @@ Trellis plotting interface .. code-block:: python - from pandas import read_csv - tips_data = read_csv('tips.csv') + tips_data = pd.read_csv('tips.csv') from the directory where you downloaded the file. @@ -1668,7 +1648,6 @@ In the example below, data from the tips data set is arranged by the attributes values, the resulting grid has two columns and two rows. A histogram is displayed for each cell of the grid. - .. ipython:: python plt.figure()