diff --git a/doc/source/io.rst b/doc/source/io.rst index 2b91836d5449d..372a7b8a325e7 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -5,25 +5,23 @@ .. ipython:: python :suppress: - import os import csv - from pandas.compat import StringIO, BytesIO - import pandas as pd - ExcelWriter = pd.ExcelWriter + import os + import matplotlib.pyplot as plt import numpy as np - np.random.seed(123456) + import pandas as pd + from pandas.compat import StringIO, BytesIO + + randn = np.random.randn np.set_printoptions(precision=4, suppress=True) - - import matplotlib.pyplot as plt plt.close('all') - - import pandas.util.testing as tm pd.options.display.max_rows = 15 clipdf = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': ['p', 'q', 'r']}, index=['x', 'y', 'z']) + =============================== IO Tools (Text, CSV, HDF5, ...) =============================== @@ -146,7 +144,10 @@ usecols : list-like or callable, default ``None`` .. ipython:: python - data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + data = ('col1,col2,col3\n' + 'a,b,1\n' + 'a,b,2\n' + 'c,d,3') pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3']) @@ -192,7 +193,10 @@ skiprows : list-like or integer, default ``None`` .. ipython:: python - data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + data = ('col1,col2,col3\n' + 'a,b,1\n' + 'a,b,2\n' + 'c,d,3') pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0) @@ -367,7 +371,10 @@ columns: .. ipython:: python - data = 'a,b,c\n1,2,3\n4,5,6\n7,8,9' + data = ('a,b,c\n' + '1,2,3\n' + '4,5,6\n' + '7,8,9') print(data) df = pd.read_csv(StringIO(data), dtype=object) @@ -388,7 +395,11 @@ of :func:`~pandas.read_csv`: .. ipython:: python - data = "col_1\n1\n2\n'A'\n4.22" + data = ("col_1\n" + "1\n" + "2\n" + "'A'\n" + "4.22") df = pd.read_csv(StringIO(data), converters={'col_1': str}) df df['col_1'].apply(type).value_counts() @@ -427,7 +438,8 @@ worth trying. .. ipython:: python :okwarning: - df = pd.DataFrame({'col_1': list(range(500000)) + ['a', 'b'] + list(range(500000))}) + col_1 = list(range(500000)) + ['a', 'b'] + list(range(500000)) + df = pd.DataFrame({'col_1': col_1}) df.to_csv('foo.csv') mixed_df = pd.read_csv('foo.csv') mixed_df['col_1'].apply(type).value_counts() @@ -455,7 +467,10 @@ Specifying Categorical dtype .. ipython:: python - data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + data = ('col1,col2,col3\n' + 'a,b,1\n' + 'a,b,2\n' + 'c,d,3') pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data)).dtypes @@ -479,7 +494,6 @@ that column's ``dtype``. .. ipython:: python from pandas.api.types import CategoricalDtype - dtype = CategoricalDtype(['d', 'c', 'b', 'a'], ordered=True) pd.read_csv(StringIO(data), dtype={'col1': dtype}).dtypes @@ -525,7 +539,10 @@ used as the column names: .. ipython:: python - data = 'a,b,c\n1,2,3\n4,5,6\n7,8,9' + data = ('a,b,c\n' + '1,2,3\n' + '4,5,6\n' + '7,8,9') print(data) pd.read_csv(StringIO(data)) @@ -544,7 +561,11 @@ If the header is in a row other than the first, pass the row number to .. ipython:: python - data = 'skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9' + data = ('skip this skip it\n' + 'a,b,c\n' + '1,2,3\n' + '4,5,6\n' + '7,8,9') pd.read_csv(StringIO(data), header=1) .. note:: @@ -565,7 +586,9 @@ distinguish between them so as to prevent overwriting data: .. ipython :: python - data = 'a,b,a\n0,1,2\n3,4,5' + data = ('a,b,a\n' + '0,1,2\n' + '3,4,5') pd.read_csv(StringIO(data)) There is no more duplicate data because ``mangle_dupe_cols=True`` by default, @@ -633,7 +656,13 @@ be ignored. By default, completely blank lines will be ignored as well. .. ipython:: python - data = '\na,b,c\n \n# commented line\n1,2,3\n\n4,5,6' + data = ('\n' + 'a,b,c\n' + ' \n' + '# commented line\n' + '1,2,3\n' + '\n' + '4,5,6') print(data) pd.read_csv(StringIO(data), comment='#') @@ -641,7 +670,12 @@ If ``skip_blank_lines=False``, then ``read_csv`` will not ignore blank lines: .. ipython:: python - data = 'a,b,c\n\n1,2,3\n\n\n4,5,6' + data = ('a,b,c\n' + '\n' + '1,2,3\n' + '\n' + '\n' + '4,5,6') pd.read_csv(StringIO(data), skip_blank_lines=False) .. warning:: @@ -652,20 +686,32 @@ If ``skip_blank_lines=False``, then ``read_csv`` will not ignore blank lines: .. ipython:: python - data = '#comment\na,b,c\nA,B,C\n1,2,3' + data = ('#comment\n' + 'a,b,c\n' + 'A,B,C\n' + '1,2,3') pd.read_csv(StringIO(data), comment='#', header=1) - data = 'A,B,C\n#comment\na,b,c\n1,2,3' + data = ('A,B,C\n' + '#comment\n' + 'a,b,c\n' + '1,2,3') pd.read_csv(StringIO(data), comment='#', skiprows=2) If both ``header`` and ``skiprows`` are specified, ``header`` will be relative to the end of ``skiprows``. For example: - .. ipython:: python +.. ipython:: python - data = ('# empty\n# second empty line\n# third empty' - 'line\nX,Y,Z\n1,2,3\nA,B,C\n1,2.,4.\n5.,NaN,10.0') - print(data) - pd.read_csv(StringIO(data), comment='#', skiprows=4, header=1) + data = ('# empty\n' + '# second empty line\n' + '# third emptyline\n' + 'X,Y,Z\n' + '1,2,3\n' + 'A,B,C\n' + '1,2.,4.\n' + '5.,NaN,10.0\n') + print(data) + pd.read_csv(StringIO(data), comment='#', skiprows=4, header=1) .. _io.comments: @@ -677,10 +723,10 @@ Sometimes comments or meta data may be included in a file: .. ipython:: python :suppress: - data = ("ID,level,category\n" - "Patient1,123000,x # really unpleasant\n" - "Patient2,23000,y # wouldn't take his medicine\n" - "Patient3,1234018,z # awesome") + data = ("ID,level,category\n" + "Patient1,123000,x # really unpleasant\n" + "Patient2,23000,y # wouldn't take his medicine\n" + "Patient3,1234018,z # awesome") with open('tmp.csv', 'w') as fh: fh.write(data) @@ -718,7 +764,10 @@ result in byte strings being decoded to unicode in the result: .. ipython:: python - data = b'word,length\nTr\xc3\xa4umen,7\nGr\xc3\xbc\xc3\x9fe,5'.decode('utf8').encode('latin-1') + data = (b'word,length\n' + b'Tr\xc3\xa4umen,7\n' + b'Gr\xc3\xbc\xc3\x9fe,5') + data = data.decode('utf8').encode('latin-1') df = pd.read_csv(BytesIO(data), encoding='latin-1') df df['word'][1] @@ -738,12 +787,16 @@ first column will be used as the ``DataFrame``'s row names: .. ipython:: python - data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10' + data = ('a,b,c\n' + '4,apple,bat,5.7\n' + '8,orange,cow,10') pd.read_csv(StringIO(data)) .. ipython:: python - data = 'index,a,b,c\n4,apple,bat,5.7\n8,orange,cow,10' + data = ('index,a,b,c\n' + '4,apple,bat,5.7\n' + '8,orange,cow,10') pd.read_csv(StringIO(data), index_col=0) Ordinarily, you can achieve this behavior using the ``index_col`` option. @@ -754,7 +807,9 @@ index column inference and discard the last column, pass ``index_col=False``: .. ipython:: python - data = 'a,b,c\n4,apple,bat,\n8,orange,cow,' + data = ('a,b,c\n' + '4,apple,bat,\n' + '8,orange,cow,') print(data) pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), index_col=False) @@ -764,7 +819,9 @@ If a subset of data is being parsed using the ``usecols`` option, the .. ipython:: python - data = 'a,b,c\n4,apple,bat,\n8,orange,cow,' + data = ('a,b,c\n' + '4,apple,bat,\n' + '8,orange,cow,') print(data) pd.read_csv(StringIO(data), usecols=['b', 'c']) pd.read_csv(StringIO(data), usecols=['b', 'c'], index_col=0) @@ -812,12 +869,12 @@ column names: .. ipython:: python :suppress: - data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900") + data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900") with open('tmp.csv', 'w') as fh: fh.write(data) @@ -895,9 +952,8 @@ take full advantage of the flexibility of the date parsing API: .. ipython:: python - import pandas.io.date_converters as conv df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec, - date_parser=conv.parse_date_time) + date_parser=pd.io.date_converters.parse_date_time) df Pandas will try to call the ``date_parser`` function in three different ways. If @@ -990,9 +1046,12 @@ DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided: .. ipython:: python :suppress: - data = "date,value,cat\n1/6/2000,5,a\n2/6/2000,10,b\n3/6/2000,15,c" + data = ("date,value,cat\n" + "1/6/2000,5,a\n" + "2/6/2000,10,b\n" + "3/6/2000,15,c") with open('tmp.csv', 'w') as fh: - fh.write(data) + fh.write(data) .. ipython:: python @@ -1016,9 +1075,12 @@ writing to a file). For example: val = '0.3066101993807095471566981359501369297504425048828125' data = 'a,b,c\n1,2,{0}'.format(val) - abs(pd.read_csv(StringIO(data), engine='c', float_precision=None)['c'][0] - float(val)) - abs(pd.read_csv(StringIO(data), engine='c', float_precision='high')['c'][0] - float(val)) - abs(pd.read_csv(StringIO(data), engine='c', float_precision='round_trip')['c'][0] - float(val)) + abs(pd.read_csv(StringIO(data), engine='c', + float_precision=None)['c'][0] - float(val)) + abs(pd.read_csv(StringIO(data), engine='c', + float_precision='high')['c'][0] - float(val)) + abs(pd.read_csv(StringIO(data), engine='c', + float_precision='round_trip')['c'][0] - float(val)) .. _io.thousands: @@ -1033,10 +1095,10 @@ correctly: .. ipython:: python :suppress: - data = ("ID|level|category\n" - "Patient1|123,000|x\n" - "Patient2|23,000|y\n" - "Patient3|1,234,018|z") + data = ("ID|level|category\n" + "Patient1|123,000|x\n" + "Patient2|23,000|y\n" + "Patient3|1,234,018|z") with open('tmp.csv', 'w') as fh: fh.write(data) @@ -1132,10 +1194,10 @@ as a ``Series``: .. ipython:: python :suppress: - data = ("level\n" - "Patient1,123000\n" - "Patient2,23000\n" - "Patient3,1234018") + data = ("level\n" + "Patient1,123000\n" + "Patient2,23000\n" + "Patient3,1234018") with open('tmp.csv', 'w') as fh: fh.write(data) @@ -1144,7 +1206,7 @@ as a ``Series``: print(open('tmp.csv').read()) - output = pd.read_csv('tmp.csv', squeeze=True) + output = pd.read_csv('tmp.csv', squeeze=True) output type(output) @@ -1166,7 +1228,9 @@ options as follows: .. ipython:: python - data= 'a,b,c\n1,Yes,2\n3,No,4' + data = ('a,b,c\n' + '1,Yes,2\n' + '3,No,4') print(data) pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), true_values=['Yes'], false_values=['No']) @@ -1183,11 +1247,17 @@ too many fields will raise an error by default: .. ipython:: python :suppress: - data = 'a,b,c\n1,2,3\n4,5,6,7\n8,9,10' + data = ('a,b,c\n' + '1,2,3\n' + '4,5,6,7\n' + '8,9,10') .. code-block:: ipython - In [27]: data = 'a,b,c\n1,2,3\n4,5,6,7\n8,9,10' + In [27]: data = ('a,b,c\n' + '1,2,3\n' + '4,5,6,7\n' + '8,9,10') In [28]: pd.read_csv(StringIO(data)) --------------------------------------------------------------------------- @@ -1437,7 +1507,7 @@ returned object: .. ipython:: python - df = pd.read_csv("data/mindex_ex.csv", index_col=[0,1]) + df = pd.read_csv("data/mindex_ex.csv", index_col=[0, 1]) df df.loc[1978] @@ -1480,7 +1550,6 @@ with ``df.to_csv(..., index=False)``, then any ``names`` on the columns index wi .. ipython:: python :suppress: - import os os.remove('mi.csv') os.remove('mi2.csv') @@ -1966,9 +2035,8 @@ Preserve string indices: .. ipython:: python - si = pd.DataFrame(np.zeros((4, 4)), - columns=list(range(4)), - index=[str(i) for i in range(4)]) + si = pd.DataFrame(np.zeros((4, 4)), columns=list(range(4)), + index=[str(i) for i in range(4)]) si si.index si.columns @@ -2020,11 +2088,11 @@ data: .. ipython:: python - timeit pd.read_json(jsonfloats) + %timeit pd.read_json(jsonfloats) .. ipython:: python - timeit pd.read_json(jsonfloats, numpy=True) + %timeit pd.read_json(jsonfloats, numpy=True) The speedup is less noticeable for smaller datasets: @@ -2034,11 +2102,11 @@ The speedup is less noticeable for smaller datasets: .. ipython:: python - timeit pd.read_json(jsonfloats) + %timeit pd.read_json(jsonfloats) .. ipython:: python - timeit pd.read_json(jsonfloats, numpy=True) + %timeit pd.read_json(jsonfloats, numpy=True) .. warning:: @@ -2059,7 +2127,6 @@ The speedup is less noticeable for smaller datasets: .. ipython:: python :suppress: - import os os.remove('test.json') .. _io.json_normalize: @@ -2081,20 +2148,16 @@ into a flat table. .. ipython:: python data = [{'state': 'Florida', - 'shortname': 'FL', - 'info': { - 'governor': 'Rick Scott' - }, - 'counties': [{'name': 'Dade', 'population': 12345}, + 'shortname': 'FL', + 'info': {'governor': 'Rick Scott'}, + 'counties': [{'name': 'Dade', 'population': 12345}, {'name': 'Broward', 'population': 40000}, {'name': 'Palm Beach', 'population': 60000}]}, - {'state': 'Ohio', - 'shortname': 'OH', - 'info': { - 'governor': 'John Kasich' - }, - 'counties': [{'name': 'Summit', 'population': 1234}, - {'name': 'Cuyahoga', 'population': 1337}]}] + {'state': 'Ohio', + 'shortname': 'OH', + 'info': {'governor': 'John Kasich'}, + 'counties': [{'name': 'Summit', 'population': 1234}, + {'name': 'Cuyahoga', 'population': 1337}]}] json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']]) @@ -2142,11 +2205,10 @@ a JSON string with two fields, ``schema`` and ``data``. .. ipython:: python - df = pd.DataFrame( - {'A': [1, 2, 3], - 'B': ['a', 'b', 'c'], - 'C': pd.date_range('2016-01-01', freq='d', periods=3), - }, index=pd.Index(range(3), name='idx')) + df = pd.DataFrame({'A': [1, 2, 3], + 'B': ['a', 'b', 'c'], + 'C': pd.date_range('2016-01-01', freq='d', periods=3)}, + index=pd.Index(range(3), name='idx')) df df.to_json(orient='table', date_format="iso") @@ -2322,7 +2384,6 @@ as a string: .. ipython:: python :suppress: - import os file_path = os.path.abspath(os.path.join('source', '_static', 'banklist.html')) .. ipython:: python @@ -2820,8 +2881,8 @@ For example, to read in a ``MultiIndex`` index without names: .. ipython:: python - df = pd.DataFrame({'a':[1, 2, 3, 4], 'b':[5, 6, 7, 8]}, - index=pd.MultiIndex.from_product([['a', 'b'],['c', 'd']])) + df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]}, + index=pd.MultiIndex.from_product([['a', 'b'], ['c', 'd']])) df.to_excel('path_to_file.xlsx') df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1]) df @@ -2842,7 +2903,8 @@ should be passed to ``index_col`` and ``header``: .. ipython:: python - df.columns = pd.MultiIndex.from_product([['a'], ['b', 'd']], names=['c1', 'c2']) + df.columns = pd.MultiIndex.from_product([['a'], ['b', 'd']], + names=['c1', 'c2']) df.to_excel('path_to_file.xlsx') df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1], header=[0, 1]) df @@ -2850,7 +2912,6 @@ should be passed to ``index_col`` and ``header``: .. ipython:: python :suppress: - import os os.remove('path_to_file.xlsx') @@ -2997,7 +3058,7 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`. .. code-block:: python - with ExcelWriter('path_to_file.xlsx') as writer: + with pd.ExcelWriter('path_to_file.xlsx') as writer: df1.to_excel(writer, sheet_name='Sheet1') df2.to_excel(writer, sheet_name='Sheet2') @@ -3029,7 +3090,7 @@ Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` bio = BytesIO() # By setting the 'engine' in the ExcelWriter constructor. - writer = ExcelWriter(bio, engine='xlsxwriter') + writer = pd.ExcelWriter(bio, engine='xlsxwriter') df.to_excel(writer, sheet_name='Sheet1') # Save the workbook @@ -3082,7 +3143,7 @@ argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are: df.to_excel('path_to_file.xlsx', sheet_name='Sheet1', engine='xlsxwriter') # By setting the 'engine' in the ExcelWriter constructor. - writer = ExcelWriter('path_to_file.xlsx', engine='xlsxwriter') + writer = pd.ExcelWriter('path_to_file.xlsx', engine='xlsxwriter') # Or via pandas configuration. from pandas import options # noqa: E402 @@ -3172,7 +3233,6 @@ any pickled pandas object (or any other pickled object) from file: .. ipython:: python :suppress: - import os os.remove('foo.pkl') .. warning:: @@ -3249,7 +3309,6 @@ The default is to 'infer': .. ipython:: python :suppress: - import os os.remove("data.pkl.compress") os.remove("data.pkl.xz") os.remove("data.pkl.gz") @@ -3306,7 +3365,7 @@ pandas objects. .. ipython:: python - pd.to_msgpack('foo2.msg', {'dict': [{ 'df': df }, {'string': 'foo'}, + pd.to_msgpack('foo2.msg', {'dict': [{'df': df}, {'string': 'foo'}, {'scalar': 1.}, {'s': s}]}) pd.read_msgpack('foo2.msg') @@ -3365,7 +3424,6 @@ dict: .. ipython:: python - np.random.seed(1234) index = pd.date_range('1/1/2000', periods=8) s = pd.Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) df = pd.DataFrame(randn(8, 3), index=index, @@ -3421,7 +3479,6 @@ Closing a Store and using a context manager: :suppress: store.close() - import os os.remove('store.h5') @@ -3434,8 +3491,8 @@ similar to how ``read_csv`` and ``to_csv`` work. .. ipython:: python - df_tl = pd.DataFrame(dict(A=list(range(5)), B=list(range(5)))) - df_tl.to_hdf('store_tl.h5','table', append=True) + df_tl = pd.DataFrame({'A': list(range(5)), 'B': list(range(5))}) + df_tl.to_hdf('store_tl.h5', 'table', append=True) pd.read_hdf('store_tl.h5', 'table', where=['index>2']) .. ipython:: python @@ -3447,10 +3504,6 @@ similar to how ``read_csv`` and ``to_csv`` work. HDFStore will by default not drop rows that are all missing. This behavior can be changed by setting ``dropna=True``. -.. ipython:: python - :suppress: - - import os .. ipython:: python @@ -3459,12 +3512,12 @@ HDFStore will by default not drop rows that are all missing. This behavior can b df_with_missing df_with_missing.to_hdf('file.h5', 'df_with_missing', - format='table', mode='w') + format='table', mode='w') pd.read_hdf('file.h5', 'df_with_missing') df_with_missing.to_hdf('file.h5', 'df_with_missing', - format='table', mode='w', dropna=True) + format='table', mode='w', dropna=True) pd.read_hdf('file.h5', 'df_with_missing') @@ -3478,13 +3531,13 @@ This is also true for the major axis of a ``Panel``: .. ipython:: python matrix = [[[np.nan, np.nan, np.nan], [1, np.nan, np.nan]], - [[np.nan, np.nan, np.nan], [np.nan, 5, 6]], - [[np.nan, np.nan, np.nan], [np.nan, 3, np.nan]]] + [[np.nan, np.nan, np.nan], [np.nan, 5, 6]], + [[np.nan, np.nan, np.nan], [np.nan, 3, np.nan]]] - panel_with_major_axis_all_missing=pd.Panel(matrix, - items=['Item1', 'Item2', 'Item3'], - major_axis=[1, 2], - minor_axis=['A', 'B', 'C']) + panel_with_major_axis_all_missing = pd.Panel(matrix, + items=['Item1', 'Item2', 'Item3'], + major_axis=[1, 2], + minor_axis=['A', 'B', 'C']) panel_with_major_axis_all_missing @@ -3585,7 +3638,7 @@ everything in the sub-store and **below**, so be *careful*. store.put('foo/bar/bah', df) store.append('food/orange', df) - store.append('food/apple', df) + store.append('food/apple', df) store # a list of keys are returned @@ -3660,14 +3713,15 @@ defaults to `nan`. df_mixed = pd.DataFrame({'A': randn(8), 'B': randn(8), 'C': np.array(randn(8), dtype='float32'), - 'string':'string', + 'string': 'string', 'int': 1, 'bool': True, 'datetime64': pd.Timestamp('20010102')}, index=list(range(8))) - df_mixed.loc[df_mixed.index[3:5], ['A', 'B', 'string', 'datetime64']] = np.nan + df_mixed.loc[df_mixed.index[3:5], + ['A', 'B', 'string', 'datetime64']] = np.nan - store.append('df_mixed', df_mixed, min_itemsize = {'values': 50}) + store.append('df_mixed', df_mixed, min_itemsize={'values': 50}) df_mixed1 = store.select('df_mixed') df_mixed1 df_mixed1.get_dtype_counts() @@ -3820,7 +3874,8 @@ Works with a Panel as well. store.append('wp', wp) store - store.select('wp', "major_axis>pd.Timestamp('20000102') & minor_axis=['A', 'B']") + store.select('wp', + "major_axis>pd.Timestamp('20000102') & minor_axis=['A', 'B']") The ``columns`` keyword can be supplied to select a list of columns to be returned, this is equivalent to passing a @@ -3863,7 +3918,10 @@ specified in the format: ``()``, where float may be signed (and fra .. ipython:: python from datetime import timedelta - dftd = pd.DataFrame(dict(A = pd.Timestamp('20130101'), B = [ pd.Timestamp('20130101') + timedelta(days=i, seconds=10) for i in range(10) ])) + dftd = pd.DataFrame({'A': pd.Timestamp('20130101'), + 'B': [pd.Timestamp('20130101') + timedelta(days=i, + seconds=10) + for i in range(10)]}) dftd['C'] = dftd['A'] - dftd['B'] dftd store.append('dftd', dftd, data_columns=True) @@ -3940,14 +3998,14 @@ be ``data_columns``. df_dc = df.copy() df_dc['string'] = 'foo' - df_dc.loc[df_dc.index[4: 6], 'string'] = np.nan - df_dc.loc[df_dc.index[7: 9], 'string'] = 'bar' + df_dc.loc[df_dc.index[4:6], 'string'] = np.nan + df_dc.loc[df_dc.index[7:9], 'string'] = 'bar' df_dc['string2'] = 'cool' - df_dc.loc[df_dc.index[1: 3], ['B', 'C']] = 1.0 + df_dc.loc[df_dc.index[1:3], ['B', 'C']] = 1.0 df_dc # on-disk operations - store.append('df_dc', df_dc, data_columns = ['B', 'C', 'string', 'string2']) + store.append('df_dc', df_dc, data_columns=['B', 'C', 'string', 'string2']) store.select('df_dc', where='B > 0') # getting creative @@ -3976,7 +4034,7 @@ The default is 50,000 rows returned in a chunk. .. ipython:: python for df in store.select('df', chunksize=3): - print(df) + print(df) .. note:: @@ -4003,12 +4061,12 @@ chunks. store.append('dfeq', dfeq, data_columns=['number']) def chunks(l, n): - return [l[i: i+n] for i in range(0, len(l), n)] + return [l[i:i + n] for i in range(0, len(l), n)] evens = [2, 4, 6, 8, 10] coordinates = store.select_as_coordinates('dfeq', 'number=evens') for c in chunks(coordinates, 2): - print(store.select('dfeq', where=c)) + print(store.select('dfeq', where=c)) Advanced Queries ++++++++++++++++ @@ -4105,13 +4163,13 @@ results. .. ipython:: python df_mt = pd.DataFrame(randn(8, 6), index=pd.date_range('1/1/2000', periods=8), - columns=['A', 'B', 'C', 'D', 'E', 'F']) + columns=['A', 'B', 'C', 'D', 'E', 'F']) df_mt['foo'] = 'bar' df_mt.loc[df_mt.index[1], ('A', 'B')] = np.nan # you can also create the tables individually - store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None }, - df_mt, selector='df1_mt') + store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None}, + df_mt, selector='df1_mt') store # individual tables were created @@ -4120,7 +4178,7 @@ results. # as a multiple store.select_as_multiple(['df1_mt', 'df2_mt'], where=['A>0', 'B>0'], - selector = 'df1_mt') + selector='df1_mt') Delete from a Table @@ -4159,7 +4217,7 @@ the table using a ``where`` that selects all but the missing data. .. ipython:: python # returns the number of rows deleted - store.remove('wp', 'major_axis > 20000102' ) + store.remove('wp', 'major_axis > 20000102') store.select('wp') .. warning:: @@ -4332,7 +4390,7 @@ stored in a more efficient manner. .. ipython:: python dfcat = pd.DataFrame({'A': pd.Series(list('aabbcdba')).astype('category'), - 'B': np.random.randn(8) }) + 'B': np.random.randn(8)}) dfcat dfcat.dtypes cstore = pd.HDFStore('cats.h5', mode='w') @@ -4346,7 +4404,6 @@ stored in a more efficient manner. :okexcept: cstore.close() - import os os.remove('cats.h5') @@ -4374,7 +4431,7 @@ Passing a ``min_itemsize`` dict will cause all passed columns to be created as * .. ipython:: python - dfs = pd.DataFrame(dict(A='foo', B='bar'), index=list(range(5))) + dfs = pd.DataFrame({'A': 'foo', 'B': 'bar'}, index=list(range(5))) dfs # A and B have a size of 30 @@ -4393,7 +4450,7 @@ You could inadvertently turn an actual ``nan`` value into a missing value. .. ipython:: python - dfss = pd.DataFrame(dict(A=['foo', 'bar', 'nan'])) + dfss = pd.DataFrame({'A': ['foo', 'bar', 'nan']}) dfss store.append('dfss', dfss) @@ -4420,11 +4477,10 @@ It is possible to write an ``HDFStore`` object that can easily be imported into .. ipython:: python - np.random.seed(1) df_for_r = pd.DataFrame({"first": np.random.rand(100), "second": np.random.rand(100), "class": np.random.randint(0, 2, (100, ))}, - index=range(100)) + index=range(100)) df_for_r.head() store_export = pd.HDFStore('export.h5') @@ -4435,7 +4491,6 @@ It is possible to write an ``HDFStore`` object that can easily be imported into :suppress: store_export.close() - import os os.remove('export.h5') In R this file can be read into a ``data.frame`` object using the ``rhdf5`` @@ -4523,7 +4578,6 @@ Performance :suppress: store.close() - import os os.remove('store.h5') @@ -4589,7 +4643,6 @@ Read from a feather file. .. ipython:: python :suppress: - import os os.remove('example.feather') @@ -4673,7 +4726,6 @@ Read only certain columns of a parquet file. .. ipython:: python :suppress: - import os os.remove('example_pa.parquet') os.remove('example_fp.parquet') @@ -4722,7 +4774,8 @@ Parquet supports partitioning of data based on the values of one or more columns .. ipython:: python df = pd.DataFrame({'a': [0, 0, 1, 1], 'b': [0, 1, 0, 1]}) - df.to_parquet(fname='test', engine='pyarrow', partition_cols=['a'], compression=None) + df.to_parquet(fname='test', engine='pyarrow', + partition_cols=['a'], compression=None) The `fname` specifies the parent directory to which data will be saved. The `partition_cols` are the column names by which the dataset will be partitioned. @@ -4835,14 +4888,15 @@ the database using :func:`~pandas.DataFrame.to_sql`. import datetime c = ['id', 'Date', 'Col_1', 'Col_2', 'Col_3'] - d = [(26, datetime.datetime(2010,10,18), 'X', 27.5, True), - (42, datetime.datetime(2010,10,19), 'Y', -12.5, False), - (63, datetime.datetime(2010,10,20), 'Z', 5.73, True)] + d = [(26, datetime.datetime(2010, 10, 18), 'X', 27.5, True), + (42, datetime.datetime(2010, 10, 19), 'Y', -12.5, False), + (63, datetime.datetime(2010, 10, 20), 'Z', 5.73, True)] - data = pd.DataFrame(d, columns=c) + data = pd.DataFrame(d, columns=c) .. ipython:: python + data data.to_sql('data', engine) With some databases, writing large DataFrames can result in errors due to @@ -4999,7 +5053,8 @@ Specifying this will return an iterator through chunks of the query result: .. ipython:: python - for chunk in pd.read_sql_query("SELECT * FROM data_chunks", engine, chunksize=5): + for chunk in pd.read_sql_query("SELECT * FROM data_chunks", + engine, chunksize=5): print(chunk) You can also run a plain query without creating a ``DataFrame`` with @@ -5064,12 +5119,12 @@ If you have an SQLAlchemy description of your database you can express where con metadata = sa.MetaData() data_table = sa.Table('data', metadata, - sa.Column('index', sa.Integer), - sa.Column('Date', sa.DateTime), - sa.Column('Col_1', sa.String), - sa.Column('Col_2', sa.Float), - sa.Column('Col_3', sa.Boolean), - ) + sa.Column('index', sa.Integer), + sa.Column('Date', sa.DateTime), + sa.Column('Col_1', sa.String), + sa.Column('Col_2', sa.Float), + sa.Column('Col_3', sa.Boolean), + ) pd.read_sql(sa.select([data_table]).where(data_table.c.Col_3 == True), engine) @@ -5239,7 +5294,6 @@ values will have ``object`` data type. .. ipython:: python :suppress: - import os os.remove('stata.dta') .. _io.stata-categorical: @@ -5452,9 +5506,6 @@ And here's the code: .. code-block:: python - import os - import pandas as pd - import sqlite3 from numpy.random import randn sz = 1000000