|
2 | 2 |
|
3 | 3 | .. currentmodule:: pandas
|
4 | 4 |
|
| 5 | +.. ipython:: python |
| 6 | + :suppress: |
| 7 | +
|
| 8 | + import numpy as np |
| 9 | + np.random.seed(123456) |
| 10 | + from pandas import * |
| 11 | + from StringIO import StringIO |
| 12 | + import pandas.util.testing as tm |
| 13 | + randn = np.random.randn |
| 14 | + np.set_printoptions(precision=4, suppress=True) |
| 15 | + import matplotlib.pyplot as plt |
| 16 | + plt.close('all') |
| 17 | +
|
5 | 18 | *******************************
|
6 | 19 | IO Tools (Text, CSV, HDF5, ...)
|
7 | 20 | *******************************
|
8 | 21 |
|
| 22 | +Text files |
| 23 | +---------- |
| 24 | + |
| 25 | +The two workhorse functions for reading text (a.k.a. flat) files are |
| 26 | +``read_csv`` and ``read_table``. They both utilize the same parsing code for |
| 27 | +intelligently converting tabular data into a DataFrame object. They take a |
| 28 | +number of different arguments: |
| 29 | + |
| 30 | + - ``path_or_buffer``: Either a string path to a file or any object (such as |
| 31 | + an open ``file`` or ``StringIO``) with a ``read`` method. |
| 32 | + - ``delimiter``: For ``read_table`` only, a regular expression to split |
| 33 | + fields on. ``read_csv`` uses the ``csv`` module to do this and hence only |
| 34 | + supports comma-separated values |
| 35 | + - ``skiprows``: Rows in the file to skip |
| 36 | + - ``header``: row number to use as the columns, defaults to 0 (first row) |
| 37 | + - ``index_col``: integer, defaulting to 0 (the first column), instructing the |
| 38 | + parser to use a particular column as the ``index`` (row labels) of the |
| 39 | + resulting DataFrame |
| 40 | + - ``na_values``: optional list of strings to recognize as NA/NaN |
| 41 | + - ``date_parser``: function to use to parse strings into datetime |
| 42 | + objects. Defaults to the very robust ``dateutil.parser`` |
| 43 | + - ``names``: optional list of column names for the data. Otherwise will be |
| 44 | + read from the file |
| 45 | + |
| 46 | +.. code-block:: ipython |
| 47 | +
|
| 48 | + In [2]: print open('foo.csv').read() |
| 49 | + A,B,C |
| 50 | + 20090101,a,1,2 |
| 51 | + 20090102,b,3,4 |
| 52 | + 20090103,c,4,5 |
| 53 | +
|
| 54 | + In [3]: df = read_csv('foo.csv') |
| 55 | +
|
| 56 | + In [4]: df |
| 57 | + Out[4]: |
| 58 | + A B C |
| 59 | + 2009-01-01 a 1 2 |
| 60 | + 2009-01-02 b 3 4 |
| 61 | + 2009-01-03 c 4 5 |
| 62 | +
|
| 63 | + # dates parsed to datetime |
| 64 | + In [16]: df.index |
| 65 | + Out[16]: Index([2009-01-01 00:00:00, 2009-01-02 00:00:00, |
| 66 | + 2009-01-03 00:00:00], dtype=object) |
| 67 | +
|
| 68 | +If ``index_col=None``, the index will be a generic ``0...nrows-1``: |
| 69 | + |
| 70 | +.. code-block:: ipython |
| 71 | +
|
| 72 | + In [1]: print open('foo.csv').read() |
| 73 | + index,A,B,C |
| 74 | + 20090101,a,1,2 |
| 75 | + 20090102,b,3,4 |
| 76 | + 20090103,c,4,5 |
| 77 | +
|
| 78 | + In [2]: read_csv('foo.csv') |
| 79 | + Out[2]: |
| 80 | + A B C |
| 81 | + 2009-01-01 a 1 2 |
| 82 | + 2009-01-02 b 3 4 |
| 83 | + 2009-01-03 c 4 5 |
| 84 | +
|
| 85 | +
|
| 86 | + In [3]: read_csv('foo.csv', index_col=None) |
| 87 | + Out[3]: |
| 88 | + index A B C |
| 89 | + 0 20090101 a 1 2 |
| 90 | + 1 20090102 b 3 4 |
| 91 | + 2 20090103 c 4 5 |
| 92 | +
|
| 93 | +
|
| 94 | +The parsers make every attempt to "do the right thing" and not be very |
| 95 | +fragile. Type inference is a pretty big deal. So if a column can be coerced to |
| 96 | +integer dtype without altering the contents, it will do so. Any non-numeric |
| 97 | +columns will come through as object dtype as with the rest of pandas objects. |
| 98 | + |
| 99 | +Excel 2003 files |
| 100 | +---------------- |
| 101 | + |
| 102 | +The ``ExcelFile`` class can read an Excel 2003 file using the ``xlrd`` Python |
| 103 | +module and use the same parsing code as the above to convert tabular data into |
| 104 | +a DataFrame. To use it, create the ``ExcelFile`` object: |
| 105 | + |
| 106 | +.. code-block:: python |
| 107 | +
|
| 108 | + xls = ExcelFile('path_to_file.xls') |
| 109 | +
|
| 110 | +Then use the ``parse`` instance method with a sheetname, then use the same |
| 111 | +additional arguments as the parsers above: |
| 112 | + |
| 113 | +.. code-block:: python |
| 114 | +
|
| 115 | + xls.parse('Sheet1', index_col=None, na_values=['NA']) |
| 116 | +
|
9 | 117 | HDF5 (PyTables)
|
10 | 118 | ---------------
|
11 | 119 |
|
12 |
| -.. .. autosummary:: |
13 |
| -.. :toctree: generated/ |
| 120 | +``HDFStore`` is a dict-like object which reads and writes pandas to the high |
| 121 | +performance HDF5 format using the excellent `PyTables |
| 122 | +<http://www.pytables.org/>`__ library. |
14 | 123 |
|
15 |
| -.. HDFStore |
| 124 | +.. ipython:: python |
| 125 | + :suppress: |
16 | 126 |
|
17 |
| -Text files |
18 |
| ----------- |
| 127 | + import os |
| 128 | + os.remove('store.h5') |
| 129 | +
|
| 130 | +.. ipython:: python |
| 131 | +
|
| 132 | + store = HDFStore('store.h5') |
| 133 | + print store |
| 134 | +
|
| 135 | +Objects can be written to the file just like adding key-value pairs to a dict: |
| 136 | + |
| 137 | +.. ipython:: python |
| 138 | +
|
| 139 | + index = DateRange('1/1/2000', periods=8) |
| 140 | + s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) |
| 141 | + df = DataFrame(randn(8, 3), index=index, |
| 142 | + columns=['A', 'B', 'C']) |
| 143 | + wp = Panel(randn(2, 5, 4), items=['Item1', 'Item2'], |
| 144 | + major_axis=DateRange('1/1/2000', periods=5), |
| 145 | + minor_axis=['A', 'B', 'C', 'D']) |
| 146 | +
|
| 147 | + store['s'] = s |
| 148 | + store['df'] = df |
| 149 | + store['wp'] = wp |
| 150 | + store |
| 151 | +
|
| 152 | +In a current or later Python session, you can retrieve stored objects: |
| 153 | + |
| 154 | +.. ipython:: python |
| 155 | +
|
| 156 | + store['df'] |
| 157 | +
|
| 158 | +Storing in Table format |
| 159 | +~~~~~~~~~~~~~~~~~~~~~~~ |
| 160 | + |
| 161 | +Querying objects stored in Table format |
| 162 | +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 163 | + |
| 164 | +.. ipython:: python |
| 165 | + :suppress: |
19 | 166 |
|
20 |
| -.. .. automodule:: pandas.io.parsers |
21 |
| -.. :members: |
| 167 | + store.close() |
| 168 | + import os |
| 169 | + os.remove('store.h5') |
0 commit comments