Skip to content

Commit c64555b

Browse files
committed
TST: more test cases
ENH: catching some invalid option combinations BUG: fix as_recarray DOC: io.rst updated
1 parent cc93d61 commit c64555b

File tree

4 files changed

+74
-4
lines changed

4 files changed

+74
-4
lines changed

doc/source/io.rst

+19-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,10 @@ They can take a number of arguments:
5757
specified, data types will be inferred.
5858
- ``header``: row number to use as the column names, and the start of the
5959
data. Defaults to 0 if no ``names`` passed, otherwise ``None``. Explicitly
60-
pass ``header=0`` to be able to replace existing names.
60+
pass ``header=0`` to be able to replace existing names. The header can be
61+
a list of integers that specify row locations for a multi-index on the columns
62+
E.g. [0,1,3]. Interveaning rows that are not specified will be skipped.
63+
(E.g. 2 in this example are skipped)
6164
- ``skiprows``: A collection of numbers for rows in the file to skip. Can
6265
also be an integer to skip the first ``n`` rows
6366
- ``index_col``: column number, column name, or list of column numbers/names,
@@ -253,6 +256,21 @@ If the header is in a row other than the first, pass the row number to
253256
data = 'skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9'
254257
pd.read_csv(StringIO(data), header=1)
255258
259+
.. _io.multi_index_columns:
260+
261+
Specifying a multi-index columns
262+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
263+
264+
By specifying list of row locations for the ``header`` argument, you
265+
can read in a multi-index for the columns. Specifying non-consecutive
266+
rows will skip the interveaing rows. The ``index_col`` must also be
267+
specified.
268+
269+
.. ipython:: python
270+
271+
data = 'C0,C_l0_g0,C_l0_g1\nC1,C_l1_g0,C_l1_g1\nR0,,\nR_l0_g0,R0C0,R0C1\nR_l0_g1,R1C0,R1C1\nR_l0_g2,R2C0,R2C1\n'
272+
pd.read_csv(StringIO(data), header=[0,1], index_col=[0])
273+
256274
.. _io.usecols:
257275

258276
Filtering columns (``usecols``)

pandas/io/parsers.py

+19-3
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,11 @@ class DateConversionError(Exception):
5252
dialect : string or csv.Dialect instance, default None
5353
If None defaults to Excel dialect. Ignored if sep longer than 1 char
5454
See csv.Dialect documentation for more details
55-
header : int, default 0 if names parameter not specified, otherwise None
55+
header : int, default 0 if names parameter not specified,
5656
Row to use for the column labels of the parsed DataFrame. Specify None if
57-
there is no header row.
57+
there is no header row. Can be a list of integers that specify row
58+
locations for a multi-index on the columns E.g. [0,1,3]. Interveaning
59+
rows that are not specified (E.g. 2 in this example are skipped)
5860
skiprows : list-like or integer
5961
Row numbers to skip (0-indexed) or number of rows to skip (int)
6062
at the start of the file
@@ -531,6 +533,16 @@ def __init__(self, f, engine='python', **kwds):
531533
if kwds.get('header', 'infer') == 'infer':
532534
kwds['header'] = 0 if kwds.get('names') is None else None
533535

536+
# validate header options for mi
537+
h = kwds['header']
538+
if isinstance(h,(list,tuple,np.ndarray)):
539+
if kwds.get('index_col') is None:
540+
raise Exception("must have an index_col when have a "
541+
"multi-index header is specified")
542+
if kwds.get('as_recarray'):
543+
raise Exception("cannot specify as_recarray when "
544+
"specifying a multi-index header")
545+
534546
self.orig_options = kwds
535547

536548
# miscellanea
@@ -965,7 +977,8 @@ def extract(r):
965977
self.col_names = [ r[0] if len(r[0]) else None for r in header ]
966978
passed_names = True
967979
else:
968-
raise Exception("must have an index_col when have a multi-index specified")
980+
raise Exception("must have an index_col when have a multi-index "
981+
"header is specified")
969982
else:
970983
self.names = list(self._reader.header[0])
971984

@@ -1381,6 +1394,9 @@ def _infer_columns(self):
13811394
names = self.names
13821395

13831396
if self.header is not None:
1397+
if isinstance(self.header,(list,tuple,np.ndarray)):
1398+
raise Exception("PythonParser does not support a multi-index header")
1399+
13841400
if len(self.buf) > 0:
13851401
line = self.buf[0]
13861402
else:

pandas/io/tests/test_parsers.py

+33
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
TextFileReader, TextParser)
2121
from pandas.util.testing import (assert_almost_equal,
2222
assert_series_equal,
23+
makeCustomDataframe as mkdf,
2324
network,
2425
ensure_clean)
2526
import pandas.util.testing as tm
@@ -994,6 +995,38 @@ def test_header_not_first_line(self):
994995
expected = self.read_csv(StringIO(data2), header=0, index_col=0)
995996
tm.assert_frame_equal(df, expected)
996997

998+
def test_header_multi_index(self):
999+
expected = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4)
1000+
1001+
data = """\
1002+
C0,,C_l0_g0,C_l0_g1,C_l0_g2
1003+
1004+
C1,,C_l1_g0,C_l1_g1,C_l1_g2
1005+
C2,,C_l2_g0,C_l2_g1,C_l2_g2
1006+
C3,,C_l3_g0,C_l3_g1,C_l3_g2
1007+
R0,R1,,,
1008+
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
1009+
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
1010+
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
1011+
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
1012+
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
1013+
"""
1014+
1015+
# python-engine
1016+
self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3],
1017+
index_col=[0,1], engine='python')
1018+
1019+
# must specify index_col
1020+
self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3])
1021+
1022+
# no as_recarray
1023+
self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3],
1024+
index_col=[0,1], as_recarray=True)
1025+
1026+
# skipping lines in the header
1027+
df = read_csv(StringIO(data), header=[0,2,3,4],index_col=[0,1])
1028+
tm.assert_frame_equal(df, expected)
1029+
9971030
def test_pass_names_with_index(self):
9981031
lines = self.data1.split('\n')
9991032
no_header = '\n'.join(lines[1:])

pandas/src/parser.pyx

+3
Original file line numberDiff line numberDiff line change
@@ -1789,6 +1789,9 @@ def _to_structured_array(dict columns, object names):
17891789

17901790
if names is None:
17911791
names = ['%d' % i for i in range(len(columns))]
1792+
else:
1793+
# single line header
1794+
names = names[0]
17921795

17931796
dt = np.dtype([(str(name), columns[i].dtype)
17941797
for i, name in enumerate(names)])

0 commit comments

Comments
 (0)