Skip to content

Commit d6573f5

Browse files
committed
ENH/CLN: refactor to support PythonParser as well as CParser
1 parent c64555b commit d6573f5

File tree

3 files changed

+108
-69
lines changed

3 files changed

+108
-69
lines changed

pandas/io/parsers.py

+99-54
Original file line numberDiff line numberDiff line change
@@ -755,7 +755,42 @@ def _should_parse_dates(self, i):
755755
else:
756756
return (j in self.parse_dates) or (name in self.parse_dates)
757757

758-
def _make_index(self, data, alldata, columns):
758+
759+
def _extract_multi_indexer_columns(self, header, index_names, col_names, passed_names=False):
760+
""" extract and return the names, index_names, col_names
761+
header is a list-of-lists returned from the parsers """
762+
if len(header) < 2:
763+
return header[0], index_names, col_names, passed_names
764+
765+
# the names are the tuples of the header that are not the index cols
766+
# 0 is the name of the index, assuming index_col is a list of column
767+
# numbers
768+
ic = self.index_col
769+
if not isinstance(ic, (list,tuple,np.ndarray)):
770+
ic = [ ic ]
771+
sic = set(ic)
772+
773+
orig_header = list(header)
774+
index_names = header.pop(-1)
775+
index_names = [ index_names[i] for i in ic ]
776+
field_count = len(header[0])
777+
778+
def extract(r):
779+
return tuple([ r[i] for i in range(field_count) if i not in sic ])
780+
781+
names = ic + zip(*[ extract(r) for r in header ])
782+
col_names = [ r[0] if len(r[0]) else None for r in header ]
783+
passed_names = True
784+
785+
return names, index_names, col_names, passed_names
786+
787+
def _maybe_make_multi_index_columns(self, columns, col_names=None):
788+
# possibly create a column mi here
789+
if len(columns) and not isinstance(columns, MultiIndex) and all([ isinstance(c,tuple) for c in columns]):
790+
columns = MultiIndex.from_tuples(columns,names=col_names)
791+
return columns
792+
793+
def _make_index(self, data, alldata, columns, indexnamerow=False):
759794
if not _is_index_col(self.index_col) or len(self.index_col) == 0:
760795
index = None
761796

@@ -772,7 +807,15 @@ def _make_index(self, data, alldata, columns):
772807
index = self._get_complex_date_index(data, columns)
773808
index = self._agg_index(index, try_parse_dates=False)
774809

775-
return index
810+
# add names for the index
811+
if indexnamerow:
812+
coffset = len(indexnamerow) - len(columns)
813+
index.names = indexnamerow[:coffset]
814+
815+
# maybe create a mi on the columns
816+
columns = self._maybe_make_multi_index_columns(columns, self.col_names)
817+
818+
return index, columns
776819

777820
_implicit_index = False
778821

@@ -955,27 +998,11 @@ def __init__(self, src, **kwds):
955998
self.names = None
956999
else:
9571000
if len(self._reader.header) > 1:
958-
# the names are the tuples of the header that are not the index cols
959-
# 0 is the name of the index, assuming index_col is a list of column
960-
# numbers
1001+
# we have a multi index in the columns
9611002
if (self._reader.leading_cols == 0 and
9621003
_is_index_col(self.index_col)):
963-
ic = self.index_col
964-
if not isinstance(ic, (list,tuple,np.ndarray)):
965-
ic = [ ic ]
966-
sic = set(ic)
967-
968-
header = list(self._reader.header)
969-
index_names = header.pop(-1)
970-
self.index_names = [ index_names[i] for i in ic ]
971-
field_count = len(header[0])
972-
973-
def extract(r):
974-
return tuple([ r[i] for i in range(field_count) if i not in sic ])
975-
976-
self.names = ic + zip(*[ extract(r) for r in header ])
977-
self.col_names = [ r[0] if len(r[0]) else None for r in header ]
978-
passed_names = True
1004+
self.names, self.index_names, self.col_names, passed_names = self._extract_multi_indexer_columns(
1005+
self._reader.header, self.index_names, self.col_names, passed_names)
9791006
else:
9801007
raise Exception("must have an index_col when have a multi-index "
9811008
"header is specified")
@@ -1089,11 +1116,10 @@ def read(self, nrows=None):
10891116
data = dict((k, v) for k, (i, v) in zip(names, data))
10901117

10911118
names, data = self._do_date_conversions(names, data)
1092-
index = self._make_index(data, alldata, names)
1119+
index, names = self._make_index(data, alldata, names)
10931120

1094-
# possibly create a column mi here
1095-
if all([ isinstance(c,tuple) for c in names]):
1096-
names = MultiIndex.from_tuples(names,names=self.col_names)
1121+
# maybe create a mi on the columns
1122+
names = self._maybe_make_multi_index_columns(names, self.col_names)
10971123

10981124
return index, names, data
10991125

@@ -1252,16 +1278,25 @@ def __init__(self, f, **kwds):
12521278
self.data = f
12531279
self.columns = self._infer_columns()
12541280

1281+
# we are processing a multi index column
1282+
if len(self.columns) > 1:
1283+
self.columns, self.index_names, self.col_names, _ = self._extract_multi_indexer_columns(
1284+
self.columns, self.index_names, self.col_names)
1285+
else:
1286+
self.columns = self.columns[0]
1287+
12551288
# get popped off for index
12561289
self.orig_names = list(self.columns)
12571290

12581291
# needs to be cleaned/refactored
12591292
# multiple date column thing turning into a real spaghetti factory
12601293

12611294
if not self._has_complex_date_col:
1262-
(self.index_names,
1295+
(index_names,
12631296
self.orig_names, _) = self._get_index_name(self.columns)
12641297
self._name_processed = True
1298+
if self.index_names is None:
1299+
self.index_names = index_names
12651300
self._first_chunk = True
12661301

12671302
def _make_reader(self, f):
@@ -1365,10 +1400,7 @@ def read(self, rows=None):
13651400
columns, data = self._do_date_conversions(self.columns, data)
13661401

13671402
data = self._convert_data(data)
1368-
index = self._make_index(data, alldata, columns)
1369-
if indexnamerow:
1370-
coffset = len(indexnamerow) - len(columns)
1371-
index.names = indexnamerow[:coffset]
1403+
index, columns = self._make_index(data, alldata, columns, indexnamerow)
13721404

13731405
return index, columns, data
13741406

@@ -1394,39 +1426,52 @@ def _infer_columns(self):
13941426
names = self.names
13951427

13961428
if self.header is not None:
1397-
if isinstance(self.header,(list,tuple,np.ndarray)):
1398-
raise Exception("PythonParser does not support a multi-index header")
1429+
header = self.header
13991430

1400-
if len(self.buf) > 0:
1401-
line = self.buf[0]
1431+
# we have a mi columns, so read and extra line
1432+
if isinstance(header,(list,tuple,np.ndarray)):
1433+
header = list(header) + [header[-1]+1]
14021434
else:
1403-
line = self._next_line()
1404-
1405-
while self.pos <= self.header:
1406-
line = self._next_line()
1435+
header = [ header ]
14071436

14081437
columns = []
1409-
for i, c in enumerate(line):
1410-
if c == '':
1411-
columns.append('Unnamed: %d' % i)
1438+
for hr in header:
1439+
1440+
if len(self.buf) > 0:
1441+
line = self.buf[0]
14121442
else:
1413-
columns.append(c)
1443+
line = self._next_line()
14141444

1415-
if self.mangle_dupe_cols:
1416-
counts = {}
1417-
for i, col in enumerate(columns):
1418-
cur_count = counts.get(col, 0)
1419-
if cur_count > 0:
1420-
columns[i] = '%s.%d' % (col, cur_count)
1421-
counts[col] = cur_count + 1
1445+
while self.pos <= hr:
1446+
line = self._next_line()
1447+
1448+
this_columns = []
1449+
for i, c in enumerate(line):
1450+
if c == '':
1451+
this_columns.append('Unnamed: %d' % i)
1452+
else:
1453+
this_columns.append(c)
1454+
1455+
if self.mangle_dupe_cols:
1456+
counts = {}
1457+
for i, col in enumerate(this_columns):
1458+
cur_count = counts.get(col, 0)
1459+
if cur_count > 0:
1460+
this_columns[i] = '%s.%d' % (col, cur_count)
1461+
counts[col] = cur_count + 1
1462+
1463+
columns.append(this_columns)
14221464

14231465
self._clear_buffer()
14241466

14251467
if names is not None:
1426-
if len(names) != len(columns):
1468+
if len(names) != len(columns[0]):
14271469
raise Exception('Number of passed names did not match '
14281470
'number of header fields in the file')
1429-
columns = names
1471+
if len(columns) > 1:
1472+
raise Exception('Cannot pass names with multi-index columns')
1473+
columns = [ names ]
1474+
14301475
else:
14311476
if len(self.buf) > 0:
14321477
line = self.buf[0]
@@ -1436,11 +1481,11 @@ def _infer_columns(self):
14361481
ncols = len(line)
14371482
if not names:
14381483
if self.prefix:
1439-
columns = ['X%d' % i for i in range(ncols)]
1484+
columns = [ ['X%d' % i for i in range(ncols)] ]
14401485
else:
1441-
columns = range(ncols)
1486+
columns = [ range(ncols) ]
14421487
else:
1443-
columns = names
1488+
columns = [ names ]
14441489

14451490
return columns
14461491

pandas/io/tests/test_parsers.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -1012,9 +1012,10 @@ def test_header_multi_index(self):
10121012
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
10131013
"""
10141014

1015-
# python-engine
1016-
self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3],
1017-
index_col=[0,1], engine='python')
1015+
# basic test with both engines
1016+
for engine in ['c','python']:
1017+
df = read_csv(StringIO(data), header=[0,2,3,4],index_col=[0,1], engine=engine)
1018+
tm.assert_frame_equal(df, expected)
10181019

10191020
# must specify index_col
10201021
self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3])

pandas/tests/test_frame.py

+5-12
Original file line numberDiff line numberDiff line change
@@ -4755,9 +4755,13 @@ def test_to_csv_moar(self):
47554755
def _do_test(df,path,r_dtype=None,c_dtype=None,rnlvl=None,cnlvl=None,
47564756
dupe_col=False):
47574757

4758+
header = 0
4759+
if cnlvl:
4760+
header = range(cnlvl)
4761+
47584762
with ensure_clean(path) as path:
47594763
df.to_csv(path,encoding='utf8',chunksize=chunksize)
4760-
recons = DataFrame.from_csv(path,parse_dates=False)
4764+
recons = DataFrame.from_csv(path,header=header,parse_dates=False)
47614765

47624766
def _to_uni(x):
47634767
if not isinstance(x,unicode):
@@ -4773,16 +4777,6 @@ def _to_uni(x):
47734777
recons.index = ix
47744778
recons = recons.iloc[:,rnlvl-1:]
47754779

4776-
if cnlvl:
4777-
def stuple_to_tuple(x):
4778-
import re
4779-
x = x.split(",")
4780-
x = map(lambda x: re.sub("[\'\"\s\(\)]","",x),x)
4781-
return x
4782-
4783-
cols=MultiIndex.from_tuples(map(stuple_to_tuple,recons.columns))
4784-
recons.columns = cols
4785-
47864780
type_map = dict(i='i',f='f',s='O',u='O',dt='O',p='O')
47874781
if r_dtype:
47884782
if r_dtype == 'u': # unicode
@@ -4827,7 +4821,6 @@ def stuple_to_tuple(x):
48274821

48284822
assert_frame_equal(df, recons,check_names=False,check_less_precise=True)
48294823

4830-
48314824
N = 100
48324825
chunksize=1000
48334826

0 commit comments

Comments
 (0)