Skip to content

Commit 82fdb7d

Browse files
committed
now sectionwise: parser / skip rowsin between pandas-dev#4340
1 parent 8f417ac commit 82fdb7d

File tree

2 files changed

+63
-2
lines changed

2 files changed

+63
-2
lines changed

pandas/io/parsers.py

+30-2
Original file line numberDiff line numberDiff line change
@@ -1150,7 +1150,11 @@ def TextParser(*args, **kwds):
11501150
returns Series if only one column
11511151
"""
11521152
kwds['engine'] = 'python'
1153-
return TextFileReader(*args, **kwds)
1153+
1154+
res = TextFileReader(*args, **kwds)
1155+
1156+
1157+
return res
11541158

11551159
# delimiter=None, dialect=None, names=None, header=0,
11561160
# index_col=None,
@@ -1385,6 +1389,7 @@ def _convert_data(self, data):
13851389
clean_conv)
13861390

13871391
def _infer_columns(self):
1392+
#TODO: this full part is too complex and somewhat strage!!!
13881393
names = self.names
13891394

13901395
if self.header is not None:
@@ -1396,13 +1401,20 @@ def _infer_columns(self):
13961401
header = list(header) + [header[-1]+1]
13971402
else:
13981403
have_mi_columns = False
1404+
#TODO: explain why header (in this case 1 number) needs to be a list???
13991405
header = [ header ]
14001406

14011407
columns = []
14021408
for level, hr in enumerate(header):
1403-
1409+
#TODO: explain why self.buf is needed.
1410+
# the header is correctly retrieved in excel.py by
1411+
# data[header] = _trim_excel_header(data[header])
14041412
if len(self.buf) > 0:
14051413
line = self.buf[0]
1414+
1415+
elif (header[0] == hr) and (level == 0) and (header[0] > 0):
1416+
line = self._get_header()
1417+
14061418
else:
14071419
line = self._next_line()
14081420

@@ -1456,8 +1468,24 @@ def _infer_columns(self):
14561468
columns = [ names ]
14571469

14581470
return columns
1471+
1472+
def _get_header(self):
1473+
''' reads header if e.g. header
1474+
FIXME: this tshoul be turned into something much less complicates
1475+
FIXME: all due to the header assuming that there is never a row between
1476+
data and header
1477+
'''
1478+
if isinstance(self.data, list):
1479+
line = self.data[self.header]
1480+
self.pos = self.header +1
1481+
else:
1482+
line = self._next_line()
1483+
1484+
return line
14591485

14601486
def _next_line(self):
1487+
#FIXME: why is self.data at times a list and sometimes a _scv.reader??
1488+
# reduce complexity here!!!
14611489
if isinstance(self.data, list):
14621490
while self.pos in self.skiprows:
14631491
self.pos += 1

pandas/io/tests/test_parsers.py

+33
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
# pylint: disable=E1101
2+
from __future__ import absolute_import
3+
24

35
from datetime import datetime
46
import csv
@@ -38,6 +40,13 @@
3840

3941
from pandas.parser import OverflowError
4042

43+
def _skip_if_no_mpl():
44+
'''pandas.tseries.converter imports matplotlib'''
45+
try:
46+
import matplotlib
47+
except ImportError:
48+
raise nose.SkipTest('matplotlib not installed, skipping')
49+
4150

4251
class ParserTests(object):
4352
"""
@@ -2015,6 +2024,30 @@ def test_iteration_open_handle(self):
20152024
expected = Series(['DDD', 'EEE', 'FFF', 'GGG'])
20162025
tm.assert_series_equal(result, expected)
20172026

2027+
def test_infer_columns(self):
2028+
'''reads xls with certain order of header, skiprows / data'''
2029+
_skip_if_no_mpl()
2030+
from pandas.io.excel import ExcelFile
2031+
from . import test_excel
2032+
correct_date_time = test_excel._correct_date_time
2033+
test_excel._skip_if_no_excelsuite()
2034+
2035+
# test of the header column is read in nicely
2036+
# list with the expected column names from the excel file
2037+
headercols_target = ['blank', 'temperature', 'precipitation', 'Area']
2038+
2039+
# add the block reading the excel file into a DataFrame
2040+
filename = 'example_file_2013-07-25.xlsx'
2041+
pth = os.path.join(self.dirpath, filename)
2042+
xlsx = ExcelFile(pth)
2043+
df = xlsx.parse('min', skiprows=12, header=10, index_col=1,
2044+
parse_dates=False, date_parser=correct_date_time)
2045+
#read in the excel file
2046+
headercols_df_in = df.columns.tolist()
2047+
2048+
self.assertEqual(headercols_df_in, headercols_target)
2049+
2050+
20182051
class TestCParserHighMemory(ParserTests, unittest.TestCase):
20192052

20202053
def read_csv(self, *args, **kwds):

0 commit comments

Comments
 (0)