now sectionwise: parser / skip rowsin between pandas-dev#4340

timmie · timmie · commit 82fdb7d049c3 · 2013-08-22T03:56:33.000+02:00
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1150,7 +1150,11 @@ def TextParser(*args, **kwds):
         returns Series if only one column
     """
     kwds['engine'] = 'python'
-    return TextFileReader(*args, **kwds)
+    
+    res = TextFileReader(*args, **kwds)
+    
+    
+    return res
 
 # delimiter=None, dialect=None, names=None, header=0,
 # index_col=None,
@@ -1385,6 +1389,7 @@ def _convert_data(self, data):
                                          clean_conv)
 
     def _infer_columns(self):
+        #TODO: this full part is too complex and somewhat strage!!!
         names = self.names
 
         if self.header is not None:
@@ -1396,13 +1401,20 @@ def _infer_columns(self):
                 header = list(header) + [header[-1]+1]
             else:
                 have_mi_columns = False
+                #TODO: explain why header (in this case 1 number) needs to be a list???
                 header = [ header ]
 
             columns = []
             for level, hr in enumerate(header):
-
+                #TODO: explain why self.buf is needed.
+                #      the header is correctly retrieved in excel.py by
+                #      data[header] = _trim_excel_header(data[header])
                 if len(self.buf) > 0:
                     line = self.buf[0]
+
+                elif (header[0] == hr) and (level == 0) and (header[0] > 0):
+                     line = self._get_header()
+                    
                 else:
                     line = self._next_line()
 
@@ -1456,8 +1468,24 @@ def _infer_columns(self):
                 columns = [ names ]
 
         return columns
+        
+    def _get_header(self):
+        ''' reads header if e.g. header 
+        FIXME: this tshoul be turned into something much less complicates
+        FIXME: all due to the header assuming that there is never a row between
+               data and header
+        '''
+        if isinstance(self.data, list):
+            line = self.data[self.header]
+            self.pos = self.header +1
+        else:
+            line = self._next_line()
+        
+        return line
 
     def _next_line(self):
+        #FIXME: why is self.data at times a list and sometimes a _scv.reader??
+        #       reduce complexity here!!!
         if isinstance(self.data, list):
             while self.pos in self.skiprows:
                 self.pos += 1
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -1,4 +1,6 @@
 # pylint: disable=E1101
+from __future__ import absolute_import
+
 
 from datetime import datetime
 import csv
@@ -38,6 +40,13 @@
 
 from pandas.parser import OverflowError
 
+def _skip_if_no_mpl():
+    '''pandas.tseries.converter imports matplotlib'''
+    try:
+        import matplotlib
+    except ImportError:
+        raise nose.SkipTest('matplotlib not installed, skipping')
+
 
 class ParserTests(object):
     """
@@ -2015,6 +2024,30 @@ def test_iteration_open_handle(self):
                 expected = Series(['DDD', 'EEE', 'FFF', 'GGG'])
                 tm.assert_series_equal(result, expected)
 
+    def test_infer_columns(self):
+        '''reads xls with certain order of header, skiprows / data'''
+        _skip_if_no_mpl()
+        from pandas.io.excel import ExcelFile
+        from . import test_excel
+        correct_date_time = test_excel._correct_date_time
+        test_excel._skip_if_no_excelsuite()
+
+        # test of the header column is read in nicely
+        # list with the expected column names from the excel file
+        headercols_target = ['blank', 'temperature', 'precipitation', 'Area']
+
+        # add the block reading the excel file into a DataFrame
+        filename = 'example_file_2013-07-25.xlsx'
+        pth = os.path.join(self.dirpath, filename)
+        xlsx = ExcelFile(pth)
+        df = xlsx.parse('min', skiprows=12, header=10, index_col=1,
+                        parse_dates=False, date_parser=correct_date_time)
+        #read in the excel file
+        headercols_df_in = df.columns.tolist()
+
+        self.assertEqual(headercols_df_in, headercols_target)
+
+
 class TestCParserHighMemory(ParserTests, unittest.TestCase):
 
     def read_csv(self, *args, **kwds):