311
311
fields of each line as half-open intervals (i.e., [from, to[ ).
312
312
String value 'infer' can be used to instruct the parser to try
313
313
detecting the column specifications from the first 100 rows of
314
- the data (default='infer').
314
+ the data which are not being skipped via skiprows (default='infer').
315
315
widths : list of ints. optional
316
316
A list of field widths which can be used instead of 'colspecs' if
317
317
the intervals are contiguous.
@@ -2852,13 +2852,15 @@ class FixedWidthReader(BaseIterator):
2852
2852
A reader of fixed-width lines.
2853
2853
"""
2854
2854
2855
- def __init__ (self , f , colspecs , delimiter , comment ):
2855
+ def __init__ (self , f , colspecs , delimiter , comment , skiprows = None ):
2856
2856
self .f = f
2857
2857
self .buffer = None
2858
2858
self .delimiter = '\r \n ' + delimiter if delimiter else '\n \r \t '
2859
2859
self .comment = comment
2860
+ if skiprows is None :
2861
+ skiprows = set ()
2860
2862
if colspecs == 'infer' :
2861
- self .colspecs = self .detect_colspecs ()
2863
+ self .colspecs = self .detect_colspecs (skiprows = skiprows )
2862
2864
else :
2863
2865
self .colspecs = colspecs
2864
2866
@@ -2875,20 +2877,34 @@ def __init__(self, f, colspecs, delimiter, comment):
2875
2877
raise TypeError ('Each column specification must be '
2876
2878
'2 element tuple or list of integers' )
2877
2879
2878
- def get_rows (self , n ):
2879
- rows = []
2880
- for i , row in enumerate (self .f , 1 ):
2881
- rows .append (row )
2882
- if i >= n :
2880
+ def get_rows (self , n , skiprows = None ):
2881
+ """
2882
+ We distinguish buffer_rows (the first <= n lines)
2883
+ from the rows returned to detect_colspecs because
2884
+ it's simpler to leave the other locations with
2885
+ skiprows logic alone than to modify them to deal
2886
+ with the fact we skipped some rows here as well.
2887
+ """
2888
+ if skiprows is None :
2889
+ skiprows = set ()
2890
+ buffer_rows = []
2891
+ detect_rows = []
2892
+ for i , row in enumerate (self .f ):
2893
+ if i not in skiprows :
2894
+ detect_rows .append (row )
2895
+ buffer_rows .append (row )
2896
+ if len (detect_rows ) >= n :
2883
2897
break
2884
- self .buffer = iter (rows )
2885
- return rows
2898
+ self .buffer = iter (buffer_rows )
2899
+ return detect_rows
2886
2900
2887
- def detect_colspecs (self , n = 100 ):
2901
+ def detect_colspecs (self , n = 100 , skiprows = None ):
2888
2902
# Regex escape the delimiters
2889
2903
delimiters = '' .join ([r'\%s' % x for x in self .delimiter ])
2890
2904
pattern = re .compile ('([^%s]+)' % delimiters )
2891
- rows = self .get_rows (n )
2905
+ rows = self .get_rows (n , skiprows )
2906
+ if not rows :
2907
+ raise EmptyDataError ("No rows from which to infer column width" )
2892
2908
max_len = max (map (len , rows ))
2893
2909
mask = np .zeros (max_len + 1 , dtype = int )
2894
2910
if self .comment is not None :
@@ -2899,7 +2915,8 @@ def detect_colspecs(self, n=100):
2899
2915
shifted = np .roll (mask , 1 )
2900
2916
shifted [0 ] = 0
2901
2917
edges = np .where ((mask ^ shifted ) == 1 )[0 ]
2902
- return list (zip (edges [::2 ], edges [1 ::2 ]))
2918
+ edge_pairs = list (zip (edges [::2 ], edges [1 ::2 ]))
2919
+ return edge_pairs
2903
2920
2904
2921
def __next__ (self ):
2905
2922
if self .buffer is not None :
@@ -2924,9 +2941,8 @@ class FixedWidthFieldParser(PythonParser):
2924
2941
def __init__ (self , f , ** kwds ):
2925
2942
# Support iterators, convert to a list.
2926
2943
self .colspecs = kwds .pop ('colspecs' )
2927
-
2928
2944
PythonParser .__init__ (self , f , ** kwds )
2929
2945
2930
2946
def _make_reader (self , f ):
2931
2947
self .data = FixedWidthReader (f , self .colspecs , self .delimiter ,
2932
- self .comment )
2948
+ self .comment , self . skiprows )
0 commit comments