Skip to content

Commit c8ab2dd

Browse files
committed
Merge pull request pandas-dev#4955 from alefnula/iss4488
ENH: Added colspecs detection to read_fwf
2 parents 5168a57 + 9f5e5ff commit c8ab2dd

File tree

6 files changed

+199
-38
lines changed

6 files changed

+199
-38
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -41,3 +41,4 @@ pandas/io/*.json
4141

4242
.project
4343
.pydevproject
44+
.settings

doc/source/io.rst

+19-4
Original file line numberDiff line numberDiff line change
@@ -742,10 +742,13 @@ function works with data files that have known and fixed column widths.
742742
The function parameters to ``read_fwf`` are largely the same as `read_csv` with
743743
two extra parameters:
744744

745-
- ``colspecs``: a list of pairs (tuples), giving the extents of the
746-
fixed-width fields of each line as half-open intervals [from, to[
747-
- ``widths``: a list of field widths, which can be used instead of
748-
``colspecs`` if the intervals are contiguous
745+
- ``colspecs``: A list of pairs (tuples) giving the extents of the
746+
fixed-width fields of each line as half-open intervals (i.e., [from, to[ ).
747+
String value 'infer' can be used to instruct the parser to try detecting
748+
the column specifications from the first 100 rows of the data. Default
749+
behaviour, if not specified, is to infer.
750+
- ``widths``: A list of field widths which can be used instead of 'colspecs'
751+
if the intervals are contiguous.
749752

750753
.. ipython:: python
751754
:suppress:
@@ -789,6 +792,18 @@ column widths for contiguous columns:
789792
The parser will take care of extra white spaces around the columns
790793
so it's ok to have extra separation between the columns in the file.
791794

795+
.. versionadded:: 0.13.0
796+
797+
By default, ``read_fwf`` will try to infer the file's ``colspecs`` by using the
798+
first 100 rows of the file. It can do it only in cases when the columns are
799+
aligned and correctly separated by the provided ``delimiter`` (default delimiter
800+
is whitespace).
801+
802+
.. ipython:: python
803+
804+
df = pd.read_fwf('bar.csv', header=None, index_col=0)
805+
df
806+
792807
.. ipython:: python
793808
:suppress:
794809

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ New features
5959
- Added ``isin`` method to DataFrame (:issue:`4211`)
6060
- Clipboard functionality now works with PySide (:issue:`4282`)
6161
- New ``extract`` string method returns regex matches more conveniently (:issue:`4685`)
62+
- Auto-detect field widths in read_fwf when unspecified (:issue:`4488`)
6263

6364
Experimental Features
6465
~~~~~~~~~~~~~~~~~~~~~

doc/source/v0.13.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,9 @@ Enhancements
421421

422422
can also be used.
423423
- ``read_stata` now accepts Stata 13 format (:issue:`4291`)
424+
- ``read_fwf`` now infers the column specifications from the first 100 rows of
425+
the file if the data has correctly separated and properly aligned columns
426+
using the delimiter provided to the function (:issue:`4488`).
424427

425428
.. _whatsnew_0130.experimental:
426429

pandas/io/parsers.py

+69-29
Original file line numberDiff line numberDiff line change
@@ -160,11 +160,15 @@
160160
""" % (_parser_params % _table_sep)
161161

162162
_fwf_widths = """\
163-
colspecs : a list of pairs (tuples), giving the extents
164-
of the fixed-width fields of each line as half-open internals
165-
(i.e., [from, to[ ).
166-
widths : a list of field widths, which can be used instead of
167-
'colspecs' if the intervals are contiguous.
163+
colspecs : list of pairs (int, int) or 'infer'. optional
164+
A list of pairs (tuples) giving the extents of the fixed-width
165+
fields of each line as half-open intervals (i.e., [from, to[ ).
166+
String value 'infer' can be used to instruct the parser to try
167+
detecting the column specifications from the first 100 rows of
168+
the data (default='infer').
169+
widths : list of ints. optional
170+
A list of field widths which can be used instead of 'colspecs' if
171+
the intervals are contiguous.
168172
"""
169173

170174
_read_fwf_doc = """
@@ -184,7 +188,8 @@ def _read(filepath_or_buffer, kwds):
184188
if skipfooter is not None:
185189
kwds['skip_footer'] = skipfooter
186190

187-
filepath_or_buffer, _ = get_filepath_or_buffer(filepath_or_buffer)
191+
filepath_or_buffer, _ = get_filepath_or_buffer(filepath_or_buffer,
192+
encoding)
188193

189194
if kwds.get('date_parser', None) is not None:
190195
if isinstance(kwds['parse_dates'], bool):
@@ -267,8 +272,8 @@ def _read(filepath_or_buffer, kwds):
267272
}
268273

269274
_fwf_defaults = {
270-
'colspecs': None,
271-
'widths': None
275+
'colspecs': 'infer',
276+
'widths': None,
272277
}
273278

274279
_c_unsupported = set(['skip_footer'])
@@ -412,15 +417,15 @@ def parser_f(filepath_or_buffer,
412417

413418

414419
@Appender(_read_fwf_doc)
415-
def read_fwf(filepath_or_buffer, colspecs=None, widths=None, **kwds):
420+
def read_fwf(filepath_or_buffer, colspecs='infer', widths=None, **kwds):
416421
# Check input arguments.
417422
if colspecs is None and widths is None:
418423
raise ValueError("Must specify either colspecs or widths")
419-
elif colspecs is not None and widths is not None:
424+
elif colspecs not in (None, 'infer') and widths is not None:
420425
raise ValueError("You must specify only one of 'widths' and "
421426
"'colspecs'")
422427

423-
# Compute 'colspec' from 'widths', if specified.
428+
# Compute 'colspecs' from 'widths', if specified.
424429
if widths is not None:
425430
colspecs, col = [], 0
426431
for w in widths:
@@ -519,7 +524,8 @@ def _clean_options(self, options, engine):
519524
engine = 'python'
520525
elif sep is not None and len(sep) > 1:
521526
# wait until regex engine integrated
522-
engine = 'python'
527+
if engine not in ('python', 'python-fwf'):
528+
engine = 'python'
523529

524530
# C engine not supported yet
525531
if engine == 'c':
@@ -2012,31 +2018,65 @@ class FixedWidthReader(object):
20122018
"""
20132019
A reader of fixed-width lines.
20142020
"""
2015-
def __init__(self, f, colspecs, filler, thousands=None, encoding=None):
2021+
def __init__(self, f, colspecs, delimiter, comment):
20162022
self.f = f
2017-
self.colspecs = colspecs
2018-
self.filler = filler # Empty characters between fields.
2019-
self.thousands = thousands
2020-
if encoding is None:
2021-
encoding = get_option('display.encoding')
2022-
self.encoding = encoding
2023-
2024-
if not isinstance(colspecs, (tuple, list)):
2023+
self.buffer = None
2024+
self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t '
2025+
self.comment = comment
2026+
if colspecs == 'infer':
2027+
self.colspecs = self.detect_colspecs()
2028+
else:
2029+
self.colspecs = colspecs
2030+
2031+
if not isinstance(self.colspecs, (tuple, list)):
20252032
raise TypeError("column specifications must be a list or tuple, "
20262033
"input was a %r" % type(colspecs).__name__)
20272034

2028-
for colspec in colspecs:
2035+
for colspec in self.colspecs:
20292036
if not (isinstance(colspec, (tuple, list)) and
2030-
len(colspec) == 2 and
2031-
isinstance(colspec[0], int) and
2032-
isinstance(colspec[1], int)):
2037+
len(colspec) == 2 and
2038+
isinstance(colspec[0], (int, np.integer)) and
2039+
isinstance(colspec[1], (int, np.integer))):
20332040
raise TypeError('Each column specification must be '
20342041
'2 element tuple or list of integers')
20352042

2043+
def get_rows(self, n):
2044+
rows = []
2045+
for i, row in enumerate(self.f, 1):
2046+
rows.append(row)
2047+
if i >= n:
2048+
break
2049+
self.buffer = iter(rows)
2050+
return rows
2051+
2052+
def detect_colspecs(self, n=100):
2053+
# Regex escape the delimiters
2054+
delimiters = ''.join([r'\%s' % x for x in self.delimiter])
2055+
pattern = re.compile('([^%s]+)' % delimiters)
2056+
rows = self.get_rows(n)
2057+
max_len = max(map(len, rows))
2058+
mask = np.zeros(max_len + 1, dtype=int)
2059+
if self.comment is not None:
2060+
rows = [row.partition(self.comment)[0] for row in rows]
2061+
for row in rows:
2062+
for m in pattern.finditer(row):
2063+
mask[m.start():m.end()] = 1
2064+
shifted = np.roll(mask, 1)
2065+
shifted[0] = 0
2066+
edges = np.where((mask ^ shifted) == 1)[0]
2067+
return list(zip(edges[::2], edges[1::2]))
2068+
20362069
def next(self):
2037-
line = next(self.f)
2070+
if self.buffer is not None:
2071+
try:
2072+
line = next(self.buffer)
2073+
except StopIteration:
2074+
self.buffer = None
2075+
line = next(self.f)
2076+
else:
2077+
line = next(self.f)
20382078
# Note: 'colspecs' is a sequence of half-open intervals.
2039-
return [line[fromm:to].strip(self.filler or ' ')
2079+
return [line[fromm:to].strip(self.delimiter)
20402080
for (fromm, to) in self.colspecs]
20412081

20422082
# Iterator protocol in Python 3 uses __next__()
@@ -2050,10 +2090,10 @@ class FixedWidthFieldParser(PythonParser):
20502090
"""
20512091
def __init__(self, f, **kwds):
20522092
# Support iterators, convert to a list.
2053-
self.colspecs = list(kwds.pop('colspecs'))
2093+
self.colspecs = kwds.pop('colspecs')
20542094

20552095
PythonParser.__init__(self, f, **kwds)
20562096

20572097
def _make_reader(self, f):
20582098
self.data = FixedWidthReader(f, self.colspecs, self.delimiter,
2059-
encoding=self.encoding)
2099+
self.comment)

pandas/io/tests/test_parsers.py

+106-5
Original file line numberDiff line numberDiff line change
@@ -1706,7 +1706,7 @@ def test_utf16_example(self):
17061706
self.assertEquals(len(result), 50)
17071707

17081708
def test_converters_corner_with_nas(self):
1709-
# skip aberration observed on Win64 Python 3.2.2
1709+
# skip aberration observed on Win64 Python 3.2.2
17101710
if hash(np.int64(-1)) != -2:
17111711
raise nose.SkipTest("skipping because of windows hash on Python"
17121712
" 3.2.2")
@@ -2078,19 +2078,19 @@ def test_fwf(self):
20782078
read_fwf(StringIO(data3), colspecs=colspecs, widths=[6, 10, 10, 7])
20792079

20802080
with tm.assertRaisesRegexp(ValueError, "Must specify either"):
2081-
read_fwf(StringIO(data3))
2081+
read_fwf(StringIO(data3), colspecs=None, widths=None)
20822082

20832083
def test_fwf_colspecs_is_list_or_tuple(self):
20842084
with tm.assertRaisesRegexp(TypeError,
20852085
'column specifications must be a list or '
20862086
'tuple.+'):
2087-
fwr = pd.io.parsers.FixedWidthReader(StringIO(self.data1),
2088-
{'a': 1}, ',')
2087+
pd.io.parsers.FixedWidthReader(StringIO(self.data1),
2088+
{'a': 1}, ',', '#')
20892089

20902090
def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples(self):
20912091
with tm.assertRaisesRegexp(TypeError,
20922092
'Each column specification must be.+'):
2093-
read_fwf(StringIO(self.data1), {'a': 1})
2093+
read_fwf(StringIO(self.data1), [('a', 1)])
20942094

20952095
def test_fwf_regression(self):
20962096
# GH 3594
@@ -2223,6 +2223,107 @@ def test_iteration_open_handle(self):
22232223
expected = Series(['DDD', 'EEE', 'FFF', 'GGG'])
22242224
tm.assert_series_equal(result, expected)
22252225

2226+
2227+
class TestFwfColspaceSniffing(unittest.TestCase):
2228+
def test_full_file(self):
2229+
# File with all values
2230+
test = '''index A B C
2231+
2000-01-03T00:00:00 0.980268513777 3 foo
2232+
2000-01-04T00:00:00 1.04791624281 -4 bar
2233+
2000-01-05T00:00:00 0.498580885705 73 baz
2234+
2000-01-06T00:00:00 1.12020151869 1 foo
2235+
2000-01-07T00:00:00 0.487094399463 0 bar
2236+
2000-01-10T00:00:00 0.836648671666 2 baz
2237+
2000-01-11T00:00:00 0.157160753327 34 foo'''
2238+
colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
2239+
expected = read_fwf(StringIO(test), colspecs=colspecs)
2240+
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
2241+
2242+
def test_full_file_with_missing(self):
2243+
# File with missing values
2244+
test = '''index A B C
2245+
2000-01-03T00:00:00 0.980268513777 3 foo
2246+
2000-01-04T00:00:00 1.04791624281 -4 bar
2247+
0.498580885705 73 baz
2248+
2000-01-06T00:00:00 1.12020151869 1 foo
2249+
2000-01-07T00:00:00 0 bar
2250+
2000-01-10T00:00:00 0.836648671666 2 baz
2251+
34'''
2252+
colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
2253+
expected = read_fwf(StringIO(test), colspecs=colspecs)
2254+
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
2255+
2256+
def test_full_file_with_spaces(self):
2257+
# File with spaces in columns
2258+
test = '''
2259+
Account Name Balance CreditLimit AccountCreated
2260+
101 Keanu Reeves 9315.45 10000.00 1/17/1998
2261+
312 Gerard Butler 90.00 1000.00 8/6/2003
2262+
868 Jennifer Love Hewitt 0 17000.00 5/25/1985
2263+
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
2264+
317 Bill Murray 789.65 5000.00 2/5/2007
2265+
'''.strip('\r\n')
2266+
colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
2267+
expected = read_fwf(StringIO(test), colspecs=colspecs)
2268+
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
2269+
2270+
def test_full_file_with_spaces_and_missing(self):
2271+
# File with spaces and missing values in columsn
2272+
test = '''
2273+
Account Name Balance CreditLimit AccountCreated
2274+
101 10000.00 1/17/1998
2275+
312 Gerard Butler 90.00 1000.00 8/6/2003
2276+
868 5/25/1985
2277+
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
2278+
317 Bill Murray 789.65
2279+
'''.strip('\r\n')
2280+
colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
2281+
expected = read_fwf(StringIO(test), colspecs=colspecs)
2282+
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
2283+
2284+
def test_messed_up_data(self):
2285+
# Completely messed up file
2286+
test = '''
2287+
Account Name Balance Credit Limit Account Created
2288+
101 10000.00 1/17/1998
2289+
312 Gerard Butler 90.00 1000.00
2290+
2291+
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
2292+
317 Bill Murray 789.65
2293+
'''.strip('\r\n')
2294+
colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79))
2295+
expected = read_fwf(StringIO(test), colspecs=colspecs)
2296+
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
2297+
2298+
def test_multiple_delimiters(self):
2299+
test = r'''
2300+
col1~~~~~col2 col3++++++++++++++++++col4
2301+
~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves
2302+
33+++122.33\\\bar.........Gerard Butler
2303+
++44~~~~12.01 baz~~Jennifer Love Hewitt
2304+
~~55 11+++foo++++Jada Pinkett-Smith
2305+
..66++++++.03~~~bar Bill Murray
2306+
'''.strip('\r\n')
2307+
colspecs = ((0, 4), (7, 13), (15, 19), (21, 41))
2308+
expected = read_fwf(StringIO(test), colspecs=colspecs,
2309+
delimiter=' +~.\\')
2310+
tm.assert_frame_equal(expected, read_fwf(StringIO(test),
2311+
delimiter=' +~.\\'))
2312+
2313+
def test_variable_width_unicode(self):
2314+
if not compat.PY3:
2315+
raise nose.SkipTest('Bytes-related test - only needs to work on Python 3')
2316+
test = '''
2317+
שלום שלום
2318+
ום שלל
2319+
של ום
2320+
'''.strip('\r\n')
2321+
expected = pd.read_fwf(BytesIO(test.encode('utf8')),
2322+
colspecs=[(0, 4), (5, 9)], header=None)
2323+
tm.assert_frame_equal(expected, read_fwf(BytesIO(test.encode('utf8')),
2324+
header=None))
2325+
2326+
22262327
class TestCParserHighMemory(ParserTests, unittest.TestCase):
22272328

22282329
def read_csv(self, *args, **kwds):

0 commit comments

Comments
 (0)