Skip to content

Commit b0d75dc

Browse files
committed
BUG: Disable multichar/regex sep for Python engine in read_csv
Closes pandas-devgh-13374.
1 parent e1cdc4b commit b0d75dc

File tree

8 files changed

+57
-18
lines changed

8 files changed

+57
-18
lines changed

doc/source/whatsnew/v0.20.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ Backwards incompatible API changes
4848
Other API Changes
4949
^^^^^^^^^^^^^^^^^
5050

51+
- ``pd.read_csv()`` will raise a ValueError for the Python engine if a multi-character or regex separator is used with quoted data (:issue:`13374`)
52+
5153
.. _whatsnew_0200.deprecations:
5254

5355
Deprecations

pandas/io/parsers.py

+11
Original file line numberDiff line numberDiff line change
@@ -784,7 +784,9 @@ def _clean_options(self, options, engine):
784784
" skipfooter"
785785
engine = 'python'
786786

787+
quoting = options['quoting']
787788
encoding = sys.getfilesystemencoding() or 'utf-8'
789+
788790
if sep is None and not delim_whitespace:
789791
if engine == 'c':
790792
fallback_reason = "the 'c' engine does not support"\
@@ -801,6 +803,15 @@ def _clean_options(self, options, engine):
801803
" different from '\s+' are"\
802804
" interpreted as regex)"
803805
engine = 'python'
806+
elif quoting != csv.QUOTE_NONE:
807+
# gh-13374: The Python engine breaks with quotation
808+
# marks because we do not properly handle quoted
809+
# fields with multi-char / regex separators.
810+
msg = ("the %r engine does not support regex separators "
811+
"(separators > 1 char and different from '\s+' are "
812+
"interpreted as regex) with quoted fields") % engine
813+
raise ValueError(msg)
814+
804815
elif delim_whitespace:
805816
if 'python' in engine:
806817
result['delimiter'] = '\s+'

pandas/io/tests/parser/common.py

+12-6
Original file line numberDiff line numberDiff line change
@@ -836,7 +836,8 @@ def test_integer_overflow_bug(self):
836836
result = self.read_csv(StringIO(data), header=None, sep=' ')
837837
self.assertTrue(result[0].dtype == np.float64)
838838

839-
result = self.read_csv(StringIO(data), header=None, sep='\s+')
839+
result = self.read_csv(StringIO(data), header=None,
840+
quoting=csv.QUOTE_NONE, sep='\s+')
840841
self.assertTrue(result[0].dtype == np.float64)
841842

842843
def test_catch_too_many_names(self):
@@ -852,7 +853,8 @@ def test_catch_too_many_names(self):
852853
def test_ignore_leading_whitespace(self):
853854
# see gh-3374, gh-6607
854855
data = ' a b c\n 1 2 3\n 4 5 6\n 7 8 9'
855-
result = self.read_table(StringIO(data), sep='\s+')
856+
result = self.read_table(StringIO(data), sep='\s+',
857+
quoting=csv.QUOTE_NONE)
856858
expected = DataFrame({'a': [1, 4, 7], 'b': [2, 5, 8], 'c': [3, 6, 9]})
857859
tm.assert_frame_equal(result, expected)
858860

@@ -1157,7 +1159,8 @@ def test_empty_lines(self):
11571159
[-70., .4, 1.]])
11581160
df = self.read_csv(StringIO(data))
11591161
tm.assert_numpy_array_equal(df.values, expected)
1160-
df = self.read_csv(StringIO(data.replace(',', ' ')), sep='\s+')
1162+
df = self.read_csv(StringIO(data.replace(',', ' ')),
1163+
quoting=csv.QUOTE_NONE, sep='\s+')
11611164
tm.assert_numpy_array_equal(df.values, expected)
11621165
expected = np.array([[1., 2., 4.],
11631166
[np.nan, np.nan, np.nan],
@@ -1189,14 +1192,16 @@ def test_regex_separator(self):
11891192
b 1 2 3 4
11901193
c 1 2 3 4
11911194
"""
1192-
df = self.read_table(StringIO(data), sep='\s+')
1195+
df = self.read_table(StringIO(data), sep='\s+',
1196+
quoting=csv.QUOTE_NONE)
11931197
expected = self.read_csv(StringIO(re.sub('[ ]+', ',', data)),
11941198
index_col=0)
11951199
self.assertIsNone(expected.index.name)
11961200
tm.assert_frame_equal(df, expected)
11971201

11981202
data = ' a b c\n1 2 3 \n4 5 6\n 7 8 9'
1199-
result = self.read_table(StringIO(data), sep='\s+')
1203+
result = self.read_table(StringIO(data), sep='\s+',
1204+
quoting=csv.QUOTE_NONE)
12001205
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
12011206
columns=['a', 'b', 'c'])
12021207
tm.assert_frame_equal(result, expected)
@@ -1580,7 +1585,8 @@ def test_temporary_file(self):
15801585
new_file.flush()
15811586
new_file.seek(0)
15821587

1583-
result = self.read_csv(new_file, sep='\s+', header=None)
1588+
result = self.read_csv(new_file, sep='\s+', header=None,
1589+
quoting=csv.QUOTE_NONE)
15841590
new_file.close()
15851591
expected = DataFrame([[0, 0]])
15861592
tm.assert_frame_equal(result, expected)

pandas/io/tests/parser/python_parser_only.py

+13-7
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
arguments when parsing.
88
"""
99

10+
import csv
1011
import sys
1112
import nose
1213

@@ -81,7 +82,8 @@ def test_BytesIO_input(self):
8182
"Bytes-related test - only needs to work on Python 3")
8283

8384
data = BytesIO("שלום::1234\n562::123".encode('cp1255'))
84-
result = self.read_table(data, sep="::", encoding='cp1255')
85+
result = self.read_table(data, sep="::", quoting=csv.QUOTE_NONE,
86+
encoding='cp1255')
8587
expected = DataFrame([[562, 123]], columns=["שלום", "1234"])
8688
tm.assert_frame_equal(result, expected)
8789

@@ -140,15 +142,17 @@ def test_decompression_regex_sep(self):
140142
tmp.write(data)
141143
tmp.close()
142144

143-
result = self.read_csv(path, sep='::', compression='gzip')
145+
result = self.read_csv(path, sep='::', quoting=csv.QUOTE_NONE,
146+
compression='gzip')
144147
tm.assert_frame_equal(result, expected)
145148

146149
with tm.ensure_clean() as path:
147150
tmp = bz2.BZ2File(path, mode='wb')
148151
tmp.write(data)
149152
tmp.close()
150153

151-
result = self.read_csv(path, sep='::', compression='bz2')
154+
result = self.read_csv(path, sep='::', quoting=csv.QUOTE_NONE,
155+
compression='bz2')
152156
tm.assert_frame_equal(result, expected)
153157

154158
self.assertRaises(ValueError, self.read_csv,
@@ -162,15 +166,17 @@ def test_read_table_buglet_4x_multiindex(self):
162166
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
163167
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
164168

165-
df = self.read_table(StringIO(text), sep='\s+')
169+
df = self.read_table(StringIO(text), sep='\s+',
170+
quoting=csv.QUOTE_NONE)
166171
self.assertEqual(df.index.names, ('one', 'two', 'three', 'four'))
167172

168173
# see gh-6893
169174
data = ' A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9'
170175
expected = DataFrame.from_records(
171176
[(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)],
172177
columns=list('abcABC'), index=list('abc'))
173-
actual = self.read_table(StringIO(data), sep='\s+')
178+
actual = self.read_table(StringIO(data), sep='\s+',
179+
quoting=csv.QUOTE_NONE)
174180
tm.assert_frame_equal(actual, expected)
175181

176182
def test_skipfooter_with_decimal(self):
@@ -201,6 +207,6 @@ def test_encoding_non_utf8_multichar_sep(self):
201207
'utf-32', 'cp037']:
202208
encoded_data = data.encode(encoding)
203209
result = self.read_csv(BytesIO(encoded_data),
204-
sep=sep, names=['a', 'b'],
205-
encoding=encoding)
210+
sep=sep, quoting=csv.QUOTE_NONE,
211+
names=['a', 'b'], encoding=encoding)
206212
tm.assert_frame_equal(result, expected)

pandas/io/tests/parser/test_read_fwf.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from datetime import datetime
1010

11+
import csv
1112
import nose
1213
import numpy as np
1314
import pandas as pd
@@ -327,9 +328,10 @@ def test_multiple_delimiters(self):
327328
'''.strip('\r\n')
328329
colspecs = ((0, 4), (7, 13), (15, 19), (21, 41))
329330
expected = read_fwf(StringIO(test), colspecs=colspecs,
330-
delimiter=' +~.\\')
331+
delimiter=' +~.\\', quoting=csv.QUOTE_NONE)
331332
tm.assert_frame_equal(expected, read_fwf(StringIO(test),
332-
delimiter=' +~.\\'))
333+
delimiter=' +~.\\',
334+
quoting=csv.QUOTE_NONE))
333335

334336
def test_variable_width_unicode(self):
335337
if not compat.PY3:

pandas/io/tests/parser/test_unsupported.py

+7
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
test suite as new feature support is added to the parsers.
1010
"""
1111

12+
import csv
1213
import nose
1314

1415
import pandas.io.parsers as parsers
@@ -117,6 +118,12 @@ def test_python_engine(self):
117118
with tm.assertRaisesRegexp(ValueError, msg):
118119
read_csv(StringIO(data), engine=engine, **kwargs)
119120

121+
msg = "the %r engine does not support " % engine
122+
kwargs = {'sep': '\s+', 'quoting': csv.QUOTE_MINIMAL}
123+
124+
with tm.assertRaisesRegexp(ValueError, msg):
125+
read_csv(StringIO(data), engine=engine, **kwargs)
126+
120127

121128
class TestDeprecatedFeatures(tm.TestCase):
122129
def test_deprecated_args(self):

pandas/io/tests/parser/usecols.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
for all of the parsers defined in parsers.py
66
"""
77

8+
import csv
89
import nose
910

1011
import numpy as np
@@ -139,7 +140,8 @@ def test_usecols_regex_sep(self):
139140
# see gh-2733
140141
data = 'a b c\n4 apple bat 5.7\n8 orange cow 10'
141142

142-
df = self.read_csv(StringIO(data), sep='\s+', usecols=('a', 'b'))
143+
df = self.read_csv(StringIO(data), sep='\s+', usecols=('a', 'b'),
144+
quoting=csv.QUOTE_NONE)
143145

144146
expected = DataFrame({'a': ['apple', 'orange'],
145147
'b': ['bat', 'cow']}, index=[4, 8])

pandas/tests/test_multilevel.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import datetime
44
import itertools
55
import nose
6+
import csv
67

78
from numpy.random import randn
89
import numpy as np
@@ -554,7 +555,8 @@ def test_xs_level_multiple(self):
554555
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
555556
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
556557

557-
df = read_table(StringIO(text), sep='\s+', engine='python')
558+
df = read_table(StringIO(text), sep='\s+', engine='python',
559+
quoting=csv.QUOTE_NONE)
558560

559561
result = df.xs(('a', 4), level=['one', 'four'])
560562
expected = df.xs('a').xs(4, level='four')
@@ -588,7 +590,8 @@ def test_xs_level0(self):
588590
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
589591
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
590592

591-
df = read_table(StringIO(text), sep='\s+', engine='python')
593+
df = read_table(StringIO(text), sep='\s+', engine='python',
594+
quoting=csv.QUOTE_NONE)
592595

593596
result = df.xs('a', level=0)
594597
expected = df.xs('a')

0 commit comments

Comments
 (0)