Skip to content

Commit 19ebee5

Browse files
Camilo Cotajreback
Camilo Cota
authored andcommitted
ENH: support decimal option in PythonParser #12933
closes #12933 Author: Camilo Cota <[email protected]> Closes #13189 from camilocot/12933 and squashes the following commits: 465272e [Camilo Cota] Benchmark decimal option in read_csv for c engine 9f42d0c [Camilo Cota] double backticks around decimal and engine='python' dc8ca62 [Camilo Cota] fix test_empty_decimal_marker comment 49613fe [Camilo Cota] Assert read_csv error message in test_empty_decimal_marker d821052 [Camilo Cota] fix test_empty_decimal_marker comment f71509d [Camilo Cota] Include descritive what's new line 803356e [Camilo Cota] set nonnum regex in init method 1472d80 [Camilo Cota] Include the issue number in what's new b560fda [Camilo Cota] Fix what's new dc7acd1 [Camilo Cota] ENH: support decimal option in PythonParser #12933
1 parent b88eb35 commit 19ebee5

File tree

5 files changed

+137
-60
lines changed

5 files changed

+137
-60
lines changed

asv_bench/benchmarks/parser_vb.py

+56-4
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,42 @@ class read_csv_default_converter(object):
2323
goal_time = 0.2
2424

2525
def setup(self):
26-
self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n '
26+
self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n
27+
0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n
28+
0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n
29+
0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n
30+
0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n"""
2731
self.data = (self.data * 200)
2832

2933
def time_read_csv_default_converter(self):
3034
read_csv(StringIO(self.data), sep=',', header=None, float_precision=None)
3135

3236

37+
class read_csv_default_converter_with_decimal(object):
38+
goal_time = 0.2
39+
40+
def setup(self):
41+
self.data = """0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n
42+
0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n
43+
0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n
44+
0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n
45+
0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n"""
46+
self.data = (self.data * 200)
47+
48+
def time_read_csv_default_converter_with_decimal(self):
49+
read_csv(StringIO(self.data), sep=';', header=None,
50+
float_precision=None, decimal=',')
51+
52+
3353
class read_csv_precise_converter(object):
3454
goal_time = 0.2
3555

3656
def setup(self):
37-
self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n '
57+
self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n
58+
0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n
59+
0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n
60+
0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n
61+
0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n"""
3862
self.data = (self.data * 200)
3963

4064
def time_read_csv_precise_converter(self):
@@ -45,7 +69,11 @@ class read_csv_roundtrip_converter(object):
4569
goal_time = 0.2
4670

4771
def setup(self):
48-
self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n '
72+
self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n
73+
0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n
74+
0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n
75+
0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n
76+
0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n"""
4977
self.data = (self.data * 200)
5078

5179
def time_read_csv_roundtrip_converter(self):
@@ -109,4 +137,28 @@ def setup(self):
109137
self.data = (self.data * 200)
110138

111139
def time_read_table_multiple_date_baseline(self):
112-
read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1])
140+
read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1])
141+
142+
143+
class read_csv_default_converter_python_engine(object):
144+
goal_time = 0.2
145+
146+
def setup(self):
147+
self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n '
148+
self.data = (self.data * 200)
149+
150+
def time_read_csv_default_converter(self):
151+
read_csv(StringIO(self.data), sep=',', header=None,
152+
float_precision=None, engine='python')
153+
154+
155+
class read_csv_default_converter_with_decimal_python_engine(object):
156+
goal_time = 0.2
157+
158+
def setup(self):
159+
self.data = '0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n 0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n 0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n 0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n 0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n '
160+
self.data = (self.data * 200)
161+
162+
def time_read_csv_default_converter_with_decimal(self):
163+
read_csv(StringIO(self.data), sep=';', header=None,
164+
float_precision=None, decimal=',', engine='python')

doc/source/whatsnew/v0.18.2.txt

+2
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ Other enhancements
4747

4848
pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30)
4949

50+
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`)
51+
5052
.. _whatsnew_0182.api:
5153

5254
API changes

pandas/io/parsers.py

+30-7
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,7 @@ def _read(filepath_or_buffer, kwds):
348348
'keep_default_na': True,
349349
'thousands': None,
350350
'comment': None,
351+
'decimal': b'.',
351352

352353
# 'engine': 'c',
353354
'parse_dates': False,
@@ -383,7 +384,6 @@ def _read(filepath_or_buffer, kwds):
383384
'error_bad_lines': True,
384385
'warn_bad_lines': True,
385386
'dtype': None,
386-
'decimal': b'.',
387387
'float_precision': None
388388
}
389389

@@ -404,7 +404,6 @@ def _read(filepath_or_buffer, kwds):
404404
'error_bad_lines',
405405
'warn_bad_lines',
406406
'dtype',
407-
'decimal',
408407
'float_precision',
409408
])
410409

@@ -1582,6 +1581,7 @@ def __init__(self, f, **kwds):
15821581
self.converters = kwds['converters']
15831582

15841583
self.thousands = kwds['thousands']
1584+
self.decimal = kwds['decimal']
15851585
self.comment = kwds['comment']
15861586
self._comment_lines = []
15871587

@@ -1639,6 +1639,15 @@ def __init__(self, f, **kwds):
16391639
else:
16401640
self._no_thousands_columns = None
16411641

1642+
if len(self.decimal) != 1:
1643+
raise ValueError('Only length-1 decimal markers supported')
1644+
1645+
if self.thousands is None:
1646+
self.nonnum = re.compile('[^-^0-9^%s]+' % self.decimal)
1647+
else:
1648+
self.nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands,
1649+
self.decimal))
1650+
16421651
def _set_no_thousands_columns(self):
16431652
# Create a set of column ids that are not to be stripped of thousands
16441653
# operators.
@@ -2050,22 +2059,35 @@ def _check_empty(self, lines):
20502059
def _check_thousands(self, lines):
20512060
if self.thousands is None:
20522061
return lines
2053-
nonnum = re.compile('[^-^0-9^%s^.]+' % self.thousands)
2062+
2063+
return self._search_replace_num_columns(lines=lines,
2064+
search=self.thousands,
2065+
replace='')
2066+
2067+
def _search_replace_num_columns(self, lines, search, replace):
20542068
ret = []
20552069
for l in lines:
20562070
rl = []
20572071
for i, x in enumerate(l):
20582072
if (not isinstance(x, compat.string_types) or
2059-
self.thousands not in x or
2073+
search not in x or
20602074
(self._no_thousands_columns and
20612075
i in self._no_thousands_columns) or
2062-
nonnum.search(x.strip())):
2076+
self.nonnum.search(x.strip())):
20632077
rl.append(x)
20642078
else:
2065-
rl.append(x.replace(self.thousands, ''))
2079+
rl.append(x.replace(search, replace))
20662080
ret.append(rl)
20672081
return ret
20682082

2083+
def _check_decimal(self, lines):
2084+
if self.decimal == _parser_defaults['decimal']:
2085+
return lines
2086+
2087+
return self._search_replace_num_columns(lines=lines,
2088+
search=self.decimal,
2089+
replace='.')
2090+
20692091
def _clear_buffer(self):
20702092
self.buf = []
20712093

@@ -2249,7 +2271,8 @@ def _get_lines(self, rows=None):
22492271
lines = self._check_comments(lines)
22502272
if self.skip_blank_lines:
22512273
lines = self._check_empty(lines)
2252-
return self._check_thousands(lines)
2274+
lines = self._check_thousands(lines)
2275+
return self._check_decimal(lines)
22532276

22542277

22552278
def _make_date_converter(date_parser=None, dayfirst=False,

pandas/io/tests/parser/c_parser_only.py

-45
Original file line numberDiff line numberDiff line change
@@ -353,17 +353,6 @@ def test_disable_bool_parsing(self):
353353
result = self.read_csv(StringIO(data), dtype=object, na_filter=False)
354354
self.assertEqual(result['B'][2], '')
355355

356-
def test_euro_decimal_format(self):
357-
data = """Id;Number1;Number2;Text1;Text2;Number3
358-
1;1521,1541;187101,9543;ABC;poi;4,738797819
359-
2;121,12;14897,76;DEF;uyt;0,377320872
360-
3;878,158;108013,434;GHI;rez;2,735694704"""
361-
362-
df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
363-
self.assertEqual(df2['Number1'].dtype, float)
364-
self.assertEqual(df2['Number2'].dtype, float)
365-
self.assertEqual(df2['Number3'].dtype, float)
366-
367356
def test_custom_lineterminator(self):
368357
data = 'a,b,c~1,2,3~4,5,6'
369358

@@ -444,40 +433,6 @@ def test_raise_on_no_columns(self):
444433
data = "\n\n\n"
445434
self.assertRaises(ValueError, self.read_csv, StringIO(data))
446435

447-
def test_1000_sep_with_decimal(self):
448-
data = """A|B|C
449-
1|2,334.01|5
450-
10|13|10.
451-
"""
452-
expected = DataFrame({
453-
'A': [1, 10],
454-
'B': [2334.01, 13],
455-
'C': [5, 10.]
456-
})
457-
458-
tm.assert_equal(expected.A.dtype, 'int64')
459-
tm.assert_equal(expected.B.dtype, 'float')
460-
tm.assert_equal(expected.C.dtype, 'float')
461-
462-
df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
463-
tm.assert_frame_equal(df, expected)
464-
465-
df = self.read_table(StringIO(data), sep='|',
466-
thousands=',', decimal='.')
467-
tm.assert_frame_equal(df, expected)
468-
469-
data_with_odd_sep = """A|B|C
470-
1|2.334,01|5
471-
10|13|10,
472-
"""
473-
df = self.read_csv(StringIO(data_with_odd_sep),
474-
sep='|', thousands='.', decimal=',')
475-
tm.assert_frame_equal(df, expected)
476-
477-
df = self.read_table(StringIO(data_with_odd_sep),
478-
sep='|', thousands='.', decimal=',')
479-
tm.assert_frame_equal(df, expected)
480-
481436
def test_grow_boundary_at_cap(self):
482437
# See gh-12494
483438
#

pandas/io/tests/parser/common.py

+49-4
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,10 @@ def test_empty_decimal_marker(self):
4141
1|2,334|5
4242
10|13|10.
4343
"""
44-
# C parser: supports only length-1 decimals
45-
# Python parser: 'decimal' not supported yet
46-
self.assertRaises(ValueError, self.read_csv,
47-
StringIO(data), decimal='')
44+
# Parsers support only length-1 decimals
45+
msg = 'Only length-1 decimal markers supported'
46+
with tm.assertRaisesRegexp(ValueError, msg):
47+
self.read_csv(StringIO(data), decimal='')
4848

4949
def test_read_csv(self):
5050
if not compat.PY3:
@@ -1236,3 +1236,48 @@ def test_iteration_open_handle(self):
12361236
result = self.read_table(f, squeeze=True, header=None)
12371237
expected = Series(['DDD', 'EEE', 'FFF', 'GGG'], name=0)
12381238
tm.assert_series_equal(result, expected)
1239+
1240+
def test_1000_sep_with_decimal(self):
1241+
data = """A|B|C
1242+
1|2,334.01|5
1243+
10|13|10.
1244+
"""
1245+
expected = DataFrame({
1246+
'A': [1, 10],
1247+
'B': [2334.01, 13],
1248+
'C': [5, 10.]
1249+
})
1250+
1251+
tm.assert_equal(expected.A.dtype, 'int64')
1252+
tm.assert_equal(expected.B.dtype, 'float')
1253+
tm.assert_equal(expected.C.dtype, 'float')
1254+
1255+
df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
1256+
tm.assert_frame_equal(df, expected)
1257+
1258+
df = self.read_table(StringIO(data), sep='|',
1259+
thousands=',', decimal='.')
1260+
tm.assert_frame_equal(df, expected)
1261+
1262+
data_with_odd_sep = """A|B|C
1263+
1|2.334,01|5
1264+
10|13|10,
1265+
"""
1266+
df = self.read_csv(StringIO(data_with_odd_sep),
1267+
sep='|', thousands='.', decimal=',')
1268+
tm.assert_frame_equal(df, expected)
1269+
1270+
df = self.read_table(StringIO(data_with_odd_sep),
1271+
sep='|', thousands='.', decimal=',')
1272+
tm.assert_frame_equal(df, expected)
1273+
1274+
def test_euro_decimal_format(self):
1275+
data = """Id;Number1;Number2;Text1;Text2;Number3
1276+
1;1521,1541;187101,9543;ABC;poi;4,738797819
1277+
2;121,12;14897,76;DEF;uyt;0,377320872
1278+
3;878,158;108013,434;GHI;rez;2,735694704"""
1279+
1280+
df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
1281+
self.assertEqual(df2['Number1'].dtype, float)
1282+
self.assertEqual(df2['Number2'].dtype, float)
1283+
self.assertEqual(df2['Number3'].dtype, float)

0 commit comments

Comments
 (0)