Skip to content

Commit dc7acd1

Browse files
author
Camilo Cota
committed
ENH: support decimal option in PythonParser pandas-dev#12933
1 parent af7bdd3 commit dc7acd1

File tree

4 files changed

+76
-52
lines changed

4 files changed

+76
-52
lines changed

doc/source/whatsnew/v0.18.2.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ Other enhancements
3131

3232
- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`)
3333

34-
34+
- Support decimal option in PythonParser
3535

3636

3737
.. _whatsnew_0182.api:

pandas/io/parsers.py

+30-6
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,7 @@ def _read(filepath_or_buffer, kwds):
348348
'keep_default_na': True,
349349
'thousands': None,
350350
'comment': None,
351+
'decimal': b'.',
351352

352353
# 'engine': 'c',
353354
'parse_dates': False,
@@ -383,7 +384,6 @@ def _read(filepath_or_buffer, kwds):
383384
'error_bad_lines': True,
384385
'warn_bad_lines': True,
385386
'dtype': None,
386-
'decimal': b'.',
387387
'float_precision': None
388388
}
389389

@@ -404,7 +404,6 @@ def _read(filepath_or_buffer, kwds):
404404
'error_bad_lines',
405405
'warn_bad_lines',
406406
'dtype',
407-
'decimal',
408407
'float_precision',
409408
])
410409

@@ -1582,6 +1581,7 @@ def __init__(self, f, **kwds):
15821581
self.converters = kwds['converters']
15831582

15841583
self.thousands = kwds['thousands']
1584+
self.decimal = kwds['decimal']
15851585
self.comment = kwds['comment']
15861586
self._comment_lines = []
15871587

@@ -1639,6 +1639,9 @@ def __init__(self, f, **kwds):
16391639
else:
16401640
self._no_thousands_columns = None
16411641

1642+
if len(self.decimal) != 1:
1643+
raise ValueError('Only length-1 decimal markers supported')
1644+
16421645
def _set_no_thousands_columns(self):
16431646
# Create a set of column ids that are not to be stripped of thousands
16441647
# operators.
@@ -2050,22 +2053,42 @@ def _check_empty(self, lines):
20502053
def _check_thousands(self, lines):
20512054
if self.thousands is None:
20522055
return lines
2053-
nonnum = re.compile('[^-^0-9^%s^.]+' % self.thousands)
2056+
nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands, self.decimal))
2057+
return self._search_replace_num_columns(lines=lines,
2058+
search=self.thousands,
2059+
replace='',
2060+
nonnum=nonnum)
2061+
2062+
def _search_replace_num_columns(self, lines, search, replace, nonnum):
20542063
ret = []
20552064
for l in lines:
20562065
rl = []
20572066
for i, x in enumerate(l):
20582067
if (not isinstance(x, compat.string_types) or
2059-
self.thousands not in x or
2068+
search not in x or
20602069
(self._no_thousands_columns and
20612070
i in self._no_thousands_columns) or
20622071
nonnum.search(x.strip())):
20632072
rl.append(x)
20642073
else:
2065-
rl.append(x.replace(self.thousands, ''))
2074+
rl.append(x.replace(search, replace))
20662075
ret.append(rl)
20672076
return ret
20682077

2078+
def _check_decimal(self, lines):
2079+
if self.decimal == b'.':
2080+
return lines
2081+
2082+
if self.thousands is None:
2083+
nonnum = re.compile('[^-^0-9^%s]+' % self.decimal)
2084+
else:
2085+
nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands,
2086+
self.decimal))
2087+
return self._search_replace_num_columns(lines=lines,
2088+
search=self.decimal,
2089+
replace='.',
2090+
nonnum=nonnum)
2091+
20692092
def _clear_buffer(self):
20702093
self.buf = []
20712094

@@ -2249,7 +2272,8 @@ def _get_lines(self, rows=None):
22492272
lines = self._check_comments(lines)
22502273
if self.skip_blank_lines:
22512274
lines = self._check_empty(lines)
2252-
return self._check_thousands(lines)
2275+
lines = self._check_thousands(lines)
2276+
return self._check_decimal(lines)
22532277

22542278

22552279
def _make_date_converter(date_parser=None, dayfirst=False,

pandas/io/tests/parser/c_parser_only.py

-45
Original file line numberDiff line numberDiff line change
@@ -353,17 +353,6 @@ def test_disable_bool_parsing(self):
353353
result = self.read_csv(StringIO(data), dtype=object, na_filter=False)
354354
self.assertEqual(result['B'][2], '')
355355

356-
def test_euro_decimal_format(self):
357-
data = """Id;Number1;Number2;Text1;Text2;Number3
358-
1;1521,1541;187101,9543;ABC;poi;4,738797819
359-
2;121,12;14897,76;DEF;uyt;0,377320872
360-
3;878,158;108013,434;GHI;rez;2,735694704"""
361-
362-
df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
363-
self.assertEqual(df2['Number1'].dtype, float)
364-
self.assertEqual(df2['Number2'].dtype, float)
365-
self.assertEqual(df2['Number3'].dtype, float)
366-
367356
def test_custom_lineterminator(self):
368357
data = 'a,b,c~1,2,3~4,5,6'
369358

@@ -444,40 +433,6 @@ def test_raise_on_no_columns(self):
444433
data = "\n\n\n"
445434
self.assertRaises(ValueError, self.read_csv, StringIO(data))
446435

447-
def test_1000_sep_with_decimal(self):
448-
data = """A|B|C
449-
1|2,334.01|5
450-
10|13|10.
451-
"""
452-
expected = DataFrame({
453-
'A': [1, 10],
454-
'B': [2334.01, 13],
455-
'C': [5, 10.]
456-
})
457-
458-
tm.assert_equal(expected.A.dtype, 'int64')
459-
tm.assert_equal(expected.B.dtype, 'float')
460-
tm.assert_equal(expected.C.dtype, 'float')
461-
462-
df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
463-
tm.assert_frame_equal(df, expected)
464-
465-
df = self.read_table(StringIO(data), sep='|',
466-
thousands=',', decimal='.')
467-
tm.assert_frame_equal(df, expected)
468-
469-
data_with_odd_sep = """A|B|C
470-
1|2.334,01|5
471-
10|13|10,
472-
"""
473-
df = self.read_csv(StringIO(data_with_odd_sep),
474-
sep='|', thousands='.', decimal=',')
475-
tm.assert_frame_equal(df, expected)
476-
477-
df = self.read_table(StringIO(data_with_odd_sep),
478-
sep='|', thousands='.', decimal=',')
479-
tm.assert_frame_equal(df, expected)
480-
481436
def test_grow_boundary_at_cap(self):
482437
# See gh-12494
483438
#

pandas/io/tests/parser/common.py

+45
Original file line numberDiff line numberDiff line change
@@ -1236,3 +1236,48 @@ def test_iteration_open_handle(self):
12361236
result = self.read_table(f, squeeze=True, header=None)
12371237
expected = Series(['DDD', 'EEE', 'FFF', 'GGG'], name=0)
12381238
tm.assert_series_equal(result, expected)
1239+
1240+
def test_1000_sep_with_decimal(self):
1241+
data = """A|B|C
1242+
1|2,334.01|5
1243+
10|13|10.
1244+
"""
1245+
expected = DataFrame({
1246+
'A': [1, 10],
1247+
'B': [2334.01, 13],
1248+
'C': [5, 10.]
1249+
})
1250+
1251+
tm.assert_equal(expected.A.dtype, 'int64')
1252+
tm.assert_equal(expected.B.dtype, 'float')
1253+
tm.assert_equal(expected.C.dtype, 'float')
1254+
1255+
df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
1256+
tm.assert_frame_equal(df, expected)
1257+
1258+
df = self.read_table(StringIO(data), sep='|',
1259+
thousands=',', decimal='.')
1260+
tm.assert_frame_equal(df, expected)
1261+
1262+
data_with_odd_sep = """A|B|C
1263+
1|2.334,01|5
1264+
10|13|10,
1265+
"""
1266+
df = self.read_csv(StringIO(data_with_odd_sep),
1267+
sep='|', thousands='.', decimal=',')
1268+
tm.assert_frame_equal(df, expected)
1269+
1270+
df = self.read_table(StringIO(data_with_odd_sep),
1271+
sep='|', thousands='.', decimal=',')
1272+
tm.assert_frame_equal(df, expected)
1273+
1274+
def test_euro_decimal_format(self):
1275+
data = """Id;Number1;Number2;Text1;Text2;Number3
1276+
1;1521,1541;187101,9543;ABC;poi;4,738797819
1277+
2;121,12;14897,76;DEF;uyt;0,377320872
1278+
3;878,158;108013,434;GHI;rez;2,735694704"""
1279+
1280+
df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
1281+
self.assertEqual(df2['Number1'].dtype, float)
1282+
self.assertEqual(df2['Number2'].dtype, float)
1283+
self.assertEqual(df2['Number3'].dtype, float)

0 commit comments

Comments
 (0)