Skip to content

ENH: support decimal option in PythonParser #12933 #13189

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ Other enhancements
idx = pd.Index(["a1a2", "b1", "c1"])
idx.str.extractall("[ab](?P<digit>\d)")

- Support decimal option in PythonParser
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

include the issue number


.. _whatsnew_0182.api:

API changes
Expand Down
36 changes: 30 additions & 6 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ def _read(filepath_or_buffer, kwds):
'keep_default_na': True,
'thousands': None,
'comment': None,
'decimal': b'.',

# 'engine': 'c',
'parse_dates': False,
Expand Down Expand Up @@ -383,7 +384,6 @@ def _read(filepath_or_buffer, kwds):
'error_bad_lines': True,
'warn_bad_lines': True,
'dtype': None,
'decimal': b'.',
'float_precision': None
}

Expand All @@ -404,7 +404,6 @@ def _read(filepath_or_buffer, kwds):
'error_bad_lines',
'warn_bad_lines',
'dtype',
'decimal',
'float_precision',
])

Expand Down Expand Up @@ -1582,6 +1581,7 @@ def __init__(self, f, **kwds):
self.converters = kwds['converters']

self.thousands = kwds['thousands']
self.decimal = kwds['decimal']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls check / update io.rst and the doc-string. IIRC we list in the option if it doesn't support a particular engine (so that can now be removed).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, might not be the case, but pls check.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback I can't find any reference in io.rst to a particular engine when decimal option is used

self.comment = kwds['comment']
self._comment_lines = []

Expand Down Expand Up @@ -1639,6 +1639,9 @@ def __init__(self, f, **kwds):
else:
self._no_thousands_columns = None

if len(self.decimal) != 1:
raise ValueError('Only length-1 decimal markers supported')

def _set_no_thousands_columns(self):
# Create a set of column ids that are not to be stripped of thousands
# operators.
Expand Down Expand Up @@ -2050,22 +2053,42 @@ def _check_empty(self, lines):
def _check_thousands(self, lines):
if self.thousands is None:
return lines
nonnum = re.compile('[^-^0-9^%s^.]+' % self.thousands)
nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands, self.decimal))
return self._search_replace_num_columns(lines=lines,
search=self.thousands,
replace='',
nonnum=nonnum)

def _search_replace_num_columns(self, lines, search, replace, nonnum):
ret = []
for l in lines:
rl = []
for i, x in enumerate(l):
if (not isinstance(x, compat.string_types) or
self.thousands not in x or
search not in x or
(self._no_thousands_columns and
i in self._no_thousands_columns) or
nonnum.search(x.strip())):
rl.append(x)
else:
rl.append(x.replace(self.thousands, ''))
rl.append(x.replace(search, replace))
ret.append(rl)
return ret

def _check_decimal(self, lines):
if self.decimal == b'.':
return lines

if self.thousands is None:
nonnum = re.compile('[^-^0-9^%s]+' % self.decimal)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these should be created in init

else:
nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands,
self.decimal))
return self._search_replace_num_columns(lines=lines,
search=self.decimal,
replace='.',
nonnum=nonnum)

def _clear_buffer(self):
self.buf = []

Expand Down Expand Up @@ -2249,7 +2272,8 @@ def _get_lines(self, rows=None):
lines = self._check_comments(lines)
if self.skip_blank_lines:
lines = self._check_empty(lines)
return self._check_thousands(lines)
lines = self._check_thousands(lines)
return self._check_decimal(lines)


def _make_date_converter(date_parser=None, dayfirst=False,
Expand Down
45 changes: 0 additions & 45 deletions pandas/io/tests/parser/c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,17 +353,6 @@ def test_disable_bool_parsing(self):
result = self.read_csv(StringIO(data), dtype=object, na_filter=False)
self.assertEqual(result['B'][2], '')

def test_euro_decimal_format(self):
data = """Id;Number1;Number2;Text1;Text2;Number3
1;1521,1541;187101,9543;ABC;poi;4,738797819
2;121,12;14897,76;DEF;uyt;0,377320872
3;878,158;108013,434;GHI;rez;2,735694704"""

df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
self.assertEqual(df2['Number1'].dtype, float)
self.assertEqual(df2['Number2'].dtype, float)
self.assertEqual(df2['Number3'].dtype, float)

def test_custom_lineterminator(self):
data = 'a,b,c~1,2,3~4,5,6'

Expand Down Expand Up @@ -444,40 +433,6 @@ def test_raise_on_no_columns(self):
data = "\n\n\n"
self.assertRaises(ValueError, self.read_csv, StringIO(data))

def test_1000_sep_with_decimal(self):
data = """A|B|C
1|2,334.01|5
10|13|10.
"""
expected = DataFrame({
'A': [1, 10],
'B': [2334.01, 13],
'C': [5, 10.]
})

tm.assert_equal(expected.A.dtype, 'int64')
tm.assert_equal(expected.B.dtype, 'float')
tm.assert_equal(expected.C.dtype, 'float')

df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data), sep='|',
thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

data_with_odd_sep = """A|B|C
1|2.334,01|5
10|13|10,
"""
df = self.read_csv(StringIO(data_with_odd_sep),
sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data_with_odd_sep),
sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

def test_grow_boundary_at_cap(self):
# See gh-12494
#
Expand Down
45 changes: 45 additions & 0 deletions pandas/io/tests/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1236,3 +1236,48 @@ def test_iteration_open_handle(self):
result = self.read_table(f, squeeze=True, header=None)
expected = Series(['DDD', 'EEE', 'FFF', 'GGG'], name=0)
tm.assert_series_equal(result, expected)

def test_1000_sep_with_decimal(self):
data = """A|B|C
1|2,334.01|5
10|13|10.
"""
expected = DataFrame({
'A': [1, 10],
'B': [2334.01, 13],
'C': [5, 10.]
})

tm.assert_equal(expected.A.dtype, 'int64')
tm.assert_equal(expected.B.dtype, 'float')
tm.assert_equal(expected.C.dtype, 'float')

df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data), sep='|',
thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

data_with_odd_sep = """A|B|C
1|2.334,01|5
10|13|10,
"""
df = self.read_csv(StringIO(data_with_odd_sep),
sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data_with_odd_sep),
sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

def test_euro_decimal_format(self):
data = """Id;Number1;Number2;Text1;Text2;Number3
1;1521,1541;187101,9543;ABC;poi;4,738797819
2;121,12;14897,76;DEF;uyt;0,377320872
3;878,158;108013,434;GHI;rez;2,735694704"""

df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
self.assertEqual(df2['Number1'].dtype, float)
self.assertEqual(df2['Number2'].dtype, float)
self.assertEqual(df2['Number3'].dtype, float)