ENH: support decimal option in PythonParser pandas-dev#12933

Camilo Cota · Camilo Cota · commit dc7acd1d6f04 · 2016-05-15T20:41:01.000+02:00
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -31,7 +31,7 @@ Other enhancements
 
 - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`)
 
-
+- Support decimal option in PythonParser
 
 
 .. _whatsnew_0182.api:
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -348,6 +348,7 @@ def _read(filepath_or_buffer, kwds):
     'keep_default_na': True,
     'thousands': None,
     'comment': None,
+    'decimal': b'.',
 
     # 'engine': 'c',
     'parse_dates': False,
@@ -383,7 +384,6 @@ def _read(filepath_or_buffer, kwds):
     'error_bad_lines': True,
     'warn_bad_lines': True,
     'dtype': None,
-    'decimal': b'.',
     'float_precision': None
 }
 
@@ -404,7 +404,6 @@ def _read(filepath_or_buffer, kwds):
     'error_bad_lines',
     'warn_bad_lines',
     'dtype',
-    'decimal',
     'float_precision',
 ])
 
@@ -1582,6 +1581,7 @@ def __init__(self, f, **kwds):
         self.converters = kwds['converters']
 
         self.thousands = kwds['thousands']
+        self.decimal = kwds['decimal']
         self.comment = kwds['comment']
         self._comment_lines = []
 
@@ -1639,6 +1639,9 @@ def __init__(self, f, **kwds):
         else:
             self._no_thousands_columns = None
 
+        if len(self.decimal) != 1:
+            raise ValueError('Only length-1 decimal markers supported')
+
     def _set_no_thousands_columns(self):
         # Create a set of column ids that are not to be stripped of thousands
         # operators.
@@ -2050,22 +2053,42 @@ def _check_empty(self, lines):
     def _check_thousands(self, lines):
         if self.thousands is None:
             return lines
-        nonnum = re.compile('[^-^0-9^%s^.]+' % self.thousands)
+        nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands, self.decimal))
+        return self._search_replace_num_columns(lines=lines,
+                                                search=self.thousands,
+                                                replace='',
+                                                nonnum=nonnum)
+
+    def _search_replace_num_columns(self, lines, search, replace, nonnum):
         ret = []
         for l in lines:
             rl = []
             for i, x in enumerate(l):
                 if (not isinstance(x, compat.string_types) or
-                    self.thousands not in x or
+                    search not in x or
                     (self._no_thousands_columns and
                      i in self._no_thousands_columns) or
                         nonnum.search(x.strip())):
                     rl.append(x)
                 else:
-                    rl.append(x.replace(self.thousands, ''))
+                    rl.append(x.replace(search, replace))
             ret.append(rl)
         return ret
 
+    def _check_decimal(self, lines):
+        if self.decimal == b'.':
+            return lines
+
+        if self.thousands is None:
+            nonnum = re.compile('[^-^0-9^%s]+' % self.decimal)
+        else:
+            nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands,
+                                                     self.decimal))
+        return self._search_replace_num_columns(lines=lines,
+                                                search=self.decimal,
+                                                replace='.',
+                                                nonnum=nonnum)
+
     def _clear_buffer(self):
         self.buf = []
 
@@ -2249,7 +2272,8 @@ def _get_lines(self, rows=None):
         lines = self._check_comments(lines)
         if self.skip_blank_lines:
             lines = self._check_empty(lines)
-        return self._check_thousands(lines)
+        lines = self._check_thousands(lines)
+        return self._check_decimal(lines)
 
 
 def _make_date_converter(date_parser=None, dayfirst=False,
diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
@@ -353,17 +353,6 @@ def test_disable_bool_parsing(self):
         result = self.read_csv(StringIO(data), dtype=object, na_filter=False)
         self.assertEqual(result['B'][2], '')
 
-    def test_euro_decimal_format(self):
-        data = """Id;Number1;Number2;Text1;Text2;Number3
-1;1521,1541;187101,9543;ABC;poi;4,738797819
-2;121,12;14897,76;DEF;uyt;0,377320872
-3;878,158;108013,434;GHI;rez;2,735694704"""
-
-        df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
-        self.assertEqual(df2['Number1'].dtype, float)
-        self.assertEqual(df2['Number2'].dtype, float)
-        self.assertEqual(df2['Number3'].dtype, float)
-
     def test_custom_lineterminator(self):
         data = 'a,b,c~1,2,3~4,5,6'
 
@@ -444,40 +433,6 @@ def test_raise_on_no_columns(self):
         data = "\n\n\n"
         self.assertRaises(ValueError, self.read_csv, StringIO(data))
 
-    def test_1000_sep_with_decimal(self):
-        data = """A|B|C
-1|2,334.01|5
-10|13|10.
-"""
-        expected = DataFrame({
-            'A': [1, 10],
-            'B': [2334.01, 13],
-            'C': [5, 10.]
-        })
-
-        tm.assert_equal(expected.A.dtype, 'int64')
-        tm.assert_equal(expected.B.dtype, 'float')
-        tm.assert_equal(expected.C.dtype, 'float')
-
-        df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
-        tm.assert_frame_equal(df, expected)
-
-        df = self.read_table(StringIO(data), sep='|',
-                             thousands=',', decimal='.')
-        tm.assert_frame_equal(df, expected)
-
-        data_with_odd_sep = """A|B|C
-1|2.334,01|5
-10|13|10,
-"""
-        df = self.read_csv(StringIO(data_with_odd_sep),
-                           sep='|', thousands='.', decimal=',')
-        tm.assert_frame_equal(df, expected)
-
-        df = self.read_table(StringIO(data_with_odd_sep),
-                             sep='|', thousands='.', decimal=',')
-        tm.assert_frame_equal(df, expected)
-
     def test_grow_boundary_at_cap(self):
         # See gh-12494
         #
diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
@@ -1236,3 +1236,48 @@ def test_iteration_open_handle(self):
                     result = self.read_table(f, squeeze=True, header=None)
                     expected = Series(['DDD', 'EEE', 'FFF', 'GGG'], name=0)
                     tm.assert_series_equal(result, expected)
+
+    def test_1000_sep_with_decimal(self):
+        data = """A|B|C
+1|2,334.01|5
+10|13|10.
+"""
+        expected = DataFrame({
+            'A': [1, 10],
+            'B': [2334.01, 13],
+            'C': [5, 10.]
+        })
+
+        tm.assert_equal(expected.A.dtype, 'int64')
+        tm.assert_equal(expected.B.dtype, 'float')
+        tm.assert_equal(expected.C.dtype, 'float')
+
+        df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
+        tm.assert_frame_equal(df, expected)
+
+        df = self.read_table(StringIO(data), sep='|',
+                             thousands=',', decimal='.')
+        tm.assert_frame_equal(df, expected)
+
+        data_with_odd_sep = """A|B|C
+1|2.334,01|5
+10|13|10,
+"""
+        df = self.read_csv(StringIO(data_with_odd_sep),
+                           sep='|', thousands='.', decimal=',')
+        tm.assert_frame_equal(df, expected)
+
+        df = self.read_table(StringIO(data_with_odd_sep),
+                             sep='|', thousands='.', decimal=',')
+        tm.assert_frame_equal(df, expected)
+
+    def test_euro_decimal_format(self):
+        data = """Id;Number1;Number2;Text1;Text2;Number3
+1;1521,1541;187101,9543;ABC;poi;4,738797819
+2;121,12;14897,76;DEF;uyt;0,377320872
+3;878,158;108013,434;GHI;rez;2,735694704"""
+
+        df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
+        self.assertEqual(df2['Number1'].dtype, float)
+        self.assertEqual(df2['Number2'].dtype, float)
+        self.assertEqual(df2['Number3'].dtype, float)