ENH: support decimal option in PythonParser #12933

Camilo Cota · jreback · commit 19ebee56951b · 2016-05-22T16:00:11.000-04:00
closes #12933 Author: Camilo Cota <ccota@riplife.es> Closes #13189 from camilocot/12933 and squashes the following commits: 465272e [Camilo Cota] Benchmark decimal option in read_csv for c engine 9f42d0c [Camilo Cota] double backticks around decimal and engine='python' dc8ca62 [Camilo Cota] fix test_empty_decimal_marker comment 49613fe [Camilo Cota] Assert read_csv error message in test_empty_decimal_marker d821052 [Camilo Cota] fix test_empty_decimal_marker comment f71509d [Camilo Cota] Include descritive what's new line 803356e [Camilo Cota] set nonnum regex in init method 1472d80 [Camilo Cota] Include the issue number in what's new b560fda [Camilo Cota] Fix what's new dc7acd1 [Camilo Cota] ENH: support decimal option in PythonParser #12933
diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py
@@ -23,18 +23,42 @@ class read_csv_default_converter(object):
     goal_time = 0.2
 
     def setup(self):
-        self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n        0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n        0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n        0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n        0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n        '
+        self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n
+0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n
+0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n
+0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n
+0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n"""
         self.data = (self.data * 200)
 
     def time_read_csv_default_converter(self):
         read_csv(StringIO(self.data), sep=',', header=None, float_precision=None)
 
 
+class read_csv_default_converter_with_decimal(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.data = """0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n
+0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n
+0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n
+0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n
+0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n"""
+        self.data = (self.data * 200)
+
+    def time_read_csv_default_converter_with_decimal(self):
+        read_csv(StringIO(self.data), sep=';', header=None,
+                 float_precision=None, decimal=',')
+
+
 class read_csv_precise_converter(object):
     goal_time = 0.2
 
     def setup(self):
-        self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n        0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n        0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n        0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n        0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n        '
+        self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n
+0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n
+0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n
+0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n
+0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n"""
         self.data = (self.data * 200)
 
     def time_read_csv_precise_converter(self):
@@ -45,7 +69,11 @@ class read_csv_roundtrip_converter(object):
     goal_time = 0.2
 
     def setup(self):
-        self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n        0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n        0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n        0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n        0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n        '
+        self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n
+0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n
+0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n
+0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n
+0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n"""
         self.data = (self.data * 200)
 
     def time_read_csv_roundtrip_converter(self):
@@ -109,4 +137,28 @@ def setup(self):
         self.data = (self.data * 200)
 
     def time_read_table_multiple_date_baseline(self):
-        read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1])
+        read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1])
+
+
+class read_csv_default_converter_python_engine(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n        0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n        0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n        0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n        0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n        '
+        self.data = (self.data * 200)
+
+    def time_read_csv_default_converter(self):
+        read_csv(StringIO(self.data), sep=',', header=None,
+                 float_precision=None, engine='python')
+
+
+class read_csv_default_converter_with_decimal_python_engine(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.data = '0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n        0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n        0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n        0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n        0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n        '
+        self.data = (self.data * 200)
+
+    def time_read_csv_default_converter_with_decimal(self):
+        read_csv(StringIO(self.data), sep=';', header=None,
+                 float_precision=None, decimal=',', engine='python')
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -47,6 +47,8 @@ Other enhancements
 
     pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30)
 
+- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`)
+
 .. _whatsnew_0182.api:
 
 API changes
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -348,6 +348,7 @@ def _read(filepath_or_buffer, kwds):
     'keep_default_na': True,
     'thousands': None,
     'comment': None,
+    'decimal': b'.',
 
     # 'engine': 'c',
     'parse_dates': False,
@@ -383,7 +384,6 @@ def _read(filepath_or_buffer, kwds):
     'error_bad_lines': True,
     'warn_bad_lines': True,
     'dtype': None,
-    'decimal': b'.',
     'float_precision': None
 }
 
@@ -404,7 +404,6 @@ def _read(filepath_or_buffer, kwds):
     'error_bad_lines',
     'warn_bad_lines',
     'dtype',
-    'decimal',
     'float_precision',
 ])
 
@@ -1582,6 +1581,7 @@ def __init__(self, f, **kwds):
         self.converters = kwds['converters']
 
         self.thousands = kwds['thousands']
+        self.decimal = kwds['decimal']
         self.comment = kwds['comment']
         self._comment_lines = []
 
@@ -1639,6 +1639,15 @@ def __init__(self, f, **kwds):
         else:
             self._no_thousands_columns = None
 
+        if len(self.decimal) != 1:
+            raise ValueError('Only length-1 decimal markers supported')
+
+        if self.thousands is None:
+            self.nonnum = re.compile('[^-^0-9^%s]+' % self.decimal)
+        else:
+            self.nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands,
+                                                          self.decimal))
+
     def _set_no_thousands_columns(self):
         # Create a set of column ids that are not to be stripped of thousands
         # operators.
@@ -2050,22 +2059,35 @@ def _check_empty(self, lines):
     def _check_thousands(self, lines):
         if self.thousands is None:
             return lines
-        nonnum = re.compile('[^-^0-9^%s^.]+' % self.thousands)
+
+        return self._search_replace_num_columns(lines=lines,
+                                                search=self.thousands,
+                                                replace='')
+
+    def _search_replace_num_columns(self, lines, search, replace):
         ret = []
         for l in lines:
             rl = []
             for i, x in enumerate(l):
                 if (not isinstance(x, compat.string_types) or
-                    self.thousands not in x or
+                    search not in x or
                     (self._no_thousands_columns and
                      i in self._no_thousands_columns) or
-                        nonnum.search(x.strip())):
+                        self.nonnum.search(x.strip())):
                     rl.append(x)
                 else:
-                    rl.append(x.replace(self.thousands, ''))
+                    rl.append(x.replace(search, replace))
             ret.append(rl)
         return ret
 
+    def _check_decimal(self, lines):
+        if self.decimal == _parser_defaults['decimal']:
+            return lines
+
+        return self._search_replace_num_columns(lines=lines,
+                                                search=self.decimal,
+                                                replace='.')
+
     def _clear_buffer(self):
         self.buf = []
 
@@ -2249,7 +2271,8 @@ def _get_lines(self, rows=None):
         lines = self._check_comments(lines)
         if self.skip_blank_lines:
             lines = self._check_empty(lines)
-        return self._check_thousands(lines)
+        lines = self._check_thousands(lines)
+        return self._check_decimal(lines)
 
 
 def _make_date_converter(date_parser=None, dayfirst=False,
diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
@@ -353,17 +353,6 @@ def test_disable_bool_parsing(self):
         result = self.read_csv(StringIO(data), dtype=object, na_filter=False)
         self.assertEqual(result['B'][2], '')
 
-    def test_euro_decimal_format(self):
-        data = """Id;Number1;Number2;Text1;Text2;Number3
-1;1521,1541;187101,9543;ABC;poi;4,738797819
-2;121,12;14897,76;DEF;uyt;0,377320872
-3;878,158;108013,434;GHI;rez;2,735694704"""
-
-        df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
-        self.assertEqual(df2['Number1'].dtype, float)
-        self.assertEqual(df2['Number2'].dtype, float)
-        self.assertEqual(df2['Number3'].dtype, float)
-
     def test_custom_lineterminator(self):
         data = 'a,b,c~1,2,3~4,5,6'
 
@@ -444,40 +433,6 @@ def test_raise_on_no_columns(self):
         data = "\n\n\n"
         self.assertRaises(ValueError, self.read_csv, StringIO(data))
 
-    def test_1000_sep_with_decimal(self):
-        data = """A|B|C
-1|2,334.01|5
-10|13|10.
-"""
-        expected = DataFrame({
-            'A': [1, 10],
-            'B': [2334.01, 13],
-            'C': [5, 10.]
-        })
-
-        tm.assert_equal(expected.A.dtype, 'int64')
-        tm.assert_equal(expected.B.dtype, 'float')
-        tm.assert_equal(expected.C.dtype, 'float')
-
-        df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
-        tm.assert_frame_equal(df, expected)
-
-        df = self.read_table(StringIO(data), sep='|',
-                             thousands=',', decimal='.')
-        tm.assert_frame_equal(df, expected)
-
-        data_with_odd_sep = """A|B|C
-1|2.334,01|5
-10|13|10,
-"""
-        df = self.read_csv(StringIO(data_with_odd_sep),
-                           sep='|', thousands='.', decimal=',')
-        tm.assert_frame_equal(df, expected)
-
-        df = self.read_table(StringIO(data_with_odd_sep),
-                             sep='|', thousands='.', decimal=',')
-        tm.assert_frame_equal(df, expected)
-
     def test_grow_boundary_at_cap(self):
         # See gh-12494
         #
diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
@@ -41,10 +41,10 @@ def test_empty_decimal_marker(self):
 1|2,334|5
 10|13|10.
 """
-        # C parser: supports only length-1 decimals
-        # Python parser: 'decimal' not supported yet
-        self.assertRaises(ValueError, self.read_csv,
-                          StringIO(data), decimal='')
+        # Parsers support only length-1 decimals
+        msg = 'Only length-1 decimal markers supported'
+        with tm.assertRaisesRegexp(ValueError, msg):
+            self.read_csv(StringIO(data), decimal='')
 
     def test_read_csv(self):
         if not compat.PY3:
@@ -1236,3 +1236,48 @@ def test_iteration_open_handle(self):
                     result = self.read_table(f, squeeze=True, header=None)
                     expected = Series(['DDD', 'EEE', 'FFF', 'GGG'], name=0)
                     tm.assert_series_equal(result, expected)
+
+    def test_1000_sep_with_decimal(self):
+        data = """A|B|C
+1|2,334.01|5
+10|13|10.
+"""
+        expected = DataFrame({
+            'A': [1, 10],
+            'B': [2334.01, 13],
+            'C': [5, 10.]
+        })
+
+        tm.assert_equal(expected.A.dtype, 'int64')
+        tm.assert_equal(expected.B.dtype, 'float')
+        tm.assert_equal(expected.C.dtype, 'float')
+
+        df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
+        tm.assert_frame_equal(df, expected)
+
+        df = self.read_table(StringIO(data), sep='|',
+                             thousands=',', decimal='.')
+        tm.assert_frame_equal(df, expected)
+
+        data_with_odd_sep = """A|B|C
+1|2.334,01|5
+10|13|10,
+"""
+        df = self.read_csv(StringIO(data_with_odd_sep),
+                           sep='|', thousands='.', decimal=',')
+        tm.assert_frame_equal(df, expected)
+
+        df = self.read_table(StringIO(data_with_odd_sep),
+                             sep='|', thousands='.', decimal=',')
+        tm.assert_frame_equal(df, expected)
+
+    def test_euro_decimal_format(self):
+        data = """Id;Number1;Number2;Text1;Text2;Number3
+1;1521,1541;187101,9543;ABC;poi;4,738797819
+2;121,12;14897,76;DEF;uyt;0,377320872
+3;878,158;108013,434;GHI;rez;2,735694704"""
+
+        df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
+        self.assertEqual(df2['Number1'].dtype, float)
+        self.assertEqual(df2['Number2'].dtype, float)
+        self.assertEqual(df2['Number3'].dtype, float)