From dc7acd1d6f043afbc445b729a39a11bd8e13f454 Mon Sep 17 00:00:00 2001
From: Camilo Cota <ccota@riplife.es>
Date: Sun, 15 May 2016 20:29:06 +0200
Subject: [PATCH 1/8] ENH: support decimal option in PythonParser #12933

---
 doc/source/whatsnew/v0.18.2.txt         |  2 +-
 pandas/io/parsers.py                    | 36 ++++++++++++++++----
 pandas/io/tests/parser/c_parser_only.py | 45 -------------------------
 pandas/io/tests/parser/common.py        | 45 +++++++++++++++++++++++++
 4 files changed, 76 insertions(+), 52 deletions(-)

diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
index fa426aa30bc65..4a393cc24f123 100644
--- a/doc/source/whatsnew/v0.18.2.txt
+++ b/doc/source/whatsnew/v0.18.2.txt
@@ -31,7 +31,7 @@ Other enhancements
 
 - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`)
 
-
+- Support decimal option in PythonParser
 
 
 .. _whatsnew_0182.api:
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index f4527df56db88..93e4d23c000fc 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -348,6 +348,7 @@ def _read(filepath_or_buffer, kwds):
     'keep_default_na': True,
     'thousands': None,
     'comment': None,
+    'decimal': b'.',
 
     # 'engine': 'c',
     'parse_dates': False,
@@ -383,7 +384,6 @@ def _read(filepath_or_buffer, kwds):
     'error_bad_lines': True,
     'warn_bad_lines': True,
     'dtype': None,
-    'decimal': b'.',
     'float_precision': None
 }
 
@@ -404,7 +404,6 @@ def _read(filepath_or_buffer, kwds):
     'error_bad_lines',
     'warn_bad_lines',
     'dtype',
-    'decimal',
     'float_precision',
 ])
 
@@ -1582,6 +1581,7 @@ def __init__(self, f, **kwds):
         self.converters = kwds['converters']
 
         self.thousands = kwds['thousands']
+        self.decimal = kwds['decimal']
         self.comment = kwds['comment']
         self._comment_lines = []
 
@@ -1639,6 +1639,9 @@ def __init__(self, f, **kwds):
         else:
             self._no_thousands_columns = None
 
+        if len(self.decimal) != 1:
+            raise ValueError('Only length-1 decimal markers supported')
+
     def _set_no_thousands_columns(self):
         # Create a set of column ids that are not to be stripped of thousands
         # operators.
@@ -2050,22 +2053,42 @@ def _check_empty(self, lines):
     def _check_thousands(self, lines):
         if self.thousands is None:
             return lines
-        nonnum = re.compile('[^-^0-9^%s^.]+' % self.thousands)
+        nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands, self.decimal))
+        return self._search_replace_num_columns(lines=lines,
+                                                search=self.thousands,
+                                                replace='',
+                                                nonnum=nonnum)
+
+    def _search_replace_num_columns(self, lines, search, replace, nonnum):
         ret = []
         for l in lines:
             rl = []
             for i, x in enumerate(l):
                 if (not isinstance(x, compat.string_types) or
-                    self.thousands not in x or
+                    search not in x or
                     (self._no_thousands_columns and
                      i in self._no_thousands_columns) or
                         nonnum.search(x.strip())):
                     rl.append(x)
                 else:
-                    rl.append(x.replace(self.thousands, ''))
+                    rl.append(x.replace(search, replace))
             ret.append(rl)
         return ret
 
+    def _check_decimal(self, lines):
+        if self.decimal == b'.':
+            return lines
+
+        if self.thousands is None:
+            nonnum = re.compile('[^-^0-9^%s]+' % self.decimal)
+        else:
+            nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands,
+                                                     self.decimal))
+        return self._search_replace_num_columns(lines=lines,
+                                                search=self.decimal,
+                                                replace='.',
+                                                nonnum=nonnum)
+
     def _clear_buffer(self):
         self.buf = []
 
@@ -2249,7 +2272,8 @@ def _get_lines(self, rows=None):
         lines = self._check_comments(lines)
         if self.skip_blank_lines:
             lines = self._check_empty(lines)
-        return self._check_thousands(lines)
+        lines = self._check_thousands(lines)
+        return self._check_decimal(lines)
 
 
 def _make_date_converter(date_parser=None, dayfirst=False,
diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
index 24c670abe8158..8e44802adf744 100644
--- a/pandas/io/tests/parser/c_parser_only.py
+++ b/pandas/io/tests/parser/c_parser_only.py
@@ -353,17 +353,6 @@ def test_disable_bool_parsing(self):
         result = self.read_csv(StringIO(data), dtype=object, na_filter=False)
         self.assertEqual(result['B'][2], '')
 
-    def test_euro_decimal_format(self):
-        data = """Id;Number1;Number2;Text1;Text2;Number3
-1;1521,1541;187101,9543;ABC;poi;4,738797819
-2;121,12;14897,76;DEF;uyt;0,377320872
-3;878,158;108013,434;GHI;rez;2,735694704"""
-
-        df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
-        self.assertEqual(df2['Number1'].dtype, float)
-        self.assertEqual(df2['Number2'].dtype, float)
-        self.assertEqual(df2['Number3'].dtype, float)
-
     def test_custom_lineterminator(self):
         data = 'a,b,c~1,2,3~4,5,6'
 
@@ -444,40 +433,6 @@ def test_raise_on_no_columns(self):
         data = "\n\n\n"
         self.assertRaises(ValueError, self.read_csv, StringIO(data))
 
-    def test_1000_sep_with_decimal(self):
-        data = """A|B|C
-1|2,334.01|5
-10|13|10.
-"""
-        expected = DataFrame({
-            'A': [1, 10],
-            'B': [2334.01, 13],
-            'C': [5, 10.]
-        })
-
-        tm.assert_equal(expected.A.dtype, 'int64')
-        tm.assert_equal(expected.B.dtype, 'float')
-        tm.assert_equal(expected.C.dtype, 'float')
-
-        df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
-        tm.assert_frame_equal(df, expected)
-
-        df = self.read_table(StringIO(data), sep='|',
-                             thousands=',', decimal='.')
-        tm.assert_frame_equal(df, expected)
-
-        data_with_odd_sep = """A|B|C
-1|2.334,01|5
-10|13|10,
-"""
-        df = self.read_csv(StringIO(data_with_odd_sep),
-                           sep='|', thousands='.', decimal=',')
-        tm.assert_frame_equal(df, expected)
-
-        df = self.read_table(StringIO(data_with_odd_sep),
-                             sep='|', thousands='.', decimal=',')
-        tm.assert_frame_equal(df, expected)
-
     def test_grow_boundary_at_cap(self):
         # See gh-12494
         #
diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
index 4d9ce922184d9..09bd6ee838d16 100644
--- a/pandas/io/tests/parser/common.py
+++ b/pandas/io/tests/parser/common.py
@@ -1236,3 +1236,48 @@ def test_iteration_open_handle(self):
                     result = self.read_table(f, squeeze=True, header=None)
                     expected = Series(['DDD', 'EEE', 'FFF', 'GGG'], name=0)
                     tm.assert_series_equal(result, expected)
+
+    def test_1000_sep_with_decimal(self):
+        data = """A|B|C
+1|2,334.01|5
+10|13|10.
+"""
+        expected = DataFrame({
+            'A': [1, 10],
+            'B': [2334.01, 13],
+            'C': [5, 10.]
+        })
+
+        tm.assert_equal(expected.A.dtype, 'int64')
+        tm.assert_equal(expected.B.dtype, 'float')
+        tm.assert_equal(expected.C.dtype, 'float')
+
+        df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
+        tm.assert_frame_equal(df, expected)
+
+        df = self.read_table(StringIO(data), sep='|',
+                             thousands=',', decimal='.')
+        tm.assert_frame_equal(df, expected)
+
+        data_with_odd_sep = """A|B|C
+1|2.334,01|5
+10|13|10,
+"""
+        df = self.read_csv(StringIO(data_with_odd_sep),
+                           sep='|', thousands='.', decimal=',')
+        tm.assert_frame_equal(df, expected)
+
+        df = self.read_table(StringIO(data_with_odd_sep),
+                             sep='|', thousands='.', decimal=',')
+        tm.assert_frame_equal(df, expected)
+
+    def test_euro_decimal_format(self):
+        data = """Id;Number1;Number2;Text1;Text2;Number3
+1;1521,1541;187101,9543;ABC;poi;4,738797819
+2;121,12;14897,76;DEF;uyt;0,377320872
+3;878,158;108013,434;GHI;rez;2,735694704"""
+
+        df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
+        self.assertEqual(df2['Number1'].dtype, float)
+        self.assertEqual(df2['Number2'].dtype, float)
+        self.assertEqual(df2['Number3'].dtype, float)

From 1472d80bccd29f66bc86d3ee0878f40744a5bf9e Mon Sep 17 00:00:00 2001
From: Camilo Cota <ccota@riplife.es>
Date: Mon, 16 May 2016 18:53:15 +0200
Subject: [PATCH 2/8] Include the issue number in what's new

---
 doc/source/whatsnew/v0.18.2.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
index 99cbf11a108c1..8b96ac71924bf 100644
--- a/doc/source/whatsnew/v0.18.2.txt
+++ b/doc/source/whatsnew/v0.18.2.txt
@@ -38,7 +38,7 @@ Other enhancements
      idx = pd.Index(["a1a2", "b1", "c1"])
      idx.str.extractall("[ab](?P<digit>\d)")
 
-- Support decimal option in PythonParser
+- Support decimal option in PythonParser (:issue:`12933`)
 
 .. _whatsnew_0182.api:
 

From 803356ef42543140c96699ad27091438277e760b Mon Sep 17 00:00:00 2001
From: Camilo Cota <ccota@riplife.es>
Date: Mon, 16 May 2016 18:55:38 +0200
Subject: [PATCH 3/8] set nonnum regex in init method

---
 pandas/io/parsers.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index acef52db1de49..07b92fd6bfd28 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1642,6 +1642,12 @@ def __init__(self, f, **kwds):
         if len(self.decimal) != 1:
             raise ValueError('Only length-1 decimal markers supported')
 
+        if self.thousands is None:
+            self.nonnum = re.compile('[^-^0-9^%s]+' % self.decimal)
+        else:
+            self.nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands,
+                                                          self.decimal))
+
     def _set_no_thousands_columns(self):
         # Create a set of column ids that are not to be stripped of thousands
         # operators.
@@ -2053,13 +2059,12 @@ def _check_empty(self, lines):
     def _check_thousands(self, lines):
         if self.thousands is None:
             return lines
-        nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands, self.decimal))
+
         return self._search_replace_num_columns(lines=lines,
                                                 search=self.thousands,
-                                                replace='',
-                                                nonnum=nonnum)
+                                                replace='')
 
-    def _search_replace_num_columns(self, lines, search, replace, nonnum):
+    def _search_replace_num_columns(self, lines, search, replace):
         ret = []
         for l in lines:
             rl = []
@@ -2068,7 +2073,7 @@ def _search_replace_num_columns(self, lines, search, replace, nonnum):
                     search not in x or
                     (self._no_thousands_columns and
                      i in self._no_thousands_columns) or
-                        nonnum.search(x.strip())):
+                        self.nonnum.search(x.strip())):
                     rl.append(x)
                 else:
                     rl.append(x.replace(search, replace))
@@ -2076,18 +2081,12 @@ def _search_replace_num_columns(self, lines, search, replace, nonnum):
         return ret
 
     def _check_decimal(self, lines):
-        if self.decimal == b'.':
+        if self.decimal == _parser_defaults['decimal']:
             return lines
 
-        if self.thousands is None:
-            nonnum = re.compile('[^-^0-9^%s]+' % self.decimal)
-        else:
-            nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands,
-                                                     self.decimal))
         return self._search_replace_num_columns(lines=lines,
                                                 search=self.decimal,
-                                                replace='.',
-                                                nonnum=nonnum)
+                                                replace='.')
 
     def _clear_buffer(self):
         self.buf = []

From f71509dfb82d2b3dfc2d33abd913a1cc736b07e2 Mon Sep 17 00:00:00 2001
From: Camilo Cota <ccota@riplife.es>
Date: Mon, 16 May 2016 22:23:14 +0200
Subject: [PATCH 4/8] Include descritive what's new line

---
 doc/source/whatsnew/v0.18.2.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
index 8b96ac71924bf..020a8dd3a27d6 100644
--- a/doc/source/whatsnew/v0.18.2.txt
+++ b/doc/source/whatsnew/v0.18.2.txt
@@ -38,7 +38,7 @@ Other enhancements
      idx = pd.Index(["a1a2", "b1", "c1"])
      idx.str.extractall("[ab](?P<digit>\d)")
 
-- Support decimal option in PythonParser (:issue:`12933`)
+- The ``pd.read_csv()`` with engine='python' has gained support for the decimal option (:issue:`12933`)
 
 .. _whatsnew_0182.api:
 

From d8210529ec455934e45bc4558e93a2ce99c24d79 Mon Sep 17 00:00:00 2001
From: Camilo Cota <ccota@riplife.es>
Date: Mon, 16 May 2016 22:27:18 +0200
Subject: [PATCH 5/8] fix test_empty_decimal_marker comment

---
 pandas/io/tests/parser/common.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
index 09bd6ee838d16..1aef496300ad8 100644
--- a/pandas/io/tests/parser/common.py
+++ b/pandas/io/tests/parser/common.py
@@ -41,8 +41,7 @@ def test_empty_decimal_marker(self):
 1|2,334|5
 10|13|10.
 """
-        # C parser: supports only length-1 decimals
-        # Python parser: 'decimal' not supported yet
+        # Parsers support only length-1 decimals
         self.assertRaises(ValueError, self.read_csv,
                           StringIO(data), decimal='')
 

From 49613fe3590660deaee9274aca701304dd137b09 Mon Sep 17 00:00:00 2001
From: Camilo Cota <ccota@riplife.es>
Date: Tue, 17 May 2016 21:24:30 +0200
Subject: [PATCH 6/8] Assert read_csv error message in
 test_empty_decimal_marker

---
 pandas/io/tests/parser/common.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
index 1aef496300ad8..57ab9477302c1 100644
--- a/pandas/io/tests/parser/common.py
+++ b/pandas/io/tests/parser/common.py
@@ -42,8 +42,9 @@ def test_empty_decimal_marker(self):
 10|13|10.
 """
         # Parsers support only length-1 decimals
-        self.assertRaises(ValueError, self.read_csv,
-                          StringIO(data), decimal='')
+        msg = 'Only length-1 decimal markers supported'
+        with tm.assertRaisesRegexp(ValueError, msg):
+            self.read_csv(StringIO(data), decimal='')
 
     def test_read_csv(self):
         if not compat.PY3:

From dc8ca622ec771a1c6688ad065f1606c2d9439fb2 Mon Sep 17 00:00:00 2001
From: Camilo Cota <ccota@riplife.es>
Date: Wed, 18 May 2016 21:51:16 +0200
Subject: [PATCH 7/8] fix test_empty_decimal_marker comment

---
 asv_bench/benchmarks/parser_vb.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py
index 18cd4de6cc9c5..afe6efc9a903d 100644
--- a/asv_bench/benchmarks/parser_vb.py
+++ b/asv_bench/benchmarks/parser_vb.py
@@ -29,7 +29,6 @@ def setup(self):
     def time_read_csv_default_converter(self):
         read_csv(StringIO(self.data), sep=',', header=None, float_precision=None)
 
-
 class read_csv_precise_converter(object):
     goal_time = 0.2
 
@@ -109,4 +108,20 @@ def setup(self):
         self.data = (self.data * 200)
 
     def time_read_table_multiple_date_baseline(self):
-        read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1])
\ No newline at end of file
+        read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1])
+
+
+class read_csv_python_engine(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.data_decimal = '0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n        0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n        0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n        0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n        0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n        '
+        self.data_decimal = (self.data_decimal * 200)
+        self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n        0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n        0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n        0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n        0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n        '
+        self.data = (self.data * 200)
+
+    def time_read_csv_default_converter_with_decimal(self):
+        read_csv(StringIO(self.data_decimal), sep=';', header=None, float_precision=None, decimal=',', engine='python')
+
+    def time_read_csv_default_converter(self):
+        read_csv(StringIO(self.data), sep=',', header=None, float_precision=None, engine='python')
\ No newline at end of file

From 465272e11770435cc5b03ec342af19ac1e8bd5a6 Mon Sep 17 00:00:00 2001
From: Camilo Cota <ccota@riplife.es>
Date: Sun, 22 May 2016 17:44:40 +0200
Subject: [PATCH 8/8] Benchmark decimal option in read_csv for c engine

---
 asv_bench/benchmarks/parser_vb.py | 35 ++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py
index afe6efc9a903d..25fe1bf80a438 100644
--- a/asv_bench/benchmarks/parser_vb.py
+++ b/asv_bench/benchmarks/parser_vb.py
@@ -29,6 +29,19 @@ def setup(self):
     def time_read_csv_default_converter(self):
         read_csv(StringIO(self.data), sep=',', header=None, float_precision=None)
 
+
+class read_csv_default_converter_with_decimal(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.data = '0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n        0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n        0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n        0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n        0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n        '
+        self.data = (self.data * 200)
+
+    def time_read_csv_default_converter_with_decimal(self):
+        read_csv(StringIO(self.data), sep=';', header=None,
+                 float_precision=None, decimal=',')
+
+
 class read_csv_precise_converter(object):
     goal_time = 0.2
 
@@ -111,17 +124,25 @@ def time_read_table_multiple_date_baseline(self):
         read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1])
 
 
-class read_csv_python_engine(object):
+class read_csv_default_converter_python_engine(object):
     goal_time = 0.2
 
     def setup(self):
-        self.data_decimal = '0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n        0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n        0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n        0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n        0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n        '
-        self.data_decimal = (self.data_decimal * 200)
         self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n        0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n        0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n        0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n        0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n        '
         self.data = (self.data * 200)
 
-    def time_read_csv_default_converter_with_decimal(self):
-        read_csv(StringIO(self.data_decimal), sep=';', header=None, float_precision=None, decimal=',', engine='python')
-
     def time_read_csv_default_converter(self):
-        read_csv(StringIO(self.data), sep=',', header=None, float_precision=None, engine='python')
\ No newline at end of file
+        read_csv(StringIO(self.data), sep=',', header=None,
+                 float_precision=None, engine='python')
+
+
+class read_csv_default_converter_with_decimal_python_engine(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.data = '0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n        0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n        0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n        0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n        0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n        '
+        self.data = (self.data * 200)
+
+    def time_read_csv_default_converter_with_decimal(self):
+        read_csv(StringIO(self.data), sep=';', header=None,
+                 float_precision=None, decimal=',', engine='python')