From dc7acd1d6f043afbc445b729a39a11bd8e13f454 Mon Sep 17 00:00:00 2001 From: Camilo Cota Date: Sun, 15 May 2016 20:29:06 +0200 Subject: [PATCH 1/8] ENH: support decimal option in PythonParser #12933 --- doc/source/whatsnew/v0.18.2.txt | 2 +- pandas/io/parsers.py | 36 ++++++++++++++++---- pandas/io/tests/parser/c_parser_only.py | 45 ------------------------- pandas/io/tests/parser/common.py | 45 +++++++++++++++++++++++++ 4 files changed, 76 insertions(+), 52 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index fa426aa30bc65..4a393cc24f123 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -31,7 +31,7 @@ Other enhancements - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) - +- Support decimal option in PythonParser .. _whatsnew_0182.api: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f4527df56db88..93e4d23c000fc 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -348,6 +348,7 @@ def _read(filepath_or_buffer, kwds): 'keep_default_na': True, 'thousands': None, 'comment': None, + 'decimal': b'.', # 'engine': 'c', 'parse_dates': False, @@ -383,7 +384,6 @@ def _read(filepath_or_buffer, kwds): 'error_bad_lines': True, 'warn_bad_lines': True, 'dtype': None, - 'decimal': b'.', 'float_precision': None } @@ -404,7 +404,6 @@ def _read(filepath_or_buffer, kwds): 'error_bad_lines', 'warn_bad_lines', 'dtype', - 'decimal', 'float_precision', ]) @@ -1582,6 +1581,7 @@ def __init__(self, f, **kwds): self.converters = kwds['converters'] self.thousands = kwds['thousands'] + self.decimal = kwds['decimal'] self.comment = kwds['comment'] self._comment_lines = [] @@ -1639,6 +1639,9 @@ def __init__(self, f, **kwds): else: self._no_thousands_columns = None + if len(self.decimal) != 1: + raise ValueError('Only length-1 decimal markers supported') + def _set_no_thousands_columns(self): # Create a set of column ids that are not to be stripped of thousands # operators. @@ -2050,22 +2053,42 @@ def _check_empty(self, lines): def _check_thousands(self, lines): if self.thousands is None: return lines - nonnum = re.compile('[^-^0-9^%s^.]+' % self.thousands) + nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands, self.decimal)) + return self._search_replace_num_columns(lines=lines, + search=self.thousands, + replace='', + nonnum=nonnum) + + def _search_replace_num_columns(self, lines, search, replace, nonnum): ret = [] for l in lines: rl = [] for i, x in enumerate(l): if (not isinstance(x, compat.string_types) or - self.thousands not in x or + search not in x or (self._no_thousands_columns and i in self._no_thousands_columns) or nonnum.search(x.strip())): rl.append(x) else: - rl.append(x.replace(self.thousands, '')) + rl.append(x.replace(search, replace)) ret.append(rl) return ret + def _check_decimal(self, lines): + if self.decimal == b'.': + return lines + + if self.thousands is None: + nonnum = re.compile('[^-^0-9^%s]+' % self.decimal) + else: + nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands, + self.decimal)) + return self._search_replace_num_columns(lines=lines, + search=self.decimal, + replace='.', + nonnum=nonnum) + def _clear_buffer(self): self.buf = [] @@ -2249,7 +2272,8 @@ def _get_lines(self, rows=None): lines = self._check_comments(lines) if self.skip_blank_lines: lines = self._check_empty(lines) - return self._check_thousands(lines) + lines = self._check_thousands(lines) + return self._check_decimal(lines) def _make_date_converter(date_parser=None, dayfirst=False, diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 24c670abe8158..8e44802adf744 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -353,17 +353,6 @@ def test_disable_bool_parsing(self): result = self.read_csv(StringIO(data), dtype=object, na_filter=False) self.assertEqual(result['B'][2], '') - def test_euro_decimal_format(self): - data = """Id;Number1;Number2;Text1;Text2;Number3 -1;1521,1541;187101,9543;ABC;poi;4,738797819 -2;121,12;14897,76;DEF;uyt;0,377320872 -3;878,158;108013,434;GHI;rez;2,735694704""" - - df2 = self.read_csv(StringIO(data), sep=';', decimal=',') - self.assertEqual(df2['Number1'].dtype, float) - self.assertEqual(df2['Number2'].dtype, float) - self.assertEqual(df2['Number3'].dtype, float) - def test_custom_lineterminator(self): data = 'a,b,c~1,2,3~4,5,6' @@ -444,40 +433,6 @@ def test_raise_on_no_columns(self): data = "\n\n\n" self.assertRaises(ValueError, self.read_csv, StringIO(data)) - def test_1000_sep_with_decimal(self): - data = """A|B|C -1|2,334.01|5 -10|13|10. -""" - expected = DataFrame({ - 'A': [1, 10], - 'B': [2334.01, 13], - 'C': [5, 10.] - }) - - tm.assert_equal(expected.A.dtype, 'int64') - tm.assert_equal(expected.B.dtype, 'float') - tm.assert_equal(expected.C.dtype, 'float') - - df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.') - tm.assert_frame_equal(df, expected) - - df = self.read_table(StringIO(data), sep='|', - thousands=',', decimal='.') - tm.assert_frame_equal(df, expected) - - data_with_odd_sep = """A|B|C -1|2.334,01|5 -10|13|10, -""" - df = self.read_csv(StringIO(data_with_odd_sep), - sep='|', thousands='.', decimal=',') - tm.assert_frame_equal(df, expected) - - df = self.read_table(StringIO(data_with_odd_sep), - sep='|', thousands='.', decimal=',') - tm.assert_frame_equal(df, expected) - def test_grow_boundary_at_cap(self): # See gh-12494 # diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 4d9ce922184d9..09bd6ee838d16 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1236,3 +1236,48 @@ def test_iteration_open_handle(self): result = self.read_table(f, squeeze=True, header=None) expected = Series(['DDD', 'EEE', 'FFF', 'GGG'], name=0) tm.assert_series_equal(result, expected) + + def test_1000_sep_with_decimal(self): + data = """A|B|C +1|2,334.01|5 +10|13|10. +""" + expected = DataFrame({ + 'A': [1, 10], + 'B': [2334.01, 13], + 'C': [5, 10.] + }) + + tm.assert_equal(expected.A.dtype, 'int64') + tm.assert_equal(expected.B.dtype, 'float') + tm.assert_equal(expected.C.dtype, 'float') + + df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.') + tm.assert_frame_equal(df, expected) + + df = self.read_table(StringIO(data), sep='|', + thousands=',', decimal='.') + tm.assert_frame_equal(df, expected) + + data_with_odd_sep = """A|B|C +1|2.334,01|5 +10|13|10, +""" + df = self.read_csv(StringIO(data_with_odd_sep), + sep='|', thousands='.', decimal=',') + tm.assert_frame_equal(df, expected) + + df = self.read_table(StringIO(data_with_odd_sep), + sep='|', thousands='.', decimal=',') + tm.assert_frame_equal(df, expected) + + def test_euro_decimal_format(self): + data = """Id;Number1;Number2;Text1;Text2;Number3 +1;1521,1541;187101,9543;ABC;poi;4,738797819 +2;121,12;14897,76;DEF;uyt;0,377320872 +3;878,158;108013,434;GHI;rez;2,735694704""" + + df2 = self.read_csv(StringIO(data), sep=';', decimal=',') + self.assertEqual(df2['Number1'].dtype, float) + self.assertEqual(df2['Number2'].dtype, float) + self.assertEqual(df2['Number3'].dtype, float) From 1472d80bccd29f66bc86d3ee0878f40744a5bf9e Mon Sep 17 00:00:00 2001 From: Camilo Cota Date: Mon, 16 May 2016 18:53:15 +0200 Subject: [PATCH 2/8] Include the issue number in what's new --- doc/source/whatsnew/v0.18.2.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 99cbf11a108c1..8b96ac71924bf 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -38,7 +38,7 @@ Other enhancements idx = pd.Index(["a1a2", "b1", "c1"]) idx.str.extractall("[ab](?P\d)") -- Support decimal option in PythonParser +- Support decimal option in PythonParser (:issue:`12933`) .. _whatsnew_0182.api: From 803356ef42543140c96699ad27091438277e760b Mon Sep 17 00:00:00 2001 From: Camilo Cota Date: Mon, 16 May 2016 18:55:38 +0200 Subject: [PATCH 3/8] set nonnum regex in init method --- pandas/io/parsers.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index acef52db1de49..07b92fd6bfd28 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1642,6 +1642,12 @@ def __init__(self, f, **kwds): if len(self.decimal) != 1: raise ValueError('Only length-1 decimal markers supported') + if self.thousands is None: + self.nonnum = re.compile('[^-^0-9^%s]+' % self.decimal) + else: + self.nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands, + self.decimal)) + def _set_no_thousands_columns(self): # Create a set of column ids that are not to be stripped of thousands # operators. @@ -2053,13 +2059,12 @@ def _check_empty(self, lines): def _check_thousands(self, lines): if self.thousands is None: return lines - nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands, self.decimal)) + return self._search_replace_num_columns(lines=lines, search=self.thousands, - replace='', - nonnum=nonnum) + replace='') - def _search_replace_num_columns(self, lines, search, replace, nonnum): + def _search_replace_num_columns(self, lines, search, replace): ret = [] for l in lines: rl = [] @@ -2068,7 +2073,7 @@ def _search_replace_num_columns(self, lines, search, replace, nonnum): search not in x or (self._no_thousands_columns and i in self._no_thousands_columns) or - nonnum.search(x.strip())): + self.nonnum.search(x.strip())): rl.append(x) else: rl.append(x.replace(search, replace)) @@ -2076,18 +2081,12 @@ def _search_replace_num_columns(self, lines, search, replace, nonnum): return ret def _check_decimal(self, lines): - if self.decimal == b'.': + if self.decimal == _parser_defaults['decimal']: return lines - if self.thousands is None: - nonnum = re.compile('[^-^0-9^%s]+' % self.decimal) - else: - nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands, - self.decimal)) return self._search_replace_num_columns(lines=lines, search=self.decimal, - replace='.', - nonnum=nonnum) + replace='.') def _clear_buffer(self): self.buf = [] From f71509dfb82d2b3dfc2d33abd913a1cc736b07e2 Mon Sep 17 00:00:00 2001 From: Camilo Cota Date: Mon, 16 May 2016 22:23:14 +0200 Subject: [PATCH 4/8] Include descritive what's new line --- doc/source/whatsnew/v0.18.2.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 8b96ac71924bf..020a8dd3a27d6 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -38,7 +38,7 @@ Other enhancements idx = pd.Index(["a1a2", "b1", "c1"]) idx.str.extractall("[ab](?P\d)") -- Support decimal option in PythonParser (:issue:`12933`) +- The ``pd.read_csv()`` with engine='python' has gained support for the decimal option (:issue:`12933`) .. _whatsnew_0182.api: From d8210529ec455934e45bc4558e93a2ce99c24d79 Mon Sep 17 00:00:00 2001 From: Camilo Cota Date: Mon, 16 May 2016 22:27:18 +0200 Subject: [PATCH 5/8] fix test_empty_decimal_marker comment --- pandas/io/tests/parser/common.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 09bd6ee838d16..1aef496300ad8 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -41,8 +41,7 @@ def test_empty_decimal_marker(self): 1|2,334|5 10|13|10. """ - # C parser: supports only length-1 decimals - # Python parser: 'decimal' not supported yet + # Parsers support only length-1 decimals self.assertRaises(ValueError, self.read_csv, StringIO(data), decimal='') From 49613fe3590660deaee9274aca701304dd137b09 Mon Sep 17 00:00:00 2001 From: Camilo Cota Date: Tue, 17 May 2016 21:24:30 +0200 Subject: [PATCH 6/8] Assert read_csv error message in test_empty_decimal_marker --- pandas/io/tests/parser/common.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 1aef496300ad8..57ab9477302c1 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -42,8 +42,9 @@ def test_empty_decimal_marker(self): 10|13|10. """ # Parsers support only length-1 decimals - self.assertRaises(ValueError, self.read_csv, - StringIO(data), decimal='') + msg = 'Only length-1 decimal markers supported' + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(data), decimal='') def test_read_csv(self): if not compat.PY3: From dc8ca622ec771a1c6688ad065f1606c2d9439fb2 Mon Sep 17 00:00:00 2001 From: Camilo Cota Date: Wed, 18 May 2016 21:51:16 +0200 Subject: [PATCH 7/8] fix test_empty_decimal_marker comment --- asv_bench/benchmarks/parser_vb.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py index 18cd4de6cc9c5..afe6efc9a903d 100644 --- a/asv_bench/benchmarks/parser_vb.py +++ b/asv_bench/benchmarks/parser_vb.py @@ -29,7 +29,6 @@ def setup(self): def time_read_csv_default_converter(self): read_csv(StringIO(self.data), sep=',', header=None, float_precision=None) - class read_csv_precise_converter(object): goal_time = 0.2 @@ -109,4 +108,20 @@ def setup(self): self.data = (self.data * 200) def time_read_table_multiple_date_baseline(self): - read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1]) \ No newline at end of file + read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1]) + + +class read_csv_python_engine(object): + goal_time = 0.2 + + def setup(self): + self.data_decimal = '0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n 0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n 0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n 0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n 0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n ' + self.data_decimal = (self.data_decimal * 200) + self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' + self.data = (self.data * 200) + + def time_read_csv_default_converter_with_decimal(self): + read_csv(StringIO(self.data_decimal), sep=';', header=None, float_precision=None, decimal=',', engine='python') + + def time_read_csv_default_converter(self): + read_csv(StringIO(self.data), sep=',', header=None, float_precision=None, engine='python') \ No newline at end of file From 465272e11770435cc5b03ec342af19ac1e8bd5a6 Mon Sep 17 00:00:00 2001 From: Camilo Cota Date: Sun, 22 May 2016 17:44:40 +0200 Subject: [PATCH 8/8] Benchmark decimal option in read_csv for c engine --- asv_bench/benchmarks/parser_vb.py | 35 ++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py index afe6efc9a903d..25fe1bf80a438 100644 --- a/asv_bench/benchmarks/parser_vb.py +++ b/asv_bench/benchmarks/parser_vb.py @@ -29,6 +29,19 @@ def setup(self): def time_read_csv_default_converter(self): read_csv(StringIO(self.data), sep=',', header=None, float_precision=None) + +class read_csv_default_converter_with_decimal(object): + goal_time = 0.2 + + def setup(self): + self.data = '0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n 0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n 0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n 0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n 0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n ' + self.data = (self.data * 200) + + def time_read_csv_default_converter_with_decimal(self): + read_csv(StringIO(self.data), sep=';', header=None, + float_precision=None, decimal=',') + + class read_csv_precise_converter(object): goal_time = 0.2 @@ -111,17 +124,25 @@ def time_read_table_multiple_date_baseline(self): read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1]) -class read_csv_python_engine(object): +class read_csv_default_converter_python_engine(object): goal_time = 0.2 def setup(self): - self.data_decimal = '0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n 0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n 0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n 0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n 0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n ' - self.data_decimal = (self.data_decimal * 200) self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' self.data = (self.data * 200) - def time_read_csv_default_converter_with_decimal(self): - read_csv(StringIO(self.data_decimal), sep=';', header=None, float_precision=None, decimal=',', engine='python') - def time_read_csv_default_converter(self): - read_csv(StringIO(self.data), sep=',', header=None, float_precision=None, engine='python') \ No newline at end of file + read_csv(StringIO(self.data), sep=',', header=None, + float_precision=None, engine='python') + + +class read_csv_default_converter_with_decimal_python_engine(object): + goal_time = 0.2 + + def setup(self): + self.data = '0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n 0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n 0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n 0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n 0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n ' + self.data = (self.data * 200) + + def time_read_csv_default_converter_with_decimal(self): + read_csv(StringIO(self.data), sep=';', header=None, + float_precision=None, decimal=',', engine='python')