From 3080721f1bfa4b3688b0e050f80521339ac9f8dc Mon Sep 17 00:00:00 2001 From: Jeffrey Tratner Date: Thu, 11 Jul 2013 20:51:59 -0400 Subject: [PATCH] ENH: Use case-insensitive checks for 'inf' in parser. Instead of just 'inf' or '-inf', can test for 'Inf', '-Inf', 'INF', etc. Uses strcasecmp under the hood. (also, small fix to assert_almost_equal to make string comparisons clearer) --- doc/source/release.rst | 3 +++ doc/source/v0.13.0.txt | 3 +++ pandas/io/tests/test_parsers.py | 17 +++++++++++++++-- pandas/parser.pyx | 10 +++++----- pandas/util/testing.py | 2 +- 5 files changed, 27 insertions(+), 8 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 478e7375b0b30..496294fd86b19 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -41,6 +41,9 @@ pandas 0.13 - ``read_excel`` now supports an integer in its ``sheetname`` argument giving the index of the sheet to read in (:issue:`4301`). - Added a test for ``read_clipboard()`` and ``to_clipboard()`` (:issue:`4282`) + - Text parser now treats anything that reads like inf ("inf", "Inf", "-Inf", + "iNf", etc.) to infinity. (:issue:`4220`, :issue:`4219`), affecting + ``read_table``, ``read_csv``, etc. **API Changes** diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index 24d1b30d470ee..5af972ad4253c 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -11,6 +11,9 @@ API changes - ``read_excel`` now supports an integer in its ``sheetname`` argument giving the index of the sheet to read in (:issue:`4301`). + - Text parser now treats anything that reads like inf ("inf", "Inf", "-Inf", + "iNf", etc.) to infinity. (:issue:`4220`, :issue:`4219`), affecting + ``read_table``, ``read_csv``, etc. Enhancements ~~~~~~~~~~~~ diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 784d650a524a7..b88b1ab776ab4 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -167,9 +167,22 @@ def test_inf_parsing(self): data = """\ ,A a,inf -b,-inf""" +b,-inf +c,Inf +d,-Inf +e,INF +f,-INF +g,INf +h,-INf +i,inF +j,-inF""" + inf = float('inf') + expected = Series([inf, -inf] * 5) df = read_csv(StringIO(data), index_col=0) - self.assertTrue(np.isinf(np.abs(df['A'])).all()) + assert_almost_equal(df['A'].values, expected.values) + df = read_csv(StringIO(data), index_col=0, na_filter=False) + print df['A'].values + assert_almost_equal(df['A'].values, expected.values) def test_multiple_date_col(self): # Can use multiple date parsers diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 185cf1a752803..71d2e1c1e5381 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -3,7 +3,7 @@ from libc.stdio cimport fopen, fclose from libc.stdlib cimport malloc, free -from libc.string cimport strncpy, strlen, strcmp +from libc.string cimport strncpy, strlen, strcmp, strcasecmp cimport libc.stdio as stdio from cpython cimport (PyObject, PyBytes_FromString, @@ -1399,9 +1399,9 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, else: error = to_double(word, data, parser.sci, parser.decimal) if error != 1: - if strcmp(word, cinf) == 0: + if strcasecmp(word, cinf) == 0: data[0] = INF - elif strcmp(word, cneginf) == 0: + elif strcasecmp(word, cneginf) == 0: data[0] = NEGINF else: return None, None @@ -1415,9 +1415,9 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, word = COLITER_NEXT(it) error = to_double(word, data, parser.sci, parser.decimal) if error != 1: - if strcmp(word, cinf) == 0: + if strcasecmp(word, cinf) == 0: data[0] = INF - elif strcmp(word, cneginf) == 0: + elif strcasecmp(word, cneginf) == 0: data[0] = NEGINF else: return None, None diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 0dc80f59e4699..275853d4533c9 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -126,7 +126,7 @@ def assert_almost_equal(a, b, check_less_precise = False): return assert_dict_equal(a, b) if isinstance(a, basestring): - assert a == b, "%s != %s" % (a, b) + assert a == b, "%r != %r" % (a, b) return True if isiterable(a):