BUG: fixes issue pandas-dev#4322

guyrt · guyrt · commit 09225995d91b · 2013-08-23T11:45:15.000-04:00
Adds support for the thousands character in csv parser for floats.

Updated docs to reflect bug fix.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -118,9 +118,11 @@ They can take a number of arguments:
     date_converters.py
   - ``dayfirst``: if True then uses the DD/MM international/European date format
     (This is False by default)
-  - ``thousands``: sepcifies the thousands separator. If not None, then parser
-    will try to look for it in the output and parse relevant data to integers.
-    Because it has to essentially scan through the data again, this causes a
+  - ``thousands``: specifies the thousands separator. If not None, this character will
+    be stripped from numeric dtypes. However, if it is the first character in a field,
+    that column will be imported as a string. In the PythonParser, if not None,
+    then parser will try to look for it in the output and parse relevant data to numeric
+    dtypes. Because it has to essentially scan through the data again, this causes a
     significant performance hit so only use if necessary.
   - ``lineterminator`` : string (length 1), default ``None``, Character to break file into lines. Only valid with C parser
   - ``quotechar`` : string, The character to used to denote the start and end of a quoted item.
@@ -506,8 +508,8 @@ DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided:
 
 Thousand Separators
 ~~~~~~~~~~~~~~~~~~~
-For large integers that have been written with a thousands separator, you can
-set the ``thousands`` keyword to ``True`` so that integers will be parsed
+For large numbers that have been written with a thousands separator, you can
+set the ``thousands`` keyword to a string of length 1 so that integers will be parsed
 correctly:
 
 .. ipython:: python
@@ -521,7 +523,7 @@ correctly:
    with open('tmp.csv', 'w') as fh:
        fh.write(data)
 
-By default, integers with a thousands separator will be parsed as strings
+By default, numbers with a thousands separator will be parsed as strings
 
 .. ipython:: python
 
@@ -1123,7 +1125,7 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series``
 - ``numpy`` : direct decoding to numpy arrays. default is False;
   Note that the JSON ordering **MUST** be the same for each term if ``numpy=True``
 - ``precise_float`` : boolean, default ``False``. Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (``False``) is to use fast but less precise builtin functionality
-- ``date_unit`` : string, the timestamp unit to detect if converting dates. Default 
+- ``date_unit`` : string, the timestamp unit to detect if converting dates. Default
   None. By default the timestamp precision will be detected, if this is not desired
   then pass one of 's', 'ms', 'us' or 'ns' to force timestamp precision to
   seconds, milliseconds, microseconds or nanoseconds respectively.
@@ -1201,11 +1203,11 @@ nanoseconds
    dfju
 
    # Let Pandas detect the correct precision
-   dfju = pd.read_json(json)  
+   dfju = pd.read_json(json)
    dfju
 
    # Or specify that all timestamps are in nanoseconds
-   dfju = pd.read_json(json, date_unit='ns')  
+   dfju = pd.read_json(json, date_unit='ns')
    dfju
 
 .. ipython:: python
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -272,6 +272,8 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
   - Fix selection with ``ix/loc`` and non_unique selectors (:issue:`4619`)
   - Fix assignment with iloc/loc involving a dtype change in an existing column (:issue:`4312`)
     have internal setitem_with_indexer in core/indexing to use Block.setitem
+  - Fixed bug where thousands operator was not handled correctly for floating point numbers
+    in csv_import (:issue:`4322`)
 
 pandas 0.12
 ===========
diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt
@@ -278,6 +278,9 @@ Bug Fixes
 
   - Suppressed DeprecationWarning associated with internal calls issued by repr() (:issue:`4391`)
 
+  - Fixed bug where thousands operator was not handled correctly for floating point numbers
+    in csv_import (:issue:`4322`)
+
 See the :ref:`full release notes
 <release>` or issue tracker
 on GitHub for a complete list.
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -19,7 +19,8 @@
 import pandas.io.parsers as parsers
 from pandas.io.parsers import (read_csv, read_table, read_fwf,
                                TextFileReader, TextParser)
-from pandas.util.testing import (assert_almost_equal,
+from pandas.util.testing import (assert_equal,
+                                 assert_almost_equal,
                                  assert_series_equal,
                                  makeCustomDataframe as mkdf,
                                  network,
@@ -67,6 +68,35 @@ def setUp(self):
         self.csv2 = os.path.join(self.dirpath, 'test2.csv')
         self.xls1 = os.path.join(self.dirpath, 'test.xls')
 
+    def test_multi_character_decimal_marker(self):
+        data = """A|B|C
+1|2,334|5
+10|13|10.
+"""
+        self.assertRaises(ValueError, read_csv, StringIO(data), decimal=',,')
+
+    def test_empty_decimal_marker(self):
+        data = """A|B|C
+1|2,334|5
+10|13|10.
+"""
+        self.assertRaises(ValueError, read_csv, StringIO(data), decimal='')
+
+    def test_empty_thousands_marker(self):
+        data = """A|B|C
+1|2,334|5
+10|13|10.
+"""
+        self.assertRaises(ValueError, read_csv, StringIO(data), thousands='')
+
+
+    def test_multi_character_decimal_marker(self):
+        data = """A|B|C
+1|2,334|5
+10|13|10.
+"""
+        self.assertRaises(ValueError, read_csv, StringIO(data), thousands=',,')
+
     def test_empty_string(self):
         data = """\
 One,Two,Three
@@ -164,14 +194,48 @@ def test_1000_sep(self):
 1|2,334|5
 10|13|10.
 """
-        expected = [[1, 2334., 5],
-                    [10, 13, 10]]
+        expected = DataFrame({
+            'A': [1, 10],
+            'B': [2334, 13],
+            'C': [5, 10.]
+        })
 
         df = self.read_csv(StringIO(data), sep='|', thousands=',')
-        assert_almost_equal(df.values, expected)
+        tm.assert_frame_equal(df, expected)
 
         df = self.read_table(StringIO(data), sep='|', thousands=',')
-        assert_almost_equal(df.values, expected)
+        tm.assert_frame_equal(df, expected)
+
+    def test_1000_sep_with_decimal(self):
+        data = """A|B|C
+1|2,334.01|5
+10|13|10.
+"""
+        expected = DataFrame({
+            'A': [1, 10],
+            'B': [2334.01, 13],
+            'C': [5, 10.]
+        })
+
+        assert_equal(expected.A.dtype, 'int64')
+        assert_equal(expected.B.dtype, 'float')
+        assert_equal(expected.C.dtype, 'float')
+
+        df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
+        tm.assert_frame_equal(df, expected)
+
+        df = self.read_table(StringIO(data), sep='|', thousands=',', decimal='.')
+        tm.assert_frame_equal(df, expected)
+
+        data_with_odd_sep = """A|B|C
+1|2.334,01|5
+10|13|10,
+"""
+        df = self.read_csv(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',')
+        tm.assert_frame_equal(df, expected)
+
+        df = self.read_table(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',')
+        tm.assert_frame_equal(df, expected)
 
     def test_squeeze(self):
         data = """\
@@ -1862,6 +1926,24 @@ def test_1000_fwf(self):
                       thousands=',')
         assert_almost_equal(df.values, expected)
 
+    def test_1000_sep_with_decimal(self):
+        data = """A|B|C
+1|2,334.01|5
+10|13|10.
+"""
+
+        expected = DataFrame({
+            'A': [1, 10],
+            'B': [2334.01, 13],
+            'C': [5, 10.]
+        })
+
+        df = self.read_csv(StringIO(data), sep='|', thousands=',')
+        tm.assert_frame_equal(df, expected)
+
+        df = self.read_table(StringIO(data), sep='|', thousands=',')
+        tm.assert_frame_equal(df, expected)
+
     def test_comment_fwf(self):
         data = """
   1   2.   4  #hello world
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -186,7 +186,7 @@ cdef extern from "parser/tokenizer.h":
     uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error)
 
     inline int to_double(char *item, double *p_value,
-                         char sci, char decimal)
+                         char sci, char decimal, char thousands)
     inline int to_complex(char *item, double *p_real,
                           double *p_imag, char sci, char decimal)
     inline int to_longlong(char *item, long long *p_value)
@@ -355,7 +355,7 @@ cdef class TextReader:
 
         if thousands is not None:
             if len(thousands) != 1:
-                raise ValueError('Only length-1 decimal markers supported')
+                raise ValueError('Only length-1 thousands markers supported')
             self.parser.thousands = ord(thousands)
 
         if escapechar is not None:
@@ -1397,7 +1397,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
                 na_count += 1
                 data[0] = NA
             else:
-                error = to_double(word, data, parser.sci, parser.decimal)
+                error = to_double(word, data, parser.sci, parser.decimal, parser.thousands)
                 if error != 1:
                     if strcasecmp(word, cinf) == 0:
                         data[0] = INF
@@ -1413,7 +1413,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
     else:
         for i in range(lines):
             word = COLITER_NEXT(it)
-            error = to_double(word, data, parser.sci, parser.decimal)
+            error = to_double(word, data, parser.sci, parser.decimal, parser.thousands)
             if error != 1:
                 if strcasecmp(word, cinf) == 0:
                     data[0] = INF
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
@@ -1633,7 +1633,7 @@ void test_count_lines(char *fname) {
 
 
 // forward declaration
-static double xstrtod(const char *p, char **q, char decimal, char sci, int skip_trailing);
+static double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing);
 
 
 P_INLINE void lowercase(char *p) {
@@ -1661,11 +1661,11 @@ P_INLINE void uppercase(char *p) {
  *
  */
 
-int to_double(char *item, double *p_value, char sci, char decimal)
+int to_double(char *item, double *p_value, char sci, char decimal, char tsep)
 {
     char *p_end;
 
-    *p_value = xstrtod(item, &p_end, decimal, sci, TRUE);
+    *p_value = xstrtod(item, &p_end, decimal, sci, tsep, TRUE);
 
     return (errno == 0) && (!*p_end);
 }
@@ -1675,7 +1675,7 @@ int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, ch
 {
     char *p_end;
 
-    *p_real = xstrtod(item, &p_end, decimal, sci, FALSE);
+    *p_real = xstrtod(item, &p_end, decimal, sci, '\0', FALSE);
     if (*p_end == '\0') {
         *p_imag = 0.0;
         return errno == 0;
@@ -1689,7 +1689,7 @@ int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, ch
         if (*p_end == '+') {
             ++p_end;
         }
-        *p_imag = xstrtod(p_end, &p_end, decimal, sci, FALSE);
+        *p_imag = xstrtod(p_end, &p_end, decimal, sci, '\0', FALSE);
         if (errno || ((*p_end != 'i') && (*p_end != 'j'))) {
             return FALSE;
         }
@@ -1856,10 +1856,12 @@ int main(int argc, char *argv[])
 // * Added decimal and sci arguments.
 // * Skip trailing spaces.
 // * Commented out the other functions.
+// Modifications by Richard T Guy, August 2013:
+// * Add tsep argument for thousands separator
 //
 
 static double xstrtod(const char *str, char **endptr, char decimal,
-                      char sci, int skip_trailing)
+                      char sci, char tsep, int skip_trailing)
 {
   double number;
   int exponent;
@@ -1894,6 +1896,11 @@ static double xstrtod(const char *str, char **endptr, char decimal,
     number = number * 10. + (*p - '0');
     p++;
     num_digits++;
+
+    if (tsep != '\0' && *p == tsep)
+    {
+      ++p;
+    }
   }
 
   // Process decimal part
diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h
@@ -255,7 +255,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min,
                      int64_t int_max, int *error, char tsep);
 uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error);
 
-int P_INLINE to_double(char *item, double *p_value, char sci, char decimal);
+int P_INLINE to_double(char *item, double *p_value, char sci, char decimal, char tsep);
 int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, char decimal);
 int P_INLINE to_longlong(char *item, long long *p_value);
 int P_INLINE to_longlong_thousands(char *item, long long *p_value, char tsep);