BUG: Conflict between thousands sep and date parser.

guyrt · guyrt · commit c6bf2eb01030 · 2013-09-23T15:26:20.000-04:00
Fixes issue where thousands separator could conflict with date parsing. This is only fixed in the C parser. Closes issue #4678
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -447,6 +447,7 @@ Bug Fixes
   - Fixed a bug in ``convert_objects`` for > 2 ndims (:issue:`4937`)
   - Fixed a bug in DataFrame/Panel cache insertion and subsequent indexing (:issue:`4939`)
   - Fixed string methods for ``FrozenNDArray`` and ``FrozenList`` (:issue:`4929`)
+  - Fixed conflict between thousands separator and date parser in csv_parser (:issue:`4678`)
 
 pandas 0.12.0
 -------------
diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py
@@ -26,7 +26,7 @@ def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col,
     minute_col = _maybe_cast(minute_col)
     second_col = _maybe_cast(second_col)
     return lib.try_parse_datetime_components(year_col, month_col, day_col,
-                                             hour_col, minute_col, second_col)
+                    hour_col, minute_col, second_col)
 
 
 def generic_parser(parse_func, *cols):
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1020,6 +1020,14 @@ def _set(x):
                 else:
                     _set(val)
 
+        elif isinstance(self.parse_dates, dict):
+            for val in self.parse_dates.values():
+                if isinstance(val, list):
+                    for k in val:
+                        _set(k)
+                else:
+                    _set(val)
+
     def set_error_bad_lines(self, status):
         self._reader.set_error_bad_lines(int(status))
 
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -233,6 +233,18 @@ def test_1000_sep_with_decimal(self):
         df = self.read_table(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',')
         tm.assert_frame_equal(df, expected)
 
+    def test_separator_date_conflict(self):
+        # Regression test for issue #4678: make sure thousands separator and
+        # date parsing do not conflict.
+        data = '06-02-2013;13:00;1-000.215'
+        expected = DataFrame(
+            [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]],
+            columns=['Date', 2]
+        )
+
+        df = self.read_csv(StringIO(data), sep=';', thousands='-', parse_dates={'Date': [0, 1]}, header=None)
+        tm.assert_frame_equal(df, expected)
+
     def test_squeeze(self):
         data = """\
 a,1
@@ -1940,6 +1952,9 @@ def test_1000_sep_with_decimal(self):
         df = self.read_table(StringIO(data), sep='|', thousands=',')
         tm.assert_frame_equal(df, expected)
 
+    def test_separator_date_conflict(self):
+        raise nose.SkipTest("Not supported in Python parser.")
+
     def test_comment_fwf(self):
         data = """
   1   2.   4  #hello world
diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx
@@ -708,20 +708,22 @@ def try_parse_datetime_components(ndarray[object] years,
         Py_ssize_t i, n
         ndarray[object] result
         int secs
+        double float_secs
         double micros
 
     from datetime import datetime
 
     n = len(years)
-    if (len(months) != n and len(days) != n and len(hours) != n and
-        len(minutes) != n and len(seconds) != n):
+    if (len(months) != n or len(days) != n or len(hours) != n or
+        len(minutes) != n or len(seconds) != n):
         raise ValueError('Length of all datetime components must be equal')
     result = np.empty(n, dtype='O')
 
     for i from 0 <= i < n:
-        secs = int(seconds[i])
+        float_secs = float(seconds[i])
+        secs = int(float_secs)
 
-        micros = seconds[i] - secs
+        micros = float_secs - secs
         if micros > 0:
             micros = micros * 1000000