diff --git a/doc/source/release.rst b/doc/source/release.rst index 75097ee50e8c1..cd0a2ba6e3884 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -447,6 +447,7 @@ Bug Fixes - Fixed a bug in ``convert_objects`` for > 2 ndims (:issue:`4937`) - Fixed a bug in DataFrame/Panel cache insertion and subsequent indexing (:issue:`4939`) - Fixed string methods for ``FrozenNDArray`` and ``FrozenList`` (:issue:`4929`) + - Fixed conflict between thousands separator and date parser in csv_parser (:issue:`4678`) pandas 0.12.0 ------------- diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py index 2be477f49e28b..5c99ab4d0a664 100644 --- a/pandas/io/date_converters.py +++ b/pandas/io/date_converters.py @@ -26,7 +26,7 @@ def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, minute_col = _maybe_cast(minute_col) second_col = _maybe_cast(second_col) return lib.try_parse_datetime_components(year_col, month_col, day_col, - hour_col, minute_col, second_col) + hour_col, minute_col, second_col) def generic_parser(parse_func, *cols): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 7b9347a821fad..c109508ea2c19 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1020,6 +1020,14 @@ def _set(x): else: _set(val) + elif isinstance(self.parse_dates, dict): + for val in self.parse_dates.values(): + if isinstance(val, list): + for k in val: + _set(k) + else: + _set(val) + def set_error_bad_lines(self, status): self._reader.set_error_bad_lines(int(status)) @@ -1269,6 +1277,7 @@ def __init__(self, f, **kwds): self._make_reader(f) else: self.data = f + self.columns = self._infer_columns() # we are processing a multi index column @@ -1292,6 +1301,38 @@ def __init__(self, f, **kwds): self.index_names = index_names self._first_chunk = True + if self.parse_dates: + self._no_thousands_columns = self._set_no_thousands_columns() + else: + self._no_thousands_columns = None + + def _set_no_thousands_columns(self): + # Create a set of column ids that are not to be stripped of thousands operators. + noconvert_columns = set() + + def _set(x): + if com.is_integer(x): + noconvert_columns.add(x) + else: + noconvert_columns.add(self.columns.index(x)) + + if isinstance(self.parse_dates, list): + for val in self.parse_dates: + if isinstance(val, list): + for k in val: + _set(k) + else: + _set(val) + + elif isinstance(self.parse_dates, dict): + for val in self.parse_dates.values(): + if isinstance(val, list): + for k in val: + _set(k) + else: + _set(val) + return noconvert_columns + def _make_reader(self, f): sep = self.delimiter @@ -1500,7 +1541,6 @@ def _next_line(self): line = next(self.data) line = self._check_comments([line])[0] - line = self._check_thousands([line])[0] self.pos += 1 self.buf.append(line) @@ -1532,9 +1572,10 @@ def _check_thousands(self, lines): ret = [] for l in lines: rl = [] - for x in l: + for i, x in enumerate(l): if (not isinstance(x, compat.string_types) or self.thousands not in x or + (self._no_thousands_columns and i in self._no_thousands_columns) or nonnum.search(x.strip())): rl.append(x) else: @@ -1608,7 +1649,6 @@ def _rows_to_cols(self, content): raise AssertionError() if col_len != zip_len and self.index_col is not False: - row_num = -1 i = 0 for (i, l) in enumerate(content): if len(l) != col_len: diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index fb2b3fdd33bf1..6c32224dc7808 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -233,6 +233,18 @@ def test_1000_sep_with_decimal(self): df = self.read_table(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',') tm.assert_frame_equal(df, expected) + def test_separator_date_conflict(self): + # Regression test for issue #4678: make sure thousands separator and + # date parsing do not conflict. + data = '06-02-2013;13:00;1-000.215' + expected = DataFrame( + [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], + columns=['Date', 2] + ) + + df = self.read_csv(StringIO(data), sep=';', thousands='-', parse_dates={'Date': [0, 1]}, header=None) + tm.assert_frame_equal(df, expected) + def test_squeeze(self): data = """\ a,1 diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index e0bbc1a4e64c1..2a3f85b550a7c 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -708,20 +708,22 @@ def try_parse_datetime_components(ndarray[object] years, Py_ssize_t i, n ndarray[object] result int secs + double float_secs double micros from datetime import datetime n = len(years) - if (len(months) != n and len(days) != n and len(hours) != n and - len(minutes) != n and len(seconds) != n): + if (len(months) != n or len(days) != n or len(hours) != n or + len(minutes) != n or len(seconds) != n): raise ValueError('Length of all datetime components must be equal') result = np.empty(n, dtype='O') for i from 0 <= i < n: - secs = int(seconds[i]) + float_secs = float(seconds[i]) + secs = int(float_secs) - micros = seconds[i] - secs + micros = float_secs - secs if micros > 0: micros = micros * 1000000