diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e218fdce98380..a46ad2a58e1e4 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -24,6 +24,9 @@ def next(x): from pandas.util.decorators import Appender +class DateConversionError(Exception): + pass + _parser_params = """Also supports optionally iterating or breaking of the file into chunks. @@ -155,7 +158,8 @@ def _read(cls, filepath_or_buffer, kwds): f = com._get_handle(filepath_or_buffer, 'r', encoding=encoding) if kwds.get('date_parser', None) is not None: - kwds['parse_dates'] = True + if isinstance(kwds['parse_dates'], bool): + kwds['parse_dates'] = True # Extract some of the arguments (pass chunksize on). kwds.pop('filepath_or_buffer') @@ -362,8 +366,8 @@ class TextParser(object): def __init__(self, f, delimiter=None, names=None, header=0, index_col=None, na_values=None, thousands=None, comment=None, parse_dates=False, - date_parser=None, dayfirst=False, chunksize=None, - skiprows=None, skip_footer=0, converters=None, + date_parser=None, dayfirst=False, + chunksize=None, skiprows=None, skip_footer=0, converters=None, verbose=False, encoding=None): """ Workhorse function for processing nested list into DataFrame @@ -672,7 +676,6 @@ def get_chunk(self, rows=None): zipped_content = list(lib.to_object_array(content).T) - # no index column specified, so infer that's what is wanted if self.index_col is not None: if np.isscalar(self.index_col): index = zipped_content.pop(self.index_col) @@ -686,9 +689,8 @@ def get_chunk(self, rows=None): zipped_content.pop(i) if np.isscalar(self.index_col): - if self._should_parse_dates(0): - index = lib.try_parse_dates(index, parser=self.date_parser, - dayfirst=self.dayfirst) + if self._should_parse_dates(self.index_col): + index = self._conv_date(index) index, na_count = _convert_types(index, self.na_values) index = Index(index, name=self.index_name) if self.verbose and na_count: @@ -696,9 +698,8 @@ def get_chunk(self, rows=None): else: arrays = [] for i, arr in enumerate(index): - if self._should_parse_dates(i): - arr = lib.try_parse_dates(arr, parser=self.date_parser, - dayfirst=self.dayfirst) + if self._should_parse_dates(self.index_col[i]): + arr = self._conv_date(arr) arr, _ = _convert_types(arr, self.na_values) arrays.append(arr) index = MultiIndex.from_arrays(arrays, names=self.index_name) @@ -736,18 +737,13 @@ def get_chunk(self, rows=None): col = self.columns[col] data[col] = lib.map_infer(data[col], f) - if not isinstance(self.parse_dates, bool): - for x in self.parse_dates: - if isinstance(x, int) and x not in data: - x = self.orig_columns[x] - if x in self.index_col or x in self.index_name: - continue - data[x] = lib.try_parse_dates(data[x], parser=self.date_parser, - dayfirst=self.dayfirst) + columns = self.columns + if self.parse_dates is not None: + data, columns = self._process_date_conversion(data) data = _convert_to_ndarrays(data, self.na_values, self.verbose) - return DataFrame(data=data, columns=self.columns, index=index) + return DataFrame(data=data, columns=columns, index=index) def _find_line_number(self, exp_len, chunk_len, chunk_i): if exp_len is None: @@ -778,6 +774,68 @@ def _should_parse_dates(self, i): name = self.index_name[i] return i in to_parse or name in to_parse + def _conv_date(self, *date_cols): + if self.date_parser is None: + return lib.try_parse_dates(_concat_date_cols(date_cols), + dayfirst=self.dayfirst) + else: + try: + return self.date_parser(*date_cols) + except: + return lib.try_parse_dates(_concat_date_cols(date_cols), + parser=self.date_parser, + dayfirst=self.dayfirst) + + def _process_date_conversion(self, data_dict): + new_cols = [] + new_data = {} + columns = self.columns + + if self.parse_dates is None or isinstance(self.parse_dates, bool): + return data_dict, columns + + if isinstance(self.parse_dates, list): + # list of column lists + for colspec in self.parse_dates: + if np.isscalar(colspec): + if isinstance(colspec, int) and colspec not in data_dict: + colspec = self.orig_columns[colspec] + if self._isindex(colspec): + continue + data_dict[colspec] = self._conv_date(data_dict[colspec]) + else: + new_name, col = _try_convert_dates(self._conv_date, colspec, + data_dict, self.orig_columns) + if new_name in data_dict: + raise ValueError('New date column already in dict %s' % + new_name) + new_data[new_name] = col + new_cols.append(new_name) + + elif isinstance(self.parse_dates, dict): + # dict of new name to column list + for new_name, colspec in self.parse_dates.iteritems(): + if new_name in data_dict: + raise ValueError('Date column %s already in dict' % + new_name) + + _, col = _try_convert_dates(self._conv_date, colspec, data_dict, + self.orig_columns) + new_data[new_name] = col + new_cols.append(new_name) + + data_dict.update(new_data) + new_cols.extend(columns) + return data_dict, new_cols + + def _isindex(self, colspec): + return (colspec == self.index_col or + (isinstance(self.index_col, list) and + colspec in self.index_col) or + (colspec == self.index_name or + (isinstance(self.index_name, list) and + colspec in self.index_name))) + def _get_lines(self, rows=None): source = self.data lines = self.buf @@ -860,6 +918,33 @@ def _convert_types(values, na_values): return result, na_count +def _get_col_names(colspec, columns): + colset = set(columns) + colnames = [] + for c in colspec: + if c in colset: + colnames.append(str(c)) + elif isinstance(c, int): + colnames.append(str(columns[c])) + return colnames + +def _try_convert_dates(parser, colspec, data_dict, columns): + colspec = _get_col_names(colspec, columns) + new_name = '_'.join(colspec) + + to_parse = [data_dict[c] for c in colspec if c in data_dict] + try: + new_col = parser(*to_parse) + except DateConversionError: + new_col = _concat_date_cols(to_parse) + return new_name, new_col + +def _concat_date_cols(date_cols): + if len(date_cols) == 1: + return date_cols[0] + concat = lambda x: ' '.join(x) + return np.array(np.apply_along_axis(concat, 0, np.vstack(date_cols)), + dtype=object) class FixedWidthReader(object): """ diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 02fc25329e7bc..1a8ad3f13ad4e 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -12,6 +12,7 @@ import numpy as np from pandas import DataFrame, Index, isnull +import pandas.io.parsers as parsers from pandas.io.parsers import (read_csv, read_table, read_fwf, ExcelFile, TextParser) from pandas.util.testing import assert_almost_equal, assert_frame_equal, network @@ -90,6 +91,58 @@ def test_comment_fwf(self): comment='#') assert_almost_equal(df.values, expected) + def test_multiple_date_col(self): + # Can use multiple date parsers + data = """\ +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + def func(*date_cols): + return lib.try_parse_dates(parsers._concat_date_cols(date_cols)) + + df = read_table(StringIO(data), sep=',', header=None, + date_parser=func, + parse_dates={'nominal' : [1, 2], + 'actual' : [1,3]}) + self.assert_('nominal' in df) + self.assert_('actual' in df) + from datetime import datetime + d = datetime(1999, 1, 27, 19, 0) + self.assert_(df.ix[0, 'nominal'] == d) + + data = """\ +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + df = read_table(StringIO(data), sep=',', header=None, + parse_dates=[[1, 2], [1,3]]) + self.assert_('X.2_X.3' in df) + self.assert_('X.2_X.4' in df) + from datetime import datetime + d = datetime(1999, 1, 27, 19, 0) + self.assert_(df.ix[0, 'X.2_X.3'] == d) + + data = '''\ +KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +''' + df = read_table(StringIO(data), sep=',', header=None, + parse_dates=[1], index_col=1) + from datetime import datetime + d = datetime(1999, 1, 27, 19, 0) + self.assert_(df.index[0] == d) + def test_malformed(self): # all data = """ignore diff --git a/vb_suite/parser.py b/vb_suite/parser.py index 7c2754ca7da07..946e1327578c0 100644 --- a/vb_suite/parser.py +++ b/vb_suite/parser.py @@ -50,3 +50,42 @@ setup, cleanup="os.remove('test.csv')", start_date=datetime(2012, 5, 7)) + +setup = common_setup + """ +from pandas import read_table +from cStringIO import StringIO +import os +N = 10000 +K = 8 +data = '''\ +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +''' +data = data * 2000 +""" +cmd = ("read_table(StringIO(data), sep=',', header=None, " + "parse_dates=[[1,2], [1,3]])") +sdate = datetime(2012, 5, 7) +read_table_multiple_date = Benchmark(cmd, setup, start_date=sdate) + +setup = common_setup + """ +from pandas import read_table +from cStringIO import StringIO +import os +N = 10000 +K = 8 +data = '''\ +KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +''' +data = data * 2000 +""" +cmd = "read_table(StringIO(data), sep=',', header=None, parse_dates=[1])" +sdate = datetime(2012, 5, 7) +read_table_multiple_date_baseline = Benchmark(cmd, setup, start_date=sdate)