Skip to content

Commit af25751

Browse files
Chang Shewesm
Chang She
authored andcommitted
ENH: convert multiple text file columns to a single date column #1186
1 parent 0325989 commit af25751

File tree

2 files changed

+127
-3
lines changed

2 files changed

+127
-3
lines changed

pandas/io/parsers.py

Lines changed: 87 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ def next(x):
2424

2525
from pandas.util.decorators import Appender
2626

27+
class DateConversionError(Exception):
28+
pass
29+
2730
_parser_params = """Also supports optionally iterating or breaking of the file
2831
into chunks.
2932
@@ -51,6 +54,9 @@ def next(x):
5154
date_parser : function
5255
Function to use for converting dates to strings. Defaults to
5356
dateutil.parser
57+
date_conversion : list or dict, default None
58+
Can combine multiple columns in date-time specification
59+
Newly created columns are prepended to the output
5460
dayfirst : boolean, default False
5561
DD/MM format dates, international and European format
5662
thousands : str, default None
@@ -186,6 +192,7 @@ def read_csv(filepath_or_buffer,
186192
parse_dates=False,
187193
dayfirst=False,
188194
date_parser=None,
195+
date_conversion=None,
189196
nrows=None,
190197
iterator=False,
191198
chunksize=None,
@@ -216,6 +223,7 @@ def read_table(filepath_or_buffer,
216223
parse_dates=False,
217224
dayfirst=False,
218225
date_parser=None,
226+
date_conversion=None,
219227
nrows=None,
220228
iterator=False,
221229
chunksize=None,
@@ -250,6 +258,7 @@ def read_fwf(filepath_or_buffer,
250258
parse_dates=False,
251259
dayfirst=False,
252260
date_parser=None,
261+
date_conversion=None,
253262
nrows=None,
254263
iterator=False,
255264
chunksize=None,
@@ -351,6 +360,7 @@ class TextParser(object):
351360
Comment out remainder of line
352361
parse_dates : boolean, default False
353362
date_parser : function, default None
363+
date_conversion : list or dict, default None
354364
skiprows : list of integers
355365
Row numbers to skip
356366
skip_footer : int
@@ -362,8 +372,8 @@ class TextParser(object):
362372
def __init__(self, f, delimiter=None, names=None, header=0,
363373
index_col=None, na_values=None, thousands=None,
364374
comment=None, parse_dates=False,
365-
date_parser=None, dayfirst=False, chunksize=None,
366-
skiprows=None, skip_footer=0, converters=None,
375+
date_parser=None, date_conversion=None, dayfirst=False,
376+
chunksize=None, skiprows=None, skip_footer=0, converters=None,
367377
verbose=False, encoding=None):
368378
"""
369379
Workhorse function for processing nested list into DataFrame
@@ -382,6 +392,7 @@ def __init__(self, f, delimiter=None, names=None, header=0,
382392

383393
self.parse_dates = parse_dates
384394
self.date_parser = date_parser
395+
self.date_conversion = date_conversion
385396
self.dayfirst = dayfirst
386397

387398
if com.is_integer(skiprows):
@@ -745,9 +756,11 @@ def get_chunk(self, rows=None):
745756
data[x] = lib.try_parse_dates(data[x], parser=self.date_parser,
746757
dayfirst=self.dayfirst)
747758

759+
data, columns = self._process_date_conversion(data, self.columns)
760+
748761
data = _convert_to_ndarrays(data, self.na_values, self.verbose)
749762

750-
return DataFrame(data=data, columns=self.columns, index=index)
763+
return DataFrame(data=data, columns=columns, index=index)
751764

752765
def _find_line_number(self, exp_len, chunk_len, chunk_i):
753766
if exp_len is None:
@@ -778,6 +791,52 @@ def _should_parse_dates(self, i):
778791
name = self.index_name[i]
779792
return i in to_parse or name in to_parse
780793

794+
def _process_date_conversion(self, data_dict, columns):
795+
if self.date_conversion is None:
796+
return data_dict, columns
797+
798+
new_cols = []
799+
new_data = {}
800+
801+
def date_converter(*date_cols):
802+
if self.date_parser is None:
803+
return lib.try_parse_dates(_concat_date_cols(date_cols),
804+
dayfirst=self.dayfirst)
805+
else:
806+
try:
807+
return self.date_parser(date_cols)
808+
except:
809+
return lib.try_parse_dates(_concat_date_cols(date_cols),
810+
parser=self.date_parser,
811+
dayfirst=self.dayfirst)
812+
813+
if isinstance(self.date_conversion, list):
814+
# list of column lists
815+
for colspec in self.date_conversion:
816+
new_name, col = _try_convert_dates(date_converter, colspec,
817+
data_dict, columns)
818+
if new_name in data_dict:
819+
raise ValueError('Result date column already in dict %s' %
820+
new_name)
821+
new_data[new_name] = col
822+
new_cols.append(new_name)
823+
824+
elif isinstance(self.date_conversion, dict):
825+
# dict of new name to column list
826+
for new_name, colspec in self.date_conversion.iteritems():
827+
if new_name in data_dict:
828+
raise ValueError('Date column %s already in dict' %
829+
new_name)
830+
831+
_, col = _try_convert_dates(date_converter, colspec, data_dict,
832+
columns)
833+
new_data[new_name] = col
834+
new_cols.append(new_name)
835+
836+
data_dict.update(new_data)
837+
new_cols.extend(columns)
838+
return data_dict, new_cols
839+
781840
def _get_lines(self, rows=None):
782841
source = self.data
783842
lines = self.buf
@@ -860,6 +919,31 @@ def _convert_types(values, na_values):
860919

861920
return result, na_count
862921

922+
def _get_col_names(colspec, columns):
923+
colset = set(columns)
924+
colnames = []
925+
for c in colspec:
926+
if c in colset:
927+
colnames.append(str(c))
928+
elif isinstance(c, int):
929+
colnames.append(str(columns[c]))
930+
return colnames
931+
932+
def _try_convert_dates(parser, colspec, data_dict, columns):
933+
colspec = _get_col_names(colspec, columns)
934+
new_name = '_'.join(colspec)
935+
936+
to_parse = [data_dict[c] for c in colspec if c in data_dict]
937+
try:
938+
new_col = parser(*to_parse)
939+
except DateConversionError:
940+
new_col = _concat_date_cols(to_parse)
941+
return new_name, new_col
942+
943+
def _concat_date_cols(date_cols):
944+
concat = lambda x: ' '.join(x)
945+
return np.array(np.apply_along_axis(concat, 0, np.vstack(date_cols)),
946+
dtype=object)
863947

864948
class FixedWidthReader(object):
865949
"""

pandas/io/tests/test_parsers.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import numpy as np
1313

1414
from pandas import DataFrame, Index, isnull
15+
import pandas.io.parsers as parsers
1516
from pandas.io.parsers import (read_csv, read_table, read_fwf,
1617
ExcelFile, TextParser)
1718
from pandas.util.testing import assert_almost_equal, assert_frame_equal, network
@@ -90,6 +91,45 @@ def test_comment_fwf(self):
9091
comment='#')
9192
assert_almost_equal(df.values, expected)
9293

94+
def test_multiple_date_col(self):
95+
# Can use multiple date parsers
96+
data = """\
97+
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
98+
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
99+
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
100+
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
101+
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
102+
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
103+
"""
104+
def func(*date_cols):
105+
return lib.try_parse_dates(parsers._concat_date_cols(date_cols))
106+
107+
df = read_table(StringIO(data), sep=',', header=None,
108+
date_parser=func,
109+
date_conversion={'nominal' : [1, 2],
110+
'actual' : [1,3]})
111+
self.assert_('nominal' in df)
112+
self.assert_('actual' in df)
113+
from datetime import datetime
114+
d = datetime(1999, 1, 27, 19, 0)
115+
self.assert_(df.ix[0, 'nominal'] == d)
116+
117+
data = """\
118+
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
119+
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
120+
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
121+
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
122+
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
123+
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
124+
"""
125+
df = read_table(StringIO(data), sep=',', header=None,
126+
date_conversion=[[1, 2], [1,3]])
127+
self.assert_('X.2_X.3' in df)
128+
self.assert_('X.2_X.4' in df)
129+
from datetime import datetime
130+
d = datetime(1999, 1, 27, 19, 0)
131+
self.assert_(df.ix[0, 'X.2_X.3'] == d)
132+
93133
def test_malformed(self):
94134
# all
95135
data = """ignore

0 commit comments

Comments
 (0)