Skip to content

Commit fc56b64

Browse files
committed
ENH: can pass list of columns to parse_dates, close pandas-dev#853 and add dayfirst argument for european dates close pandas-dev#854
1 parent 141df57 commit fc56b64

File tree

8 files changed

+111
-31
lines changed

8 files changed

+111
-31
lines changed

pandas/core/datetools.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def parser(x):
6666
data = p_ufunc(arr)
6767
return np.array(data, dtype='M8[us]')
6868

69-
def to_datetime(arg, errors='ignore'):
69+
def to_datetime(arg, errors='ignore', dayfirst=False):
7070
"""
7171
Convert argument to datetime
7272
@@ -87,14 +87,16 @@ def to_datetime(arg, errors='ignore'):
8787
return arg
8888
elif isinstance(arg, Series):
8989
values = lib.string_to_datetime(com._ensure_object(arg.values),
90-
raise_=errors == 'raise')
90+
raise_=errors == 'raise',
91+
dayfirst=dayfirst)
9192
return Series(values, index=arg.index, name=arg.name)
9293
elif isinstance(arg, np.ndarray):
9394
return lib.string_to_datetime(com._ensure_object(arg),
94-
raise_=errors == 'raise')
95+
raise_=errors == 'raise',
96+
dayfirst=dayfirst)
9597

9698
try:
97-
return parser.parse(arg)
99+
return parser.parse(arg, dayfirst=dayfirst)
98100
except Exception:
99101
if errors == 'raise':
100102
raise

pandas/core/index.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -118,12 +118,18 @@ def astype(self, dtype):
118118
return Index(self.values.astype(dtype), name=self.name,
119119
dtype=dtype)
120120

121-
def to_datetime(self):
121+
def to_datetime(self, dayfirst=False):
122122
"""
123123
For an Index containing strings or datetime.datetime objects, attempt
124124
conversion to DatetimeIndex
125125
"""
126-
return DatetimeIndex(self.values)
126+
if self.inferred_type == 'string':
127+
from dateutil.parser import parse
128+
parser = lambda x: parse(x, dayfirst=dayfirst)
129+
parsed = lib.try_parse_dates(self.values, parser=parser)
130+
return DatetimeIndex(parsed)
131+
else:
132+
return DatetimeIndex(self.values)
127133

128134
@property
129135
def dtype(self):

pandas/io/parsers.py

+54-19
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,13 @@
3939
na_values : list-like or dict, default None
4040
Additional strings to recognize as NA/NaN. If dict passed, specific
4141
per-column NA values
42-
parse_dates : boolean, default False
43-
Attempt to parse dates in the index column(s)
42+
parse_dates : boolean or list of column numbers/name, default False
43+
Attempt to parse dates in the indicated columns
4444
date_parser : function
4545
Function to use for converting dates to strings. Defaults to
4646
dateutil.parser
47+
dayfirst : boolean, default False
48+
DD/MM format dates, international and European format
4749
nrows : int, default None
4850
Number of rows of file to read. Useful for reading pieces of large files
4951
iterator : boolean, default False
@@ -168,6 +170,7 @@ def read_csv(filepath_or_buffer,
168170
skiprows=None,
169171
na_values=None,
170172
parse_dates=False,
173+
dayfirst=False,
171174
date_parser=None,
172175
nrows=None,
173176
iterator=False,
@@ -195,6 +198,7 @@ def read_table(filepath_or_buffer,
195198
skiprows=None,
196199
na_values=None,
197200
parse_dates=False,
201+
dayfirst=False,
198202
date_parser=None,
199203
nrows=None,
200204
iterator=False,
@@ -226,6 +230,7 @@ def read_fwf(filepath_or_buffer,
226230
skiprows=None,
227231
na_values=None,
228232
parse_dates=False,
233+
dayfirst=False,
229234
date_parser=None,
230235
nrows=None,
231236
iterator=False,
@@ -242,7 +247,8 @@ def read_fwf(filepath_or_buffer,
242247
colspecs = kwds.get('colspecs', None)
243248
widths = kwds.pop('widths', None)
244249
if bool(colspecs is None) == bool(widths is None):
245-
raise ValueError("You must specify only one of 'widths' and 'colspecs'")
250+
raise ValueError("You must specify only one of 'widths' and "
251+
"'colspecs'")
246252

247253
# Compute 'colspec' from 'widths', if specified.
248254
if widths is not None:
@@ -258,8 +264,8 @@ def read_fwf(filepath_or_buffer,
258264

259265
def read_clipboard(**kwargs): # pragma: no cover
260266
"""
261-
Read text from clipboard and pass to read_table. See read_table for the full
262-
argument list
267+
Read text from clipboard and pass to read_table. See read_table for the
268+
full argument list
263269
264270
Returns
265271
-------
@@ -334,9 +340,9 @@ class TextParser(object):
334340

335341
def __init__(self, f, delimiter=None, names=None, header=0,
336342
index_col=None, na_values=None, parse_dates=False,
337-
date_parser=None, chunksize=None, skiprows=None,
338-
skip_footer=0, converters=None, verbose=False,
339-
encoding=None):
343+
date_parser=None, dayfirst=False, chunksize=None,
344+
skiprows=None, skip_footer=0, converters=None,
345+
verbose=False, encoding=None):
340346
"""
341347
Workhorse function for processing nested list into DataFrame
342348
@@ -348,12 +354,14 @@ def __init__(self, f, delimiter=None, names=None, header=0,
348354
self.names = list(names) if names is not None else names
349355
self.header = header
350356
self.index_col = index_col
351-
self.parse_dates = parse_dates
352-
self.date_parser = date_parser
353357
self.chunksize = chunksize
354358
self.passed_names = names is not None
355359
self.encoding = encoding
356360

361+
self.parse_dates = parse_dates
362+
self.date_parser = date_parser
363+
self.dayfirst = dayfirst
364+
357365
if com.is_integer(skiprows):
358366
skiprows = range(skiprows)
359367
self.skiprows = set() if skiprows is None else set(skiprows)
@@ -382,6 +390,10 @@ def __init__(self, f, delimiter=None, names=None, header=0,
382390
else:
383391
self.data = f
384392
self.columns = self._infer_columns()
393+
394+
# get popped off for index
395+
self.orig_columns = list(self.columns)
396+
385397
self.index_name = self._get_index_name()
386398
self._first_chunk = True
387399

@@ -588,17 +600,19 @@ def get_chunk(self, rows=None):
588600
zipped_content.pop(i)
589601

590602
if np.isscalar(self.index_col):
591-
if self.parse_dates:
592-
index = lib.try_parse_dates(index, parser=self.date_parser)
603+
if self._should_parse_dates(0):
604+
index = lib.try_parse_dates(index, parser=self.date_parser,
605+
dayfirst=self.dayfirst)
593606
index, na_count = _convert_types(index, self.na_values)
594607
index = Index(index, name=self.index_name)
595608
if self.verbose and na_count:
596609
print 'Found %d NA values in the index' % na_count
597610
else:
598611
arrays = []
599-
for arr in index:
600-
if self.parse_dates:
601-
arr = lib.try_parse_dates(arr, parser=self.date_parser)
612+
for i, arr in enumerate(index):
613+
if self._should_parse_dates(i):
614+
arr = lib.try_parse_dates(arr, parser=self.date_parser,
615+
dayfirst=self.dayfirst)
602616
arr, _ = _convert_types(arr, self.na_values)
603617
arrays.append(arr)
604618
index = MultiIndex.from_arrays(arrays, names=self.index_name)
@@ -623,10 +637,30 @@ def get_chunk(self, rows=None):
623637
col = self.columns[col]
624638
data[col] = lib.map_infer(data[col], f)
625639

640+
if not isinstance(self.parse_dates, bool):
641+
for x in self.parse_dates:
642+
if isinstance(x, int) and x not in data:
643+
x = self.orig_columns[x]
644+
if x in self.index_col or x in self.index_name:
645+
continue
646+
data[x] = lib.try_parse_dates(data[x], parser=self.date_parser,
647+
dayfirst=self.dayfirst)
648+
626649
data = _convert_to_ndarrays(data, self.na_values, self.verbose)
627650

628651
return DataFrame(data=data, columns=self.columns, index=index)
629652

653+
def _should_parse_dates(self, i):
654+
if isinstance(self.parse_dates, bool):
655+
return self.parse_dates
656+
else:
657+
to_parse = self.parse_dates
658+
if np.isscalar(self.index_col):
659+
name = self.index_name
660+
else:
661+
name = self.index_name[i]
662+
return i in to_parse or name in to_parse
663+
630664
def _get_lines(self, rows=None):
631665
source = self.data
632666
lines = self.buf
@@ -725,7 +759,8 @@ def __init__(self, f, colspecs, filler):
725759
def next(self):
726760
line = self.f.next()
727761
# Note: 'colspecs' is a sequence of half-open intervals.
728-
return [line[fromm:to].strip(self.filler or ' ') for (fromm, to) in self.colspecs]
762+
return [line[fromm:to].strip(self.filler or ' ')
763+
for (fromm, to) in self.colspecs]
729764

730765

731766
class FixedWidthFieldParser(TextParser):
@@ -743,7 +778,7 @@ def _make_reader(self, f):
743778
self.data = FixedWidthReader(f, self.colspecs, self.delimiter)
744779

745780

746-
#-------------------------------------------------------------------------------
781+
#----------------------------------------------------------------------
747782
# ExcelFile class
748783

749784
_openpyxl_msg = ("\nFor parsing .xlsx files 'openpyxl' is required.\n"
@@ -795,8 +830,8 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=None,
795830
skiprows : list-like
796831
Row numbers to skip (0-indexed)
797832
index_col : int, default None
798-
Column to use as the row labels of the DataFrame. Pass None if there
799-
is no such column
833+
Column to use as the row labels of the DataFrame. Pass None if
834+
there is no such column
800835
na_values : list-like, default None
801836
List of additional strings to recognize as NA/NaN
802837

pandas/io/tests/test_parsers.py

+25
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,31 @@ def test_parse_dates_implicit_first_col(self):
175175
self.assert_(isinstance(df.index[0], (datetime, np.datetime64, Timestamp)))
176176
assert_frame_equal(df, expected)
177177

178+
def test_parse_dates_column_list(self):
179+
from pandas.core.datetools import to_datetime
180+
181+
data = '''date;destination;ventilationcode;unitcode;units;aux_date
182+
01/01/2010;P;P;50;1;12/1/2011
183+
01/01/2010;P;R;50;1;13/1/2011
184+
15/01/2010;P;P;50;1;14/1/2011
185+
01/05/2010;P;P;50;1;15/1/2011'''
186+
187+
expected = read_csv(StringIO(data), sep=";", index_col=range(4))
188+
189+
lev = expected.index.levels[0]
190+
expected.index.levels[0] = lev.to_datetime(dayfirst=True)
191+
expected['aux_date'] = to_datetime(expected['aux_date'],
192+
dayfirst=True).astype('O')
193+
self.assert_(isinstance(expected['aux_date'][0], datetime))
194+
195+
df = read_csv(StringIO(data), sep=";", index_col = range(4),
196+
parse_dates=[0, 5], dayfirst=True)
197+
assert_frame_equal(df, expected)
198+
199+
df = read_csv(StringIO(data), sep=";", index_col = range(4),
200+
parse_dates=['date', 'aux_date'], dayfirst=True)
201+
assert_frame_equal(df, expected)
202+
178203
def test_no_header(self):
179204
data = """1,2,3,4,5
180205
6,7,8,9,10

pandas/src/datetime.pyx

+3-3
Original file line numberDiff line numberDiff line change
@@ -613,7 +613,7 @@ cdef class DayOffset(_Offset):
613613
# offset.next()
614614
# return i
615615

616-
def string_to_datetime(ndarray[object] strings, raise_=False):
616+
def string_to_datetime(ndarray[object] strings, raise_=False, dayfirst=False):
617617
cdef:
618618
Py_ssize_t i, n = len(strings)
619619
object val
@@ -634,7 +634,7 @@ def string_to_datetime(ndarray[object] strings, raise_=False):
634634
result[i] = val
635635
else:
636636
try:
637-
result[i] = parse(val)
637+
result[i] = parse(val, dayfirst=dayfirst)
638638
except Exception:
639639
raise TypeError
640640
return result
@@ -647,7 +647,7 @@ def string_to_datetime(ndarray[object] strings, raise_=False):
647647
oresult[i] = val
648648
else:
649649
try:
650-
oresult[i] = parse(val)
650+
oresult[i] = parse(val, dayfirst=dayfirst)
651651
except Exception:
652652
if raise_:
653653
raise

pandas/src/inference.pyx

+4-3
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
377377
def convert_sql_column(x):
378378
return maybe_convert_objects(x, try_float=1)
379379

380-
def try_parse_dates(ndarray[object] values, parser=None):
380+
def try_parse_dates(ndarray[object] values, parser=None,
381+
dayfirst=False):
381382
cdef:
382383
Py_ssize_t i, n
383384
ndarray[object] result
@@ -389,8 +390,8 @@ def try_parse_dates(ndarray[object] values, parser=None):
389390

390391
if parser is None:
391392
try:
392-
from dateutil import parser
393-
parse_date = parser.parse
393+
from dateutil.parser import parse
394+
parse_date = lambda x: parse(x, dayfirst=dayfirst)
394395
except ImportError: # pragma: no cover
395396
def parse_date(s):
396397
try:

pandas/tests/test_tseries.py

+10
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,16 @@ def _ohlc(group):
433433
expected[0] = np.nan
434434
assert_almost_equal(out, expected)
435435

436+
def test_try_parse_dates():
437+
from dateutil.parser import parse
438+
439+
arr = np.array(['5/1/2000', '6/1/2000', '7/1/2000'], dtype=object)
440+
441+
result = lib.try_parse_dates(arr, dayfirst=True)
442+
expected = [parse(d, dayfirst=True) for d in arr]
443+
assert(np.array_equal(result, expected))
444+
445+
436446
class TestTypeInference(unittest.TestCase):
437447

438448
def test_length_zero(self):

scripts/count_code.sh

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
cloc pandas --force-lang=Python,pyx --not-match-f="tseries.c|sandbox.c|engines.c|sparse.c"

0 commit comments

Comments
 (0)