ENH: can pass list of columns to parse_dates, close pandas-dev#853 and add dayfirst argument for european dates close pandas-dev#854

wesm · wesm · commit fc56b64c1985 · 2012-04-14T21:35:44.000-04:00
diff --git a/pandas/core/datetools.py b/pandas/core/datetools.py
@@ -66,7 +66,7 @@ def parser(x):
     data = p_ufunc(arr)
     return np.array(data, dtype='M8[us]')
 
-def to_datetime(arg, errors='ignore'):
+def to_datetime(arg, errors='ignore', dayfirst=False):
     """
     Convert argument to datetime
 
@@ -87,14 +87,16 @@ def to_datetime(arg, errors='ignore'):
         return arg
     elif isinstance(arg, Series):
         values = lib.string_to_datetime(com._ensure_object(arg.values),
-                                        raise_=errors == 'raise')
+                                        raise_=errors == 'raise',
+                                        dayfirst=dayfirst)
         return Series(values, index=arg.index, name=arg.name)
     elif isinstance(arg, np.ndarray):
         return lib.string_to_datetime(com._ensure_object(arg),
-                                      raise_=errors == 'raise')
+                                      raise_=errors == 'raise',
+                                      dayfirst=dayfirst)
 
     try:
-        return parser.parse(arg)
+        return parser.parse(arg, dayfirst=dayfirst)
     except Exception:
         if errors == 'raise':
             raise
diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -118,12 +118,18 @@ def astype(self, dtype):
         return Index(self.values.astype(dtype), name=self.name,
                      dtype=dtype)
 
-    def to_datetime(self):
+    def to_datetime(self, dayfirst=False):
         """
         For an Index containing strings or datetime.datetime objects, attempt
         conversion to DatetimeIndex
         """
-        return DatetimeIndex(self.values)
+        if self.inferred_type == 'string':
+            from dateutil.parser import parse
+            parser = lambda x: parse(x, dayfirst=dayfirst)
+            parsed = lib.try_parse_dates(self.values, parser=parser)
+            return DatetimeIndex(parsed)
+        else:
+            return DatetimeIndex(self.values)
 
     @property
     def dtype(self):
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -39,11 +39,13 @@
 na_values : list-like or dict, default None
     Additional strings to recognize as NA/NaN. If dict passed, specific
     per-column NA values
-parse_dates : boolean, default False
-    Attempt to parse dates in the index column(s)
+parse_dates : boolean or list of column numbers/name, default False
+    Attempt to parse dates in the indicated columns
 date_parser : function
     Function to use for converting dates to strings. Defaults to
     dateutil.parser
+dayfirst : boolean, default False
+    DD/MM format dates, international and European format
 nrows : int, default None
     Number of rows of file to read. Useful for reading pieces of large files
 iterator : boolean, default False
@@ -168,6 +170,7 @@ def read_csv(filepath_or_buffer,
              skiprows=None,
              na_values=None,
              parse_dates=False,
+             dayfirst=False,
              date_parser=None,
              nrows=None,
              iterator=False,
@@ -195,6 +198,7 @@ def read_table(filepath_or_buffer,
                skiprows=None,
                na_values=None,
                parse_dates=False,
+               dayfirst=False,
                date_parser=None,
                nrows=None,
                iterator=False,
@@ -226,6 +230,7 @@ def read_fwf(filepath_or_buffer,
              skiprows=None,
              na_values=None,
              parse_dates=False,
+             dayfirst=False,
              date_parser=None,
              nrows=None,
              iterator=False,
@@ -242,7 +247,8 @@ def read_fwf(filepath_or_buffer,
     colspecs = kwds.get('colspecs', None)
     widths = kwds.pop('widths', None)
     if bool(colspecs is None) == bool(widths is None):
-        raise ValueError("You must specify only one of 'widths' and 'colspecs'")
+        raise ValueError("You must specify only one of 'widths' and "
+                         "'colspecs'")
 
     # Compute 'colspec' from 'widths', if specified.
     if widths is not None:
@@ -258,8 +264,8 @@ def read_fwf(filepath_or_buffer,
 
 def read_clipboard(**kwargs):  # pragma: no cover
     """
-    Read text from clipboard and pass to read_table. See read_table for the full
-    argument list
+    Read text from clipboard and pass to read_table. See read_table for the
+    full argument list
 
     Returns
     -------
@@ -334,9 +340,9 @@ class TextParser(object):
 
     def __init__(self, f, delimiter=None, names=None, header=0,
                  index_col=None, na_values=None, parse_dates=False,
-                 date_parser=None, chunksize=None, skiprows=None,
-                 skip_footer=0, converters=None, verbose=False,
-                 encoding=None):
+                 date_parser=None, dayfirst=False, chunksize=None,
+                 skiprows=None, skip_footer=0, converters=None,
+                 verbose=False, encoding=None):
         """
         Workhorse function for processing nested list into DataFrame
 
@@ -348,12 +354,14 @@ def __init__(self, f, delimiter=None, names=None, header=0,
         self.names = list(names) if names is not None else names
         self.header = header
         self.index_col = index_col
-        self.parse_dates = parse_dates
-        self.date_parser = date_parser
         self.chunksize = chunksize
         self.passed_names = names is not None
         self.encoding = encoding
 
+        self.parse_dates = parse_dates
+        self.date_parser = date_parser
+        self.dayfirst = dayfirst
+
         if com.is_integer(skiprows):
             skiprows = range(skiprows)
         self.skiprows = set() if skiprows is None else set(skiprows)
@@ -382,6 +390,10 @@ def __init__(self, f, delimiter=None, names=None, header=0,
         else:
             self.data = f
         self.columns = self._infer_columns()
+
+        # get popped off for index
+        self.orig_columns = list(self.columns)
+
         self.index_name = self._get_index_name()
         self._first_chunk = True
 
@@ -588,17 +600,19 @@ def get_chunk(self, rows=None):
                     zipped_content.pop(i)
 
             if np.isscalar(self.index_col):
-                if self.parse_dates:
-                    index = lib.try_parse_dates(index, parser=self.date_parser)
+                if self._should_parse_dates(0):
+                    index = lib.try_parse_dates(index, parser=self.date_parser,
+                                                dayfirst=self.dayfirst)
                 index, na_count = _convert_types(index, self.na_values)
                 index = Index(index, name=self.index_name)
                 if self.verbose and na_count:
                     print 'Found %d NA values in the index' % na_count
             else:
                 arrays = []
-                for arr in index:
-                    if self.parse_dates:
-                        arr = lib.try_parse_dates(arr, parser=self.date_parser)
+                for i, arr in enumerate(index):
+                    if self._should_parse_dates(i):
+                        arr = lib.try_parse_dates(arr, parser=self.date_parser,
+                                                  dayfirst=self.dayfirst)
                     arr, _ = _convert_types(arr, self.na_values)
                     arrays.append(arr)
                 index = MultiIndex.from_arrays(arrays, names=self.index_name)
@@ -623,10 +637,30 @@ def get_chunk(self, rows=None):
                 col = self.columns[col]
             data[col] = lib.map_infer(data[col], f)
 
+        if not isinstance(self.parse_dates, bool):
+            for x in self.parse_dates:
+                if isinstance(x, int) and x not in data:
+                    x = self.orig_columns[x]
+                if x in self.index_col or x in self.index_name:
+                    continue
+                data[x] = lib.try_parse_dates(data[x], parser=self.date_parser,
+                                              dayfirst=self.dayfirst)
+
         data = _convert_to_ndarrays(data, self.na_values, self.verbose)
 
         return DataFrame(data=data, columns=self.columns, index=index)
 
+    def _should_parse_dates(self, i):
+        if isinstance(self.parse_dates, bool):
+            return self.parse_dates
+        else:
+            to_parse = self.parse_dates
+            if np.isscalar(self.index_col):
+                name = self.index_name
+            else:
+                name = self.index_name[i]
+            return i in to_parse or name in to_parse
+
     def _get_lines(self, rows=None):
         source = self.data
         lines = self.buf
@@ -725,7 +759,8 @@ def __init__(self, f, colspecs, filler):
     def next(self):
         line = self.f.next()
         # Note: 'colspecs' is a sequence of half-open intervals.
-        return [line[fromm:to].strip(self.filler or ' ') for (fromm, to) in self.colspecs]
+        return [line[fromm:to].strip(self.filler or ' ')
+                for (fromm, to) in self.colspecs]
 
 
 class FixedWidthFieldParser(TextParser):
@@ -743,7 +778,7 @@ def _make_reader(self, f):
         self.data = FixedWidthReader(f, self.colspecs, self.delimiter)
 
 
-#-------------------------------------------------------------------------------
+#----------------------------------------------------------------------
 # ExcelFile class
 
 _openpyxl_msg = ("\nFor parsing .xlsx files 'openpyxl' is required.\n"
@@ -795,8 +830,8 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=None,
         skiprows : list-like
             Row numbers to skip (0-indexed)
         index_col : int, default None
-            Column to use as the row labels of the DataFrame. Pass None if there
-            is no such column
+            Column to use as the row labels of the DataFrame. Pass None if
+            there is no such column
         na_values : list-like, default None
             List of additional strings to recognize as NA/NaN
 
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -175,6 +175,31 @@ def test_parse_dates_implicit_first_col(self):
         self.assert_(isinstance(df.index[0], (datetime, np.datetime64, Timestamp)))
         assert_frame_equal(df, expected)
 
+    def test_parse_dates_column_list(self):
+        from pandas.core.datetools import to_datetime
+
+        data = '''date;destination;ventilationcode;unitcode;units;aux_date
+01/01/2010;P;P;50;1;12/1/2011
+01/01/2010;P;R;50;1;13/1/2011
+15/01/2010;P;P;50;1;14/1/2011
+01/05/2010;P;P;50;1;15/1/2011'''
+
+        expected = read_csv(StringIO(data), sep=";", index_col=range(4))
+
+        lev = expected.index.levels[0]
+        expected.index.levels[0] = lev.to_datetime(dayfirst=True)
+        expected['aux_date'] = to_datetime(expected['aux_date'],
+                                           dayfirst=True).astype('O')
+        self.assert_(isinstance(expected['aux_date'][0], datetime))
+
+        df = read_csv(StringIO(data), sep=";", index_col = range(4),
+                      parse_dates=[0, 5], dayfirst=True)
+        assert_frame_equal(df, expected)
+
+        df = read_csv(StringIO(data), sep=";", index_col = range(4),
+                      parse_dates=['date', 'aux_date'], dayfirst=True)
+        assert_frame_equal(df, expected)
+
     def test_no_header(self):
         data = """1,2,3,4,5
 6,7,8,9,10
diff --git a/pandas/src/datetime.pyx b/pandas/src/datetime.pyx
@@ -613,7 +613,7 @@ cdef class DayOffset(_Offset):
 #        offset.next()
 #    return i
 
-def string_to_datetime(ndarray[object] strings, raise_=False):
+def string_to_datetime(ndarray[object] strings, raise_=False, dayfirst=False):
     cdef:
         Py_ssize_t i, n = len(strings)
         object val
@@ -634,7 +634,7 @@ def string_to_datetime(ndarray[object] strings, raise_=False):
                 result[i] = val
             else:
                 try:
-                    result[i] = parse(val)
+                    result[i] = parse(val, dayfirst=dayfirst)
                 except Exception:
                     raise TypeError
         return result
@@ -647,7 +647,7 @@ def string_to_datetime(ndarray[object] strings, raise_=False):
                 oresult[i] = val
             else:
                 try:
-                    oresult[i] = parse(val)
+                    oresult[i] = parse(val, dayfirst=dayfirst)
                 except Exception:
                     if raise_:
                         raise
diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx
@@ -377,7 +377,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
 def convert_sql_column(x):
     return maybe_convert_objects(x, try_float=1)
 
-def try_parse_dates(ndarray[object] values, parser=None):
+def try_parse_dates(ndarray[object] values, parser=None,
+                    dayfirst=False):
     cdef:
         Py_ssize_t i, n
         ndarray[object] result
@@ -389,8 +390,8 @@ def try_parse_dates(ndarray[object] values, parser=None):
 
     if parser is None:
         try:
-            from dateutil import parser
-            parse_date = parser.parse
+            from dateutil.parser import parse
+            parse_date = lambda x: parse(x, dayfirst=dayfirst)
         except ImportError: # pragma: no cover
             def parse_date(s):
                 try:
diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py
@@ -433,6 +433,16 @@ def _ohlc(group):
     expected[0] = np.nan
     assert_almost_equal(out, expected)
 
+def test_try_parse_dates():
+    from dateutil.parser import parse
+
+    arr = np.array(['5/1/2000', '6/1/2000', '7/1/2000'], dtype=object)
+
+    result = lib.try_parse_dates(arr, dayfirst=True)
+    expected = [parse(d, dayfirst=True) for d in arr]
+    assert(np.array_equal(result, expected))
+
+
 class TestTypeInference(unittest.TestCase):
 
     def test_length_zero(self):
diff --git a/scripts/count_code.sh b/scripts/count_code.sh
@@ -0,0 +1 @@
+cloc pandas --force-lang=Python,pyx --not-match-f="tseries.c|sandbox.c|engines.c|sparse.c"

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+cloc pandas --force-lang=Python,pyx --not-match-f="tseries.c\|sandbox.c\|engines.c\|sparse.c"`