BUG: each date parsing funcs results differently

sinhrks · sinhrks · commit c2ea0d4d1f73 · 2015-07-12T06:30:51.000+09:00
diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst
@@ -71,37 +71,69 @@ Resample:
    ts.resample('D', how='mean')
 
 
+.. _timeseries.overview:
+
+Overview
+--------
+
+Following table shows the type of time-related classes pandas can handle and
+how to create them.
+
+=================  ============================== ==================================================
+Class              Remarks                        How to create
+=================  ============================== ==================================================
+``Timestamp``      Represents a single time stamp ``to_datetime``, ``Timestamp``
+``DatetimeIndex``  Index of ``Timestamps``        ``to_datetime``, ``date_range``, ``DatetimeIndex``
+``Period``         Represents a single time span  ``Period``
+``PeriodIndex``    Index of ``Period``            ``period_range``, ``PeriodIndex``
+=================  ============================== ==================================================
+
 .. _timeseries.representation:
 
 Time Stamps vs. Time Spans
 --------------------------
 
 Time-stamped data is the most basic type of timeseries data that associates
 values with points in time. For pandas objects it means using the points in
-time to create the index
+time.
 
 .. ipython:: python
 
-   dates = [datetime(2012, 5, 1), datetime(2012, 5, 2), datetime(2012, 5, 3)]
-   ts = Series(np.random.randn(3), dates)
-
-   type(ts.index)
-
-   ts
+   Timestamp(datetime(2012, 5, 1))
+   Timestamp('2012-05-01')
 
 However, in many cases it is more natural to associate things like change
-variables with a time span instead.
+variables with a time span instead. The span represented by ``Period`` can be
+specified explicitly, or inferred from datetime string format.
 
 For example:
 
 .. ipython:: python
 
-   periods = PeriodIndex([Period('2012-01'), Period('2012-02'),
-                          Period('2012-03')])
+   Period('2011-01')
+
+   Period('2012-05', freq='D')
+
+``Timestamp`` and ``Period`` can be the index. Lists of ``Timestamp`` and
+``Period`` are automatically coerce to ``DatetimeIndex`` and ``PeriodIndex``
+respectively.
+
+.. ipython:: python
+
+   dates = [Timestamp('2012-05-01'), Timestamp('2012-05-02'), Timestamp('2012-05-03')]
+   ts = Series(np.random.randn(3), dates)
+
+   type(ts.index)
+   ts.index
+
+   ts
+
+   periods = [Period('2012-01'), Period('2012-02'), Period('2012-03')]
 
    ts = Series(np.random.randn(3), periods)
 
    type(ts.index)
+   ts.index
 
    ts
 
@@ -150,6 +182,17 @@ you can pass the ``dayfirst`` flag:
    considerably and on versions later then 0.13.0 explicitly specifying
    a format string of '%Y%m%d' takes a faster path still.
 
+If you pass a single string to ``to_datetime``, it returns single ``Timestamp``.
+Also, ``Timestamp`` can accept the string input.
+Note that ``Timestamp`` doesn't accept string parsing option like ``dayfirst``
+or ``format``, use ``to_datetime`` if these are required.
+
+.. ipython:: python
+
+    to_datetime('2010/11/12')
+
+    Timestamp('2010/11/12')
+
 
 Invalid Data
 ~~~~~~~~~~~~
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -33,6 +33,45 @@ New features
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
 
+- ``DatetimeIndex`` can be instantiated using strings contains ``NaT`` (:issue:`7599`)
+- The string parsing of ``to_datetime``, ``Timestamp`` and ``DatetimeIndex`` has been made consistent" (:issue:`7599`)
+
+  Prior to v0.17.0, ``Timestamp`` and ``to_datetime`` may parse year-only datetime-string incorrectly using today's date, otherwise ``DatetimeIndex`` uses the beginning of the year.
+  ``Timestamp`` and ``to_datetime`` may raise ``ValueError`` in some types of datetime-string which ``DatetimeIndex`` can parse, such as quarterly string.
+
+  Previous Behavior
+
+  .. code-block:: python
+
+     In [1]: Timestamp('2012Q2')
+     Traceback
+        ...
+     ValueError: Unable to parse 2012Q2
+
+     # Results in today's date.
+     In [2]: Timestamp('2014')
+     Out [2]: 2014-08-12 00:00:00
+
+  v0.17.0 can parse them as below. It works on ``DatetimeIndex`` also.
+
+  New Behaviour
+
+  .. ipython:: python
+
+     Timestamp('2012Q2')
+     Timestamp('2014')
+     DatetimeIndex(['2012Q2', '2014'])
+
+  .. note:: If you want to perform calculations based on today's date, use ``Timestamp.now()`` and ``pandas.tseries.offsets``.
+
+  .. ipython:: python
+
+     import pandas.tseries.offsets as offsets
+     Timestamp.now()
+     Timestamp.now() + offsets.DateOffset(years=1)
+
+- ``to_datetime`` can now accept ``yearfirst`` keyword (:issue:`7599`)
+
 - ``.as_blocks`` will now take a ``copy`` optional argument to return a copy of the data, default is to copy (no change in behavior from prior versions), (:issue:`9607`)
 
 - ``regex`` argument to ``DataFrame.filter`` now handles numeric column names instead of raising ``ValueError`` (:issue:`10384`).
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -2047,8 +2047,9 @@ def _make_date_converter(date_parser=None, dayfirst=False,
     def converter(*date_cols):
         if date_parser is None:
             strs = _concat_date_cols(date_cols)
+
             try:
-                return tools.to_datetime(
+                return tools._to_datetime(
                     com._ensure_object(strs),
                     utc=None,
                     box=False,
diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py
@@ -314,14 +314,12 @@ def _get_freq_str(base, mult=1):
 }
 
 need_suffix = ['QS', 'BQ', 'BQS', 'AS', 'BA', 'BAS']
-_months = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP',
-           'OCT', 'NOV', 'DEC']
 for __prefix in need_suffix:
-    for _m in _months:
+    for _m in tslib._MONTHS:
         _offset_to_period_map['%s-%s' % (__prefix, _m)] = \
             _offset_to_period_map[__prefix]
 for __prefix in ['A', 'Q']:
-    for _m in _months:
+    for _m in tslib._MONTHS:
         _alias = '%s-%s' % (__prefix, _m)
         _offset_to_period_map[_alias] = _alias
 
@@ -1188,12 +1186,7 @@ def is_superperiod(source, target):
         return target in ['N']
 
 
-def _get_rule_month(source, default='DEC'):
-    source = source.upper()
-    if '-' not in source:
-        return default
-    else:
-        return source.split('-')[1]
+_get_rule_month = tslib._get_rule_month
 
 
 def _is_annual(rule):
@@ -1224,15 +1217,10 @@ def _is_weekly(rule):
 
 DAYS = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN']
 
-MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL',
-          'AUG', 'SEP', 'OCT', 'NOV', 'DEC']
-
-_month_numbers = dict((k, i) for i, k in enumerate(MONTHS))
-
-
+MONTHS = tslib._MONTHS
+_month_numbers = tslib._MONTH_NUMBERS
+_month_aliases = tslib._MONTH_ALIASES
 _weekday_rule_aliases = dict((k, v) for k, v in enumerate(DAYS))
-_month_aliases = dict((k + 1, v) for k, v in enumerate(MONTHS))
-
 
 def _is_multiple(us, mult):
     return us % mult == 0
diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py
@@ -239,8 +239,9 @@ def __new__(cls, data=None,
 
             # try a few ways to make it datetime64
             if lib.is_string_array(data):
-                data = _str_to_dt_array(data, freq, dayfirst=dayfirst,
-                                        yearfirst=yearfirst)
+                data = tslib.parse_str_array_to_datetime(data, freq=freq,
+                                                         dayfirst=dayfirst,
+                                                         yearfirst=yearfirst)
             else:
                 data = tools.to_datetime(data, errors='raise')
                 data.offset = freq
@@ -254,8 +255,9 @@ def __new__(cls, data=None,
                     return data
 
         if issubclass(data.dtype.type, compat.string_types):
-            data = _str_to_dt_array(data, freq, dayfirst=dayfirst,
-                                      yearfirst=yearfirst)
+            data = tslib.parse_str_array_to_datetime(data, freq=freq,
+                                                     dayfirst=dayfirst,
+                                                     yearfirst=yearfirst)
 
         if issubclass(data.dtype.type, np.datetime64):
             if isinstance(data, ABCSeries):
@@ -288,8 +290,9 @@ def __new__(cls, data=None,
                 values = data
 
             if lib.is_string_array(values):
-                subarr = _str_to_dt_array(values, freq, dayfirst=dayfirst,
-                                        yearfirst=yearfirst)
+                subarr = tslib.parse_str_array_to_datetime(values, freq=freq, dayfirst=dayfirst,
+                                                     yearfirst=yearfirst)
+
             else:
                 try:
                     subarr = tools.to_datetime(data, box=False)
@@ -298,11 +301,11 @@ def __new__(cls, data=None,
                     if isinstance(subarr, ABCSeries):
                         subarr = subarr.values
                         if subarr.dtype == np.object_:
-                            subarr = tools.to_datetime(subarr, box=False)
+                            subarr = tools._to_datetime(subarr, box=False)
 
                 except ValueError:
                     # tz aware
-                    subarr = tools.to_datetime(data, box=False, utc=True)
+                    subarr = tools._to_datetime(data, box=False, utc=True)
 
                 if not np.issubdtype(subarr.dtype, np.datetime64):
                     raise ValueError('Unable to convert %s to datetime dtype'
@@ -332,7 +335,7 @@ def __new__(cls, data=None,
                 if inferred != freq.freqstr:
                     on_freq = cls._generate(subarr[0], None, len(subarr), None, freq, tz=tz)
                     if not np.array_equal(subarr.asi8, on_freq.asi8):
-                        raise ValueError('Inferred frequency {0} from passed dates does not'
+                        raise ValueError('Inferred frequency {0} from passed dates does not '
                                          'conform to passed frequency {1}'.format(inferred, freq.freqstr))
 
         if freq_infer:
@@ -534,7 +537,7 @@ def _cached_range(cls, start=None, end=None, periods=None, offset=None,
             xdr = generate_range(offset=offset, start=_CACHE_START,
                                  end=_CACHE_END)
 
-            arr = tools.to_datetime(list(xdr), box=False)
+            arr = tools._to_datetime(list(xdr), box=False)
 
             cachedRange = DatetimeIndex._simple_new(arr)
             cachedRange.offset = offset
@@ -1926,17 +1929,6 @@ def _to_m8(key, tz=None):
     return np.int64(tslib.pydt_to_i8(key)).view(_NS_DTYPE)
 
 
-def _str_to_dt_array(arr, offset=None, dayfirst=None, yearfirst=None):
-    def parser(x):
-        result = parse_time_string(x, offset, dayfirst=dayfirst,
-                                   yearfirst=yearfirst)
-        return result[0]
-
-    arr = np.asarray(arr, dtype=object)
-    data = _algos.arrmap_object(arr, parser)
-    return tools.to_datetime(data)
-
-
 _CACHE_START = Timestamp(datetime(1950, 1, 1))
 _CACHE_END = Timestamp(datetime(2030, 1, 1))
 
diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py
@@ -1508,22 +1508,7 @@ def onOffset(self, dt):
         modMonth = (dt.month - self.startingMonth) % 3
         return BMonthEnd().onOffset(dt) and modMonth == 0
 
-
-_int_to_month = {
-    1: 'JAN',
-    2: 'FEB',
-    3: 'MAR',
-    4: 'APR',
-    5: 'MAY',
-    6: 'JUN',
-    7: 'JUL',
-    8: 'AUG',
-    9: 'SEP',
-    10: 'OCT',
-    11: 'NOV',
-    12: 'DEC'
-}
-
+_int_to_month = tslib._MONTH_ALIASES
 _month_to_int = dict((v, k) for k, v in _int_to_month.items())
 
 
diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py
@@ -1432,6 +1432,25 @@ def test_dti_constructor_preserve_dti_freq(self):
         rng2 = DatetimeIndex(rng)
         self.assertEqual(rng.freq, rng2.freq)
 
+    def test_dti_constructor_years_only(self):
+        # GH 6961
+        for tz in [None, 'UTC', 'Asia/Tokyo', 'dateutil/US/Pacific']:
+            rng1 = date_range('2014', '2015', freq='M', tz=tz)
+            expected1 = date_range('2014-01-31', '2014-12-31', freq='M', tz=tz)
+
+            rng2 = date_range('2014', '2015', freq='MS', tz=tz)
+            expected2 = date_range('2014-01-01', '2015-01-01', freq='MS', tz=tz)
+
+            rng3 = date_range('2014', '2020', freq='A', tz=tz)
+            expected3 = date_range('2014-12-31', '2019-12-31', freq='A', tz=tz)
+
+            rng4 = date_range('2014', '2020', freq='AS', tz=tz)
+            expected4 = date_range('2014-01-01', '2020-01-01', freq='AS', tz=tz)
+
+            for rng, expected in [(rng1, expected1), (rng2, expected2),
+                                  (rng3, expected3), (rng4, expected4)]:
+                tm.assert_index_equal(rng, expected)
+
     def test_normalize(self):
         rng = date_range('1/1/2000 9:30', periods=10, freq='D')
 
@@ -2146,6 +2165,15 @@ def test_constructor_coverage(self):
         from_ints = DatetimeIndex(expected.asi8)
         self.assertTrue(from_ints.equals(expected))
 
+        # string with NaT
+        strings = np.array(['2000-01-01', '2000-01-02', 'NaT'])
+        result = DatetimeIndex(strings)
+        expected = DatetimeIndex(strings.astype('O'))
+        self.assertTrue(result.equals(expected))
+
+        from_ints = DatetimeIndex(expected.asi8)
+        self.assertTrue(from_ints.equals(expected))
+
         # non-conforming
         self.assertRaises(ValueError, DatetimeIndex,
                           ['2000-01-01', '2000-01-02', '2000-01-04'],
diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py
diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py
diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx