diff --git a/RELEASE.rst b/RELEASE.rst index 8c0d56666f4e1..aeaebd88c5ee7 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -122,6 +122,8 @@ pandas 0.11.1 - Fix ``read_csv`` to correctly encode identical na_values, e.g. ``na_values=[-999.0,-999]`` was failing (GH3611_) - Fix indexing issue in ndim >= 3 with ``iloc`` (GH3617_) + - Correctly parse date columns with embedded (nan/NaT) into datetime64[ns] dtype in ``read_csv`` + when ``parse_dates`` is specified (GH3062_) .. _GH3164: https://github.com/pydata/pandas/issues/3164 .. _GH2786: https://github.com/pydata/pandas/issues/2786 @@ -172,6 +174,7 @@ pandas 0.11.1 .. _GH3617: https://github.com/pydata/pandas/issues/3617 .. _GH3435: https://github.com/pydata/pandas/issues/3435 .. _GH3611: https://github.com/pydata/pandas/issues/3611 +.. _GH3062: https://github.com/pydata/pandas/issues/3062 .. _GH1512: https://github.com/pydata/pandas/issues/1512 diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 4a9004b7068ba..38a31c042d120 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -531,6 +531,28 @@ def test_custom_na_values(self): skiprows=[1]) assert_almost_equal(df3.values, expected) + def test_nat_parse(self): + + # GH 3062 + df = DataFrame(dict({ + 'A' : np.asarray(range(10),dtype='float64'), + 'B' : pd.Timestamp('20010101') })) + df.iloc[3:6,:] = np.nan + + with ensure_clean('__nat_parse_.csv') as path: + df.to_csv(path) + result = read_csv(path,index_col=0,parse_dates=['B']) + tm.assert_frame_equal(result,df) + + expected = Series(dict( A = 'float64',B = 'datetime64[ns]')) + tm.assert_series_equal(expected,result.dtypes) + + # test with NaT for the nan_rep + # we don't have a method to specif the Datetime na_rep (it defaults to '') + df.to_csv(path) + result = read_csv(path,index_col=0,parse_dates=['B']) + tm.assert_frame_equal(result,df) + def test_skiprows_bug(self): # GH #505 text = """#foo,a,b,c diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index f9c1b2329c16d..a633b9482da06 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -318,8 +318,10 @@ class Timestamp(_Timestamp): ts.dts.us, ts.tzinfo) +_nat_strings = set(['NaT','nat','NAT','nan','NaN','NAN']) class NaTType(_NaT): """(N)ot-(A)-(T)ime, the time equivalent of NaN""" + def __new__(cls): cdef _NaT base @@ -647,8 +649,11 @@ cdef convert_to_tsobject(object ts, object tz): obj.value = ts pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) elif util.is_string_object(ts): - _string_to_dts(ts, &obj.dts) - obj.value = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &obj.dts) + if ts in _nat_strings: + obj.value = NPY_NAT + else: + _string_to_dts(ts, &obj.dts) + obj.value = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &obj.dts) elif PyDateTime_Check(ts): if tz is not None: # sort of a temporary hack @@ -862,6 +867,10 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, iresult[i] = iNaT continue + elif val in _nat_strings: + iresult[i] = iNaT + continue + _string_to_dts(val, &dts) iresult[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts)