diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index fad4c7e3d5d0a..60fd950bfe725 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -522,6 +522,7 @@ Bug Fixes - Bug in not treating ``NaT`` as a missing value in datetimelikes when factorizing & with ``Categoricals`` (:issue:`12077`) - Bug in getitem when the values of a ``Series`` were tz-aware (:issue:`12089`) - Bug in ``Series.str.get_dummies`` when one of the variables was 'name' (:issue:`12180`) +- Bug in ``pd.concat`` while concatenating tz-aware NaT series. (:issue:`11693`, :issue:`11755`) diff --git a/pandas/core/common.py b/pandas/core/common.py index 508765896e275..da20fd75ceb35 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1643,7 +1643,13 @@ def _possibly_cast_to_datetime(value, dtype, errors='raise'): raise TypeError("cannot convert datetimelike to " "dtype [%s]" % dtype) elif is_datetime64tz: - pass + + # our NaT doesn't support tz's + # this will coerce to DatetimeIndex with + # a matching dtype below + if lib.isscalar(value) and isnull(value): + value = [value] + elif is_timedelta64 and not is_dtype_equal(dtype, _TD_DTYPE): if dtype.name == 'timedelta64[ns]': dtype = _TD_DTYPE @@ -1651,7 +1657,7 @@ def _possibly_cast_to_datetime(value, dtype, errors='raise'): raise TypeError("cannot convert timedeltalike to " "dtype [%s]" % dtype) - if np.isscalar(value): + if lib.isscalar(value): if value == tslib.iNaT or isnull(value): value = tslib.iNaT else: diff --git a/pandas/core/series.py b/pandas/core/series.py index 49182951c0e9d..68ae58737916b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2903,7 +2903,7 @@ def create_from_value(value, index, dtype): # return a new empty value suitable for the dtype if is_datetimetz(dtype): - subarr = DatetimeIndex([value] * len(index)) + subarr = DatetimeIndex([value] * len(index), dtype=dtype) else: if not isinstance(dtype, (np.dtype, type(np.dtype))): dtype = dtype.dtype @@ -2937,7 +2937,8 @@ def create_from_value(value, index, dtype): # a 1-element ndarray if len(subarr) != len(index) and len(subarr) == 1: - subarr = create_from_value(subarr[0], index, subarr) + subarr = create_from_value(subarr[0], index, + subarr.dtype) elif subarr.ndim > 1: if isinstance(data, np.ndarray): diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index de505b93da241..b0ca07e84f7ce 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -108,10 +108,6 @@ def test_construction_with_alt(self): expected = i.tz_localize(None).tz_localize('UTC') self.assert_index_equal(i2, expected) - i2 = DatetimeIndex(i, tz='UTC') - expected = i.tz_convert('UTC') - self.assert_index_equal(i2, expected) - # incompat tz/dtype self.assertRaises(ValueError, lambda: DatetimeIndex( i.tz_localize(None).asi8, dtype=i.dtype, tz='US/Pacific')) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 9133cc2c5a020..6ae24bbccfa74 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -473,6 +473,11 @@ def test_constructor_with_datetime_tz(self): self.assertTrue(s.dtype == 'object') self.assertTrue(lib.infer_dtype(s) == 'datetime') + # with all NaT + s = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]') + expected = Series(pd.DatetimeIndex(['NaT', 'NaT'], tz='US/Eastern')) + assert_series_equal(s, expected) + def test_constructor_periodindex(self): # GH7932 # converting a PeriodIndex when put in a Series diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 753bbccf850e4..30c2621cd64ef 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3943,10 +3943,17 @@ def test_groupby_multi_timezone(self): result = df.groupby('tz').date.apply( lambda x: pd.to_datetime(x).dt.tz_localize(x.name)) - expected = pd.to_datetime(Series( - ['2000-01-28 22:47:00', '2000-01-29 22:48:00', - '2000-01-31 00:49:00', '2000-01-31 22:50:00', - '2000-01-01 21:50:00'])) + expected = Series([Timestamp('2000-01-28 16:47:00-0600', + tz='America/Chicago'), + Timestamp('2000-01-29 16:48:00-0600', + tz='America/Chicago'), + Timestamp('2000-01-30 16:49:00-0800', + tz='America/Los_Angeles'), + Timestamp('2000-01-31 16:50:00-0600', + tz='America/Chicago'), + Timestamp('2000-01-01 16:50:00-0500', + tz='America/New_York')], + dtype=object) assert_series_equal(result, expected) tz = 'America/Chicago' diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 8200989ff84d2..fdf38a0869a0b 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -1024,6 +1024,63 @@ def test_merge_on_datetime64tz(self): result = pd.merge(left, right, on='key', how='outer') assert_frame_equal(result, expected) + def test_concat_NaT_series(self): + # GH 11693 + # test for merging NaT series with datetime series. + x = Series(date_range('20151124 08:00', '20151124 09:00', + freq='1h', tz='US/Eastern')) + y = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]') + expected = Series([x[0], x[1], pd.NaT, pd.NaT]) + + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # all NaT with tz + expected = Series(pd.NaT, index=range(4), + dtype='datetime64[ns, US/Eastern]') + result = pd.concat([y, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # without tz + x = pd.Series(pd.date_range('20151124 08:00', + '20151124 09:00', freq='1h')) + y = pd.Series(pd.date_range('20151124 10:00', + '20151124 11:00', freq='1h')) + y[:] = pd.NaT + expected = pd.Series([x[0], x[1], pd.NaT, pd.NaT]) + result = pd.concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # all NaT without tz + x[:] = pd.NaT + expected = pd.Series(pd.NaT, index=range(4), + dtype='datetime64[ns]') + result = pd.concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + def test_concat_tz_series(self): + # GH 11755 + # tz and no tz + x = Series(date_range('20151124 08:00', + '20151124 09:00', + freq='1h', tz='UTC')) + y = Series(date_range('2012-01-01', '2012-01-02')) + expected = Series([x[0], x[1], y[0], y[1]], + dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # GH 11887 + # concat tz and object + x = Series(date_range('20151124 08:00', + '20151124 09:00', + freq='1h', tz='UTC')) + y = Series(['a', 'b']) + expected = Series([x[0], x[1], y[0], y[1]], + dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + def test_indicator(self): # PR #10054. xref #7412 and closes #8790. df1 = DataFrame({'col1': [0, 1], 'col_left': [ diff --git a/pandas/tseries/common.py b/pandas/tseries/common.py index f9f90a9377f76..5c31d79dc6780 100644 --- a/pandas/tseries/common.py +++ b/pandas/tseries/common.py @@ -255,14 +255,16 @@ def _concat_compat(to_concat, axis=0): def convert_to_pydatetime(x, axis): # coerce to an object dtype - if x.dtype == _NS_DTYPE: - if hasattr(x, 'tz'): + # if dtype is of datetimetz or timezone + if x.dtype.kind == _NS_DTYPE.kind: + if getattr(x, 'tz', None) is not None: x = x.asobject + else: + shape = x.shape + x = tslib.ints_to_pydatetime(x.view(np.int64).ravel()) + x = x.reshape(shape) - shape = x.shape - x = tslib.ints_to_pydatetime(x.view(np.int64).ravel()) - x = x.reshape(shape) elif x.dtype == _TD_DTYPE: shape = x.shape x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel()) @@ -275,6 +277,12 @@ def convert_to_pydatetime(x, axis): # datetimetz if 'datetimetz' in typs: + # if to_concat have 'datetime' or 'object' + # then we need to coerce to object + if 'datetime' in typs or 'object' in typs: + to_concat = [convert_to_pydatetime(x, axis) for x in to_concat] + return np.concatenate(to_concat, axis=axis) + # we require ALL of the same tz for datetimetz tzs = set([getattr(x, 'tz', None) for x in to_concat]) - set([None]) if len(tzs) == 1: diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index f7223f803c41a..a632913fbe4fe 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -242,6 +242,19 @@ def __new__(cls, data=None, raise ValueError("Must provide freq argument if no data is " "supplied") + # if dtype has an embeded tz, capture it + if dtype is not None: + try: + dtype = DatetimeTZDtype.construct_from_string(dtype) + dtz = getattr(dtype, 'tz', None) + if dtz is not None: + if tz is not None and str(tz) != str(dtz): + raise ValueError("cannot supply both a tz and a dtype" + " with a tz") + tz = dtz + except TypeError: + pass + if data is None: return cls._generate(start, end, periods, name, freq, tz=tz, normalize=normalize, closed=closed, @@ -272,7 +285,15 @@ def __new__(cls, data=None, data.name = name if tz is not None: - return data.tz_localize(tz, ambiguous=ambiguous) + + # we might already be localized to this tz + # so passing the same tz is ok + # however any other tz is a no-no + if data.tz is None: + return data.tz_localize(tz, ambiguous=ambiguous) + elif str(tz) != str(data.tz): + raise TypeError("Already tz-aware, use tz_convert " + "to convert.") return data @@ -288,6 +309,12 @@ def __new__(cls, data=None, if tz is None: tz = data.tz + else: + # the tz's must match + if str(tz) != str(data.tz): + raise TypeError("Already tz-aware, use tz_convert " + "to convert.") + subarr = data.values if freq is None: diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index a011652b7f2e2..99cada26464cb 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -74,7 +74,7 @@ def test_index_unique(self): dups_local = self.dups.index.tz_localize('US/Eastern') dups_local.name = 'foo' result = dups_local.unique() - expected = DatetimeIndex(expected, tz='US/Eastern') + expected = DatetimeIndex(expected).tz_localize('US/Eastern') self.assertTrue(result.tz is not None) self.assertEqual(result.name, 'foo') self.assertTrue(result.equals(expected)) @@ -2473,6 +2473,40 @@ def test_constructor_datetime64_tzformat(self): tz='Asia/Tokyo') self.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) + def test_constructor_dtype(self): + + # passing a dtype with a tz should localize + idx = DatetimeIndex(['2013-01-01', + '2013-01-02'], + dtype='datetime64[ns, US/Eastern]') + expected = DatetimeIndex(['2013-01-01', '2013-01-02'] + ).tz_localize('US/Eastern') + self.assertTrue(idx.equals(expected)) + + idx = DatetimeIndex(['2013-01-01', + '2013-01-02'], + tz='US/Eastern') + self.assertTrue(idx.equals(expected)) + + # if we already have a tz and its not the same, then raise + idx = DatetimeIndex(['2013-01-01', '2013-01-02'], + dtype='datetime64[ns, US/Eastern]') + + self.assertRaises(ValueError, + lambda: DatetimeIndex(idx, + dtype='datetime64[ns]')) + + # this is effectively trying to convert tz's + self.assertRaises(TypeError, + lambda: DatetimeIndex(idx, + dtype='datetime64[ns, CET]')) + self.assertRaises(ValueError, + lambda: DatetimeIndex( + idx, tz='CET', + dtype='datetime64[ns, US/Eastern]')) + result = DatetimeIndex(idx, dtype='datetime64[ns, US/Eastern]') + self.assertTrue(idx.equals(result)) + def test_constructor_name(self): idx = DatetimeIndex(start='2000-01-01', periods=1, freq='A', name='TEST') diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index be1c5af74a95d..49b8f2c19700c 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -3554,6 +3554,10 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): trans, deltas, typ = _get_dst_info(tz2) trans_len = len(trans) + # if all NaT, return all NaT + if (utc_dates==NPY_NAT).all(): + return utc_dates + # use first non-NaT element # if all-NaT, return all-NaT if (result==NPY_NAT).all():