Skip to content

API: return Index instead of array from DatetimeIndex field accessors (GH15022) #15589

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

34 changes: 34 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,40 @@ New Behavior:

s.map(lambda x: x.hour)


Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a ref here

.. _whatsnew_0200.api_breaking.index_dt_field:

Accessing datetime fields of Index now return Index
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The several datetime-related attributes (see :ref:`here <timeseries.components>`
for an overview) of ``DatetimeIndex``, ``PeriodIndex`` and ``TimedeltaIndex`` previously
returned numpy arrays, now they will return a new Index object (:issue:`15022`).
Only in the case of a boolean field, a the return value is still a boolean array
instead of an Index (to support boolean indexing).

Previous behaviour:

.. code-block:: ipython

In [1]: idx = pd.date_range("2015-01-01", periods=5, freq='10H')

In [2]: idx.hour
Out[2]: array([ 0, 10, 20, 6, 16], dtype=int32)

New Behavior:

.. ipython:: python

idx = pd.date_range("2015-01-01", periods=5, freq='10H')
idx.hour

This has the advantage that specific Index methods are still available on the
result. On the other hand, this might have backward incompatibilities: e.g.
compared to numpy arrays, Index objects are not mutable (values cannot be set
by indexing). To get the original result, you can convert to a numpy array
explicitly using ``np.asarray(idx.hour)``.

.. _whatsnew_0200.api_breaking.s3:

S3 File Handling
Expand Down
33 changes: 31 additions & 2 deletions pandas/tests/indexes/datetimes/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ def test_normalize(self):
class TestDatetime64(tm.TestCase):

def test_datetimeindex_accessors(self):

dti_naive = DatetimeIndex(freq='D', start=datetime(1998, 1, 1),
periods=365)
# GH 13303
Expand Down Expand Up @@ -255,6 +256,34 @@ def test_datetimeindex_accessors(self):
self.assertEqual(len(dti.is_year_end), 365)
self.assertEqual(len(dti.weekday_name), 365)

dti.name = 'name'

# non boolean accessors -> return Index
for accessor in ['year', 'month', 'day', 'hour', 'minute',
'second', 'microsecond', 'nanosecond',
'dayofweek', 'dayofyear', 'weekofyear',
'quarter', 'weekday_name']:
res = getattr(dti, accessor)
assert len(res) == 365
assert isinstance(res, Index)
assert res.name == 'name'

# boolean accessors -> return array
for accessor in ['is_month_start', 'is_month_end',
'is_quarter_start', 'is_quarter_end',
'is_year_start', 'is_year_end']:
res = getattr(dti, accessor)
assert len(res) == 365
assert isinstance(res, np.ndarray)

# test boolean indexing
res = dti[dti.is_quarter_start]
exp = dti[[0, 90, 181, 273]]
tm.assert_index_equal(res, exp)
res = dti[dti.is_leap_year]
exp = DatetimeIndex([], freq='D', tz=dti.tz, name='name')
tm.assert_index_equal(res, exp)

dti = DatetimeIndex(freq='BQ-FEB', start=datetime(1998, 1, 1),
periods=4)

Expand Down Expand Up @@ -313,5 +342,5 @@ def test_datetimeindex_accessors(self):
def test_nanosecond_field(self):
dti = DatetimeIndex(np.arange(10))

self.assert_numpy_array_equal(dti.nanosecond,
np.arange(10, dtype=np.int32))
self.assert_index_equal(dti.nanosecond,
pd.Index(np.arange(10, dtype=np.int64)))
4 changes: 2 additions & 2 deletions pandas/tests/indexes/period/test_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@ def test_constructor_arrays_negative_year(self):

pindex = PeriodIndex(year=years, quarter=quarters)

self.assert_numpy_array_equal(pindex.year, years)
self.assert_numpy_array_equal(pindex.quarter, quarters)
self.assert_index_equal(pindex.year, pd.Index(years))
self.assert_index_equal(pindex.quarter, pd.Index(quarters))

def test_constructor_invalid_quarters(self):
self.assertRaises(ValueError, PeriodIndex, year=lrange(2000, 2004),
Expand Down
10 changes: 5 additions & 5 deletions pandas/tests/indexes/period/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,12 +658,12 @@ def test_negative_ordinals(self):

def test_pindex_fieldaccessor_nat(self):
idx = PeriodIndex(['2011-01', '2011-02', 'NaT',
'2012-03', '2012-04'], freq='D')
'2012-03', '2012-04'], freq='D', name='name')

exp = np.array([2011, 2011, -1, 2012, 2012], dtype=np.int64)
self.assert_numpy_array_equal(idx.year, exp)
exp = np.array([1, 2, -1, 3, 4], dtype=np.int64)
self.assert_numpy_array_equal(idx.month, exp)
exp = Index([2011, 2011, -1, 2012, 2012], dtype=np.int64, name='name')
self.assert_index_equal(idx.year, exp)
exp = Index([1, 2, -1, 3, 4], dtype=np.int64, name='name')
self.assert_index_equal(idx.month, exp)

def test_pindex_qaccess(self):
pi = PeriodIndex(['2Q05', '3Q05', '4Q05', '1Q06', '2Q06'], freq='Q')
Expand Down
24 changes: 14 additions & 10 deletions pandas/tests/indexes/timedeltas/test_timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ def test_total_seconds(self):
freq='s')
expt = [1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456. / 1e9,
1 * 86400 + 10 * 3600 + 11 * 60 + 13 + 100123456. / 1e9]
tm.assert_almost_equal(rng.total_seconds(), np.array(expt))
tm.assert_almost_equal(rng.total_seconds(), Index(expt))

# test Series
s = Series(rng)
Expand Down Expand Up @@ -486,16 +486,16 @@ def test_append_numpy_bug_1681(self):
def test_fields(self):
rng = timedelta_range('1 days, 10:11:12.100123456', periods=2,
freq='s')
self.assert_numpy_array_equal(rng.days, np.array(
[1, 1], dtype='int64'))
self.assert_numpy_array_equal(
self.assert_index_equal(rng.days, Index([1, 1], dtype='int64'))
self.assert_index_equal(
rng.seconds,
np.array([10 * 3600 + 11 * 60 + 12, 10 * 3600 + 11 * 60 + 13],
dtype='int64'))
self.assert_numpy_array_equal(rng.microseconds, np.array(
[100 * 1000 + 123, 100 * 1000 + 123], dtype='int64'))
self.assert_numpy_array_equal(rng.nanoseconds, np.array(
[456, 456], dtype='int64'))
Index([10 * 3600 + 11 * 60 + 12, 10 * 3600 + 11 * 60 + 13],
dtype='int64'))
self.assert_index_equal(
rng.microseconds,
Index([100 * 1000 + 123, 100 * 1000 + 123], dtype='int64'))
self.assert_index_equal(rng.nanoseconds,
Index([456, 456], dtype='int64'))

self.assertRaises(AttributeError, lambda: rng.hours)
self.assertRaises(AttributeError, lambda: rng.minutes)
Expand All @@ -509,6 +509,10 @@ def test_fields(self):
tm.assert_series_equal(s.dt.seconds, Series(
[10 * 3600 + 11 * 60 + 12, np.nan], index=[0, 1]))

# preserve name (GH15589)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might be better to add something to

pandas/tests/indexes/datetimelike.py. These are inherited by all of the datetimelike test indexes.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only problem is that they don't have a common field attribute.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I now ensured I have a test for each of period, timedelta, datetime that checks the name preservation, but indeed, ideally would have a test in datetimelike.py for that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only problem is that they don't have a common field attribute.

you should simply run it for index._datetimelike_ops which are defined per-class

but no big deal

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that would indeed be a possibility, and just checked and eg also freq is included in this list (which has a different return type). So would start skipping those, which would also not be that clean.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah prob should just define these as fixtures I think, then would make it really easy

https://github.com/pandas-dev/pandas/blob/master/pandas/tests/series/test_datetime_values.py#L30

rng.name = 'name'
assert rng.days.name == 'name'

def test_freq_conversion(self):

# doc example
Expand Down
13 changes: 12 additions & 1 deletion pandas/tests/scalar/test_timestamp.py
Original file line number Diff line number Diff line change
Expand Up @@ -597,9 +597,20 @@ def test_nat_fields(self):
def test_nat_vector_field_access(self):
idx = DatetimeIndex(['1/1/2000', None, None, '1/4/2000'])

# non boolean fields
fields = ['year', 'quarter', 'month', 'day', 'hour', 'minute',
'second', 'microsecond', 'nanosecond', 'week', 'dayofyear',
'days_in_month', 'is_leap_year']
'days_in_month']

for field in fields:
result = getattr(idx, field)
expected = [getattr(x, field) for x in idx]
self.assert_index_equal(result, pd.Index(expected))

# boolean fields
fields = ['is_leap_year']
# other boolean fields like 'is_month_start' and 'is_month_end'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose let's make an issue for this NaT enhancement?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, will open an issue for that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

# not yet supported by NaT

for field in fields:
result = getattr(idx, field)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/tools/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -1367,7 +1367,7 @@ def test_daily(self):
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
annual = pivot_annual(ts, 'D')

doy = ts.index.dayofyear
doy = np.asarray(ts.index.dayofyear)

with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
doy[(~isleapyear(ts.index.year)) & (doy >= 60)] += 1
Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/tools/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ def test_datetimeindex(self):
# make sure that the ordering on datetimeindex is consistent
x = date_range('2000-01-01', periods=2)
result1, result2 = [Index(y).day for y in cartesian_product([x, x])]
expected1 = np.array([1, 1, 2, 2], dtype=np.int32)
expected2 = np.array([1, 2, 1, 2], dtype=np.int32)
tm.assert_numpy_array_equal(result1, expected1)
tm.assert_numpy_array_equal(result2, expected2)
expected1 = Index([1, 1, 2, 2])
expected2 = Index([1, 2, 1, 2])
tm.assert_index_equal(result1, expected1)
tm.assert_index_equal(result2, expected2)

def test_empty(self):
# product of empty factors
Expand Down
70 changes: 32 additions & 38 deletions pandas/tests/tseries/test_timezones.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,8 +358,8 @@ def test_field_access_localize(self):
dr = date_range('2011-10-02 00:00', freq='h', periods=10,
tz=self.tzstr('America/Atikokan'))

expected = np.arange(10, dtype=np.int32)
self.assert_numpy_array_equal(dr.hour, expected)
expected = Index(np.arange(10, dtype=np.int64))
self.assert_index_equal(dr.hour, expected)

def test_with_tz(self):
tz = self.tz('US/Central')
Expand Down Expand Up @@ -947,35 +947,35 @@ def test_tz_convert_hour_overflow_dst(self):
'2009-05-12 09:50:32']
tt = to_datetime(ts).tz_localize('US/Eastern')
ut = tt.tz_convert('UTC')
expected = np.array([13, 14, 13], dtype=np.int32)
self.assert_numpy_array_equal(ut.hour, expected)
expected = Index([13, 14, 13])
self.assert_index_equal(ut.hour, expected)

# sorted case UTC -> US/Eastern
ts = ['2008-05-12 13:50:00',
'2008-12-12 14:50:35',
'2009-05-12 13:50:32']
tt = to_datetime(ts).tz_localize('UTC')
ut = tt.tz_convert('US/Eastern')
expected = np.array([9, 9, 9], dtype=np.int32)
self.assert_numpy_array_equal(ut.hour, expected)
expected = Index([9, 9, 9])
self.assert_index_equal(ut.hour, expected)

# unsorted case US/Eastern -> UTC
ts = ['2008-05-12 09:50:00',
'2008-12-12 09:50:35',
'2008-05-12 09:50:32']
tt = to_datetime(ts).tz_localize('US/Eastern')
ut = tt.tz_convert('UTC')
expected = np.array([13, 14, 13], dtype=np.int32)
self.assert_numpy_array_equal(ut.hour, expected)
expected = Index([13, 14, 13])
self.assert_index_equal(ut.hour, expected)

# unsorted case UTC -> US/Eastern
ts = ['2008-05-12 13:50:00',
'2008-12-12 14:50:35',
'2008-05-12 13:50:32']
tt = to_datetime(ts).tz_localize('UTC')
ut = tt.tz_convert('US/Eastern')
expected = np.array([9, 9, 9], dtype=np.int32)
self.assert_numpy_array_equal(ut.hour, expected)
expected = Index([9, 9, 9])
self.assert_index_equal(ut.hour, expected)

def test_tz_convert_hour_overflow_dst_timestamps(self):
# Regression test for:
Expand All @@ -989,35 +989,35 @@ def test_tz_convert_hour_overflow_dst_timestamps(self):
Timestamp('2009-05-12 09:50:32', tz=tz)]
tt = to_datetime(ts)
ut = tt.tz_convert('UTC')
expected = np.array([13, 14, 13], dtype=np.int32)
self.assert_numpy_array_equal(ut.hour, expected)
expected = Index([13, 14, 13])
self.assert_index_equal(ut.hour, expected)

# sorted case UTC -> US/Eastern
ts = [Timestamp('2008-05-12 13:50:00', tz='UTC'),
Timestamp('2008-12-12 14:50:35', tz='UTC'),
Timestamp('2009-05-12 13:50:32', tz='UTC')]
tt = to_datetime(ts)
ut = tt.tz_convert('US/Eastern')
expected = np.array([9, 9, 9], dtype=np.int32)
self.assert_numpy_array_equal(ut.hour, expected)
expected = Index([9, 9, 9])
self.assert_index_equal(ut.hour, expected)

# unsorted case US/Eastern -> UTC
ts = [Timestamp('2008-05-12 09:50:00', tz=tz),
Timestamp('2008-12-12 09:50:35', tz=tz),
Timestamp('2008-05-12 09:50:32', tz=tz)]
tt = to_datetime(ts)
ut = tt.tz_convert('UTC')
expected = np.array([13, 14, 13], dtype=np.int32)
self.assert_numpy_array_equal(ut.hour, expected)
expected = Index([13, 14, 13])
self.assert_index_equal(ut.hour, expected)

# unsorted case UTC -> US/Eastern
ts = [Timestamp('2008-05-12 13:50:00', tz='UTC'),
Timestamp('2008-12-12 14:50:35', tz='UTC'),
Timestamp('2008-05-12 13:50:32', tz='UTC')]
tt = to_datetime(ts)
ut = tt.tz_convert('US/Eastern')
expected = np.array([9, 9, 9], dtype=np.int32)
self.assert_numpy_array_equal(ut.hour, expected)
expected = Index([9, 9, 9])
self.assert_index_equal(ut.hour, expected)

def test_tslib_tz_convert_trans_pos_plus_1__bug(self):
# Regression test for tslib.tz_convert(vals, tz1, tz2).
Expand All @@ -1028,9 +1028,8 @@ def test_tslib_tz_convert_trans_pos_plus_1__bug(self):
idx = idx.tz_localize('UTC')
idx = idx.tz_convert('Europe/Moscow')

expected = np.repeat(np.array([3, 4, 5], dtype=np.int32),
np.array([n, n, 1]))
self.assert_numpy_array_equal(idx.hour, expected)
expected = np.repeat(np.array([3, 4, 5]), np.array([n, n, 1]))
self.assert_index_equal(idx.hour, Index(expected))

def test_tslib_tz_convert_dst(self):
for freq, n in [('H', 1), ('T', 60), ('S', 3600)]:
Expand All @@ -1039,62 +1038,57 @@ def test_tslib_tz_convert_dst(self):
tz='UTC')
idx = idx.tz_convert('US/Eastern')
expected = np.repeat(np.array([18, 19, 20, 21, 22, 23,
0, 1, 3, 4, 5], dtype=np.int32),
0, 1, 3, 4, 5]),
np.array([n, n, n, n, n, n, n, n, n, n, 1]))
self.assert_numpy_array_equal(idx.hour, expected)
self.assert_index_equal(idx.hour, Index(expected))

idx = date_range('2014-03-08 18:00', '2014-03-09 05:00', freq=freq,
tz='US/Eastern')
idx = idx.tz_convert('UTC')
expected = np.repeat(np.array([23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
dtype=np.int32),
expected = np.repeat(np.array([23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
np.array([n, n, n, n, n, n, n, n, n, n, 1]))
self.assert_numpy_array_equal(idx.hour, expected)
self.assert_index_equal(idx.hour, Index(expected))

# End DST
idx = date_range('2014-11-01 23:00', '2014-11-02 09:00', freq=freq,
tz='UTC')
idx = idx.tz_convert('US/Eastern')
expected = np.repeat(np.array([19, 20, 21, 22, 23,
0, 1, 1, 2, 3, 4], dtype=np.int32),
0, 1, 1, 2, 3, 4]),
np.array([n, n, n, n, n, n, n, n, n, n, 1]))
self.assert_numpy_array_equal(idx.hour, expected)
self.assert_index_equal(idx.hour, Index(expected))

idx = date_range('2014-11-01 18:00', '2014-11-02 05:00', freq=freq,
tz='US/Eastern')
idx = idx.tz_convert('UTC')
expected = np.repeat(np.array([22, 23, 0, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10], dtype=np.int32),
7, 8, 9, 10]),
np.array([n, n, n, n, n, n, n, n, n,
n, n, n, 1]))
self.assert_numpy_array_equal(idx.hour, expected)
self.assert_index_equal(idx.hour, Index(expected))

# daily
# Start DST
idx = date_range('2014-03-08 00:00', '2014-03-09 00:00', freq='D',
tz='UTC')
idx = idx.tz_convert('US/Eastern')
self.assert_numpy_array_equal(idx.hour,
np.array([19, 19], dtype=np.int32))
self.assert_index_equal(idx.hour, Index([19, 19]))

idx = date_range('2014-03-08 00:00', '2014-03-09 00:00', freq='D',
tz='US/Eastern')
idx = idx.tz_convert('UTC')
self.assert_numpy_array_equal(idx.hour,
np.array([5, 5], dtype=np.int32))
self.assert_index_equal(idx.hour, Index([5, 5]))

# End DST
idx = date_range('2014-11-01 00:00', '2014-11-02 00:00', freq='D',
tz='UTC')
idx = idx.tz_convert('US/Eastern')
self.assert_numpy_array_equal(idx.hour,
np.array([20, 20], dtype=np.int32))
self.assert_index_equal(idx.hour, Index([20, 20]))

idx = date_range('2014-11-01 00:00', '2014-11-02 000:00', freq='D',
tz='US/Eastern')
idx = idx.tz_convert('UTC')
self.assert_numpy_array_equal(idx.hour,
np.array([4, 4], dtype=np.int32))
self.assert_index_equal(idx.hour, Index([4, 4]))

def test_tzlocal(self):
# GH 13583
Expand Down
Loading