Skip to content

Commit 27b7b1e

Browse files
committed
API: add DatetimeBlockTZ pandas-dev#8260
fix scalar comparisons vs None generally fix NaT formattting in Series TST: skip postgresql test with tz's update for msgpack Conflicts: pandas/core/base.py pandas/core/categorical.py pandas/core/format.py pandas/tests/test_base.py pandas/util/testing.py full interop for tz-aware Series & timedeltas pandas-dev#10763
1 parent 8eb950b commit 27b7b1e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+2518
-1088
lines changed

asv_bench/asv.conf.json

+3-3
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,15 @@
1818
// If missing or the empty string, the tool will be automatically
1919
// determined by looking for tools on the PATH environment
2020
// variable.
21-
"environment_type": "",
21+
"environment_type": "conda",
2222

2323
// the base URL to show a commit for the project.
2424
"show_commit_url": "https://github.com/pydata/pandas/commit/",
2525

2626
// The Pythons you'd like to test against. If not provided, defaults
2727
// to the current version of Python used to run `asv`.
2828
// "pythons": ["2.7", "3.4"],
29-
"pythons": ["2.7", "3.4"],
29+
"pythons": ["2.7"],
3030

3131
// The matrix of dependencies to test. Each key is the name of a
3232
// package (in PyPI) and the values are version numbers. An empty
@@ -41,7 +41,7 @@
4141
"sqlalchemy": [],
4242
"scipy": [],
4343
"numexpr": [],
44-
"tables": [],
44+
"pytables": [],
4545
"openpyxl": [],
4646
"xlrd": [],
4747
"xlwt": []

asv_bench/benchmarks/binary_ops.py

+32-7
Original file line numberDiff line numberDiff line change
@@ -203,34 +203,59 @@ def time_series_timestamp_compare(self):
203203

204204
class timestamp_ops_diff1(object):
205205
goal_time = 0.2
206+
N = 1000000
206207

207208
def setup(self):
208-
self.N = 1000000
209-
self.s = Series(date_range('20010101', periods=self.N, freq='s'))
209+
self.s = self.create()
210+
211+
def create(self):
212+
return Series(date_range('20010101', periods=self.N, freq='s'))
210213

211214
def time_timestamp_ops_diff1(self):
212215
self.s.diff()
213216

217+
class timestamp_tz_ops_diff1(timestamp_ops_diff1):
218+
N = 10000
219+
220+
def create(self):
221+
return Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern'))
214222

215223
class timestamp_ops_diff2(object):
216224
goal_time = 0.2
225+
N = 1000000
217226

218227
def setup(self):
219-
self.N = 1000000
220-
self.s = Series(date_range('20010101', periods=self.N, freq='s'))
228+
self.s = self.create()
229+
230+
def create(self):
231+
return Series(date_range('20010101', periods=self.N, freq='s'))
221232

222233
def time_timestamp_ops_diff2(self):
223234
(self.s - self.s.shift())
224235

236+
class timestamp_tz_ops_diff2(timestamp_ops_diff2):
237+
N = 10000
238+
239+
def create(self):
240+
return Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern'))
225241

226242
class timestamp_series_compare(object):
227243
goal_time = 0.2
244+
N = 1000000
228245

229246
def setup(self):
230-
self.N = 1000000
231247
self.halfway = ((self.N // 2) - 1)
232-
self.s = Series(date_range('20010101', periods=self.N, freq='T'))
248+
self.s = self.create()
233249
self.ts = self.s[self.halfway]
234250

251+
def create(self):
252+
return Series(date_range('20010101', periods=self.N, freq='T'))
253+
235254
def time_timestamp_series_compare(self):
236-
(self.ts >= self.s)
255+
(self.ts >= self.s)
256+
257+
class timestamp_tz_series_compare(timestamp_series_compare):
258+
N = 10000
259+
260+
def create(self):
261+
return Series(date_range('20010101', periods=self.N, freq='T', tz='US/Eastern'))

doc/source/basics.rst

+11-4
Original file line numberDiff line numberDiff line change
@@ -1590,9 +1590,10 @@ dtypes
15901590
------
15911591

15921592
The main types stored in pandas objects are ``float``, ``int``, ``bool``,
1593-
``datetime64[ns]``, ``timedelta[ns]`` and ``object``. In addition these dtypes
1594-
have item sizes, e.g. ``int64`` and ``int32``. A convenient :attr:`~DataFrame.dtypes``
1595-
attribute for DataFrames returns a Series with the data type of each column.
1593+
``datetime64[ns]`` and ``datetime64[ns, tz]`` (in >= 0.17.0), ``timedelta[ns]``, ``category`` (in >= 0.15.0), and ``object``. In addition these dtypes
1594+
have item sizes, e.g. ``int64`` and ``int32``. See :ref:`Series with TZ <timeseries.timezone_series>` for more detail on ``datetime64[ns, tz]`` dtypes.
1595+
1596+
A convenient :attr:`~DataFrame.dtypes` attribute for DataFrames returns a Series with the data type of each column.
15961597

15971598
.. ipython:: python
15981599
@@ -1814,8 +1815,14 @@ dtypes:
18141815
df['tdeltas'] = df.dates.diff()
18151816
df['uint64'] = np.arange(3, 6).astype('u8')
18161817
df['other_dates'] = pd.date_range('20130101', periods=3).values
1818+
df['tz_aware_dates'] = pd.date_range('20130101', periods=3, tz='US/Eastern')
18171819
df
18181820
1821+
And the dtypes
1822+
1823+
.. ipython:: python
1824+
1825+
df.dtypes
18191826
18201827
:meth:`~DataFrame.select_dtypes` has two parameters ``include`` and ``exclude`` that allow you to
18211828
say "give me the columns WITH these dtypes" (``include``) and/or "give the
@@ -1868,7 +1875,7 @@ All numpy dtypes are subclasses of ``numpy.generic``:
18681875
18691876
.. note::
18701877

1871-
Pandas also defines an additional ``category`` dtype, which is not integrated into the normal
1878+
Pandas also defines the types ``category``, and ``datetime64[ns, tz]``, which are not integrated into the normal
18721879
numpy hierarchy and wont show up with the above function.
18731880

18741881
.. note::

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ Highlights include:
5050

5151
- Release the Global Interpreter Lock (GIL) on some cython operations, see :ref:`here <whatsnew_0170.gil>`
5252
- The sorting API has been revamped to remove some long-time inconsistencies, see :ref:`here <whatsnew_0170.api_breaking.sorting>`
53+
- Support for a ``datetime64[ns]`` with timezones as a first-class dtype, see :ref:`here <whatsnew_0170.tz>`
5354
- The default for ``to_datetime`` will now be to ``raise`` when presented with unparseable formats,
5455
previously this would return the original input, see :ref:`here <whatsnew_0170.api_breaking.to_datetime>`
5556
- The default for ``dropna`` in ``HDFStore`` has changed to ``False``, to store by default all rows even

doc/source/timeseries.rst

+27
Original file line numberDiff line numberDiff line change
@@ -1745,3 +1745,30 @@ constructor as well as ``tz_localize``.
17451745
17461746
# tz_convert(None) is identical with tz_convert('UTC').tz_localize(None)
17471747
didx.tz_convert('UCT').tz_localize(None)
1748+
1749+
.. _timeseries.timezone_series:
1750+
1751+
TZ aware Dtypes
1752+
~~~~~~~~~~~~~~~
1753+
1754+
.. versionadded:: 0.17.0
1755+
1756+
``Series/DatetimeIndex`` with a timezone naive value are represented with a dtype of ``datetime64[ns]``.
1757+
1758+
.. ipython:: python
1759+
1760+
dr = pd.date_range('20130101',periods=3)
1761+
dr
1762+
s = Series(dr)
1763+
s
1764+
1765+
``Series/DatetimeIndex`` with a timezone aware value are represented with a dtype of ``datetime64[ns, tz]``.
1766+
1767+
.. ipython:: python
1768+
1769+
dr = pd.date_range('20130101',periods=3,tz='US/Eastern')
1770+
dr
1771+
s = Series(dr)
1772+
s
1773+
1774+
Both of these ``Series`` can be manipulated via the ``.dt`` accessor, see the :ref:`docs <basics.dt_accessors>` as well.

doc/source/whatsnew/v0.17.0.txt

+56
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ Highlights include:
3030

3131
- Release the Global Interpreter Lock (GIL) on some cython operations, see :ref:`here <whatsnew_0170.gil>`
3232
- The sorting API has been revamped to remove some long-time inconsistencies, see :ref:`here <whatsnew_0170.api_breaking.sorting>`
33+
- Support for a ``datetime64[ns]`` with timezones as a first-class dtype, see :ref:`here <whatsnew_0170.tz>`
3334
- The default for ``to_datetime`` will now be to ``raise`` when presented with unparseable formats,
3435
previously this would return the original input, see :ref:`here <whatsnew_0170.api_breaking.to_datetime>`
3536
- The default for ``dropna`` in ``HDFStore`` has changed to ``False``, to store by default all rows even
@@ -417,6 +418,58 @@ To keep the previous behaviour, you can use ``errors='ignore'``:
417418
Furthermore, ``pd.to_timedelta`` has gained a similar API, of ``errors='raise'|'ignore'|'coerce'``, and the ``coerce`` keyword
418419
has been deprecated in favor of ``errors='coerce'``.
419420

421+
.. _whatsnew_0170.tz:
422+
423+
Datetime with TZ
424+
~~~~~~~~~~~~~~~~
425+
426+
We are adding an implementation that natively supports datetime with timezones. A ``Series`` or a ``DataFrame`` column previously
427+
*could* be assigned a datetime with timezones, and would work as an ``object`` dtype. This had performance issues with a large
428+
number rows. (:issue:`8260`, :issue:`10763`)
429+
430+
The new implementation allows for having a single-timezone across all rows, and operating on it in a performant manner.
431+
432+
.. ipython:: python
433+
434+
df = DataFrame({'A' : date_range('20130101',periods=3),
435+
'B' : date_range('20130101',periods=3,tz='US/Eastern'),
436+
'C' : date_range('20130101',periods=3,tz='CET')})
437+
df
438+
df.dtypes
439+
440+
.. ipython:: python
441+
442+
df.B
443+
df.B.dt.tz_localize(None)
444+
445+
This uses a new-dtype representation as well, that is very similar in look-and-feel to its numpy cousin ``datetime64[ns]``
446+
447+
.. ipython:: python
448+
449+
df['B'].dtype
450+
type(df['B']).dtype
451+
452+
.. note::
453+
454+
There is a slightly different string repr for the underlying ``DatetimeIndex`` as a result of the dtype changes, but
455+
functionally these are the same.
456+
457+
.. code-block:: python
458+
459+
In [1]: pd.date_range('20130101',periods=3,tz='US/Eastern')
460+
Out[1]: DatetimeIndex(['2013-01-01 00:00:00-05:00', '2013-01-02 00:00:00-05:00',
461+
'2013-01-03 00:00:00-05:00'],
462+
dtype='datetime64[ns]', freq='D', tz='US/Eastern')
463+
464+
In [2]: pd.date_range('20130101',periods=3,tz='US/Eastern').dtype
465+
Out[2]: dtype('<M8[ns]')
466+
467+
.. ipython:: python
468+
469+
pd.date_range('20130101',periods=3,tz='US/Eastern')
470+
pd.date_range('20130101',periods=3,tz='US/Eastern').dtype
471+
472+
420473
.. _whatsnew_0170.api_breaking.convert_objects:
421474

422475
Changes to convert_objects
@@ -824,6 +877,9 @@ Bug Fixes
824877
- Bug in incorrection computation of ``.mean()`` on ``timedelta64[ns]`` because of overflow (:issue:`9442`)
825878
- Bug in ``DataFrame.to_html(index=False)`` renders unnecessary ``name`` row (:issue:`10344`)
826879
- Bug in ``DataFrame.to_latex()`` the ``column_format`` argument could not be passed (:issue:`9402`)
880+
- Bug in ``DatetimeIndex`` when localizing with ``NaT`` (:issue:`10477`)
881+
- Bug in ``Series.dt`` ops in preserving meta-data (:issue:`10477`)
882+
- Bug in preserving ``NaT`` when passed in an otherwise invalid ``to_datetime`` construction (:issue:`10477`)
827883
- Bug in ``DataFrame.apply`` when function returns categorical series. (:issue:`9573`)
828884
- Bug in ``to_datetime`` with invalid dates and formats supplied (:issue:`10154`)
829885
- Bug in ``Index.drop_duplicates`` dropping name(s) (:issue:`10115`)

pandas/core/algorithms.py

+17-5
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
206206
"""
207207
from pandas.core.series import Series
208208
from pandas.tools.tile import cut
209-
from pandas.tseries.period import PeriodIndex
209+
from pandas import Index, PeriodIndex, DatetimeIndex
210210

211211
name = getattr(values, 'name', None)
212212
values = Series(values).values
@@ -225,11 +225,15 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
225225

226226
dtype = values.dtype
227227
is_period = com.is_period_arraylike(values)
228+
is_datetimetz = com.is_datetimetz(values)
228229

229-
if com.is_datetime_or_timedelta_dtype(dtype) or is_period:
230+
if com.is_datetime_or_timedelta_dtype(dtype) or is_period or is_datetimetz:
230231

231232
if is_period:
232-
values = PeriodIndex(values, name=name)
233+
values = PeriodIndex(values)
234+
elif is_datetimetz:
235+
tz = getattr(values, 'tz', None)
236+
values = DatetimeIndex(values).tz_localize(None)
233237

234238
values = values.view(np.int64)
235239
keys, counts = htable.value_count_scalar64(values, dropna)
@@ -239,8 +243,14 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
239243
msk = keys != iNaT
240244
keys, counts = keys[msk], counts[msk]
241245

246+
# localize to the original tz if necessary
247+
if is_datetimetz:
248+
keys = DatetimeIndex(keys).tz_localize(tz)
249+
242250
# convert the keys back to the dtype we came in
243-
keys = keys.astype(dtype)
251+
else:
252+
keys = keys.astype(dtype)
253+
244254

245255
elif com.is_integer_dtype(dtype):
246256
values = com._ensure_int64(values)
@@ -257,7 +267,9 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
257267
keys = np.insert(keys, 0, np.NaN)
258268
counts = np.insert(counts, 0, mask.sum())
259269

260-
result = Series(counts, index=com._values_from_object(keys), name=name)
270+
if not isinstance(keys, Index):
271+
keys = Index(keys)
272+
result = Series(counts, index=keys, name=name)
261273

262274
if bins is not None:
263275
# TODO: This next line should be more efficient

pandas/core/base.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,11 @@ def base(self):
364364
""" return the base object if the memory of the underlying data is shared """
365365
return self.values.base
366366

367+
@property
368+
def _values(self):
369+
""" the internal implementation """
370+
return self.values
371+
367372
def max(self):
368373
""" The maximum value of the object """
369374
return nanops.nanmax(self.values)
@@ -397,6 +402,14 @@ def hasnans(self):
397402
""" return if I have any nans; enables various perf speedups """
398403
return com.isnull(self).any()
399404

405+
def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
406+
filter_type=None, **kwds):
407+
""" perform the reduction type operation if we can """
408+
func = getattr(self,name,None)
409+
if func is None:
410+
raise TypeError("{klass} cannot perform the operation {op}".format(klass=self.__class__.__name__,op=name))
411+
return func(**kwds)
412+
400413
def value_counts(self, normalize=False, sort=True, ascending=False,
401414
bins=None, dropna=True):
402415
"""
@@ -586,7 +599,7 @@ def drop_duplicates(self, keep='first', inplace=False):
586599
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
587600
@Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs)
588601
def duplicated(self, keep='first'):
589-
keys = com._ensure_object(self.values)
602+
keys = com._values_from_object(com._ensure_object(self.values))
590603
duplicated = lib.duplicated(keys, keep=keep)
591604
try:
592605
return self._constructor(duplicated,

pandas/core/categorical.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,14 @@
1212
import pandas.core.common as com
1313
from pandas.util.decorators import cache_readonly, deprecate_kwarg
1414

15-
from pandas.core.common import (CategoricalDtype, ABCSeries, ABCIndexClass, ABCCategoricalIndex,
15+
from pandas.core.common import (ABCSeries, ABCIndexClass, ABCPeriodIndex, ABCCategoricalIndex,
1616
isnull, notnull, is_dtype_equal,
1717
is_categorical_dtype, is_integer_dtype, is_object_dtype,
1818
_possibly_infer_to_datetimelike, get_dtype_kinds,
1919
is_list_like, is_sequence, is_null_slice, is_bool,
2020
_ensure_platform_int, _ensure_object, _ensure_int64,
2121
_coerce_indexer_dtype, take_1d)
22+
from pandas.core.dtypes import CategoricalDtype
2223
from pandas.util.terminal import get_terminal_size
2324
from pandas.core.config import get_option
2425

@@ -85,7 +86,7 @@ def f(self, other):
8586
def maybe_to_categorical(array):
8687
""" coerce to a categorical if a series is given """
8788
if isinstance(array, (ABCSeries, ABCCategoricalIndex)):
88-
return array.values
89+
return array._values
8990
return array
9091

9192
_codes_doc = """The category codes of this categorical.
@@ -231,7 +232,7 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F
231232

232233
# we are either a Series or a CategoricalIndex
233234
if isinstance(values, (ABCSeries, ABCCategoricalIndex)):
234-
values = values.values
235+
values = values._values
235236

236237
if ordered is None:
237238
ordered = values.ordered

0 commit comments

Comments
 (0)