Skip to content

Commit 42cc66d

Browse files
sinhrksjreback
authored andcommitted
PERF/BUG: improve factorize for datetimetz (#13750)
1 parent 964b7bb commit 42cc66d

File tree

6 files changed

+79
-13
lines changed

6 files changed

+79
-13
lines changed

asv_bench/benchmarks/algorithms.py

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
5+
class algorithm(object):
6+
goal_time = 0.2
7+
8+
def setup(self):
9+
N = 100000
10+
self.int = pd.Int64Index(np.arange(N).repeat(5))
11+
self.float = pd.Float64Index(np.random.randn(N).repeat(5))
12+
13+
def time_int_factorize(self):
14+
self.int.factorize()
15+
16+
def time_float_factorize(self):
17+
self.int.factorize()

asv_bench/benchmarks/period.py

+16
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,22 @@ def time_period_index(self):
99
PeriodIndex(date_range('1985', periods=1000).to_pydatetime(), freq='D')
1010

1111

12+
class period_setitem(object):
13+
goal_time = 0.2
14+
15+
def setup(self):
16+
self.N = 100000
17+
self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
18+
if hasattr(Series, 'convert'):
19+
Series.resample = Series.convert
20+
self.ts = Series(np.random.randn(self.N), index=self.rng)
21+
self.rng = period_range(start='1/1/1990', freq='S', periods=20000)
22+
self.df = DataFrame(index=range(len(self.rng)))
23+
24+
def time_period_setitem(self):
25+
self.df['col'] = self.rng
26+
27+
1228
class period_algorithm(object):
1329
goal_time = 0.2
1430

asv_bench/benchmarks/timeseries.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -218,20 +218,20 @@ def time_dti_reset_index_tz(self):
218218
self.df.reset_index()
219219

220220

221-
class period_setitem(object):
221+
class datetime_algorithm(object):
222222
goal_time = 0.2
223223

224224
def setup(self):
225-
self.N = 100000
226-
self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
227-
if hasattr(Series, 'convert'):
228-
Series.resample = Series.convert
229-
self.ts = Series(np.random.randn(self.N), index=self.rng)
230-
self.rng = period_range(start='1/1/1990', freq='S', periods=20000)
231-
self.df = DataFrame(index=range(len(self.rng)))
225+
N = 100000
226+
self.dti = pd.date_range('2011-01-01', freq='H', periods=N).repeat(5)
227+
self.dti_tz = pd.date_range('2011-01-01', freq='H', periods=N,
228+
tz='Asia/Tokyo').repeat(5)
229+
230+
def time_dti_factorize(self):
231+
self.dti.factorize()
232232

233-
def time_period_setitem(self):
234-
self.df['col'] = self.rng
233+
def time_dti_tz_factorize(self):
234+
self.dti_tz.factorize()
235235

236236

237237
class timeseries_1min_5min_mean(object):

doc/source/whatsnew/v0.19.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -652,6 +652,8 @@ Performance Improvements
652652
- Improved performance of ``Index.difference`` (:issue:`12044`)
653653
- Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`)
654654
- Improved performance of hashing ``Period`` (:issue:`12817`)
655+
- Improved performance of ``factorize`` of datetime with timezone (:issue:`13750`)
656+
655657

656658

657659
.. _whatsnew_0190.bug_fixes:
@@ -738,6 +740,7 @@ Bug Fixes
738740
- Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`)
739741
- Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`)
740742
- Clean some compile time warnings in datetime parsing (:issue:`13607`)
743+
- Bug in ``factorize`` raises ``AmbiguousTimeError`` if data contains datetime near DST boundary (:issue:`13750`)
741744

742745
- Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`)
743746
- Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`)

pandas/core/algorithms.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
293293
is_datetimetz_type = is_datetimetz(values)
294294
if is_datetimetz_type:
295295
values = DatetimeIndex(values)
296-
vals = values.tz_localize(None)
296+
vals = values.asi8
297297

298298
is_datetime = is_datetime64_dtype(vals)
299299
is_timedelta = is_timedelta64_dtype(vals)
@@ -313,8 +313,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
313313

314314
if is_datetimetz_type:
315315
# reset tz
316-
uniques = DatetimeIndex(uniques.astype('M8[ns]')).tz_localize(
317-
values.tz)
316+
uniques = values._shallow_copy(uniques)
318317
elif is_datetime:
319318
uniques = uniques.astype('M8[ns]')
320319
elif is_timedelta:

pandas/tseries/tests/test_timeseries.py

+31
Original file line numberDiff line numberDiff line change
@@ -3826,6 +3826,37 @@ def test_factorize(self):
38263826
self.assert_numpy_array_equal(arr, exp_arr)
38273827
tm.assert_index_equal(idx, idx3)
38283828

3829+
def test_factorize_tz(self):
3830+
# GH 13750
3831+
for tz in [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']:
3832+
base = pd.date_range('2016-11-05', freq='H', periods=100, tz=tz)
3833+
idx = base.repeat(5)
3834+
3835+
exp_arr = np.arange(100).repeat(5)
3836+
3837+
for obj in [idx, pd.Series(idx)]:
3838+
arr, res = obj.factorize()
3839+
self.assert_numpy_array_equal(arr, exp_arr)
3840+
tm.assert_index_equal(res, base)
3841+
3842+
def test_factorize_dst(self):
3843+
# GH 13750
3844+
idx = pd.date_range('2016-11-06', freq='H', periods=12,
3845+
tz='US/Eastern')
3846+
3847+
for obj in [idx, pd.Series(idx)]:
3848+
arr, res = obj.factorize()
3849+
self.assert_numpy_array_equal(arr, np.arange(12))
3850+
tm.assert_index_equal(res, idx)
3851+
3852+
idx = pd.date_range('2016-06-13', freq='H', periods=12,
3853+
tz='US/Eastern')
3854+
3855+
for obj in [idx, pd.Series(idx)]:
3856+
arr, res = obj.factorize()
3857+
self.assert_numpy_array_equal(arr, np.arange(12))
3858+
tm.assert_index_equal(res, idx)
3859+
38293860
def test_slice_with_negative_step(self):
38303861
ts = Series(np.arange(20),
38313862
date_range('2014-01-01', periods=20, freq='MS'))

0 commit comments

Comments
 (0)