Skip to content

Commit 1152d78

Browse files
committed
BUG: fix major performance issue in DatetimeIndex.union affecting join performance on irregular indexes, remedying #1046
1 parent 51250cc commit 1152d78

File tree

3 files changed

+26
-12
lines changed

3 files changed

+26
-12
lines changed

pandas/core/generic.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,8 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True):
132132
return groupby(self, by, axis=axis, level=level, as_index=as_index,
133133
sort=sort)
134134

135-
def convert(self, rule, method='pad', how='last', axis=0, as_index=True):
135+
def convert(self, rule, method='pad', how='last', axis=0, as_index=True,
136+
closed='right', label='right'):
136137
"""
137138
Convenience method for frequency conversion and resampling of regular
138139
time-series data.
@@ -143,6 +144,10 @@ def convert(self, rule, method='pad', how='last', axis=0, as_index=True):
143144
how : string, method for down- or re-sampling, default 'last'
144145
method : string, method for upsampling, default 'pad'
145146
axis : int, optional, default 0
147+
closed : {'right', 'left'}, default 'right'
148+
Which side of bin interval is closed
149+
label : {'right', 'left'}, default 'right'
150+
Which bin edge label to label bucket with
146151
as_index : see synonymous argument of groupby
147152
"""
148153
from pandas.core.groupby import TimeGrouper, translate_grouping
@@ -154,14 +159,11 @@ def convert(self, rule, method='pad', how='last', axis=0, as_index=True):
154159
if not isinstance(idx, DatetimeIndex):
155160
raise ValueError("Cannot call convert with non-DatetimeIndex")
156161

157-
if idx.offset is None:
158-
raise ValueError("Cannot call convert with non-regular index")
159-
160162
if not isinstance(rule, datetools.DateOffset):
161163
raise ValueError("Rule not a recognized offset")
162164

163-
interval = TimeGrouper(rule, label='right',
164-
closed='right', _obj=self)
165+
interval = TimeGrouper(rule, label=label,
166+
closed=closed, _obj=self)
165167

166168
currfreq = len(idx)
167169
targfreq = len(interval.binner) - 2 # since binner extends endpoints

pandas/core/index.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -457,8 +457,7 @@ def union(self, other):
457457

458458
if len(indexer) > 0:
459459
other_diff = other.values.take(indexer)
460-
result = list(self) + list(other_diff)
461-
# timsort wins
460+
result = np.concatenate((self.values, other_diff))
462461
try:
463462
result.sort()
464463
except Exception:

vb_suite/timeseries.py

+17-4
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,14 @@
22
from datetime import datetime
33

44
common_setup = """from pandas_vb_common import *
5+
N = 100000
6+
57
try:
6-
rng = date_range('1/1/2000', periods=100000, freq='min')
8+
rng = date_range('1/1/2000', periods=N, freq='min')
79
except NameError:
8-
rng = DateRange('1/1/2000', periods=100000,
9-
offset=datetools.Minute())
10+
rng = DateRange('1/1/2000', periods=N, offset=datetools.Minute())
1011
11-
ts = Series(np.random.randn(100000), index=rng)
12+
ts = Series(np.random.randn(N), index=rng)
1213
"""
1314

1415
#----------------------------------------------------------------------
@@ -28,3 +29,15 @@
2829

2930
timeseries_1min_5min_mean = Benchmark("ts[:10000].convert('5min', how='mean')",
3031
common_setup)
32+
33+
#----------------------------------------------------------------------
34+
# Irregular alignment
35+
36+
setup = common_setup + """
37+
lindex = np.random.permutation(N)[:N // 2]
38+
rindex = np.random.permutation(N)[:N // 2]
39+
left = Series(ts.values.take(lindex), index=ts.index.take(lindex))
40+
right = Series(ts.values.take(rindex), index=ts.index.take(rindex))
41+
"""
42+
43+
timeseries_add_irregular = Benchmark('left + right', setup)

0 commit comments

Comments
 (0)