Skip to content

Commit 0b3f24b

Browse files
mroeschkejorisvandenbossche
authored andcommitted
CLN: ASV HDFStore benchmark (#18641)
* Add IO base class
1 parent 13f6267 commit 0b3f24b

File tree

3 files changed

+68
-72
lines changed

3 files changed

+68
-72
lines changed

asv_bench/benchmarks/hdfstore_bench.py

+39-47
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,40 @@
1-
from .pandas_vb_common import *
2-
import os
1+
import numpy as np
2+
from pandas import DataFrame, Panel, date_range, HDFStore
3+
import pandas.util.testing as tm
34

5+
from .pandas_vb_common import BaseIO, setup # noqa
46

5-
class HDF5(object):
6-
goal_time = 0.2
7-
8-
def setup(self):
9-
self.index = tm.makeStringIndex(25000)
10-
self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000),},
11-
index=self.index)
127

13-
self.df_mixed = DataFrame(
14-
{'float1': randn(25000), 'float2': randn(25000),
15-
'string1': (['foo'] * 25000),
16-
'bool1': ([True] * 25000),
17-
'int1': np.random.randint(0, 250000, size=25000),},
18-
index=self.index)
8+
class HDF5(BaseIO):
199

20-
self.df_wide = DataFrame(np.random.randn(25000, 100))
21-
22-
self.df2 = DataFrame({'float1': randn(25000), 'float2': randn(25000)},
23-
index=date_range('1/1/2000', periods=25000))
24-
self.df_wide2 = DataFrame(np.random.randn(25000, 100),
25-
index=date_range('1/1/2000', periods=25000))
10+
goal_time = 0.2
2611

27-
self.df_dc = DataFrame(np.random.randn(10000, 10),
28-
columns=[('C%03d' % i) for i in range(10)])
12+
def setup(self):
13+
N = 25000
14+
index = tm.makeStringIndex(N)
15+
self.df = DataFrame({'float1': np.random.randn(N),
16+
'float2': np.random.randn(N)},
17+
index=index)
18+
self.df_mixed = DataFrame({'float1': np.random.randn(N),
19+
'float2': np.random.randn(N),
20+
'string1': ['foo'] * N,
21+
'bool1': [True] * N,
22+
'int1': np.random.randint(0, N, size=N)},
23+
index=index)
24+
self.df_wide = DataFrame(np.random.randn(N, 100))
25+
self.start_wide = self.df_wide.index[10000]
26+
self.stop_wide = self.df_wide.index[15000]
27+
self.df2 = DataFrame({'float1': np.random.randn(N),
28+
'float2': np.random.randn(N)},
29+
index=date_range('1/1/2000', periods=N))
30+
self.start = self.df2.index[10000]
31+
self.stop = self.df2.index[15000]
32+
self.df_wide2 = DataFrame(np.random.randn(N, 100),
33+
index=date_range('1/1/2000', periods=N))
34+
self.df_dc = DataFrame(np.random.randn(N, 10),
35+
columns=['C%03d' % i for i in range(10)])
2936

3037
self.f = '__test__.h5'
31-
self.remove(self.f)
3238

3339
self.store = HDFStore(self.f)
3440
self.store.put('fixed', self.df)
@@ -42,12 +48,6 @@ def teardown(self):
4248
self.store.close()
4349
self.remove(self.f)
4450

45-
def remove(self, f):
46-
try:
47-
os.remove(f)
48-
except:
49-
pass
50-
5151
def time_read_store(self):
5252
self.store.get('fixed')
5353

@@ -82,14 +82,12 @@ def time_write_store_table_dc(self):
8282
self.store.append('table_dc_write', self.df_dc, data_columns=True)
8383

8484
def time_query_store_table_wide(self):
85-
start = self.df_wide2.index[10000]
86-
stop = self.df_wide2.index[15000]
87-
self.store.select('table_wide', where="index > start and index < stop")
85+
self.store.select('table_wide', where="index > self.start_wide and "
86+
"index < self.stop_wide")
8887

8988
def time_query_store_table(self):
90-
start = self.df2.index[10000]
91-
stop = self.df2.index[15000]
92-
self.store.select('table', where="index > start and index < stop")
89+
self.store.select('table', where="index > self.start and "
90+
"index < self.stop")
9391

9492
def time_store_repr(self):
9593
repr(self.store)
@@ -101,29 +99,23 @@ def time_store_info(self):
10199
self.store.info()
102100

103101

104-
class HDF5Panel(object):
102+
class HDF5Panel(BaseIO):
103+
105104
goal_time = 0.2
106105

107106
def setup(self):
108107
self.f = '__test__.h5'
109-
self.p = Panel(randn(20, 1000, 25),
110-
items=[('Item%03d' % i) for i in range(20)],
108+
self.p = Panel(np.random.randn(20, 1000, 25),
109+
items=['Item%03d' % i for i in range(20)],
111110
major_axis=date_range('1/1/2000', periods=1000),
112-
minor_axis=[('E%03d' % i) for i in range(25)])
113-
self.remove(self.f)
111+
minor_axis=['E%03d' % i for i in range(25)])
114112
self.store = HDFStore(self.f)
115113
self.store.append('p1', self.p)
116114

117115
def teardown(self):
118116
self.store.close()
119117
self.remove(self.f)
120118

121-
def remove(self, f):
122-
try:
123-
os.remove(f)
124-
except:
125-
pass
126-
127119
def time_read_store_table_panel(self):
128120
self.store.select('p1')
129121

asv_bench/benchmarks/io_bench.py

+9-25
Original file line numberDiff line numberDiff line change
@@ -8,23 +8,7 @@
88
import timeit
99

1010

11-
class _BenchTeardown(object):
12-
"""
13-
base class for teardown method implementation
14-
"""
15-
fname = None
16-
17-
def remove(self, f):
18-
try:
19-
os.remove(f)
20-
except:
21-
pass
22-
23-
def teardown(self):
24-
self.remove(self.fname)
25-
26-
27-
class frame_to_csv(_BenchTeardown):
11+
class frame_to_csv(BaseIO):
2812
goal_time = 0.2
2913
fname = '__test__.csv'
3014

@@ -35,7 +19,7 @@ def time_frame_to_csv(self):
3519
self.df.to_csv(self.fname)
3620

3721

38-
class frame_to_csv2(_BenchTeardown):
22+
class frame_to_csv2(BaseIO):
3923
goal_time = 0.2
4024
fname = '__test__.csv'
4125

@@ -49,7 +33,7 @@ def time_frame_to_csv2(self):
4933
self.df.to_csv(self.fname)
5034

5135

52-
class frame_to_csv_date_formatting(_BenchTeardown):
36+
class frame_to_csv_date_formatting(BaseIO):
5337
goal_time = 0.2
5438
fname = '__test__.csv'
5539

@@ -61,7 +45,7 @@ def time_frame_to_csv_date_formatting(self):
6145
self.data.to_csv(self.fname, date_format='%Y%m%d')
6246

6347

64-
class frame_to_csv_mixed(_BenchTeardown):
48+
class frame_to_csv_mixed(BaseIO):
6549
goal_time = 0.2
6650
fname = '__test__.csv'
6751

@@ -114,7 +98,7 @@ def time_read_csv_infer_datetime_format_ymd(self):
11498
read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'], infer_datetime_format=True)
11599

116100

117-
class read_csv_skiprows(_BenchTeardown):
101+
class read_csv_skiprows(BaseIO):
118102
goal_time = 0.2
119103
fname = '__test__.csv'
120104

@@ -127,7 +111,7 @@ def time_read_csv_skiprows(self):
127111
read_csv(self.fname, skiprows=10000)
128112

129113

130-
class read_csv_standard(_BenchTeardown):
114+
class read_csv_standard(BaseIO):
131115
goal_time = 0.2
132116
fname = '__test__.csv'
133117

@@ -174,7 +158,7 @@ def time_read_uint64_na_values(self):
174158
read_csv(StringIO(self.data1), header=None, na_values=self.na_values)
175159

176160

177-
class write_csv_standard(_BenchTeardown):
161+
class write_csv_standard(BaseIO):
178162
goal_time = 0.2
179163
fname = '__test__.csv'
180164

@@ -218,14 +202,14 @@ def time_read_nrows(self, compression, engine):
218202
compression=compression, engine=engine)
219203

220204

221-
class read_json_lines(_BenchTeardown):
205+
class read_json_lines(BaseIO):
222206
goal_time = 0.2
223207
fname = "__test__.json"
224208

225209
def setup(self):
226210
self.N = 100000
227211
self.C = 5
228-
self.df = DataFrame({('float{0}'.format(i), randn(self.N)) for i in range(self.C)})
212+
self.df = DataFrame({'float{0}'.format(i): randn(self.N) for i in range(self.C)})
229213
self.df.to_json(self.fname,orient="records",lines=True)
230214

231215
def time_read_json_lines(self):

asv_bench/benchmarks/pandas_vb_common.py

+20
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
from pandas import *
23
import pandas as pd
34
from numpy.random import randn
@@ -19,6 +20,25 @@
1920
def setup(*args, **kwargs):
2021
np.random.seed(1234)
2122

23+
24+
class BaseIO(object):
25+
"""
26+
Base class for IO benchmarks
27+
"""
28+
fname = None
29+
30+
def remove(self, f):
31+
"""Remove created files"""
32+
try:
33+
os.remove(f)
34+
except:
35+
# On Windows, attempting to remove a file that is in use
36+
# causes an exception to be raised
37+
pass
38+
39+
def teardown(self):
40+
self.remove(self.fname)
41+
2242
# try em until it works!
2343
for imp in ['pandas._libs.lib', 'pandas.lib', 'pandas_tseries']:
2444
try:

0 commit comments

Comments
 (0)