Skip to content

Commit b2a02bd

Browse files
mroeschkejreback
authored andcommitted
CLN: ASV io_bench, parser_vb (#18815)
1 parent 26c6c19 commit b2a02bd

File tree

6 files changed

+376
-408
lines changed

6 files changed

+376
-408
lines changed

asv_bench/benchmarks/io/__init__.py

Whitespace-only changes.

asv_bench/benchmarks/io/csv.py

+249
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
import random
2+
import timeit
3+
import string
4+
5+
import numpy as np
6+
import pandas.util.testing as tm
7+
from pandas import DataFrame, Categorical, date_range, read_csv
8+
from pandas.compat import PY2
9+
from pandas.compat import cStringIO as StringIO
10+
11+
from ..pandas_vb_common import setup, BaseIO # noqa
12+
13+
14+
class ToCSV(BaseIO):
15+
16+
goal_time = 0.2
17+
fname = '__test__.csv'
18+
params = ['wide', 'long', 'mixed']
19+
param_names = ['kind']
20+
21+
def setup(self, kind):
22+
wide_frame = DataFrame(np.random.randn(3000, 30))
23+
long_frame = DataFrame({'A': np.arange(50000),
24+
'B': np.arange(50000) + 1.,
25+
'C': np.arange(50000) + 2.,
26+
'D': np.arange(50000) + 3.})
27+
mixed_frame = DataFrame({'float': np.random.randn(5000),
28+
'int': np.random.randn(5000).astype(int),
29+
'bool': (np.arange(5000) % 2) == 0,
30+
'datetime': date_range('2001',
31+
freq='s',
32+
periods=5000),
33+
'object': ['foo'] * 5000})
34+
mixed_frame.loc[30:500, 'float'] = np.nan
35+
data = {'wide': wide_frame,
36+
'long': long_frame,
37+
'mixed': mixed_frame}
38+
self.df = data[kind]
39+
40+
def time_frame(self, kind):
41+
self.df.to_csv(self.fname)
42+
43+
44+
class ToCSVDatetime(BaseIO):
45+
46+
goal_time = 0.2
47+
fname = '__test__.csv'
48+
49+
def setup(self):
50+
rng = date_range('1/1/2000', periods=1000)
51+
self.data = DataFrame(rng, index=rng)
52+
53+
def time_frame_date_formatting(self):
54+
self.data.to_csv(self.fname, date_format='%Y%m%d')
55+
56+
57+
class ReadCSVDInferDatetimeFormat(object):
58+
59+
goal_time = 0.2
60+
params = ([True, False], ['custom', 'iso8601', 'ymd'])
61+
param_names = ['infer_datetime_format', 'format']
62+
63+
def setup(self, infer_datetime_format, format):
64+
rng = date_range('1/1/2000', periods=1000)
65+
formats = {'custom': '%m/%d/%Y %H:%M:%S.%f',
66+
'iso8601': '%Y-%m-%d %H:%M:%S',
67+
'ymd': '%Y%m%d'}
68+
dt_format = formats[format]
69+
self.data = StringIO('\n'.join(rng.strftime(dt_format).tolist()))
70+
71+
def time_read_csv(self, infer_datetime_format, format):
72+
read_csv(self.data, header=None, names=['foo'], parse_dates=['foo'],
73+
infer_datetime_format=infer_datetime_format)
74+
75+
76+
class ReadCSVSkipRows(BaseIO):
77+
78+
goal_time = 0.2
79+
fname = '__test__.csv'
80+
params = [None, 10000]
81+
param_names = ['skiprows']
82+
83+
def setup(self, skiprows):
84+
N = 20000
85+
index = tm.makeStringIndex(N)
86+
df = DataFrame({'float1': np.random.randn(N),
87+
'float2': np.random.randn(N),
88+
'string1': ['foo'] * N,
89+
'bool1': [True] * N,
90+
'int1': np.random.randint(0, N, size=N)},
91+
index=index)
92+
df.to_csv(self.fname)
93+
94+
def time_skipprows(self, skiprows):
95+
read_csv(self.fname, skiprows=skiprows)
96+
97+
98+
class ReadUint64Integers(object):
99+
100+
goal_time = 0.2
101+
102+
def setup(self):
103+
self.na_values = [2**63 + 500]
104+
arr = np.arange(10000).astype('uint64') + 2**63
105+
self.data1 = StringIO('\n'.join(arr.astype(str).tolist()))
106+
arr = arr.astype(object)
107+
arr[500] = -1
108+
self.data2 = StringIO('\n'.join(arr.astype(str).tolist()))
109+
110+
def time_read_uint64(self):
111+
read_csv(self.data1, header=None, names=['foo'])
112+
113+
def time_read_uint64_neg_values(self):
114+
read_csv(self.data2, header=None, names=['foo'])
115+
116+
def time_read_uint64_na_values(self):
117+
read_csv(self.data1, header=None, names=['foo'],
118+
na_values=self.na_values)
119+
120+
121+
class S3(object):
122+
# Make sure that we can read part of a file from S3 without
123+
# needing to download the entire thing. Use the timeit.default_timer
124+
# to measure wall time instead of CPU time -- we want to see
125+
# how long it takes to download the data.
126+
timer = timeit.default_timer
127+
params = ([None, "gzip", "bz2"], ["python", "c"])
128+
param_names = ["compression", "engine"]
129+
130+
def setup(self, compression, engine):
131+
if compression == "bz2" and engine == "c" and PY2:
132+
# The Python 2 C parser can't read bz2 from open files.
133+
raise NotImplementedError
134+
try:
135+
import s3fs
136+
except ImportError:
137+
# Skip these benchmarks if `boto` is not installed.
138+
raise NotImplementedError
139+
140+
ext = ""
141+
if compression == "gzip":
142+
ext = ".gz"
143+
elif compression == "bz2":
144+
ext = ".bz2"
145+
self.big_fname = "s3://pandas-test/large_random.csv" + ext
146+
147+
def time_read_csv_10_rows(self, compression, engine):
148+
# Read a small number of rows from a huge (100,000 x 50) table.
149+
read_csv(self.big_fname, nrows=10, compression=compression,
150+
engine=engine)
151+
152+
153+
class ReadCSVThousands(BaseIO):
154+
155+
goal_time = 0.2
156+
fname = '__test__.csv'
157+
params = ([',', '|'], [None, ','])
158+
param_names = ['sep', 'thousands']
159+
160+
def setup(self, sep, thousands):
161+
N = 10000
162+
K = 8
163+
data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K))
164+
df = DataFrame(data)
165+
if thousands is not None:
166+
fmt = ':{}'.format(thousands)
167+
fmt = '{' + fmt + '}'
168+
df = df.applymap(lambda x: fmt.format(x))
169+
df.to_csv(self.fname, sep=sep)
170+
171+
def time_thousands(self, sep, thousands):
172+
read_csv(self.fname, sep=sep, thousands=thousands)
173+
174+
175+
class ReadCSVComment(object):
176+
177+
goal_time = 0.2
178+
179+
def setup(self):
180+
data = ['A,B,C'] + (['1,2,3 # comment'] * 100000)
181+
self.s_data = StringIO('\n'.join(data))
182+
183+
def time_comment(self):
184+
read_csv(self.s_data, comment='#', header=None, names=list('abc'))
185+
186+
187+
class ReadCSVFloatPrecision(object):
188+
189+
goal_time = 0.2
190+
params = ([',', ';'], ['.', '_'], [None, 'high', 'round_trip'])
191+
param_names = ['sep', 'decimal', 'float_precision']
192+
193+
def setup(self, sep, decimal, float_precision):
194+
floats = [''.join(random.choice(string.digits) for _ in range(28))
195+
for _ in range(15)]
196+
rows = sep.join(['0{}'.format(decimal) + '{}'] * 3) + '\n'
197+
data = rows * 5
198+
data = data.format(*floats) * 200 # 1000 x 3 strings csv
199+
self.s_data = StringIO(data)
200+
201+
def time_read_csv(self, sep, decimal, float_precision):
202+
read_csv(self.s_data, sep=sep, header=None, names=list('abc'),
203+
float_precision=float_precision)
204+
205+
def time_read_csv_python_engine(self, sep, decimal, float_precision):
206+
read_csv(self.s_data, sep=sep, header=None, engine='python',
207+
float_precision=None, names=list('abc'))
208+
209+
210+
class ReadCSVCategorical(BaseIO):
211+
212+
goal_time = 0.2
213+
fname = '__test__.csv'
214+
215+
def setup(self):
216+
N = 100000
217+
group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee']
218+
df = DataFrame(np.random.choice(group1, (N, 3)), columns=list('abc'))
219+
df.to_csv(self.fname, index=False)
220+
221+
def time_convert_post(self):
222+
read_csv(self.fname).apply(Categorical)
223+
224+
def time_convert_direct(self):
225+
read_csv(self.fname, dtype='category')
226+
227+
228+
class ReadCSVParseDates(object):
229+
230+
goal_time = 0.2
231+
232+
def setup(self):
233+
data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n
234+
{},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n
235+
{},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n
236+
{},21:00:00,21:18:00,-0.9900,2.0100,3.6000,0.0000,270.0000\n
237+
{},22:00:00,21:56:00,-0.5900,1.7100,5.1000,0.0000,290.0000\n
238+
"""
239+
two_cols = ['KORD,19990127'] * 5
240+
data = data.format(*two_cols)
241+
self.s_data = StringIO(data)
242+
243+
def time_multiple_date(self):
244+
read_csv(self.s_data, sep=',', header=None,
245+
names=list(string.digits[:9]), parse_dates=[[1, 2], [1, 3]])
246+
247+
def time_baseline(self):
248+
read_csv(self.s_data, sep=',', header=None, parse_dates=[1],
249+
names=list(string.digits[:9]))

asv_bench/benchmarks/io/json.py

+127
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import numpy as np
2+
import pandas.util.testing as tm
3+
from pandas import DataFrame, date_range, timedelta_range, concat, read_json
4+
5+
from ..pandas_vb_common import setup, BaseIO # noqa
6+
7+
8+
class ReadJSON(BaseIO):
9+
10+
goal_time = 0.2
11+
fname = "__test__.json"
12+
params = (['split', 'index', 'records'], ['int', 'datetime'])
13+
param_names = ['orient', 'index']
14+
15+
def setup(self, orient, index):
16+
N = 100000
17+
indexes = {'int': np.arange(N),
18+
'datetime': date_range('20000101', periods=N, freq='H')}
19+
df = DataFrame(np.random.randn(N, 5),
20+
columns=['float_{}'.format(i) for i in range(5)],
21+
index=indexes[index])
22+
df.to_json(self.fname, orient=orient)
23+
24+
def time_read_json(self, orient, index):
25+
read_json(self.fname, orient=orient)
26+
27+
28+
class ReadJSONLines(BaseIO):
29+
30+
goal_time = 0.2
31+
fname = "__test_lines__.json"
32+
params = ['int', 'datetime']
33+
param_names = ['index']
34+
35+
def setup(self, index):
36+
N = 100000
37+
indexes = {'int': np.arange(N),
38+
'datetime': date_range('20000101', periods=N, freq='H')}
39+
df = DataFrame(np.random.randn(N, 5),
40+
columns=['float_{}'.format(i) for i in range(5)],
41+
index=indexes[index])
42+
df.to_json(self.fname, orient='records', lines=True)
43+
44+
def time_read_json_lines(self, index):
45+
read_json(self.fname, orient='records', lines=True)
46+
47+
def time_read_json_lines_concat(self, index):
48+
concat(read_json(self.fname, orient='records', lines=True,
49+
chunksize=25000))
50+
51+
def peakmem_read_json_lines(self, index):
52+
read_json(self.fname, orient='records', lines=True)
53+
54+
def peakmem_read_json_lines_concat(self, index):
55+
concat(read_json(self.fname, orient='records', lines=True,
56+
chunksize=25000))
57+
58+
59+
class ToJSON(BaseIO):
60+
61+
goal_time = 0.2
62+
fname = "__test__.json"
63+
params = ['split', 'columns', 'index']
64+
param_names = ['orient']
65+
66+
def setup(self, lines_orient):
67+
N = 10**5
68+
ncols = 5
69+
index = date_range('20000101', periods=N, freq='H')
70+
timedeltas = timedelta_range(start=1, periods=N, freq='s')
71+
datetimes = date_range(start=1, periods=N, freq='s')
72+
ints = np.random.randint(100000000, size=N)
73+
floats = np.random.randn(N)
74+
strings = tm.makeStringIndex(N)
75+
self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N))
76+
self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index)
77+
self.df_td_int_ts = DataFrame({'td_1': timedeltas,
78+
'td_2': timedeltas,
79+
'int_1': ints,
80+
'int_2': ints,
81+
'ts_1': datetimes,
82+
'ts_2': datetimes},
83+
index=index)
84+
self.df_int_floats = DataFrame({'int_1': ints,
85+
'int_2': ints,
86+
'int_3': ints,
87+
'float_1': floats,
88+
'float_2': floats,
89+
'float_3': floats},
90+
index=index)
91+
self.df_int_float_str = DataFrame({'int_1': ints,
92+
'int_2': ints,
93+
'float_1': floats,
94+
'float_2': floats,
95+
'str_1': strings,
96+
'str_2': strings},
97+
index=index)
98+
99+
def time_floats_with_int_index(self, orient):
100+
self.df.to_json(self.fname, orient=orient)
101+
102+
def time_floats_with_dt_index(self, orient):
103+
self.df_date_idx.to_json(self.fname, orient=orient)
104+
105+
def time_delta_int_tstamp(self, orient):
106+
self.df_td_int_ts.to_json(self.fname, orient=orient)
107+
108+
def time_float_int(self, orient):
109+
self.df_int_floats.to_json(self.fname, orient=orient)
110+
111+
def time_float_int_str(self, orient):
112+
self.df_int_float_str.to_json(self.fname, orient=orient)
113+
114+
def time_floats_with_int_idex_lines(self, orient):
115+
self.df.to_json(self.fname, orient='records', lines=True)
116+
117+
def time_floats_with_dt_index_lines(self, orient):
118+
self.df_date_idx.to_json(self.fname, orient='records', lines=True)
119+
120+
def time_delta_int_tstamp_lines(self, orient):
121+
self.df_td_int_ts.to_json(self.fname, orient='records', lines=True)
122+
123+
def time_float_int_lines(self, orient):
124+
self.df_int_floats.to_json(self.fname, orient='records', lines=True)
125+
126+
def time_float_int_str_lines(self, orient):
127+
self.df_int_float_str.to_json(self.fname, orient='records', lines=True)

0 commit comments

Comments
 (0)