Skip to content

Commit 0b7a08b

Browse files
tylerjereddyjreback
authored andcommitted
BENCH: asv csv reading benchmarks no longer read StringIO objects off the end (#21807)
* benchmarks for read_csv() now properly rewind StringIO objects prior to reading them in; previously, all iterations of an asv repeat timing run would read in no data because the StringIO object was pointing to its end after the first iteration--setup() only runs between repeats, not iterations within repeats of timeit
1 parent 848b69c commit 0b7a08b

File tree

1 file changed

+32
-20
lines changed

1 file changed

+32
-20
lines changed

asv_bench/benchmarks/io/csv.py

+32-20
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,14 @@ def time_frame_date_formatting(self):
5454
self.data.to_csv(self.fname, date_format='%Y%m%d')
5555

5656

57-
class ReadCSVDInferDatetimeFormat(object):
57+
class StringIORewind(object):
58+
59+
def data(self, stringio_object):
60+
stringio_object.seek(0)
61+
return stringio_object
62+
63+
64+
class ReadCSVDInferDatetimeFormat(StringIORewind):
5865

5966
goal_time = 0.2
6067
params = ([True, False], ['custom', 'iso8601', 'ymd'])
@@ -66,10 +73,12 @@ def setup(self, infer_datetime_format, format):
6673
'iso8601': '%Y-%m-%d %H:%M:%S',
6774
'ymd': '%Y%m%d'}
6875
dt_format = formats[format]
69-
self.data = StringIO('\n'.join(rng.strftime(dt_format).tolist()))
76+
self.StringIO_input = StringIO('\n'.join(
77+
rng.strftime(dt_format).tolist()))
7078

7179
def time_read_csv(self, infer_datetime_format, format):
72-
read_csv(self.data, header=None, names=['foo'], parse_dates=['foo'],
80+
read_csv(self.data(self.StringIO_input),
81+
header=None, names=['foo'], parse_dates=['foo'],
7382
infer_datetime_format=infer_datetime_format)
7483

7584

@@ -95,7 +104,7 @@ def time_skipprows(self, skiprows):
95104
read_csv(self.fname, skiprows=skiprows)
96105

97106

98-
class ReadUint64Integers(object):
107+
class ReadUint64Integers(StringIORewind):
99108

100109
goal_time = 0.2
101110

@@ -108,13 +117,13 @@ def setup(self):
108117
self.data2 = StringIO('\n'.join(arr.astype(str).tolist()))
109118

110119
def time_read_uint64(self):
111-
read_csv(self.data1, header=None, names=['foo'])
120+
read_csv(self.data(self.data1), header=None, names=['foo'])
112121

113122
def time_read_uint64_neg_values(self):
114-
read_csv(self.data2, header=None, names=['foo'])
123+
read_csv(self.data(self.data2), header=None, names=['foo'])
115124

116125
def time_read_uint64_na_values(self):
117-
read_csv(self.data1, header=None, names=['foo'],
126+
read_csv(self.data(self.data1), header=None, names=['foo'],
118127
na_values=self.na_values)
119128

120129

@@ -140,19 +149,20 @@ def time_thousands(self, sep, thousands):
140149
read_csv(self.fname, sep=sep, thousands=thousands)
141150

142151

143-
class ReadCSVComment(object):
152+
class ReadCSVComment(StringIORewind):
144153

145154
goal_time = 0.2
146155

147156
def setup(self):
148157
data = ['A,B,C'] + (['1,2,3 # comment'] * 100000)
149-
self.s_data = StringIO('\n'.join(data))
158+
self.StringIO_input = StringIO('\n'.join(data))
150159

151160
def time_comment(self):
152-
read_csv(self.s_data, comment='#', header=None, names=list('abc'))
161+
read_csv(self.data(self.StringIO_input), comment='#',
162+
header=None, names=list('abc'))
153163

154164

155-
class ReadCSVFloatPrecision(object):
165+
class ReadCSVFloatPrecision(StringIORewind):
156166

157167
goal_time = 0.2
158168
params = ([',', ';'], ['.', '_'], [None, 'high', 'round_trip'])
@@ -164,14 +174,14 @@ def setup(self, sep, decimal, float_precision):
164174
rows = sep.join(['0{}'.format(decimal) + '{}'] * 3) + '\n'
165175
data = rows * 5
166176
data = data.format(*floats) * 200 # 1000 x 3 strings csv
167-
self.s_data = StringIO(data)
177+
self.StringIO_input = StringIO(data)
168178

169179
def time_read_csv(self, sep, decimal, float_precision):
170-
read_csv(self.s_data, sep=sep, header=None, names=list('abc'),
171-
float_precision=float_precision)
180+
read_csv(self.data(self.StringIO_input), sep=sep, header=None,
181+
names=list('abc'), float_precision=float_precision)
172182

173183
def time_read_csv_python_engine(self, sep, decimal, float_precision):
174-
read_csv(self.s_data, sep=sep, header=None, engine='python',
184+
read_csv(self.data(self.StringIO_input), sep=sep, header=None, engine='python',
175185
float_precision=None, names=list('abc'))
176186

177187

@@ -193,7 +203,7 @@ def time_convert_direct(self):
193203
read_csv(self.fname, dtype='category')
194204

195205

196-
class ReadCSVParseDates(object):
206+
class ReadCSVParseDates(StringIORewind):
197207

198208
goal_time = 0.2
199209

@@ -206,12 +216,14 @@ def setup(self):
206216
"""
207217
two_cols = ['KORD,19990127'] * 5
208218
data = data.format(*two_cols)
209-
self.s_data = StringIO(data)
219+
self.StringIO_input = StringIO(data)
210220

211221
def time_multiple_date(self):
212-
read_csv(self.s_data, sep=',', header=None,
213-
names=list(string.digits[:9]), parse_dates=[[1, 2], [1, 3]])
222+
read_csv(self.data(self.StringIO_input), sep=',', header=None,
223+
names=list(string.digits[:9]),
224+
parse_dates=[[1, 2], [1, 3]])
214225

215226
def time_baseline(self):
216-
read_csv(self.s_data, sep=',', header=None, parse_dates=[1],
227+
read_csv(self.data(self.StringIO_input), sep=',', header=None,
228+
parse_dates=[1],
217229
names=list(string.digits[:9]))

0 commit comments

Comments
 (0)