Skip to content

CLN: ASV io_bench, parser_vb #18815

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Dec 19, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
249 changes: 249 additions & 0 deletions asv_bench/benchmarks/io/csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
import random
import timeit
import string

import numpy as np
import pandas.util.testing as tm
from pandas import DataFrame, Categorical, date_range, read_csv
from pandas.compat import PY2
from pandas.compat import cStringIO as StringIO

from ..pandas_vb_common import setup, BaseIO # noqa


class ToCSV(BaseIO):

goal_time = 0.2
fname = '__test__.csv'
params = ['wide', 'long', 'mixed']
param_names = ['kind']

def setup(self, kind):
wide_frame = DataFrame(np.random.randn(3000, 30))
long_frame = DataFrame({'A': np.arange(50000),
'B': np.arange(50000) + 1.,
'C': np.arange(50000) + 2.,
'D': np.arange(50000) + 3.})
mixed_frame = DataFrame({'float': np.random.randn(5000),
'int': np.random.randn(5000).astype(int),
'bool': (np.arange(5000) % 2) == 0,
'datetime': date_range('2001',
freq='s',
periods=5000),
'object': ['foo'] * 5000})
mixed_frame.loc[30:500, 'float'] = np.nan
data = {'wide': wide_frame,
'long': long_frame,
'mixed': mixed_frame}
self.df = data[kind]

def time_frame(self, kind):
self.df.to_csv(self.fname)


class ToCSVDatetime(BaseIO):

goal_time = 0.2
fname = '__test__.csv'

def setup(self):
rng = date_range('1/1/2000', periods=1000)
self.data = DataFrame(rng, index=rng)

def time_frame_date_formatting(self):
self.data.to_csv(self.fname, date_format='%Y%m%d')


class ReadCSVDInferDatetimeFormat(object):

goal_time = 0.2
params = ([True, False], ['custom', 'iso8601', 'ymd'])
param_names = ['infer_datetime_format', 'format']

def setup(self, infer_datetime_format, format):
rng = date_range('1/1/2000', periods=1000)
formats = {'custom': '%m/%d/%Y %H:%M:%S.%f',
'iso8601': '%Y-%m-%d %H:%M:%S',
'ymd': '%Y%m%d'}
dt_format = formats[format]
self.data = StringIO('\n'.join(rng.strftime(dt_format).tolist()))

def time_read_csv(self, infer_datetime_format, format):
read_csv(self.data, header=None, names=['foo'], parse_dates=['foo'],
infer_datetime_format=infer_datetime_format)


class ReadCSVSkipRows(BaseIO):

goal_time = 0.2
fname = '__test__.csv'
params = [None, 10000]
param_names = ['skiprows']

def setup(self, skiprows):
N = 20000
index = tm.makeStringIndex(N)
df = DataFrame({'float1': np.random.randn(N),
'float2': np.random.randn(N),
'string1': ['foo'] * N,
'bool1': [True] * N,
'int1': np.random.randint(0, N, size=N)},
index=index)
df.to_csv(self.fname)

def time_skipprows(self, skiprows):
read_csv(self.fname, skiprows=skiprows)


class ReadUint64Integers(object):

goal_time = 0.2

def setup(self):
self.na_values = [2**63 + 500]
arr = np.arange(10000).astype('uint64') + 2**63
self.data1 = StringIO('\n'.join(arr.astype(str).tolist()))
arr = arr.astype(object)
arr[500] = -1
self.data2 = StringIO('\n'.join(arr.astype(str).tolist()))

def time_read_uint64(self):
read_csv(self.data1, header=None, names=['foo'])

def time_read_uint64_neg_values(self):
read_csv(self.data2, header=None, names=['foo'])

def time_read_uint64_na_values(self):
read_csv(self.data1, header=None, names=['foo'],
na_values=self.na_values)


class S3(object):
# Make sure that we can read part of a file from S3 without
# needing to download the entire thing. Use the timeit.default_timer
# to measure wall time instead of CPU time -- we want to see
# how long it takes to download the data.
timer = timeit.default_timer
params = ([None, "gzip", "bz2"], ["python", "c"])
param_names = ["compression", "engine"]

def setup(self, compression, engine):
if compression == "bz2" and engine == "c" and PY2:
# The Python 2 C parser can't read bz2 from open files.
raise NotImplementedError
try:
import s3fs
except ImportError:
# Skip these benchmarks if `boto` is not installed.
raise NotImplementedError

ext = ""
if compression == "gzip":
ext = ".gz"
elif compression == "bz2":
ext = ".bz2"
self.big_fname = "s3://pandas-test/large_random.csv" + ext

def time_read_csv_10_rows(self, compression, engine):
# Read a small number of rows from a huge (100,000 x 50) table.
read_csv(self.big_fname, nrows=10, compression=compression,
engine=engine)


class ReadCSVThousands(BaseIO):

goal_time = 0.2
fname = '__test__.csv'
params = ([',', '|'], [None, ','])
param_names = ['sep', 'thousands']

def setup(self, sep, thousands):
N = 10000
K = 8
data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K))
df = DataFrame(data)
if thousands is not None:
fmt = ':{}'.format(thousands)
fmt = '{' + fmt + '}'
df = df.applymap(lambda x: fmt.format(x))
df.to_csv(self.fname, sep=sep)

def time_thousands(self, sep, thousands):
read_csv(self.fname, sep=sep, thousands=thousands)


class ReadCSVComment(object):

goal_time = 0.2

def setup(self):
data = ['A,B,C'] + (['1,2,3 # comment'] * 100000)
self.s_data = StringIO('\n'.join(data))

def time_comment(self):
read_csv(self.s_data, comment='#', header=None, names=list('abc'))


class ReadCSVFloatPrecision(object):

goal_time = 0.2
params = ([',', ';'], ['.', '_'], [None, 'high', 'round_trip'])
param_names = ['sep', 'decimal', 'float_precision']

def setup(self, sep, decimal, float_precision):
floats = [''.join(random.choice(string.digits) for _ in range(28))
for _ in range(15)]
rows = sep.join(['0{}'.format(decimal) + '{}'] * 3) + '\n'
data = rows * 5
data = data.format(*floats) * 200 # 1000 x 3 strings csv
self.s_data = StringIO(data)

def time_read_csv(self, sep, decimal, float_precision):
read_csv(self.s_data, sep=sep, header=None, names=list('abc'),
float_precision=float_precision)

def time_read_csv_python_engine(self, sep, decimal, float_precision):
read_csv(self.s_data, sep=sep, header=None, engine='python',
float_precision=None, names=list('abc'))


class ReadCSVCategorical(BaseIO):

goal_time = 0.2
fname = '__test__.csv'

def setup(self):
N = 100000
group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee']
df = DataFrame(np.random.choice(group1, (N, 3)), columns=list('abc'))
df.to_csv(self.fname, index=False)

def time_convert_post(self):
read_csv(self.fname).apply(Categorical)

def time_convert_direct(self):
read_csv(self.fname, dtype='category')


class ReadCSVParseDates(object):

goal_time = 0.2

def setup(self):
data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n
{},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n
{},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n
{},21:00:00,21:18:00,-0.9900,2.0100,3.6000,0.0000,270.0000\n
{},22:00:00,21:56:00,-0.5900,1.7100,5.1000,0.0000,290.0000\n
"""
two_cols = ['KORD,19990127'] * 5
data = data.format(*two_cols)
self.s_data = StringIO(data)

def time_multiple_date(self):
read_csv(self.s_data, sep=',', header=None,
names=list(string.digits[:9]), parse_dates=[[1, 2], [1, 3]])

def time_baseline(self):
read_csv(self.s_data, sep=',', header=None, parse_dates=[1],
names=list(string.digits[:9]))
127 changes: 127 additions & 0 deletions asv_bench/benchmarks/io/json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import numpy as np
import pandas.util.testing as tm
from pandas import DataFrame, date_range, timedelta_range, concat, read_json

from ..pandas_vb_common import setup, BaseIO # noqa


class ReadJSON(BaseIO):

goal_time = 0.2
fname = "__test__.json"
params = (['split', 'index', 'records'], ['int', 'datetime'])
param_names = ['orient', 'index']

def setup(self, orient, index):
N = 100000
indexes = {'int': np.arange(N),
'datetime': date_range('20000101', periods=N, freq='H')}
df = DataFrame(np.random.randn(N, 5),
columns=['float_{}'.format(i) for i in range(5)],
index=indexes[index])
df.to_json(self.fname, orient=orient)

def time_read_json(self, orient, index):
read_json(self.fname, orient=orient)


class ReadJSONLines(BaseIO):

goal_time = 0.2
fname = "__test_lines__.json"
params = ['int', 'datetime']
param_names = ['index']

def setup(self, index):
N = 100000
indexes = {'int': np.arange(N),
'datetime': date_range('20000101', periods=N, freq='H')}
df = DataFrame(np.random.randn(N, 5),
columns=['float_{}'.format(i) for i in range(5)],
index=indexes[index])
df.to_json(self.fname, orient='records', lines=True)

def time_read_json_lines(self, index):
read_json(self.fname, orient='records', lines=True)

def time_read_json_lines_concat(self, index):
concat(read_json(self.fname, orient='records', lines=True,
chunksize=25000))

def peakmem_read_json_lines(self, index):
read_json(self.fname, orient='records', lines=True)

def peakmem_read_json_lines_concat(self, index):
concat(read_json(self.fname, orient='records', lines=True,
chunksize=25000))


class ToJSON(BaseIO):

goal_time = 0.2
fname = "__test__.json"
params = ['split', 'columns', 'index']
param_names = ['orient']

def setup(self, lines_orient):
N = 10**5
ncols = 5
index = date_range('20000101', periods=N, freq='H')
timedeltas = timedelta_range(start=1, periods=N, freq='s')
datetimes = date_range(start=1, periods=N, freq='s')
ints = np.random.randint(100000000, size=N)
floats = np.random.randn(N)
strings = tm.makeStringIndex(N)
self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N))
self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index)
self.df_td_int_ts = DataFrame({'td_1': timedeltas,
'td_2': timedeltas,
'int_1': ints,
'int_2': ints,
'ts_1': datetimes,
'ts_2': datetimes},
index=index)
self.df_int_floats = DataFrame({'int_1': ints,
'int_2': ints,
'int_3': ints,
'float_1': floats,
'float_2': floats,
'float_3': floats},
index=index)
self.df_int_float_str = DataFrame({'int_1': ints,
'int_2': ints,
'float_1': floats,
'float_2': floats,
'str_1': strings,
'str_2': strings},
index=index)

def time_floats_with_int_index(self, orient):
self.df.to_json(self.fname, orient=orient)

def time_floats_with_dt_index(self, orient):
self.df_date_idx.to_json(self.fname, orient=orient)

def time_delta_int_tstamp(self, orient):
self.df_td_int_ts.to_json(self.fname, orient=orient)

def time_float_int(self, orient):
self.df_int_floats.to_json(self.fname, orient=orient)

def time_float_int_str(self, orient):
self.df_int_float_str.to_json(self.fname, orient=orient)

def time_floats_with_int_idex_lines(self, orient):
self.df.to_json(self.fname, orient='records', lines=True)

def time_floats_with_dt_index_lines(self, orient):
self.df_date_idx.to_json(self.fname, orient='records', lines=True)

def time_delta_int_tstamp_lines(self, orient):
self.df_td_int_ts.to_json(self.fname, orient='records', lines=True)

def time_float_int_lines(self, orient):
self.df_int_floats.to_json(self.fname, orient='records', lines=True)

def time_float_int_str_lines(self, orient):
self.df_int_float_str.to_json(self.fname, orient='records', lines=True)
Loading