From c3b81e94ef199b1625b58f6916a114edc15dd342 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 20 Dec 2017 23:14:25 -0800 Subject: [PATCH 1/4] CLN: ASV io bencmarks --- asv_bench/benchmarks/hdfstore_bench.py | 123 ------------------- asv_bench/benchmarks/io/excel.py | 37 ++++++ asv_bench/benchmarks/io/hdf.py | 146 ++++++++++++++++++++++ asv_bench/benchmarks/io/msgpack.py | 26 ++++ asv_bench/benchmarks/io/pickle.py | 26 ++++ asv_bench/benchmarks/io/sas.py | 21 ++++ asv_bench/benchmarks/io/sql.py | 84 +++++++++++++ asv_bench/benchmarks/io/stata.py | 37 ++++++ asv_bench/benchmarks/io_sql.py | 104 ---------------- asv_bench/benchmarks/packers.py | 163 ------------------------- 10 files changed, 377 insertions(+), 390 deletions(-) create mode 100644 asv_bench/benchmarks/io/excel.py create mode 100644 asv_bench/benchmarks/io/hdf.py create mode 100644 asv_bench/benchmarks/io/msgpack.py create mode 100644 asv_bench/benchmarks/io/pickle.py create mode 100644 asv_bench/benchmarks/io/sas.py create mode 100644 asv_bench/benchmarks/io/sql.py create mode 100644 asv_bench/benchmarks/io/stata.py diff --git a/asv_bench/benchmarks/hdfstore_bench.py b/asv_bench/benchmarks/hdfstore_bench.py index d7b3be25a18b9..e69de29bb2d1d 100644 --- a/asv_bench/benchmarks/hdfstore_bench.py +++ b/asv_bench/benchmarks/hdfstore_bench.py @@ -1,123 +0,0 @@ -import numpy as np -from pandas import DataFrame, Panel, date_range, HDFStore -import pandas.util.testing as tm - -from .pandas_vb_common import BaseIO, setup # noqa - - -class HDF5(BaseIO): - - goal_time = 0.2 - - def setup(self): - N = 25000 - index = tm.makeStringIndex(N) - self.df = DataFrame({'float1': np.random.randn(N), - 'float2': np.random.randn(N)}, - index=index) - self.df_mixed = DataFrame({'float1': np.random.randn(N), - 'float2': np.random.randn(N), - 'string1': ['foo'] * N, - 'bool1': [True] * N, - 'int1': np.random.randint(0, N, size=N)}, - index=index) - self.df_wide = DataFrame(np.random.randn(N, 100)) - self.start_wide = self.df_wide.index[10000] - self.stop_wide = self.df_wide.index[15000] - self.df2 = DataFrame({'float1': np.random.randn(N), - 'float2': np.random.randn(N)}, - index=date_range('1/1/2000', periods=N)) - self.start = self.df2.index[10000] - self.stop = self.df2.index[15000] - self.df_wide2 = DataFrame(np.random.randn(N, 100), - index=date_range('1/1/2000', periods=N)) - self.df_dc = DataFrame(np.random.randn(N, 10), - columns=['C%03d' % i for i in range(10)]) - - self.f = '__test__.h5' - - self.store = HDFStore(self.f) - self.store.put('fixed', self.df) - self.store.put('fixed_mixed', self.df_mixed) - self.store.append('table', self.df2) - self.store.append('table_mixed', self.df_mixed) - self.store.append('table_wide', self.df_wide) - self.store.append('table_wide2', self.df_wide2) - - def teardown(self): - self.store.close() - self.remove(self.f) - - def time_read_store(self): - self.store.get('fixed') - - def time_read_store_mixed(self): - self.store.get('fixed_mixed') - - def time_write_store(self): - self.store.put('fixed_write', self.df) - - def time_write_store_mixed(self): - self.store.put('fixed_mixed_write', self.df_mixed) - - def time_read_store_table_mixed(self): - self.store.select('table_mixed') - - def time_write_store_table_mixed(self): - self.store.append('table_mixed_write', self.df_mixed) - - def time_read_store_table(self): - self.store.select('table') - - def time_write_store_table(self): - self.store.append('table_write', self.df) - - def time_read_store_table_wide(self): - self.store.select('table_wide') - - def time_write_store_table_wide(self): - self.store.append('table_wide_write', self.df_wide) - - def time_write_store_table_dc(self): - self.store.append('table_dc_write', self.df_dc, data_columns=True) - - def time_query_store_table_wide(self): - self.store.select('table_wide', where="index > self.start_wide and " - "index < self.stop_wide") - - def time_query_store_table(self): - self.store.select('table', where="index > self.start and " - "index < self.stop") - - def time_store_repr(self): - repr(self.store) - - def time_store_str(self): - str(self.store) - - def time_store_info(self): - self.store.info() - - -class HDF5Panel(BaseIO): - - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.p = Panel(np.random.randn(20, 1000, 25), - items=['Item%03d' % i for i in range(20)], - major_axis=date_range('1/1/2000', periods=1000), - minor_axis=['E%03d' % i for i in range(25)]) - self.store = HDFStore(self.f) - self.store.append('p1', self.p) - - def teardown(self): - self.store.close() - self.remove(self.f) - - def time_read_store_table_panel(self): - self.store.select('p1') - - def time_write_store_table_panel(self): - self.store.append('p2', self.p) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py new file mode 100644 index 0000000000000..d2e7ae1cb539e --- /dev/null +++ b/asv_bench/benchmarks/io/excel.py @@ -0,0 +1,37 @@ +import numpy as np +from pandas import DataFrame, date_range, ExcelWriter, read_excel +from pandas.compat import BytesIO +import pandas.util.testing as tm + +from ..pandas_vb_common import BaseIO, setup # noqa + + +class Excel(object): + + goal_time = 0.2 + params = ['openpyxl', 'xlsxwriter', 'xlwt'] + param_names = ['engine'] + + def setup(self, engine): + N = 2000 + C = 5 + self.df = DataFrame(np.random.randn(N, C), + columns=['float{}'.format(i) for i in range(C)], + index=date_range('20000101', periods=N, freq='H')) + self.df['object'] = tm.makeStringIndex(N) + self.bio_read = BytesIO() + self.writer_read = ExcelWriter(self.bio_read, engine=engine) + self.df.to_excel(self.writer_read, sheet_name='Sheet1') + self.writer_read.save() + self.bio_read.seek(0) + + self.bio_write = BytesIO() + self.writer_write = ExcelWriter(self.bio_write, engine=engine) + self.bio_write.seek(0) + + def time_read_excel(self, engine): + read_excel(self.bio_read) + + def time_write_excel(self, engine): + self.df.to_excel(self.bio_write, sheet_name='Sheet1') + self.writer_write.save() diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py new file mode 100644 index 0000000000000..5c0e9586c1cb5 --- /dev/null +++ b/asv_bench/benchmarks/io/hdf.py @@ -0,0 +1,146 @@ +import numpy as np +from pandas import DataFrame, Panel, date_range, HDFStore, read_hdf +import pandas.util.testing as tm + +from ..pandas_vb_common import BaseIO, setup # noqa + + +class HDFStoreDataFrame(BaseIO): + + goal_time = 0.2 + + def setup(self): + N = 25000 + index = tm.makeStringIndex(N) + self.df = DataFrame({'float1': np.random.randn(N), + 'float2': np.random.randn(N)}, + index=index) + self.df_mixed = DataFrame({'float1': np.random.randn(N), + 'float2': np.random.randn(N), + 'string1': ['foo'] * N, + 'bool1': [True] * N, + 'int1': np.random.randint(0, N, size=N)}, + index=index) + self.df_wide = DataFrame(np.random.randn(N, 100)) + self.start_wide = self.df_wide.index[10000] + self.stop_wide = self.df_wide.index[15000] + self.df2 = DataFrame({'float1': np.random.randn(N), + 'float2': np.random.randn(N)}, + index=date_range('1/1/2000', periods=N)) + self.start = self.df2.index[10000] + self.stop = self.df2.index[15000] + self.df_wide2 = DataFrame(np.random.randn(N, 100), + index=date_range('1/1/2000', periods=N)) + self.df_dc = DataFrame(np.random.randn(N, 10), + columns=['C%03d' % i for i in range(10)]) + + self.fname = '__test__.h5' + + self.store = HDFStore(self.fname) + self.store.put('fixed', self.df) + self.store.put('fixed_mixed', self.df_mixed) + self.store.append('table', self.df2) + self.store.append('table_mixed', self.df_mixed) + self.store.append('table_wide', self.df_wide) + self.store.append('table_wide2', self.df_wide2) + + def teardown(self): + self.store.close() + self.remove(self.fname) + + def time_read_store(self): + self.store.get('fixed') + + def time_read_store_mixed(self): + self.store.get('fixed_mixed') + + def time_write_store(self): + self.store.put('fixed_write', self.df) + + def time_write_store_mixed(self): + self.store.put('fixed_mixed_write', self.df_mixed) + + def time_read_store_table_mixed(self): + self.store.select('table_mixed') + + def time_write_store_table_mixed(self): + self.store.append('table_mixed_write', self.df_mixed) + + def time_read_store_table(self): + self.store.select('table') + + def time_write_store_table(self): + self.store.append('table_write', self.df) + + def time_read_store_table_wide(self): + self.store.select('table_wide') + + def time_write_store_table_wide(self): + self.store.append('table_wide_write', self.df_wide) + + def time_write_store_table_dc(self): + self.store.append('table_dc_write', self.df_dc, data_columns=True) + + def time_query_store_table_wide(self): + self.store.select('table_wide', where="index > self.start_wide and " + "index < self.stop_wide") + + def time_query_store_table(self): + self.store.select('table', where="index > self.start and " + "index < self.stop") + + def time_store_repr(self): + repr(self.store) + + def time_store_str(self): + str(self.store) + + def time_store_info(self): + self.store.info() + + +class HDFStorePanel(BaseIO): + + goal_time = 0.2 + + def setup(self): + self.fname = '__test__.h5' + self.p = Panel(np.random.randn(20, 1000, 25), + items=['Item%03d' % i for i in range(20)], + major_axis=date_range('1/1/2000', periods=1000), + minor_axis=['E%03d' % i for i in range(25)]) + self.store = HDFStore(self.fname) + self.store.append('p1', self.p) + + def teardown(self): + self.store.close() + self.remove(self.fname) + + def time_read_store_table_panel(self): + self.store.select('p1') + + def time_write_store_table_panel(self): + self.store.append('p2', self.p) + + +class HDF(BaseIO): + + goal_time = 0.2 + params = ['table', 'fixed'] + param_names = ['format'] + + def setup(self, format): + self.fname = '__test__.h5' + N = 100000 + C = 5 + self.df = DataFrame(np.random.randn(N, C), + columns=['float{}'.format(i) for i in range(C)], + index=date_range('20000101', periods=N, freq='H')) + self.df['object'] = tm.makeStringIndex(N) + self.df.to_hdf(self.fname, 'df', format=format) + + def time_read_hdf(self, format): + read_hdf(self.fname, 'df') + + def time_write_hdf(self, format): + self.df.to_hdf(self.fname, 'df', format=format) diff --git a/asv_bench/benchmarks/io/msgpack.py b/asv_bench/benchmarks/io/msgpack.py new file mode 100644 index 0000000000000..8ccce01117ca4 --- /dev/null +++ b/asv_bench/benchmarks/io/msgpack.py @@ -0,0 +1,26 @@ +import numpy as np +from pandas import DataFrame, date_range, read_msgpack +import pandas.util.testing as tm + +from ..pandas_vb_common import BaseIO, setup # noqa + + +class MSGPack(BaseIO): + + goal_time = 0.2 + + def setup(self): + self.fname = '__test__.msg' + N = 100000 + C = 5 + self.df = DataFrame(np.random.randn(N, C), + columns=['float{}'.format(i) for i in range(C)], + index=date_range('20000101', periods=N, freq='H')) + self.df['object'] = tm.makeStringIndex(N) + self.df.to_msgpack(self.fname) + + def time_read_msgpack(self): + read_msgpack(self.fname) + + def time_write_msgpack(self): + self.df.to_msgpack(self.fname) diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py new file mode 100644 index 0000000000000..2ad0fcca6eb26 --- /dev/null +++ b/asv_bench/benchmarks/io/pickle.py @@ -0,0 +1,26 @@ +import numpy as np +from pandas import DataFrame, date_range, read_pickle +import pandas.util.testing as tm + +from ..pandas_vb_common import BaseIO, setup # noqa + + +class Pickle(BaseIO): + + goal_time = 0.2 + + def setup(self): + self.fname = '__test__.pkl' + N = 100000 + C = 5 + self.df = DataFrame(np.random.randn(N, C), + columns=['float{}'.format(i) for i in range(C)], + index=date_range('20000101', periods=N, freq='H')) + self.df['object'] = tm.makeStringIndex(N) + self.df.to_pickle(self.fname) + + def time_read_pickle(self): + read_pickle(self.fname) + + def time_write_pickle(self): + self.df.to_pickle(self.fname) diff --git a/asv_bench/benchmarks/io/sas.py b/asv_bench/benchmarks/io/sas.py new file mode 100644 index 0000000000000..526c524de7fff --- /dev/null +++ b/asv_bench/benchmarks/io/sas.py @@ -0,0 +1,21 @@ +import os + +from pandas import read_sas + + +class SAS(object): + + goal_time = 0.2 + params = ['sas7bdat', 'xport'] + param_names = ['format'] + + def setup(self, format): + # Read files that are located in 'pandas/io/tests/sas/data' + files = {'sas7bdat': 'test1.sas7bdat', 'xport': 'paxraw_d_short.xpt'} + file = files[format] + paths = [os.path.dirname(__file__), '..', '..', '..', 'pandas', + 'tests', 'io', 'sas', 'data', file] + self.f = os.path.join(*paths) + + def time_read_msgpack(self, format): + read_sas(self.f, format=format) diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py new file mode 100644 index 0000000000000..4d4fdd81fe6e7 --- /dev/null +++ b/asv_bench/benchmarks/io/sql.py @@ -0,0 +1,84 @@ +import sqlite3 + +import numpy as np +import pandas.util.testing as tm +from pandas import DataFrame, date_range, read_sql_query, read_sql_table +from sqlalchemy import create_engine + +from ..pandas_vb_common import setup # noqa + + +class SQL(object): + + goal_time = 0.2 + params = (['sqlalchemy', 'sqlite'], + ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime']) + param_names = ['connection', 'dtype'] + + def setup(self, connection, dtype): + N = 10000 + con = {'sqlalchemy': create_engine('sqlite:///:memory:'), + 'sqlite': sqlite3.connect(':memory:')} + self.table_name = 'test_type' + self.query_all = 'SELECT * FROM {}'.format(self.table_name) + self.query_col = 'SELECT {} FROM {}'.format(dtype, self.table_name) + self.con = con[connection] + self.df = DataFrame({'float': np.random.randn(N), + 'float_with_nan': np.random.randn(N), + 'string': ['foo'] * N, + 'bool': [True] * N, + 'int': np.random.randint(0, N, size=N), + 'datetime': date_range('2000-01-01', + periods=N, + freq='s')}, + index=tm.makeStringIndex(N)) + self.df.loc[1000:3000, 'float_with_nan'] = np.nan + self.df['datetime_string'] = self.df['datetime'].astype(str) + self.df.to_sql(self.table_name, self.con, if_exists='replace') + + def time_to_sql_dataframe_full(self, connection, dtype): + self.df.to_sql('test1', self.con, if_exists='replace') + + def time_to_sql_dataframe_colums(self, connection, dtype): + self.df[[dtype]].to_sql('test1', self.con, if_exists='replace') + + def time_read_sql_query_select_all(self, connection, dtype): + read_sql_query(self.query_all, self.con) + + def time_read_sql_query_select_column(self, connection, dtype): + read_sql_query(self.query_all, self.con) + + +class ReadSQLTable(object): + + goal_time = 0.2 + + params = ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime'] + param_names = ['dtype'] + + def setup(self, dtype): + N = 10000 + self.table_name = 'test' + self.con = create_engine('sqlite:///:memory:') + self.df = DataFrame({'float': np.random.randn(N), + 'float_with_nan': np.random.randn(N), + 'string': ['foo'] * N, + 'bool': [True] * N, + 'int': np.random.randint(0, N, size=N), + 'datetime': date_range('2000-01-01', + periods=N, + freq='s')}, + index=tm.makeStringIndex(N)) + self.df.loc[1000:3000, 'float_with_nan'] = np.nan + self.df['datetime_string'] = self.df['datetime'].astype(str) + self.df.to_sql(self.table_name, self.con, if_exists='replace') + + def time_read_sql_table_all(self, dtype): + read_sql_table(self.table_name, self.con) + + def time_read_sql_table_column(self, dtype): + read_sql_table(self.table_name, self.con, columns=[dtype]) + + def time_read_sql_table_parse_dates(self, dtype): + read_sql_table(self.table_name, self.con, columns=['datetime_string'], + parse_dates=['datetime_string']) diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py new file mode 100644 index 0000000000000..e0f5752ca930f --- /dev/null +++ b/asv_bench/benchmarks/io/stata.py @@ -0,0 +1,37 @@ +import numpy as np +from pandas import DataFrame, date_range, read_stata +import pandas.util.testing as tm + +from ..pandas_vb_common import BaseIO, setup # noqa + + +class Stata(BaseIO): + + goal_time = 0.2 + params = ['tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'] + param_names = ['convert_dates'] + + def setup(self, convert_dates): + self.fname = '__test__.dta' + N = 100000 + C = 5 + self.df = DataFrame(np.random.randn(N, C), + columns=['float{}'.format(i) for i in range(C)], + index=date_range('20000101', periods=N, freq='H')) + self.df['object'] = tm.makeStringIndex(N) + self.df['int8_'] = np.random.randint(np.iinfo(np.int8).min, + np.iinfo(np.int8).max - 27, N) + self.df['int16_'] = np.random.randint(np.iinfo(np.int16).min, + np.iinfo(np.int16).max - 27, N) + self.df['int32_'] = np.random.randint(np.iinfo(np.int32).min, + np.iinfo(np.int32).max - 27, N) + self.df['float32_'] = np.array(np.random.randn(N), + dtype=np.float32) + self.convert_dates = {'index': convert_dates} + self.df.to_stata(self.fname, self.convert_dates) + + def time_read_stata(self, convert_dates): + read_stata(self.fname) + + def time_write_stata(self, convert_dates): + self.df.to_stata(self.fname, self.convert_dates) diff --git a/asv_bench/benchmarks/io_sql.py b/asv_bench/benchmarks/io_sql.py index ec855e5d33525..8b137891791fe 100644 --- a/asv_bench/benchmarks/io_sql.py +++ b/asv_bench/benchmarks/io_sql.py @@ -1,105 +1 @@ -import sqlalchemy -from .pandas_vb_common import * -import sqlite3 -from sqlalchemy import create_engine - -#------------------------------------------------------------------------------- -# to_sql - -class WriteSQL(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) - - def time_fallback(self): - self.df.to_sql('test1', self.con, if_exists='replace') - - def time_sqlalchemy(self): - self.df.to_sql('test1', self.engine, if_exists='replace') - - -#------------------------------------------------------------------------------- -# read_sql - -class ReadSQL(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) - self.df.to_sql('test2', self.engine, if_exists='replace') - self.df.to_sql('test2', self.con, if_exists='replace') - - def time_read_query_fallback(self): - read_sql_query('SELECT * FROM test2', self.con) - - def time_read_query_sqlalchemy(self): - read_sql_query('SELECT * FROM test2', self.engine) - - def time_read_table_sqlalchemy(self): - read_sql_table('test2', self.engine) - - -#------------------------------------------------------------------------------- -# type specific write - -class WriteSQLTypes(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df.loc[1000:3000, 'float'] = np.nan - - def time_string_fallback(self): - self.df[['string']].to_sql('test_string', self.con, if_exists='replace') - - def time_string_sqlalchemy(self): - self.df[['string']].to_sql('test_string', self.engine, if_exists='replace') - - def time_float_fallback(self): - self.df[['float']].to_sql('test_float', self.con, if_exists='replace') - - def time_float_sqlalchemy(self): - self.df[['float']].to_sql('test_float', self.engine, if_exists='replace') - - def time_datetime_sqlalchemy(self): - self.df[['datetime']].to_sql('test_datetime', self.engine, if_exists='replace') - - -#------------------------------------------------------------------------------- -# type specific read - -class ReadSQLTypes(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df['datetime_string'] = self.df['datetime'].map(str) - self.df.to_sql('test_type', self.engine, if_exists='replace') - self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') - - def time_datetime_read_and_parse_sqlalchemy(self): - read_sql_table('test_type', self.engine, columns=['datetime_string'], parse_dates=['datetime_string']) - - def time_datetime_read_as_native_sqlalchemy(self): - read_sql_table('test_type', self.engine, columns=['datetime']) - - def time_float_read_query_fallback(self): - read_sql_query('SELECT float FROM test_type', self.con) - - def time_float_read_query_sqlalchemy(self): - read_sql_query('SELECT float FROM test_type', self.engine) - - def time_float_read_table_sqlalchemy(self): - read_sql_table('test_type', self.engine, columns=['float']) diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py index 7b6cefc56f0da..0853bd17e49cd 100644 --- a/asv_bench/benchmarks/packers.py +++ b/asv_bench/benchmarks/packers.py @@ -3,9 +3,7 @@ import pandas as pd from collections import OrderedDict from pandas.compat import BytesIO -import sqlite3 import os -from sqlalchemy import create_engine import numpy as np from random import randrange @@ -57,109 +55,6 @@ def time_packers_read_excel(self): pd.read_excel(self.bio) -class packers_read_hdf_store(_Packers): - - def setup(self): - self._setup() - self.df2.to_hdf(self.f, 'df') - - def time_packers_read_hdf_store(self): - pd.read_hdf(self.f, 'df') - - -class packers_read_hdf_table(_Packers): - - def setup(self): - self._setup() - self.df2.to_hdf(self.f, 'df', format='table') - - def time_packers_read_hdf_table(self): - pd.read_hdf(self.f, 'df') - - -class packers_read_pack(_Packers): - - def setup(self): - self._setup() - self.df2.to_msgpack(self.f) - - def time_packers_read_pack(self): - pd.read_msgpack(self.f) - - -class packers_read_pickle(_Packers): - - def setup(self): - self._setup() - self.df2.to_pickle(self.f) - - def time_packers_read_pickle(self): - pd.read_pickle(self.f) - - -class packers_read_sql(_Packers): - - def setup(self): - self._setup() - self.engine = create_engine('sqlite:///:memory:') - self.df2.to_sql('table', self.engine, if_exists='replace') - - def time_packers_read_sql(self): - pd.read_sql_table('table', self.engine) - - -class packers_read_stata(_Packers): - - def setup(self): - self._setup() - self.df.to_stata(self.f, {'index': 'tc', }) - - def time_packers_read_stata(self): - pd.read_stata(self.f) - - -class packers_read_stata_with_validation(_Packers): - - def setup(self): - self._setup() - self.df['int8_'] = [randint(np.iinfo(np.int8).min, (np.iinfo(np.int8).max - 27)) for _ in range(self.N)] - self.df['int16_'] = [randint(np.iinfo(np.int16).min, (np.iinfo(np.int16).max - 27)) for _ in range(self.N)] - self.df['int32_'] = [randint(np.iinfo(np.int32).min, (np.iinfo(np.int32).max - 27)) for _ in range(self.N)] - self.df['float32_'] = np.array(randn(self.N), dtype=np.float32) - self.df.to_stata(self.f, {'index': 'tc', }) - - def time_packers_read_stata_with_validation(self): - pd.read_stata(self.f) - - -class packers_read_sas(_Packers): - - def setup(self): - - testdir = os.path.join(os.path.dirname(__file__), '..', '..', - 'pandas', 'tests', 'io', 'sas') - if not os.path.exists(testdir): - testdir = os.path.join(os.path.dirname(__file__), '..', '..', - 'pandas', 'io', 'tests', 'sas') - self.f = os.path.join(testdir, 'data', 'test1.sas7bdat') - self.f2 = os.path.join(testdir, 'data', 'paxraw_d_short.xpt') - - def time_read_sas7bdat(self): - pd.read_sas(self.f, format='sas7bdat') - - def time_read_xport(self): - pd.read_sas(self.f2, format='xport') - - -class CSV(_Packers): - - def setup(self): - self._setup() - - def time_write_csv(self): - self.df.to_csv(self.f) - - class Excel(_Packers): def setup(self): @@ -183,61 +78,3 @@ def time_write_excel_xlwt(self): self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlwt') self.df[:2000].to_excel(self.writer) self.writer.save() - - -class HDF(_Packers): - - def setup(self): - self._setup() - - def time_write_hdf_store(self): - self.df2.to_hdf(self.f, 'df') - - def time_write_hdf_table(self): - self.df2.to_hdf(self.f, 'df', table=True) - - -class MsgPack(_Packers): - - def setup(self): - self._setup() - - def time_write_msgpack(self): - self.df2.to_msgpack(self.f) - - -class Pickle(_Packers): - - def setup(self): - self._setup() - - def time_write_pickle(self): - self.df2.to_pickle(self.f) - - -class SQL(_Packers): - - def setup(self): - self._setup() - self.engine = create_engine('sqlite:///:memory:') - - def time_write_sql(self): - self.df2.to_sql('table', self.engine, if_exists='replace') - - -class STATA(_Packers): - - def setup(self): - self._setup() - - self.df3=self.df.copy() - self.df3['int8_'] = [randint(np.iinfo(np.int8).min, (np.iinfo(np.int8).max - 27)) for _ in range(self.N)] - self.df3['int16_'] = [randint(np.iinfo(np.int16).min, (np.iinfo(np.int16).max - 27)) for _ in range(self.N)] - self.df3['int32_'] = [randint(np.iinfo(np.int32).min, (np.iinfo(np.int32).max - 27)) for _ in range(self.N)] - self.df3['float32_'] = np.array(randn(self.N), dtype=np.float32) - - def time_write_stata(self): - self.df.to_stata(self.f, {'index': 'tc', }) - - def time_write_stata_with_validation(self): - self.df3.to_stata(self.f, {'index': 'tc', }) From 42455285e359683a76ba6c9fd33ab1ed9143c9e2 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 21 Dec 2017 22:55:33 -0800 Subject: [PATCH 2/4] Fix typo --- asv_bench/benchmarks/io/excel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index d2e7ae1cb539e..a7c6c43d15026 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -26,12 +26,12 @@ def setup(self, engine): self.bio_read.seek(0) self.bio_write = BytesIO() - self.writer_write = ExcelWriter(self.bio_write, engine=engine) self.bio_write.seek(0) + self.writer_write = ExcelWriter(self.bio_write, engine=engine) def time_read_excel(self, engine): read_excel(self.bio_read) def time_write_excel(self, engine): - self.df.to_excel(self.bio_write, sheet_name='Sheet1') + self.df.to_excel(self.writer_write, sheet_name='Sheet1') self.writer_write.save() From 9531f05527f5d15c06845b335226ba0403dc70a2 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 21 Dec 2017 22:58:43 -0800 Subject: [PATCH 3/4] Remove unused files --- asv_bench/benchmarks/hdfstore_bench.py | 0 asv_bench/benchmarks/io_sql.py | 1 - asv_bench/benchmarks/packers.py | 80 -------------------------- 3 files changed, 81 deletions(-) delete mode 100644 asv_bench/benchmarks/hdfstore_bench.py delete mode 100644 asv_bench/benchmarks/io_sql.py delete mode 100644 asv_bench/benchmarks/packers.py diff --git a/asv_bench/benchmarks/hdfstore_bench.py b/asv_bench/benchmarks/hdfstore_bench.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/asv_bench/benchmarks/io_sql.py b/asv_bench/benchmarks/io_sql.py deleted file mode 100644 index 8b137891791fe..0000000000000 --- a/asv_bench/benchmarks/io_sql.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py deleted file mode 100644 index 0853bd17e49cd..0000000000000 --- a/asv_bench/benchmarks/packers.py +++ /dev/null @@ -1,80 +0,0 @@ -from .pandas_vb_common import * -from numpy.random import randint -import pandas as pd -from collections import OrderedDict -from pandas.compat import BytesIO -import os -import numpy as np -from random import randrange - - -class _Packers(object): - goal_time = 0.2 - - def _setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame({'float{0}'.format(i): randn(self.N) for i in range(self.C)}, index=self.index) - self.df2 = self.df.copy() - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - - def remove(self, f): - try: - os.remove(f) - except: - pass - - def teardown(self): - self.remove(self.f) - - -class Packers(_Packers): - - def setup(self): - self._setup() - self.df.to_csv(self.f) - - def time_packers_read_csv(self): - pd.read_csv(self.f) - - -class packers_read_excel(_Packers): - - def setup(self): - self._setup() - self.bio = BytesIO() - self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter') - self.df[:2000].to_excel(self.writer) - self.writer.save() - - def time_packers_read_excel(self): - self.bio.seek(0) - pd.read_excel(self.bio) - - -class Excel(_Packers): - - def setup(self): - self._setup() - self.bio = BytesIO() - - def time_write_excel_openpyxl(self): - self.bio.seek(0) - self.writer = pd.io.excel.ExcelWriter(self.bio, engine='openpyxl') - self.df[:2000].to_excel(self.writer) - self.writer.save() - - def time_write_excel_xlsxwriter(self): - self.bio.seek(0) - self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter') - self.df[:2000].to_excel(self.writer) - self.writer.save() - - def time_write_excel_xlwt(self): - self.bio.seek(0) - self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlwt') - self.df[:2000].to_excel(self.writer) - self.writer.save() From d24bc6b31b0f304ad7c22bb63c3a64b286131619 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Fri, 22 Dec 2017 17:01:35 -0800 Subject: [PATCH 4/4] Avoid redundant benchmarks --- asv_bench/benchmarks/io/sql.py | 80 +++++++++++++++++++++++++++------- 1 file changed, 64 insertions(+), 16 deletions(-) diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py index 4d4fdd81fe6e7..ef4e501e5f3b9 100644 --- a/asv_bench/benchmarks/io/sql.py +++ b/asv_bench/benchmarks/io/sql.py @@ -10,6 +10,39 @@ class SQL(object): + goal_time = 0.2 + params = ['sqlalchemy', 'sqlite'] + param_names = ['connection'] + + def setup(self, connection): + N = 10000 + con = {'sqlalchemy': create_engine('sqlite:///:memory:'), + 'sqlite': sqlite3.connect(':memory:')} + self.table_name = 'test_type' + self.query_all = 'SELECT * FROM {}'.format(self.table_name) + self.con = con[connection] + self.df = DataFrame({'float': np.random.randn(N), + 'float_with_nan': np.random.randn(N), + 'string': ['foo'] * N, + 'bool': [True] * N, + 'int': np.random.randint(0, N, size=N), + 'datetime': date_range('2000-01-01', + periods=N, + freq='s')}, + index=tm.makeStringIndex(N)) + self.df.loc[1000:3000, 'float_with_nan'] = np.nan + self.df['datetime_string'] = self.df['datetime'].astype(str) + self.df.to_sql(self.table_name, self.con, if_exists='replace') + + def time_to_sql_dataframe(self, connection): + self.df.to_sql('test1', self.con, if_exists='replace') + + def time_read_sql_query(self, connection): + read_sql_query(self.query_all, self.con) + + +class WriteSQLDtypes(object): + goal_time = 0.2 params = (['sqlalchemy', 'sqlite'], ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime']) @@ -20,7 +53,6 @@ def setup(self, connection, dtype): con = {'sqlalchemy': create_engine('sqlite:///:memory:'), 'sqlite': sqlite3.connect(':memory:')} self.table_name = 'test_type' - self.query_all = 'SELECT * FROM {}'.format(self.table_name) self.query_col = 'SELECT {} FROM {}'.format(dtype, self.table_name) self.con = con[connection] self.df = DataFrame({'float': np.random.randn(N), @@ -36,23 +68,46 @@ def setup(self, connection, dtype): self.df['datetime_string'] = self.df['datetime'].astype(str) self.df.to_sql(self.table_name, self.con, if_exists='replace') - def time_to_sql_dataframe_full(self, connection, dtype): - self.df.to_sql('test1', self.con, if_exists='replace') - - def time_to_sql_dataframe_colums(self, connection, dtype): + def time_to_sql_dataframe_column(self, connection, dtype): self.df[[dtype]].to_sql('test1', self.con, if_exists='replace') - def time_read_sql_query_select_all(self, connection, dtype): - read_sql_query(self.query_all, self.con) - def time_read_sql_query_select_column(self, connection, dtype): - read_sql_query(self.query_all, self.con) + read_sql_query(self.query_col, self.con) class ReadSQLTable(object): goal_time = 0.2 + def setup(self): + N = 10000 + self.table_name = 'test' + self.con = create_engine('sqlite:///:memory:') + self.df = DataFrame({'float': np.random.randn(N), + 'float_with_nan': np.random.randn(N), + 'string': ['foo'] * N, + 'bool': [True] * N, + 'int': np.random.randint(0, N, size=N), + 'datetime': date_range('2000-01-01', + periods=N, + freq='s')}, + index=tm.makeStringIndex(N)) + self.df.loc[1000:3000, 'float_with_nan'] = np.nan + self.df['datetime_string'] = self.df['datetime'].astype(str) + self.df.to_sql(self.table_name, self.con, if_exists='replace') + + def time_read_sql_table_all(self): + read_sql_table(self.table_name, self.con) + + def time_read_sql_table_parse_dates(self): + read_sql_table(self.table_name, self.con, columns=['datetime_string'], + parse_dates=['datetime_string']) + + +class ReadSQLTableDtypes(object): + + goal_time = 0.2 + params = ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime'] param_names = ['dtype'] @@ -73,12 +128,5 @@ def setup(self, dtype): self.df['datetime_string'] = self.df['datetime'].astype(str) self.df.to_sql(self.table_name, self.con, if_exists='replace') - def time_read_sql_table_all(self, dtype): - read_sql_table(self.table_name, self.con) - def time_read_sql_table_column(self, dtype): read_sql_table(self.table_name, self.con, columns=[dtype]) - - def time_read_sql_table_parse_dates(self, dtype): - read_sql_table(self.table_name, self.con, columns=['datetime_string'], - parse_dates=['datetime_string'])