pandas-dev · jreback · Dec 19, 2017 · Dec 17, 2017 · Dec 18, 2017 · Dec 18, 2017
diff --git a/asv_bench/benchmarks/io/__init__.py b/asv_bench/benchmarks/io/__init__.py
diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -0,0 +1,249 @@
+import random
+import timeit
+import string
+
+import numpy as np
+import pandas.util.testing as tm
+from pandas import DataFrame, Categorical, date_range, read_csv
+from pandas.compat import PY2
+from pandas.compat import cStringIO as StringIO
+
+from ..pandas_vb_common import setup, BaseIO  # noqa
+
+
+class ToCSV(BaseIO):
+
+    goal_time = 0.2
+    fname = '__test__.csv'
+    params = ['wide', 'long', 'mixed']
+    param_names = ['kind']
+
+    def setup(self, kind):
+        wide_frame = DataFrame(np.random.randn(3000, 30))
+        long_frame = DataFrame({'A': np.arange(50000),
+                                'B': np.arange(50000) + 1.,
+                                'C': np.arange(50000) + 2.,
+                                'D': np.arange(50000) + 3.})
+        mixed_frame = DataFrame({'float': np.random.randn(5000),
+                                 'int': np.random.randn(5000).astype(int),
+                                 'bool': (np.arange(5000) % 2) == 0,
+                                 'datetime': date_range('2001',
+                                                        freq='s',
+                                                        periods=5000),
+                                 'object': ['foo'] * 5000})
+        mixed_frame.loc[30:500, 'float'] = np.nan
+        data = {'wide': wide_frame,
+                'long': long_frame,
+                'mixed': mixed_frame}
+        self.df = data[kind]
+
+    def time_frame(self, kind):
+        self.df.to_csv(self.fname)
+
+
+class ToCSVDatetime(BaseIO):
+
+    goal_time = 0.2
+    fname = '__test__.csv'
+
+    def setup(self):
+        rng = date_range('1/1/2000', periods=1000)
+        self.data = DataFrame(rng, index=rng)
+
+    def time_frame_date_formatting(self):
+        self.data.to_csv(self.fname, date_format='%Y%m%d')
+
+
+class ReadCSVDInferDatetimeFormat(object):
+
+    goal_time = 0.2
+    params = ([True, False], ['custom', 'iso8601', 'ymd'])
+    param_names = ['infer_datetime_format', 'format']
+
+    def setup(self, infer_datetime_format, format):
+        rng = date_range('1/1/2000', periods=1000)
+        formats = {'custom': '%m/%d/%Y %H:%M:%S.%f',
+                   'iso8601': '%Y-%m-%d %H:%M:%S',
+                   'ymd': '%Y%m%d'}
+        dt_format = formats[format]
+        self.data = StringIO('\n'.join(rng.strftime(dt_format).tolist()))
+
+    def time_read_csv(self, infer_datetime_format, format):
+        read_csv(self.data, header=None, names=['foo'], parse_dates=['foo'],
+                 infer_datetime_format=infer_datetime_format)
+
+
+class ReadCSVSkipRows(BaseIO):
+
+    goal_time = 0.2
+    fname = '__test__.csv'
+    params = [None, 10000]
+    param_names = ['skiprows']
+
+    def setup(self, skiprows):
+        N = 20000
+        index = tm.makeStringIndex(N)
+        df = DataFrame({'float1': np.random.randn(N),
+                        'float2': np.random.randn(N),
+                        'string1': ['foo'] * N,
+                        'bool1': [True] * N,
+                        'int1': np.random.randint(0, N, size=N)},
+                       index=index)
+        df.to_csv(self.fname)
+
+    def time_skipprows(self, skiprows):
+        read_csv(self.fname, skiprows=skiprows)
+
+
+class ReadUint64Integers(object):
+
+    goal_time = 0.2
+
+    def setup(self):
+        self.na_values = [2**63 + 500]
+        arr = np.arange(10000).astype('uint64') + 2**63
+        self.data1 = StringIO('\n'.join(arr.astype(str).tolist()))
+        arr = arr.astype(object)
+        arr[500] = -1
+        self.data2 = StringIO('\n'.join(arr.astype(str).tolist()))
+
+    def time_read_uint64(self):
+        read_csv(self.data1, header=None, names=['foo'])
+
+    def time_read_uint64_neg_values(self):
+        read_csv(self.data2, header=None, names=['foo'])
+
+    def time_read_uint64_na_values(self):
+        read_csv(self.data1, header=None, names=['foo'],
+                 na_values=self.na_values)
+
+
+class S3(object):
+    # Make sure that we can read part of a file from S3 without
+    # needing to download the entire thing. Use the timeit.default_timer
+    # to measure wall time instead of CPU time -- we want to see
+    # how long it takes to download the data.
+    timer = timeit.default_timer
+    params = ([None, "gzip", "bz2"], ["python", "c"])
+    param_names = ["compression", "engine"]
+
+    def setup(self, compression, engine):
+        if compression == "bz2" and engine == "c" and PY2:
+            # The Python 2 C parser can't read bz2 from open files.
+            raise NotImplementedError
+        try:
+            import s3fs
+        except ImportError:
+            # Skip these benchmarks if `boto` is not installed.
+            raise NotImplementedError
+
+        ext = ""
+        if compression == "gzip":
+            ext = ".gz"
+        elif compression == "bz2":
+            ext = ".bz2"
+        self.big_fname = "s3://pandas-test/large_random.csv" + ext
+
+    def time_read_csv_10_rows(self, compression, engine):
+        # Read a small number of rows from a huge (100,000 x 50) table.
+        read_csv(self.big_fname, nrows=10, compression=compression,
+                 engine=engine)
+
+
+class ReadCSVThousands(BaseIO):
+
+    goal_time = 0.2
+    fname = '__test__.csv'
+    params = ([',', '|'], [None, ','])
+    param_names = ['sep', 'thousands']
+
+    def setup(self, sep, thousands):
+        N = 10000
+        K = 8
+        data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K))
+        df = DataFrame(data)
+        if thousands is not None:
+            fmt = ':{}'.format(thousands)
+            fmt = '{' + fmt + '}'
+            df = df.applymap(lambda x: fmt.format(x))
+        df.to_csv(self.fname, sep=sep)
+
+    def time_thousands(self, sep, thousands):
+        read_csv(self.fname, sep=sep, thousands=thousands)
+
+
+class ReadCSVComment(object):
+
+    goal_time = 0.2
+
+    def setup(self):
+        data = ['A,B,C'] + (['1,2,3 # comment'] * 100000)
+        self.s_data = StringIO('\n'.join(data))
+
+    def time_comment(self):
+        read_csv(self.s_data, comment='#', header=None, names=list('abc'))
+
+
+class ReadCSVFloatPrecision(object):
+
+    goal_time = 0.2
+    params = ([',', ';'], ['.', '_'], [None, 'high', 'round_trip'])
+    param_names = ['sep', 'decimal', 'float_precision']
+
+    def setup(self, sep, decimal, float_precision):
+        floats = [''.join(random.choice(string.digits) for _ in range(28))
+                  for _ in range(15)]
+        rows = sep.join(['0{}'.format(decimal) + '{}'] * 3) + '\n'
+        data = rows * 5
+        data = data.format(*floats) * 200  # 1000 x 3 strings csv
+        self.s_data = StringIO(data)
+
+    def time_read_csv(self, sep, decimal, float_precision):
+        read_csv(self.s_data, sep=sep, header=None, names=list('abc'),
+                 float_precision=float_precision)
+
+    def time_read_csv_python_engine(self, sep, decimal, float_precision):
+        read_csv(self.s_data, sep=sep, header=None, engine='python',
+                 float_precision=None, names=list('abc'))
+
+
+class ReadCSVCategorical(BaseIO):
+
+    goal_time = 0.2
+    fname = '__test__.csv'
+
+    def setup(self):
+        N = 100000
+        group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee']
+        df = DataFrame(np.random.choice(group1, (N, 3)), columns=list('abc'))
+        df.to_csv(self.fname, index=False)
+
+    def time_convert_post(self):
+        read_csv(self.fname).apply(Categorical)
+
+    def time_convert_direct(self):
+        read_csv(self.fname, dtype='category')
+
+
+class ReadCSVParseDates(object):
+
+    goal_time = 0.2
+
+    def setup(self):
+        data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n
+                  {},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n
+                  {},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n
+                  {},21:00:00,21:18:00,-0.9900,2.0100,3.6000,0.0000,270.0000\n
+                  {},22:00:00,21:56:00,-0.5900,1.7100,5.1000,0.0000,290.0000\n
+               """
+        two_cols = ['KORD,19990127'] * 5
+        data = data.format(*two_cols)
+        self.s_data = StringIO(data)
+
+    def time_multiple_date(self):
+        read_csv(self.s_data, sep=',', header=None,
+                 names=list(string.digits[:9]), parse_dates=[[1, 2], [1, 3]])
+
+    def time_baseline(self):
+        read_csv(self.s_data, sep=',', header=None, parse_dates=[1],
+                 names=list(string.digits[:9]))
diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py
@@ -0,0 +1,127 @@
+import numpy as np
+import pandas.util.testing as tm
+from pandas import DataFrame, date_range, timedelta_range, concat, read_json
+
+from ..pandas_vb_common import setup, BaseIO  # noqa
+
+
+class ReadJSON(BaseIO):
+
+    goal_time = 0.2
+    fname = "__test__.json"
+    params = (['split', 'index', 'records'], ['int', 'datetime'])
+    param_names = ['orient', 'index']
+
+    def setup(self, orient, index):
+        N = 100000
+        indexes = {'int': np.arange(N),
+                   'datetime': date_range('20000101', periods=N, freq='H')}
+        df = DataFrame(np.random.randn(N, 5),
+                       columns=['float_{}'.format(i) for i in range(5)],
+                       index=indexes[index])
+        df.to_json(self.fname, orient=orient)
+
+    def time_read_json(self, orient, index):
+        read_json(self.fname, orient=orient)
+
+
+class ReadJSONLines(BaseIO):
+
+    goal_time = 0.2
+    fname = "__test_lines__.json"
+    params = ['int', 'datetime']
+    param_names = ['index']
+
+    def setup(self, index):
+        N = 100000
+        indexes = {'int': np.arange(N),
+                   'datetime': date_range('20000101', periods=N, freq='H')}
+        df = DataFrame(np.random.randn(N, 5),
+                       columns=['float_{}'.format(i) for i in range(5)],
+                       index=indexes[index])
+        df.to_json(self.fname, orient='records', lines=True)
+
+    def time_read_json_lines(self, index):
+        read_json(self.fname, orient='records', lines=True)
+
+    def time_read_json_lines_concat(self, index):
+        concat(read_json(self.fname, orient='records', lines=True,
+                         chunksize=25000))
+
+    def peakmem_read_json_lines(self, index):
+        read_json(self.fname, orient='records', lines=True)
+
+    def peakmem_read_json_lines_concat(self, index):
+        concat(read_json(self.fname, orient='records', lines=True,
+                         chunksize=25000))
+
+
+class ToJSON(BaseIO):
+
+    goal_time = 0.2
+    fname = "__test__.json"
+    params = ['split', 'columns', 'index']
+    param_names = ['orient']
+
+    def setup(self, lines_orient):
+        N = 10**5
+        ncols = 5
+        index = date_range('20000101', periods=N, freq='H')
+        timedeltas = timedelta_range(start=1, periods=N, freq='s')
+        datetimes = date_range(start=1, periods=N, freq='s')
+        ints = np.random.randint(100000000, size=N)
+        floats = np.random.randn(N)
+        strings = tm.makeStringIndex(N)
+        self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N))
+        self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index)
+        self.df_td_int_ts = DataFrame({'td_1': timedeltas,
+                                       'td_2': timedeltas,
+                                       'int_1': ints,
+                                       'int_2': ints,
+                                       'ts_1': datetimes,
+                                       'ts_2': datetimes},
+                                      index=index)
+        self.df_int_floats = DataFrame({'int_1': ints,
+                                        'int_2': ints,
+                                        'int_3': ints,
+                                        'float_1': floats,
+                                        'float_2': floats,
+                                        'float_3': floats},
+                                       index=index)
+        self.df_int_float_str = DataFrame({'int_1': ints,
+                                           'int_2': ints,
+                                           'float_1': floats,
+                                           'float_2': floats,
+                                           'str_1': strings,
+                                           'str_2': strings},
+                                          index=index)
+
+    def time_floats_with_int_index(self, orient):
+        self.df.to_json(self.fname, orient=orient)
+
+    def time_floats_with_dt_index(self, orient):
+        self.df_date_idx.to_json(self.fname, orient=orient)
+
+    def time_delta_int_tstamp(self, orient):
+        self.df_td_int_ts.to_json(self.fname, orient=orient)
+
+    def time_float_int(self, orient):
+        self.df_int_floats.to_json(self.fname, orient=orient)
+
+    def time_float_int_str(self, orient):
+        self.df_int_float_str.to_json(self.fname, orient=orient)
+
+    def time_floats_with_int_idex_lines(self, orient):
+        self.df.to_json(self.fname, orient='records', lines=True)
+
+    def time_floats_with_dt_index_lines(self, orient):
+        self.df_date_idx.to_json(self.fname, orient='records', lines=True)
+
+    def time_delta_int_tstamp_lines(self, orient):
+        self.df_td_int_ts.to_json(self.fname, orient='records', lines=True)
+
+    def time_float_int_lines(self, orient):
+        self.df_int_floats.to_json(self.fname, orient='records', lines=True)
+
+    def time_float_int_str_lines(self, orient):
+        self.df_int_float_str.to_json(self.fname, orient='records', lines=True)