Add perf tests for table scans

Brian Hulette · Brian Hulette · commit a1edac2095b4 · 2018-01-12T10:03:36.000-05:00
diff --git a/js/generate.py b/js/generate.py
@@ -0,0 +1,36 @@
+import pyarrow as pa
+import random
+import numpy as np
+import pandas as pd
+
+
+cities = [u'Charlottesville', u'New York', u'San Francisco', u'Seattle', u'Terre Haute', u'Washington, DC']
+
+def generate_batch(batch_len):
+    return pa.RecordBatch.from_arrays([
+        pa.Array.from_pandas(pd.Series(np.random.uniform(-90,90,batch_len), dtype="float32")),
+        pa.Array.from_pandas(pd.Series(np.random.uniform(-180,180,batch_len), dtype="float32")),
+        pa.Array.from_pandas(pd.Categorical((random.choice(cities) for i in range(batch_len)), cities)),
+        pa.Array.from_pandas(pd.Categorical((random.choice(cities) for i in range(batch_len)), cities))
+    ], ['lat', 'lng', 'origin', 'destination'])
+
+def write_record_batches(fd, batch_len, num_batches):
+    writer = pa.ipc.RecordBatchStreamWriter(fd, generate_batch(1).schema)
+    for batch in range(num_batches):
+        writer.write_batch(generate_batch(batch_len))
+
+    writer.close()
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('filename', help='number of batches')
+    parser.add_argument('-n', '--num-batches', help='number of batches', type=int, default=10)
+    parser.add_argument('-b', '--batch-size', help='size of each batch', type=int, default=100000)
+
+    args = parser.parse_args()
+
+    print "Writing {} {}-element batches to '{}'".format(args.num_batches, args.batch_size, args.filename)
+    with open(args.filename, 'w') as fd:
+        write_record_batches(fd, args.batch_size, args.num_batches)
diff --git a/js/perf/index.js b/js/perf/index.js
@@ -41,6 +41,21 @@ for (let { name, buffers} of config) {
     suites.push(getByIndexSuite, iterateSuite, sliceSuite, parseSuite);
 }
 
+for (let {name, buffers, tests} of require('./table_config')) {
+    const tableIterateSuite = new Benchmark.Suite(`Table Iterate ${name}`, { async: true });
+    const tableCountBySuite = new Benchmark.Suite(`Table Count By ${name}`, { async: true });
+    const vectorCountBySuite = new Benchmark.Suite(`Vector Count By ${name}`, { async: true });
+    const table = Table.from(buffers);
+
+    tableIterateSuite.add(createTableIterateTest(table));
+    for (test of tests) {
+        tableCountBySuite.add(createTableCountByTest(table, test.col, test.test, test.value))
+        vectorCountBySuite.add(createVectorCountByTest(table.columns[test.col], test.test, test.value))
+    }
+
+    suites.push(tableIterateSuite, tableCountBySuite, vectorCountBySuite)
+}
+
 console.log('Running apache-arrow performance tests...\n');
 
 run();
@@ -109,3 +124,66 @@ function createGetByIndexTest(vector) {
         }
     };
 }
+
+function createVectorCountByTest(vector, test, value) {
+    let op;
+    if (test == 'gteq') {
+        op = function () {
+            sum = 0;
+            for (cell of vector) {
+                sum += (cell >= value)
+            }
+        }
+    } else if (test == 'eq') {
+        op = function () {
+            sum = 0;
+            for (cell of vector) {
+                sum += (cell == value)
+            }
+        }
+    } else {
+        throw new Error(`Unrecognized test "$test"`);
+    }
+
+    return {
+        async: true,
+        name: `name: '${vector.name}', length: ${vector.length}, type: ${vector.type}, test: ${test}, value: ${value}`,
+        fn: op
+    };
+}
+
+function createTableIterateTest(table) {
+    let row;
+    return {
+        async: true,
+        name: `length: ${table.length}`,
+        fn() { for (row of table) {} }
+    };
+}
+
+function createTableCountByTest(table, column, test, value) {
+    let op;
+    if (test == 'gteq') {
+        op = function () {
+            sum = 0;
+            for (row of table) {
+                sum += (row.get(column) >= value)
+            }
+        }
+    } else if (test == 'eq') {
+        op = function() {
+            sum = 0;
+            for (row of table) {
+                sum += (row.get(column) == value)
+            }
+        }
+    } else {
+        throw new Error(`Unrecognized test "${test}"`);
+    }
+
+    return {
+        async: true,
+        name: `name: '${table.columns[column].name}', length: ${table.length}, type: ${table.columns[column].type}, test: ${test}, value: ${value}`,
+        fn: op
+    };
+}
diff --git a/js/perf/table_config.js b/js/perf/table_config.js
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+const fs = require('fs');
+const path = require('path');
+const glob = require('glob');
+
+const config = [];
+const filenames = glob.sync(path.resolve(__dirname, `../test/data/tables/`, `*.arrow`));
+
+tests = [
+    {col: 0, test: 'gteq', value: 0        },
+    {col: 1, test: 'gteq', value: 0        },
+    {col: 2, test:   'eq', value: 'Seattle'},
+]
+
+for (const filename of filenames) {
+    const { name } = path.parse(filename);
+    config.push({ name, buffers: [fs.readFileSync(filename)], tests });
+}
+
+module.exports = config;