Skip to content

Commit 6719147

Browse files
author
Brian Hulette
committed
Add DataFrame.countBy operation
1 parent 2f4a349 commit 6719147

File tree

3 files changed

+101
-8
lines changed

3 files changed

+101
-8
lines changed

js/perf/index.js

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,17 +41,23 @@ for (let { name, buffers} of config) {
4141
suites.push(getByIndexSuite, iterateSuite, sliceSuite, parseSuite);
4242
}
4343

44-
for (let {name, buffers, tests} of require('./table_config')) {
44+
for (let {name, buffers, countBys, counts} of require('./table_config')) {
45+
const table = Table.from(buffers);
46+
47+
const dfCountBySuite = new Benchmark.Suite(`DataFrame Count By "${name}"`, { async: true });
48+
for (countBy of countBys) {
49+
dfCountBySuite.add(createDataFrameCountByTest(table, countBy));
50+
}
51+
4552
const dfFilterCountSuite = new Benchmark.Suite(`DataFrame Filter-Scan Count "${name}"`, { async: true });
4653
const dfDirectCountSuite = new Benchmark.Suite(`DataFrame Direct Count "${name}"`, { async: true });
47-
const table = Table.from(buffers);
4854

49-
for (test of tests) {
55+
for (test of counts) {
5056
dfFilterCountSuite.add(createDataFrameFilterCountTest(table, test.col, test.test, test.value))
5157
dfDirectCountSuite.add(createDataFrameDirectCountTest(table, test.col, test.test, test.value))
5258
}
5359

54-
suites.push(dfFilterCountSuite, dfDirectCountSuite)
60+
suites.push(dfCountBySuite, dfFilterCountSuite, dfDirectCountSuite)
5561
}
5662

5763
console.log('Running apache-arrow performance tests...\n');
@@ -167,6 +173,18 @@ function createDataFrameDirectCountTest(table, column, test, value) {
167173
};
168174
}
169175

176+
function createDataFrameCountByTest(table, column) {
177+
let colidx = table.columns.findIndex((c)=>c.name === column);
178+
179+
return {
180+
async: true,
181+
name: `name: '${column}', length: ${table.length}, type: ${table.columns[colidx].type}`,
182+
fn() {
183+
table.countBy(col(column));
184+
}
185+
};
186+
}
187+
170188
function createDataFrameFilterCountTest(table, column, test, value) {
171189
let colidx = table.columns.findIndex((c)=>c.name === column);
172190
let df;

js/perf/table_config.js

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@ const glob = require('glob');
2222
const config = [];
2323
const filenames = glob.sync(path.resolve(__dirname, `../test/data/tables/`, `*.arrow`));
2424

25-
tests = {
25+
countBys = {
26+
"tracks": ['origin', 'destination']
27+
}
28+
counts = {
2629
"tracks": [
2730
{col: 'lat', test: 'gteq', value: 0 },
2831
{col: 'lng', test: 'gteq', value: 0 },
@@ -32,11 +35,12 @@ tests = {
3235

3336
for (const filename of filenames) {
3437
const { name } = path.parse(filename);
35-
if (name in tests) {
38+
if (name in counts) {
3639
config.push({
3740
name,
3841
buffers: [fs.readFileSync(filename)],
39-
tests: tests[name]
42+
countBys: countBys[name],
43+
counts: counts[name],
4044
});
4145
}
4246
}

js/src/table.ts

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,10 @@
1616
// under the License.
1717

1818
import { Vector } from './vector/vector';
19+
import { DictionaryVector } from './vector/dictionary';
20+
import { Uint32Vector } from './vector/numeric';
1921
import { read, readAsync } from './reader/arrow';
20-
import { Predicate } from './predicate';
22+
import { Col, Predicate } from './predicate';
2123

2224
export type NextFunc = (idx: number, cols: Vector[]) => void;
2325

@@ -40,6 +42,7 @@ export interface DataFrame {
4042
filter(predicate: Predicate): DataFrame;
4143
scan(next: NextFunc): void;
4244
count(): number;
45+
countBy(col: (Col|string)): Table;
4346
}
4447

4548
function columnsFromBatches(batches: Vector[][]) {
@@ -111,6 +114,40 @@ export class Table implements DataFrame {
111114
count(): number {
112115
return this.lengths.reduce((acc, val) => acc + val);
113116
}
117+
countBy(count_by: (Col|string)): Table {
118+
if (count_by instanceof String) {
119+
count_by = new Col(count_by);
120+
}
121+
122+
// the last batch will have the most complete dictionary, use it's data
123+
// vector as our count by keys
124+
count_by.bind(this.batches[this.batches.length - 1]);
125+
if (!(count_by.vector instanceof DictionaryVector)) {
126+
throw new Error("countBy currently only supports dictionary-encoded columns");
127+
}
128+
129+
let keys: Vector = (count_by.vector as DictionaryVector<any>).data;
130+
// TODO: Adjust array byte width based on overall length
131+
// (e.g. if this.length <= 255 use Uint8Array, etc...)
132+
let counts: Uint32Array = new Uint32Array(keys.length);
133+
134+
135+
for (let batch = -1; ++batch < this.lengths.length;) {
136+
const length = this.lengths[batch];
137+
138+
// load batches
139+
const columns = this.batches[batch];
140+
count_by.bind(columns);
141+
142+
// yield all indices
143+
for (let idx = -1; ++idx < length;) {
144+
let key = (count_by.vector as DictionaryVector<any>).getKey(idx)
145+
if (key !== null) { counts[key]++; }
146+
}
147+
}
148+
149+
return new Table({batches: [[keys, new Uint32Vector({data: counts})]]})
150+
}
114151
*[Symbol.iterator]() {
115152
for (let batch = -1; ++batch < this.lengths.length;) {
116153
const length = this.lengths[batch];
@@ -177,4 +214,38 @@ class FilteredDataFrame implements DataFrame {
177214
this.predicate.and(predicate)
178215
);
179216
}
217+
218+
countBy(count_by: (Col|string)): Table {
219+
if (count_by instanceof String) {
220+
count_by = new Col(count_by);
221+
}
222+
223+
// the last batch will have the most complete dictionary, use it's data
224+
// vector as our count by keys
225+
count_by.bind(this.parent.batches[this.parent.batches.length - 1]);
226+
if (!(count_by.vector instanceof DictionaryVector)) {
227+
throw new Error("countBy currently only supports dictionary-encoded columns");
228+
}
229+
230+
let keys: Vector = (count_by.vector as DictionaryVector<any>).data;
231+
let counts: Uint32Array = new Uint32Array(keys.length);
232+
233+
234+
for (let batch = -1; ++batch < this.parent.lengths.length;) {
235+
const length = this.parent.lengths[batch];
236+
237+
// load batches
238+
const columns = this.parent.batches[batch];
239+
const predicate = this.predicate.bind(columns);
240+
count_by.bind(columns);
241+
242+
// yield all indices
243+
for (let idx = -1; ++idx < length;) {
244+
let key = (count_by.vector as DictionaryVector<any>).getKey(idx)
245+
if (key !== null && predicate(idx, columns)) { counts[key]++; }
246+
}
247+
}
248+
249+
return new Table({batches: [[keys, new Uint32Vector({data: counts})]]})
250+
}
180251
}

0 commit comments

Comments
 (0)