Skip to content

Commit 796f45d

Browse files
author
Brian Hulette
committed
add DataFrame filter and count ops
1 parent 30f0330 commit 796f45d

File tree

2 files changed

+105
-16
lines changed

2 files changed

+105
-16
lines changed

js/perf/index.js

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ for (let {name, buffers, tests} of require('./table_config')) {
4848
const dfIteratorCountSuite = new Benchmark.Suite(`DataFrame Iterator Count "${name}"`, { async: true });
4949
const dfDirectCountSuite = new Benchmark.Suite(`DataFrame Direct Count "${name}"`, { async: true });
5050
const dfScanCountSuite = new Benchmark.Suite(`DataFrame Scan Count "${name}"`, { async: true });
51+
const dfFilterCountSuite = new Benchmark.Suite(`DataFrame Filter Scan Count "${name}"`, { async: true });
5152
const vectorCountSuite = new Benchmark.Suite(`Vector Count "${name}"`, { async: true });
5253
const table = Table.from(buffers);
5354

@@ -58,10 +59,11 @@ for (let {name, buffers, tests} of require('./table_config')) {
5859
dfIteratorCountSuite.add(createDataFrameIteratorCountTest(table, test.col, test.test, test.value))
5960
dfDirectCountSuite.add(createDataFrameDirectCountTest(table, test.col, test.test, test.value))
6061
dfScanCountSuite.add(createDataFrameScanCountTest(table, test.col, test.test, test.value))
62+
dfFilterCountSuite.add(createDataFrameFilterCountTest(table, test.col, test.test, test.value))
6163
vectorCountSuite.add(createVectorCountTest(table.columns[test.col], test.test, test.value))
6264
}
6365

64-
suites.push(tableIteratorSuite, tableCountSuite, dfIteratorSuite, dfIteratorCountSuite, dfDirectCountSuite, dfScanCountSuite, vectorCountSuite)
66+
suites.push(tableIteratorSuite, tableCountSuite, dfIteratorSuite, dfIteratorCountSuite, dfDirectCountSuite, dfScanCountSuite, dfFilterCountSuite, vectorCountSuite)
6567
}
6668

6769
console.log('Running apache-arrow performance tests...\n');
@@ -275,6 +277,25 @@ function createDataFrameScanCountTest(table, column, test, value) {
275277
};
276278
}
277279

280+
function createDataFrameFilterCountTest(table, column, test, value) {
281+
let df = DataFrame.from(table);
282+
if (test == 'gteq') {
283+
df = df.filter((idx, cols)=>cols[column].get(idx) >= value);
284+
} else if (test == 'eq') {
285+
df = df.filter((idx, cols)=>cols[column].get(idx) == value);
286+
} else {
287+
throw new Error(`Unrecognized test "${test}"`);
288+
}
289+
290+
return {
291+
async: true,
292+
name: `name: '${table.columns[column].name}', length: ${table.length}, type: ${table.columns[column].type}, test: ${test}, value: ${value}`,
293+
fn() {
294+
df.count();
295+
}
296+
};
297+
}
298+
278299
function createDataFrameIteratorCountTest(table, column, test, value) {
279300
let df = DataFrame.from(table);
280301

js/src/dataframe/dataframe.ts

Lines changed: 83 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,18 @@ import { Vector } from "../vector/vector";
22
import { StructVector } from "../vector/struct";
33
import { VirtualVector } from "../vector/virtual";
44

5+
export type NextFunc = (idx: number, cols: Vector[]) => void;
6+
export type PredicateFunc = (idx: number, cols: Vector[]) => boolean;
7+
58
export abstract class DataFrame {
9+
constructor(readonly lengths: Uint32Array) {}
610
public abstract columns: Vector<any>[];
711
public abstract getBatch(batch: number): Vector[];
8-
public abstract scan(next: (idx: number, cols: Vector[])=>void): void;
12+
public abstract scan(next: NextFunc): void;
13+
public filter(predicate: PredicateFunc): DataFrame {
14+
return new FilteredDataFrame(this, predicate);
15+
}
16+
917
static from(table: Vector<any>): DataFrame {
1018
// There are two types of Vectors we might want to make into
1119
// a ChunkedDataFrame:
@@ -31,23 +39,26 @@ export abstract class DataFrame {
3139
return new SimpleDataFrame([table]);
3240
}
3341
}
42+
43+
count(): number {
44+
return this.lengths.reduce((acc, val) => acc + val);
45+
}
3446
}
3547

3648
class SimpleDataFrame extends DataFrame {
3749
readonly lengths: Uint32Array;
3850
constructor(public columns: Vector<any>[]) {
39-
super();
51+
super(new Uint32Array([0, columns[0].length]));
4052
if (!this.columns.slice(1).every((v) => v.length === this.columns[0].length)) {
4153
throw new Error("Attempted to create a DataFrame with un-aligned vectors");
4254
}
43-
this.lengths = new Uint32Array([0, this.columns[0].length]);
4455
}
4556

4657
public getBatch() {
4758
return this.columns;
4859
}
4960

50-
public scan(next: (idx: number, cols: Vector[])=>void) {
61+
public scan(next: NextFunc) {
5162
for (let idx = -1; ++idx < this.lengths[1];) {
5263
next(idx, this.columns)
5364
}
@@ -62,24 +73,16 @@ class SimpleDataFrame extends DataFrame {
6273

6374
class ChunkedDataFrame extends DataFrame {
6475
public columns: Vector<any>[];
65-
readonly lengths: Uint32Array;
6676
constructor(private virtuals: VirtualVector<any>[]) {
67-
super();
68-
const offsets = virtuals[0].offsets;
69-
if (!this.virtuals.slice(1).every((v) => v.aligned(virtuals[0]))) {
70-
throw new Error("Attempted to create a DataFrame with un-aligned vectors");
71-
}
72-
this.lengths = new Uint32Array(offsets.length);
73-
offsets.forEach((offset, i) => {
74-
this.lengths[i] = offsets[i+1] - offset;;
75-
});
77+
super(ChunkedDataFrame.getLengths(virtuals));
78+
this.virtuals = virtuals;
7679
}
7780

7881
getBatch(batch: number): Vector[] {
7982
return this.virtuals.map((virt) => virt.vectors[batch]);
8083
}
8184

82-
scan(next: (idx: number, cols: Vector[])=>void) {
85+
scan(next: NextFunc) {
8386
for (let batch = -1; ++batch < this.lengths.length;) {
8487
const length = this.lengths[batch];
8588

@@ -106,4 +109,69 @@ class ChunkedDataFrame extends DataFrame {
106109
}
107110
}
108111
}
112+
113+
private static getLengths(virtuals: VirtualVector<any>[]): Uint32Array {
114+
if (!virtuals.slice(1).every((v) => v.aligned(virtuals[0]))) {
115+
throw new Error("Attempted to create a DataFrame with un-aligned vectors");
116+
}
117+
return new Uint32Array(virtuals[0].vectors.map((v)=>v.length));
118+
}
119+
}
120+
121+
class FilteredDataFrame extends DataFrame {
122+
public columns: Vector<any>[];
123+
constructor (readonly parent: DataFrame, private predicate: PredicateFunc) {
124+
super(parent.lengths);
125+
}
126+
127+
getBatch(batch: number): Vector[] {
128+
return this.parent.getBatch(batch);
129+
};
130+
131+
scan(next: NextFunc) {
132+
// inlined version of this:
133+
// this.parent.scan((idx, columns) => {
134+
// if (this.predicate(idx, columns)) next(idx, columns);
135+
// });
136+
for (let batch = -1; ++batch < this.parent.lengths.length;) {
137+
const length = this.parent.lengths[batch];
138+
139+
// load batches
140+
const columns = this.parent.getBatch(batch);
141+
142+
// yield all indices
143+
for (let idx = -1; ++idx < length;) {
144+
if (this.predicate(idx, columns)) next(idx, columns);
145+
}
146+
}
147+
}
148+
149+
count(): number {
150+
// inlined version of this:
151+
// let sum = 0;
152+
// this.parent.scan((idx, columns) => {
153+
// if (this.predicate(idx, columns)) ++sum;
154+
// });
155+
// return sum;
156+
let sum = 0;
157+
for (let batch = -1; ++batch < this.parent.lengths.length;) {
158+
const length = this.parent.lengths[batch];
159+
160+
// load batches
161+
const columns = this.parent.getBatch(batch);
162+
163+
// yield all indices
164+
for (let idx = -1; ++idx < length;) {
165+
if (this.predicate(idx, columns)) ++sum;
166+
}
167+
}
168+
return sum;
169+
}
170+
171+
filter(predicate: PredicateFunc): DataFrame {
172+
return new FilteredDataFrame(
173+
this.parent,
174+
(idx, cols) => this.predicate(idx, cols) && predicate(idx, cols)
175+
);
176+
}
109177
}

0 commit comments

Comments
 (0)