Skip to content

Commit 4d9e8c0

Browse files
author
Brian Hulette
committed
Add concept of predicates for filtering dataframes
1 parent 796f45d commit 4d9e8c0

File tree

4 files changed

+192
-13
lines changed

4 files changed

+192
-13
lines changed

js/perf/index.js

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616
// under the License.
1717

1818
// Use the ES5 UMD target as perf baseline
19-
// const { DataFrame, Table, readVectors } = require('../targets/es5/umd');
20-
// const { DataFrame, Table, readVectors } = require('../targets/es5/cjs');
21-
// const { DataFrame, Table, readVectors } = require('../targets/es2015/umd');
22-
const { DataFrame, Table, readVectors } = require('../targets/es2015/cjs');
19+
// const { lit, col, DataFrame, Table, readVectors } = require('../targets/es5/umd');
20+
// const { lit, col, DataFrame, Table, readVectors } = require('../targets/es5/cjs');
21+
// const { lit, col, DataFrame, Table, readVectors } = require('../targets/es2015/umd');
22+
const { lit, col, DataFrame, Table, readVectors } = require('../targets/es2015/cjs');
2323

2424
const config = require('./config');
2525
const Benchmark = require('benchmark');
@@ -280,9 +280,9 @@ function createDataFrameScanCountTest(table, column, test, value) {
280280
function createDataFrameFilterCountTest(table, column, test, value) {
281281
let df = DataFrame.from(table);
282282
if (test == 'gteq') {
283-
df = df.filter((idx, cols)=>cols[column].get(idx) >= value);
283+
df = df.filter(col(table.columns[column].name).gteq(value));
284284
} else if (test == 'eq') {
285-
df = df.filter((idx, cols)=>cols[column].get(idx) == value);
285+
df = df.filter(col(table.columns[column].name).eq(value));
286286
} else {
287287
throw new Error(`Unrecognized test "${test}"`);
288288
}

js/src/Arrow.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ import {
4646
} from './vector/numeric';
4747

4848
import { DataFrame } from './dataframe/dataframe';
49+
import { lit, col } from './dataframe/predicate';
4950

5051
// closure compiler always erases static method names:
5152
// https://github.com/google/closure-compiler/issues/1776
@@ -88,12 +89,16 @@ export {
8889
};
8990

9091
export { DataFrame } from './dataframe/dataframe';
92+
export { lit, col } from './dataframe/predicate';
93+
9194

9295
/* These exports are needed for the closure umd targets */
9396
try {
9497
const Arrow = eval('exports');
9598
if (typeof Arrow === 'object') {
9699
// string indexers tell closure compiler not to rename these properties
100+
Arrow['lit'] = lit;
101+
Arrow['col'] = col;
97102
Arrow['read'] = read;
98103
Arrow['readAsync'] = readAsync;
99104
Arrow['Table'] = Table;

js/src/dataframe/dataframe.ts

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,16 @@ import { Vector } from "../vector/vector";
22
import { StructVector } from "../vector/struct";
33
import { VirtualVector } from "../vector/virtual";
44

5+
import { Predicate } from "./predicate"
6+
57
export type NextFunc = (idx: number, cols: Vector[]) => void;
6-
export type PredicateFunc = (idx: number, cols: Vector[]) => boolean;
78

89
export abstract class DataFrame {
910
constructor(readonly lengths: Uint32Array) {}
1011
public abstract columns: Vector<any>[];
1112
public abstract getBatch(batch: number): Vector[];
1213
public abstract scan(next: NextFunc): void;
13-
public filter(predicate: PredicateFunc): DataFrame {
14+
public filter(predicate: Predicate): DataFrame {
1415
return new FilteredDataFrame(this, predicate);
1516
}
1617

@@ -120,7 +121,7 @@ class ChunkedDataFrame extends DataFrame {
120121

121122
class FilteredDataFrame extends DataFrame {
122123
public columns: Vector<any>[];
123-
constructor (readonly parent: DataFrame, private predicate: PredicateFunc) {
124+
constructor (readonly parent: DataFrame, private predicate: Predicate) {
124125
super(parent.lengths);
125126
}
126127

@@ -138,10 +139,11 @@ class FilteredDataFrame extends DataFrame {
138139

139140
// load batches
140141
const columns = this.parent.getBatch(batch);
142+
const predicate = this.predicate.bind(columns);
141143

142144
// yield all indices
143145
for (let idx = -1; ++idx < length;) {
144-
if (this.predicate(idx, columns)) next(idx, columns);
146+
if (predicate(idx, columns)) next(idx, columns);
145147
}
146148
}
147149
}
@@ -159,19 +161,20 @@ class FilteredDataFrame extends DataFrame {
159161

160162
// load batches
161163
const columns = this.parent.getBatch(batch);
164+
const predicate = this.predicate.bind(columns);
162165

163166
// yield all indices
164167
for (let idx = -1; ++idx < length;) {
165-
if (this.predicate(idx, columns)) ++sum;
168+
if (predicate(idx, columns)) ++sum;
166169
}
167170
}
168171
return sum;
169172
}
170173

171-
filter(predicate: PredicateFunc): DataFrame {
174+
filter(predicate: Predicate): DataFrame {
172175
return new FilteredDataFrame(
173176
this.parent,
174-
(idx, cols) => this.predicate(idx, cols) && predicate(idx, cols)
177+
this.predicate.and(predicate)
175178
);
176179
}
177180
}

js/src/dataframe/predicate.ts

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
import { Vector } from "../vector/vector";
2+
3+
export type ValueFunc<T> = (idx: number, cols: Vector[]) => T|null;
4+
export type PredicateFunc = (idx: number, cols: Vector[]) => boolean;
5+
6+
export abstract class Value<T> {
7+
eq(other: Value<T>|T): Predicate {
8+
if (!(other instanceof Value)) other = new Literal(other);
9+
return new Equals(this, other);
10+
}
11+
lteq(other: Value<T>|T): Predicate {
12+
if (!(other instanceof Value)) other = new Literal(other);
13+
return new LTeq(this, other);
14+
}
15+
gteq(other: Value<T>|T): Predicate {
16+
if (!(other instanceof Value)) other = new Literal(other);
17+
return new GTeq(this, other);
18+
}
19+
}
20+
21+
class Literal<T=any> extends Value<T> {
22+
constructor(public v: T) { super(); }
23+
}
24+
25+
class Col<T=any> extends Value<T> {
26+
vector: Vector<T>;
27+
colidx: number;
28+
29+
constructor(public name: string) { super(); }
30+
bind(cols: Vector[]) {
31+
if (!this.colidx) {
32+
// Assume column index doesn't change between calls to bind
33+
//this.colidx = cols.findIndex(v => v.name.indexOf(this.name) != -1);
34+
this.colidx = -1;
35+
for (let idx = -1; ++idx < cols.length;) {
36+
if (cols[idx].name === this.name) {
37+
this.colidx = idx;
38+
break;
39+
}
40+
}
41+
if (this.colidx < 0) throw new Error(`Failed to bind Col "${this.name}"`)
42+
}
43+
this.vector = cols[this.colidx]
44+
return this.vector.get.bind(this.vector);
45+
}
46+
47+
emitString() { return `cols[${this.colidx}].get(idx)`; }
48+
}
49+
50+
export abstract class Predicate {
51+
abstract bind(cols: Vector[]): PredicateFunc;
52+
and(expr: Predicate): Predicate { return new And(this, expr); }
53+
or(expr: Predicate): Predicate { return new Or(this, expr); }
54+
ands(): Predicate[] { return [this]; }
55+
}
56+
57+
abstract class ComparisonPredicate<T=any> extends Predicate {
58+
constructor(public readonly left: Value<T>, public readonly right: Value<T>) {
59+
super();
60+
}
61+
62+
bind(cols: Vector<any>[]) {
63+
if (this.left instanceof Literal) {
64+
if (this.right instanceof Literal) {
65+
return this._bindLitLit(cols, this.left, this.right);
66+
} else { // right is a Col
67+
68+
return this._bindColLit(cols, this.right as Col, this.left);
69+
}
70+
} else { // left is a Col
71+
if (this.right instanceof Literal) {
72+
return this._bindColLit(cols, this.left as Col, this.right);
73+
} else { // right is a Col
74+
return this._bindColCol(cols, this.left as Col, this.right as Col);
75+
}
76+
}
77+
}
78+
79+
protected abstract _bindLitLit(cols: Vector<any>[], left: Literal, right: Literal): PredicateFunc;
80+
protected abstract _bindColCol(cols: Vector<any>[], left: Col , right: Col ): PredicateFunc;
81+
protected abstract _bindColLit(cols: Vector<any>[], col: Col , lit: Literal ): PredicateFunc;
82+
}
83+
84+
abstract class CombinationPredicate extends Predicate {
85+
constructor(public readonly left: Predicate, public readonly right: Predicate) {
86+
super();
87+
}
88+
}
89+
90+
class And extends CombinationPredicate {
91+
bind(cols: Vector[]) {
92+
const left = this.left.bind(cols);
93+
const right = this.right.bind(cols);
94+
return (idx: number, cols: Vector[]) => left(idx, cols) && right(idx, cols);
95+
}
96+
ands() : Predicate[] { return this.left.ands().concat(this.right.ands()); }
97+
}
98+
99+
class Or extends CombinationPredicate {
100+
bind(cols: Vector[]) {
101+
const left = this.left.bind(cols);
102+
const right = this.right.bind(cols);
103+
return (idx: number, cols: Vector[]) => left(idx, cols) || right(idx, cols);
104+
}
105+
}
106+
107+
class Equals extends ComparisonPredicate {
108+
protected _bindLitLit(_: Vector<any>[], left: Literal, right: Literal): PredicateFunc {
109+
const rtrn: boolean = left.v == right.v;
110+
return () => rtrn;
111+
}
112+
113+
protected _bindColCol(cols: Vector<any>[], left: Col , right: Col ): PredicateFunc {
114+
const left_func = left.bind(cols);
115+
const right_func = right.bind(cols);
116+
return (idx: number, cols: Vector[]) => left_func(idx, cols) == right_func(idx, cols);
117+
}
118+
119+
protected _bindColLit(cols: Vector<any>[], col: Col , lit: Literal ): PredicateFunc {
120+
const col_func = col.bind(cols);
121+
return (idx: number, cols: Vector[]) => col_func(idx, cols) == lit.v;
122+
}
123+
}
124+
125+
class LTeq extends ComparisonPredicate {
126+
protected _bindLitLit(_: Vector<any>[], left: Literal, right: Literal): PredicateFunc {
127+
const rtrn: boolean = left.v <= right.v;
128+
return () => rtrn;
129+
}
130+
131+
protected _bindColCol(cols: Vector<any>[], left: Col , right: Col ): PredicateFunc {
132+
const left_func = left.bind(cols);
133+
const right_func = right.bind(cols);
134+
return (idx: number, cols: Vector[]) => left_func(idx, cols) <= right_func(idx, cols);
135+
}
136+
137+
protected _bindColLit(cols: Vector<any>[], col: Col , lit: Literal ): PredicateFunc {
138+
const col_func = col.bind(cols);
139+
return (idx: number, cols: Vector[]) => col_func(idx, cols) <= lit.v;
140+
}
141+
}
142+
143+
class GTeq extends ComparisonPredicate {
144+
protected _bindLitLit(_: Vector<any>[], left: Literal, right: Literal): PredicateFunc {
145+
const rtrn: boolean = left.v >= right.v;
146+
return () => rtrn;
147+
}
148+
149+
protected _bindColCol(cols: Vector<any>[], left: Col, right: Col): PredicateFunc {
150+
const left_func = left.bind(cols);
151+
const right_func = right.bind(cols);
152+
return (idx: number, cols: Vector[]) => left_func(idx, cols) >= right_func(idx, cols);
153+
}
154+
155+
protected _bindColLit(cols: Vector<any>[], col: Col, lit: Literal): PredicateFunc {
156+
const col_func = col.bind(cols);
157+
return (idx: number, cols: Vector[]) => col_func(idx, cols) >= lit.v;
158+
}
159+
//eval(idx: number, cols: Vector[]) {
160+
// return this.left.eval(idx, cols) >= this.right.eval(idx, cols);
161+
//}
162+
//emitString() {
163+
// return `${this.left.emitString()} >= ${this.right.emitString()}`
164+
//}
165+
//createDictionaryEval(schema, lit: Literal, col: Col): (idx: number, cols: Vector[]) => boolean {
166+
// return this.eval;
167+
//}
168+
}
169+
170+
export function lit(n: number): Value<any> { return new Literal(n); }
171+
export function col(n: string): Value<any> { return new Col(n); }

0 commit comments

Comments
 (0)