@@ -54,14 +54,17 @@ function columnsFromBatches(batches: Vector[][]) {
54
54
55
55
export class Table implements DataFrame {
56
56
static from ( sources ?: Iterable < Uint8Array | Buffer | string > | object | string ) {
57
- let batches : Vector < any > [ ] [ ] = [ [ ] ] ;
57
+ let batches : Vector [ ] [ ] = [ ] ;
58
58
if ( sources ) {
59
- batches = Array . from ( read ( sources ) ) ;
59
+ batches = [ ] ;
60
+ for ( let batch of read ( sources ) ) {
61
+ batches . push ( batch ) ;
62
+ }
60
63
}
61
64
return new Table ( { batches } ) ;
62
65
}
63
66
static async fromAsync ( sources ?: AsyncIterable < Uint8Array | Buffer | string > ) {
64
- let batches : Vector < any > [ ] [ ] = [ [ ] ] ;
67
+ let batches : Vector [ ] [ ] = [ ] ;
65
68
if ( sources ) {
66
69
batches = [ ] ;
67
70
for await ( let batch of readAsync ( sources ) ) {
@@ -119,34 +122,34 @@ export class Table implements DataFrame {
119
122
count_by = new Col ( count_by ) ;
120
123
}
121
124
122
- // the last batch will have the most complete dictionary, use it's data
123
- // vector as our count by keys
125
+ // Assume that all dictionary batches are deltas, which means that the
126
+ // last record batch has the most complete dictionary
124
127
count_by . bind ( this . batches [ this . batches . length - 1 ] ) ;
125
128
if ( ! ( count_by . vector instanceof DictionaryVector ) ) {
126
- throw new Error ( " countBy currently only supports dictionary-encoded columns" ) ;
129
+ throw new Error ( ' countBy currently only supports dictionary-encoded columns' ) ;
127
130
}
128
131
129
- let keys : Vector = ( count_by . vector as DictionaryVector < any > ) . data ;
132
+ let data : Vector = ( count_by . vector as DictionaryVector < any > ) . data ;
130
133
// TODO: Adjust array byte width based on overall length
131
134
// (e.g. if this.length <= 255 use Uint8Array, etc...)
132
- let counts : Uint32Array = new Uint32Array ( keys . length ) ;
133
-
135
+ let counts : Uint32Array = new Uint32Array ( data . length ) ;
134
136
135
137
for ( let batch = - 1 ; ++ batch < this . lengths . length ; ) {
136
138
const length = this . lengths [ batch ] ;
137
139
138
140
// load batches
139
141
const columns = this . batches [ batch ] ;
140
142
count_by . bind ( columns ) ;
143
+ const keys : Vector = ( count_by . vector as DictionaryVector < any > ) . keys ;
141
144
142
145
// yield all indices
143
146
for ( let idx = - 1 ; ++ idx < length ; ) {
144
- let key = ( count_by . vector as DictionaryVector < any > ) . getKey ( idx )
147
+ let key = keys . get ( idx ) ;
145
148
if ( key !== null ) { counts [ key ] ++ ; }
146
149
}
147
150
}
148
151
149
- return new CountByResult ( keys , new Uint32Vector ( { data : counts } ) )
152
+ return new CountByResult ( data , new Uint32Vector ( { data : counts } ) ) ;
150
153
}
151
154
* [ Symbol . iterator ] ( ) {
152
155
for ( let batch = - 1 ; ++ batch < this . lengths . length ; ) {
@@ -220,16 +223,17 @@ class FilteredDataFrame implements DataFrame {
220
223
count_by = new Col ( count_by ) ;
221
224
}
222
225
223
- // the last batch will have the most complete dictionary, use it's data
224
- // vector as our count by keys
226
+ // Assume that all dictionary batches are deltas, which means that the
227
+ // last record batch has the most complete dictionary
225
228
count_by . bind ( this . parent . batches [ this . parent . batches . length - 1 ] ) ;
226
229
if ( ! ( count_by . vector instanceof DictionaryVector ) ) {
227
- throw new Error ( " countBy currently only supports dictionary-encoded columns" ) ;
230
+ throw new Error ( ' countBy currently only supports dictionary-encoded columns' ) ;
228
231
}
229
232
230
- let keys : Vector = ( count_by . vector as DictionaryVector < any > ) . data ;
231
- let counts : Uint32Array = new Uint32Array ( keys . length ) ;
232
-
233
+ const data : Vector = ( count_by . vector as DictionaryVector < any > ) . data ;
234
+ // TODO: Adjust array byte width based on overall length
235
+ // (e.g. if this.length <= 255 use Uint8Array, etc...)
236
+ const counts : Uint32Array = new Uint32Array ( data . length ) ;
233
237
234
238
for ( let batch = - 1 ; ++ batch < this . parent . lengths . length ; ) {
235
239
const length = this . parent . lengths [ batch ] ;
@@ -238,28 +242,29 @@ class FilteredDataFrame implements DataFrame {
238
242
const columns = this . parent . batches [ batch ] ;
239
243
const predicate = this . predicate . bind ( columns ) ;
240
244
count_by . bind ( columns ) ;
245
+ const keys : Vector = ( count_by . vector as DictionaryVector < any > ) . keys ;
241
246
242
247
// yield all indices
243
248
for ( let idx = - 1 ; ++ idx < length ; ) {
244
- let key = ( count_by . vector as DictionaryVector < any > ) . getKey ( idx )
249
+ let key = keys . get ( idx ) ;
245
250
if ( key !== null && predicate ( idx , columns ) ) { counts [ key ] ++ ; }
246
251
}
247
252
}
248
253
249
- return new CountByResult ( keys , new Uint32Vector ( { data : counts } ) )
254
+ return new CountByResult ( data , new Uint32Vector ( { data : counts } ) ) ;
250
255
}
251
256
}
252
257
253
258
export class CountByResult extends Table implements DataFrame {
254
- constructor ( readonly keys : Vector , readonly counts : Vector < number | null > ) {
255
- super ( { batches : [ [ keys , counts ] ] } ) ;
259
+ constructor ( readonly values : Vector , readonly counts : Vector < number | null > ) {
260
+ super ( { batches : [ [ values , counts ] ] } ) ;
256
261
}
257
262
258
263
asJSON ( ) : Object {
259
264
let result : { [ key : string ] : number | null } = { } ;
260
265
261
266
for ( let i = - 1 ; ++ i < this . length ; ) {
262
- result [ this . keys . get ( i ) ] = this . counts . get ( i ) ;
267
+ result [ this . values . get ( i ) ] = this . counts . get ( i ) ;
263
268
}
264
269
265
270
return result ;
0 commit comments