16
16
// under the License.
17
17
18
18
import { Vector } from './vector/vector' ;
19
+ import { DictionaryVector } from './vector/dictionary' ;
20
+ import { Uint32Vector } from './vector/numeric' ;
19
21
import { read , readAsync } from './reader/arrow' ;
20
- import { Predicate } from './predicate' ;
22
+ import { Col , Predicate } from './predicate' ;
21
23
22
24
export type NextFunc = ( idx : number , cols : Vector [ ] ) => void ;
23
25
@@ -40,6 +42,7 @@ export interface DataFrame {
40
42
filter ( predicate : Predicate ) : DataFrame ;
41
43
scan ( next : NextFunc ) : void ;
42
44
count ( ) : number ;
45
+ countBy ( col : ( Col | string ) ) : Table ;
43
46
}
44
47
45
48
function columnsFromBatches ( batches : Vector [ ] [ ] ) {
@@ -111,6 +114,40 @@ export class Table implements DataFrame {
111
114
count ( ) : number {
112
115
return this . lengths . reduce ( ( acc , val ) => acc + val ) ;
113
116
}
117
+ countBy ( count_by : ( Col | string ) ) : Table {
118
+ if ( count_by instanceof String ) {
119
+ count_by = new Col ( count_by ) ;
120
+ }
121
+
122
+ // the last batch will have the most complete dictionary, use it's data
123
+ // vector as our count by keys
124
+ count_by . bind ( this . batches [ this . batches . length - 1 ] ) ;
125
+ if ( ! ( count_by . vector instanceof DictionaryVector ) ) {
126
+ throw new Error ( "countBy currently only supports dictionary-encoded columns" ) ;
127
+ }
128
+
129
+ let keys : Vector = ( count_by . vector as DictionaryVector < any > ) . data ;
130
+ // TODO: Adjust array byte width based on overall length
131
+ // (e.g. if this.length <= 255 use Uint8Array, etc...)
132
+ let counts : Uint32Array = new Uint32Array ( keys . length ) ;
133
+
134
+
135
+ for ( let batch = - 1 ; ++ batch < this . lengths . length ; ) {
136
+ const length = this . lengths [ batch ] ;
137
+
138
+ // load batches
139
+ const columns = this . batches [ batch ] ;
140
+ count_by . bind ( columns ) ;
141
+
142
+ // yield all indices
143
+ for ( let idx = - 1 ; ++ idx < length ; ) {
144
+ let key = ( count_by . vector as DictionaryVector < any > ) . getKey ( idx )
145
+ if ( key !== null ) { counts [ key ] ++ ; }
146
+ }
147
+ }
148
+
149
+ return new Table ( { batches : [ [ keys , new Uint32Vector ( { data : counts } ) ] ] } )
150
+ }
114
151
* [ Symbol . iterator ] ( ) {
115
152
for ( let batch = - 1 ; ++ batch < this . lengths . length ; ) {
116
153
const length = this . lengths [ batch ] ;
@@ -177,4 +214,38 @@ class FilteredDataFrame implements DataFrame {
177
214
this . predicate . and ( predicate )
178
215
) ;
179
216
}
217
+
218
+ countBy ( count_by : ( Col | string ) ) : Table {
219
+ if ( count_by instanceof String ) {
220
+ count_by = new Col ( count_by ) ;
221
+ }
222
+
223
+ // the last batch will have the most complete dictionary, use it's data
224
+ // vector as our count by keys
225
+ count_by . bind ( this . parent . batches [ this . parent . batches . length - 1 ] ) ;
226
+ if ( ! ( count_by . vector instanceof DictionaryVector ) ) {
227
+ throw new Error ( "countBy currently only supports dictionary-encoded columns" ) ;
228
+ }
229
+
230
+ let keys : Vector = ( count_by . vector as DictionaryVector < any > ) . data ;
231
+ let counts : Uint32Array = new Uint32Array ( keys . length ) ;
232
+
233
+
234
+ for ( let batch = - 1 ; ++ batch < this . parent . lengths . length ; ) {
235
+ const length = this . parent . lengths [ batch ] ;
236
+
237
+ // load batches
238
+ const columns = this . parent . batches [ batch ] ;
239
+ const predicate = this . predicate . bind ( columns ) ;
240
+ count_by . bind ( columns ) ;
241
+
242
+ // yield all indices
243
+ for ( let idx = - 1 ; ++ idx < length ; ) {
244
+ let key = ( count_by . vector as DictionaryVector < any > ) . getKey ( idx )
245
+ if ( key !== null && predicate ( idx , columns ) ) { counts [ key ] ++ ; }
246
+ }
247
+ }
248
+
249
+ return new Table ( { batches : [ [ keys , new Uint32Vector ( { data : counts } ) ] ] } )
250
+ }
180
251
}
0 commit comments