Skip to content

Commit 790095a

Browse files
authored
ARROW-16 BSON array support (#119)
1 parent 139d8ed commit 790095a

File tree

9 files changed

+177
-26
lines changed

9 files changed

+177
-26
lines changed

bindings/python/docs/source/changelog.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ Changelog
44
Changes in Version 0.7.0
55
------------------------
66
- Added support for BSON Embedded Document type.
7+
- Added support for BSON Array type.
8+
79

810
Changes in Version 0.6.3
911
------------------------

bindings/python/docs/source/quickstart.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@ Nested data (embedded documents) are also supported::
7676
from pymongoarrow.api import Schema
7777
schema = Schema({'_id': int, 'amount': float, 'account': { 'name': str, 'account_number': int}})
7878

79+
Arrays (and nested arrays) are also supported::
80+
81+
from pymongoarrow.api import Schema
82+
schema = Schema({'_id': int, 'amount': float, 'account': list_(int32())})
7983

8084
.. note::
8185

@@ -112,6 +116,12 @@ Nested data (embedded documents) are also supported::
112116
schema = Schema({'_id': int, 'amount': float, 'account': { 'name': str, 'account_number': int}})
113117
arrow_table = client.db.data.find_arrow_all({'amount': {'$gt': 0}}, schema=schema)
114118

119+
Arrays (and nested arrays) are also supported::
120+
121+
from pymongoarrow.api import Schema
122+
schema = Schema({'_id': int, 'amount': float, 'account': list_(int32())})
123+
arrow_table = client.db.data.find_arrow_all({'amount': {'$gt': 0}}, schema=schema)
124+
115125
Aggregate operations
116126
--------------------
117127
Running ``aggregate`` operations is similar to ``find``. Here is an example of

bindings/python/docs/source/supported_types.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ Support for additional types will be added in subsequent releases.
1919
- :class:`py.str`, an instance of :class:`pyarrow.string`
2020
* - Embedded document
2121
- :class:`py.dict`, and instance of :class:`pyarrow.struct`
22+
* - Embedded array
23+
- :class:`py.list`, an instance of :class:`pyarrow.list_`,
2224
* - ObjectId
2325
- :class:`py.bytes`, :class:`bson.ObjectId`, an instance of :class:`pymongoarrow.types.ObjectIdType`, an instance of :class:`pyarrow.FixedSizeBinaryScalar`
2426
* - Boolean

bindings/python/pymongoarrow/context.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
DoubleBuilder,
2121
Int32Builder,
2222
Int64Builder,
23+
ListBuilder,
2324
ObjectIdBuilder,
2425
StringBuilder,
2526
)
@@ -35,6 +36,7 @@
3536
_BsonArrowTypes.string: StringBuilder,
3637
_BsonArrowTypes.bool: BoolBuilder,
3738
_BsonArrowTypes.document: DocumentBuilder,
39+
_BsonArrowTypes.array: ListBuilder,
3840
}
3941

4042

@@ -71,7 +73,6 @@ def from_schema(cls, schema, codec_options=DEFAULT_CODEC_OPTIONS):
7173

7274
builder_map = {}
7375
tzinfo = codec_options.tzinfo
74-
7576
str_type_map = _get_internal_typemap(schema.typemap)
7677
for fname, ftype in str_type_map.items():
7778
builder_cls = _TYPE_TO_BUILDER_CLS[ftype]
@@ -86,6 +87,10 @@ def from_schema(cls, schema, codec_options=DEFAULT_CODEC_OPTIONS):
8687
elif builder_cls == DocumentBuilder:
8788
arrow_type = schema.typemap[fname]
8889
builder_map[encoded_fname] = DocumentBuilder(arrow_type, tzinfo)
90+
elif builder_cls == ListBuilder:
91+
arrow_type = schema.typemap[fname]
92+
builder_map[encoded_fname] = ListBuilder(arrow_type, tzinfo)
93+
8994
else:
9095
builder_map[encoded_fname] = builder_cls()
9196
return cls(schema, builder_map)

bindings/python/pymongoarrow/lib.pyx

Lines changed: 115 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,9 @@ import bson
2626
import numpy as np
2727
from pyarrow import timestamp, struct, field
2828
from pyarrow.lib import (
29-
tobytes, StructType, int32, int64, float64, string, bool_
29+
tobytes, StructType, int32, int64, float64, string, bool_, list_
3030
)
31+
3132
from pymongoarrow.errors import InvalidBSON, PyMongoArrowError
3233
from pymongoarrow.context import PyMongoArrowContext
3334
from pymongoarrow.types import _BsonArrowTypes, _atypes, ObjectIdType, Decimal128StringType
@@ -65,7 +66,8 @@ _builder_type_map = {
6566
BSON_TYPE_UTF8: StringBuilder,
6667
BSON_TYPE_BOOL: BoolBuilder,
6768
BSON_TYPE_DOCUMENT: DocumentBuilder,
68-
BSON_TYPE_DECIMAL128: StringBuilder
69+
BSON_TYPE_DECIMAL128: StringBuilder,
70+
BSON_TYPE_ARRAY: ListBuilder,
6971
}
7072

7173
_field_type_map = {
@@ -75,9 +77,26 @@ _field_type_map = {
7577
BSON_TYPE_OID: ObjectIdType(),
7678
BSON_TYPE_UTF8: string(),
7779
BSON_TYPE_BOOL: bool_(),
78-
BSON_TYPE_DECIMAL128: Decimal128StringType()
80+
BSON_TYPE_DECIMAL128: Decimal128StringType(),
7981
}
8082

83+
cdef extract_field_dtype(bson_iter_t * doc_iter, bson_iter_t * child_iter, bson_type_t value_t, context):
84+
"""Get the appropropriate data type for a specific field"""
85+
if value_t in _field_type_map:
86+
field_type = _field_type_map[value_t]
87+
elif value_t == BSON_TYPE_ARRAY:
88+
bson_iter_recurse(doc_iter, child_iter)
89+
list_dtype = extract_array_dtype(child_iter, context)
90+
field_type = list_(list_dtype)
91+
elif value_t == BSON_TYPE_DOCUMENT:
92+
bson_iter_recurse(doc_iter, child_iter)
93+
field_type = extract_document_dtype(child_iter, context)
94+
elif value_t == BSON_TYPE_DATE_TIME:
95+
field_type = timestamp('ms', tz=context.tzinfo)
96+
else:
97+
raise PyMongoArrowError('unknown value type {}'.format(value_t))
98+
return field_type
99+
81100

82101
cdef extract_document_dtype(bson_iter_t * doc_iter, context):
83102
"""Get the appropropriate data type for a sub document"""
@@ -88,19 +107,21 @@ cdef extract_document_dtype(bson_iter_t * doc_iter, context):
88107
while bson_iter_next(doc_iter):
89108
key = bson_iter_key(doc_iter)
90109
value_t = bson_iter_type(doc_iter)
91-
if value_t in _field_type_map:
92-
field_type = _field_type_map[value_t]
93-
elif value_t == BSON_TYPE_DOCUMENT:
94-
bson_iter_recurse(doc_iter, &child_iter)
95-
field_type = extract_document_dtype(&child_iter, context)
96-
elif value_t == BSON_TYPE_DATE_TIME:
97-
field_type = timestamp('ms', tz=context.tzinfo)
98-
110+
field_type = extract_field_dtype(doc_iter, &child_iter, value_t, context)
99111
fields.append(field(key.decode('utf-8'), field_type))
100112
return struct(fields)
101113

114+
cdef extract_array_dtype(bson_iter_t * doc_iter, context):
115+
"""Get the appropropriate data type for a sub array"""
116+
cdef const char* key
117+
cdef bson_type_t value_t
118+
cdef bson_iter_t child_iter
119+
fields = []
120+
first_item = bson_iter_next(doc_iter)
121+
value_t = bson_iter_type(doc_iter)
122+
return extract_field_dtype(doc_iter, &child_iter, value_t, context)
102123

103-
def process_bson_stream(bson_stream, context):
124+
def process_bson_stream(bson_stream, context, arr_value_builder=None):
104125
"""Process a bson byte stream using a PyMongoArrowContext"""
105126
cdef const uint8_t* docstream = <const uint8_t *>bson_stream
106127
cdef size_t length = <size_t>PyBytes_Size(bson_stream)
@@ -110,6 +131,8 @@ def process_bson_stream(bson_stream, context):
110131
cdef uint32_t str_len
111132
cdef const uint8_t *doc_buf = NULL
112133
cdef uint32_t doc_buf_len = 0;
134+
cdef const uint8_t *arr_buf = NULL
135+
cdef uint32_t arr_buf_len = 0;
113136
cdef bson_decimal128_t dec128
114137
cdef bson_type_t value_t
115138
cdef const char * bson_str
@@ -131,12 +154,13 @@ def process_bson_stream(bson_stream, context):
131154
t_string = _BsonArrowTypes.string
132155
t_bool = _BsonArrowTypes.bool
133156
t_document = _BsonArrowTypes.document
157+
t_array = _BsonArrowTypes.array
158+
134159

135160
# initialize count to current length of builders
136161
for _, builder in builder_map.items():
137162
count = len(builder)
138163
break
139-
140164
try:
141165
while True:
142166
doc = bson_reader_read_safe(stream_reader)
@@ -146,7 +170,10 @@ def process_bson_stream(bson_stream, context):
146170
raise InvalidBSON("Could not read BSON document")
147171
while bson_iter_next(&doc_iter):
148172
key = bson_iter_key(&doc_iter)
149-
builder = builder_map.get(key)
173+
if arr_value_builder is not None:
174+
builder = arr_value_builder
175+
else:
176+
builder = builder_map.get(key)
150177
if builder is None:
151178
builder = builder_map.get(key)
152179
if builder is None and context.schema is None:
@@ -165,10 +192,15 @@ def process_bson_stream(bson_stream, context):
165192
bson_iter_recurse(&doc_iter, &child_iter)
166193
struct_dtype = extract_document_dtype(&child_iter, context)
167194
builder = DocumentBuilder(struct_dtype, context.tzinfo)
195+
elif builder_type == ListBuilder:
196+
bson_iter_recurse(&doc_iter, &child_iter)
197+
list_dtype = extract_array_dtype(&child_iter, context)
198+
list_dtype = list_(list_dtype)
199+
builder = ListBuilder(list_dtype, context.tzinfo, value_builder=arr_value_builder)
168200
else:
169201
builder = builder_type()
170-
171-
builder_map[key] = builder
202+
if arr_value_builder is None:
203+
builder_map[key] = builder
172204
for _ in range(count):
173205
builder.append_null()
174206

@@ -231,6 +263,14 @@ def process_bson_stream(bson_stream, context):
231263
builder.append(<bytes>doc_buf[:doc_buf_len])
232264
else:
233265
builder.append_null()
266+
elif ftype == t_array:
267+
if value_t == BSON_TYPE_ARRAY:
268+
bson_iter_array(&doc_iter, &doc_buf_len, &doc_buf)
269+
if doc_buf_len <= 0:
270+
raise ValueError("Subarray is invalid")
271+
builder.append(<bytes>doc_buf[:doc_buf_len])
272+
else:
273+
builder.append_null()
234274
else:
235275
raise PyMongoArrowError('unknown ftype {}'.format(ftype))
236276
count += 1
@@ -467,7 +507,11 @@ cdef class BoolBuilder(_ArrayBuilderBase):
467507
cdef object get_field_builder(field, tzinfo):
468508
""""Find the appropriate field builder given a pyarrow field"""
469509
cdef object field_builder
470-
field_type = field.type
510+
cdef DataType field_type
511+
if isinstance(field, DataType):
512+
field_type = field
513+
else:
514+
field_type = field.type
471515
if _atypes.is_int32(field_type):
472516
field_builder = Int32Builder()
473517
elif _atypes.is_int64(field_type):
@@ -484,6 +528,8 @@ cdef object get_field_builder(field, tzinfo):
484528
field_builder = BoolBuilder()
485529
elif _atypes.is_struct(field_type):
486530
field_builder = DocumentBuilder(field_type, tzinfo)
531+
elif _atypes.is_list(field_type):
532+
field_builder = ListBuilder(field_type, tzinfo)
487533
elif getattr(field_type, '_type_marker') == _BsonArrowTypes.objectid:
488534
field_builder = ObjectIdBuilder()
489535
elif getattr(field_type, '_type_marker') == _BsonArrowTypes.decimal128_str:
@@ -549,3 +595,55 @@ cdef class DocumentBuilder(_ArrayBuilderBase):
549595

550596
cdef shared_ptr[CStructBuilder] unwrap(self):
551597
return self.builder
598+
599+
cdef class ListBuilder(_ArrayBuilderBase):
600+
type_marker = _BsonArrowTypes.array
601+
602+
cdef:
603+
shared_ptr[CListBuilder] builder
604+
_ArrayBuilderBase child_builder
605+
object dtype
606+
object context
607+
608+
def __cinit__(self, DataType dtype, tzinfo=None, MemoryPool memory_pool=None, value_builder=None):
609+
cdef StringBuilder field_builder
610+
cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
611+
cdef shared_ptr[CArrayBuilder] grandchild_builder
612+
self.dtype = dtype
613+
if not _atypes.is_list(dtype):
614+
raise ValueError("dtype must be a list_()")
615+
self.context = context = PyMongoArrowContext(None, {})
616+
self.context.tzinfo = tzinfo
617+
field_builder = <StringBuilder>get_field_builder(self.dtype.value_type, tzinfo)
618+
grandchild_builder = <shared_ptr[CArrayBuilder]>field_builder.builder
619+
self.child_builder = field_builder
620+
self.builder.reset(new CListBuilder(pool, grandchild_builder, pyarrow_unwrap_data_type(dtype)))
621+
622+
623+
@property
624+
def dtype(self):
625+
return self.dtype
626+
627+
cpdef append_null(self):
628+
self.builder.get().AppendNull()
629+
630+
def __len__(self):
631+
return self.builder.get().length()
632+
633+
cpdef append(self, value):
634+
if not isinstance(value, bytes):
635+
value = bson.encode(value)
636+
# Append an element to the array.
637+
# arr_value_builder will be appended to by process_bson_stream.
638+
self.builder.get().Append(True)
639+
process_bson_stream(value, self.context, arr_value_builder=self.child_builder)
640+
641+
642+
cpdef finish(self):
643+
cdef shared_ptr[CArray] out
644+
with nogil:
645+
self.builder.get().Finish(&out)
646+
return pyarrow_wrap_array(out)
647+
648+
cdef shared_ptr[CListBuilder] unwrap(self):
649+
return self.builder

bindings/python/pymongoarrow/libarrow.pxd

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,14 @@ cdef extern from "arrow/builder.h" namespace "arrow" nogil:
3737
int32_t num_fields()
3838
shared_ptr[CDataType] type()
3939

40+
cdef cppclass CListBuilder" arrow::ListBuilder"(CArrayBuilder):
41+
CListBuilder(CMemoryPool* pool,
42+
shared_ptr[CArrayBuilder] value_builder, shared_ptr[CDataType] dtype)
43+
CStatus Append(uint8_t is_valid)
44+
CArrayBuilder* value_builder(int i)
45+
int32_t num_values()
46+
shared_ptr[CDataType] type()
47+
4048

4149
cdef extern from "arrow/type_fwd.h" namespace "arrow" nogil:
4250
shared_ptr[CDataType] fixed_size_binary(int32_t byte_width)

bindings/python/pymongoarrow/libbson.pxd

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,11 @@ cdef extern from "<bson/bson.h>":
133133
uint32_t *document_len, # OUT
134134
const uint8_t **document) # OUT
135135

136+
void bson_iter_array (const bson_iter_t *iter, # IN
137+
uint32_t *array_len, # OUT
138+
const uint8_t **array)
139+
140+
136141
# bson_reader_t API
137142
cdef extern from "<bson/bson.h>":
138143
bson_reader_t * bson_reader_new_from_data(const uint8_t *data, size_t length)

bindings/python/pymongoarrow/types.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
bool_,
2727
float64,
2828
int64,
29+
list_,
2930
string,
3031
struct,
3132
timestamp,
@@ -42,6 +43,7 @@ class _BsonArrowTypes(enum.Enum):
4243
bool = 7
4344
decimal128_str = 8
4445
document = 9
46+
array = 10
4547

4648

4749
# Custom Extension Types.
@@ -141,6 +143,7 @@ def get_numpy_type(type):
141143
_atypes.is_string: _BsonArrowTypes.string,
142144
_atypes.is_boolean: _BsonArrowTypes.bool,
143145
_atypes.is_struct: _BsonArrowTypes.document,
146+
_atypes.is_list: _BsonArrowTypes.array,
144147
}
145148

146149

@@ -156,6 +159,8 @@ def _normalize_typeid(typeid, field_name):
156159
for sub_field_name, sub_typeid in typeid.items():
157160
fields.append((sub_field_name, _normalize_typeid(sub_typeid, sub_field_name)))
158161
return struct(fields)
162+
elif isinstance(typeid, list):
163+
return list_(_normalize_typeid(type(typeid[0]), "0"))
159164
elif _is_typeid_supported(typeid):
160165
normalizer = _TYPE_NORMALIZER_FACTORY[typeid]
161166
return normalizer(typeid)

0 commit comments

Comments
 (0)