Skip to content

Commit 4c2e717

Browse files
authored
PYTHON-4204 Optimize JSON decoding using lookup table to find $ keys (#1512)
1 parent 296a44d commit 4c2e717

File tree

3 files changed

+76
-61
lines changed

3 files changed

+76
-61
lines changed

.evergreen/run-tests.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,9 @@ fi
246246
PIP_QUIET=0 python -m pip list
247247

248248
if [ -z "$GREEN_FRAMEWORK" ]; then
249-
python -m pytest -v --durations=5 --maxfail=10 $TEST_ARGS
249+
# Use --capture=tee-sys so pytest prints test output inline:
250+
# https://docs.pytest.org/en/stable/how-to/capture-stdout-stderr.html
251+
python -m pytest -v --capture=tee-sys --durations=5 --maxfail=10 $TEST_ARGS
250252
else
251253
python green_framework_test.py $GREEN_FRAMEWORK -v $TEST_ARGS
252254
fi

bson/json_util.py

Lines changed: 62 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -526,54 +526,17 @@ def object_pairs_hook(
526526

527527

528528
def object_hook(dct: Mapping[str, Any], json_options: JSONOptions = DEFAULT_JSON_OPTIONS) -> Any:
529-
if "$oid" in dct:
530-
return _parse_canonical_oid(dct)
531-
if (
532-
isinstance(dct.get("$ref"), str)
533-
and "$id" in dct
534-
and isinstance(dct.get("$db"), (str, type(None)))
535-
):
536-
return _parse_canonical_dbref(dct)
537-
if "$date" in dct:
538-
return _parse_canonical_datetime(dct, json_options)
539-
if "$regex" in dct:
540-
return _parse_legacy_regex(dct)
541-
if "$minKey" in dct:
542-
return _parse_canonical_minkey(dct)
543-
if "$maxKey" in dct:
544-
return _parse_canonical_maxkey(dct)
545-
if "$binary" in dct:
546-
if "$type" in dct:
547-
return _parse_legacy_binary(dct, json_options)
548-
else:
549-
return _parse_canonical_binary(dct, json_options)
550-
if "$code" in dct:
551-
return _parse_canonical_code(dct)
552-
if "$uuid" in dct:
553-
return _parse_legacy_uuid(dct, json_options)
554-
if "$undefined" in dct:
555-
return None
556-
if "$numberLong" in dct:
557-
return _parse_canonical_int64(dct)
558-
if "$timestamp" in dct:
559-
tsp = dct["$timestamp"]
560-
return Timestamp(tsp["t"], tsp["i"])
561-
if "$numberDecimal" in dct:
562-
return _parse_canonical_decimal128(dct)
563-
if "$dbPointer" in dct:
564-
return _parse_canonical_dbpointer(dct)
565-
if "$regularExpression" in dct:
566-
return _parse_canonical_regex(dct)
567-
if "$symbol" in dct:
568-
return _parse_canonical_symbol(dct)
569-
if "$numberInt" in dct:
570-
return _parse_canonical_int32(dct)
571-
if "$numberDouble" in dct:
572-
return _parse_canonical_double(dct)
529+
match = None
530+
for k in dct:
531+
if k in _PARSERS_SET:
532+
match = k
533+
break
534+
if match:
535+
return _PARSERS[match](dct, json_options)
573536
return dct
574537

575538

576-
def _parse_legacy_regex(doc: Any) -> Any:
539+
def _parse_legacy_regex(doc: Any, dummy0: Any) -> Any:
577540
pattern = doc["$regex"]
578541
# Check if this is the $regex query operator.
579542
if not isinstance(pattern, (str, bytes)):
@@ -709,30 +672,30 @@ def _parse_canonical_datetime(
709672
return _millis_to_datetime(int(dtm), cast("CodecOptions[Any]", json_options))
710673

711674

712-
def _parse_canonical_oid(doc: Any) -> ObjectId:
675+
def _parse_canonical_oid(doc: Any, dummy0: Any) -> ObjectId:
713676
"""Decode a JSON ObjectId to bson.objectid.ObjectId."""
714677
if len(doc) != 1:
715678
raise TypeError(f"Bad $oid, extra field(s): {doc}")
716679
return ObjectId(doc["$oid"])
717680

718681

719-
def _parse_canonical_symbol(doc: Any) -> str:
682+
def _parse_canonical_symbol(doc: Any, dummy0: Any) -> str:
720683
"""Decode a JSON symbol to Python string."""
721684
symbol = doc["$symbol"]
722685
if len(doc) != 1:
723686
raise TypeError(f"Bad $symbol, extra field(s): {doc}")
724687
return str(symbol)
725688

726689

727-
def _parse_canonical_code(doc: Any) -> Code:
690+
def _parse_canonical_code(doc: Any, dummy0: Any) -> Code:
728691
"""Decode a JSON code to bson.code.Code."""
729692
for key in doc:
730693
if key not in ("$code", "$scope"):
731694
raise TypeError(f"Bad $code, extra field(s): {doc}")
732695
return Code(doc["$code"], scope=doc.get("$scope"))
733696

734697

735-
def _parse_canonical_regex(doc: Any) -> Regex[str]:
698+
def _parse_canonical_regex(doc: Any, dummy0: Any) -> Regex[str]:
736699
"""Decode a JSON regex to bson.regex.Regex."""
737700
regex = doc["$regularExpression"]
738701
if len(doc) != 1:
@@ -749,12 +712,18 @@ def _parse_canonical_regex(doc: Any) -> Regex[str]:
749712
return Regex(regex["pattern"], opts)
750713

751714

752-
def _parse_canonical_dbref(doc: Any) -> DBRef:
715+
def _parse_canonical_dbref(doc: Any, dummy0: Any) -> Any:
753716
"""Decode a JSON DBRef to bson.dbref.DBRef."""
754-
return DBRef(doc.pop("$ref"), doc.pop("$id"), database=doc.pop("$db", None), **doc)
717+
if (
718+
isinstance(doc.get("$ref"), str)
719+
and "$id" in doc
720+
and isinstance(doc.get("$db"), (str, type(None)))
721+
):
722+
return DBRef(doc.pop("$ref"), doc.pop("$id"), database=doc.pop("$db", None), **doc)
723+
return doc
755724

756725

757-
def _parse_canonical_dbpointer(doc: Any) -> Any:
726+
def _parse_canonical_dbpointer(doc: Any, dummy0: Any) -> Any:
758727
"""Decode a JSON (deprecated) DBPointer to bson.dbref.DBRef."""
759728
dbref = doc["$dbPointer"]
760729
if len(doc) != 1:
@@ -773,7 +742,7 @@ def _parse_canonical_dbpointer(doc: Any) -> Any:
773742
raise TypeError(f"Bad $dbPointer, expected a DBRef: {doc}")
774743

775744

776-
def _parse_canonical_int32(doc: Any) -> int:
745+
def _parse_canonical_int32(doc: Any, dummy0: Any) -> int:
777746
"""Decode a JSON int32 to python int."""
778747
i_str = doc["$numberInt"]
779748
if len(doc) != 1:
@@ -783,15 +752,15 @@ def _parse_canonical_int32(doc: Any) -> int:
783752
return int(i_str)
784753

785754

786-
def _parse_canonical_int64(doc: Any) -> Int64:
755+
def _parse_canonical_int64(doc: Any, dummy0: Any) -> Int64:
787756
"""Decode a JSON int64 to bson.int64.Int64."""
788757
l_str = doc["$numberLong"]
789758
if len(doc) != 1:
790759
raise TypeError(f"Bad $numberLong, extra field(s): {doc}")
791760
return Int64(l_str)
792761

793762

794-
def _parse_canonical_double(doc: Any) -> float:
763+
def _parse_canonical_double(doc: Any, dummy0: Any) -> float:
795764
"""Decode a JSON double to python float."""
796765
d_str = doc["$numberDouble"]
797766
if len(doc) != 1:
@@ -801,7 +770,7 @@ def _parse_canonical_double(doc: Any) -> float:
801770
return float(d_str)
802771

803772

804-
def _parse_canonical_decimal128(doc: Any) -> Decimal128:
773+
def _parse_canonical_decimal128(doc: Any, dummy0: Any) -> Decimal128:
805774
"""Decode a JSON decimal128 to bson.decimal128.Decimal128."""
806775
d_str = doc["$numberDecimal"]
807776
if len(doc) != 1:
@@ -811,7 +780,7 @@ def _parse_canonical_decimal128(doc: Any) -> Decimal128:
811780
return Decimal128(d_str)
812781

813782

814-
def _parse_canonical_minkey(doc: Any) -> MinKey:
783+
def _parse_canonical_minkey(doc: Any, dummy0: Any) -> MinKey:
815784
"""Decode a JSON MinKey to bson.min_key.MinKey."""
816785
if type(doc["$minKey"]) is not int or doc["$minKey"] != 1: # noqa: E721
817786
raise TypeError(f"$minKey value must be 1: {doc}")
@@ -820,7 +789,7 @@ def _parse_canonical_minkey(doc: Any) -> MinKey:
820789
return MinKey()
821790

822791

823-
def _parse_canonical_maxkey(doc: Any) -> MaxKey:
792+
def _parse_canonical_maxkey(doc: Any, dummy0: Any) -> MaxKey:
824793
"""Decode a JSON MaxKey to bson.max_key.MaxKey."""
825794
if type(doc["$maxKey"]) is not int or doc["$maxKey"] != 1: # noqa: E721
826795
raise TypeError("$maxKey value must be 1: %s", (doc,))
@@ -829,6 +798,41 @@ def _parse_canonical_maxkey(doc: Any) -> MaxKey:
829798
return MaxKey()
830799

831800

801+
def _parse_binary(doc: Any, json_options: JSONOptions) -> Union[Binary, uuid.UUID]:
802+
if "$type" in doc:
803+
return _parse_legacy_binary(doc, json_options)
804+
else:
805+
return _parse_canonical_binary(doc, json_options)
806+
807+
808+
def _parse_timestamp(doc: Any, dummy0: Any) -> Timestamp:
809+
tsp = doc["$timestamp"]
810+
return Timestamp(tsp["t"], tsp["i"])
811+
812+
813+
_PARSERS: dict[str, Callable[[Any, JSONOptions], Any]] = {
814+
"$oid": _parse_canonical_oid,
815+
"$ref": _parse_canonical_dbref,
816+
"$date": _parse_canonical_datetime,
817+
"$regex": _parse_legacy_regex,
818+
"$minKey": _parse_canonical_minkey,
819+
"$maxKey": _parse_canonical_maxkey,
820+
"$binary": _parse_binary,
821+
"$code": _parse_canonical_code,
822+
"$uuid": _parse_legacy_uuid,
823+
"$undefined": lambda _, _1: None,
824+
"$numberLong": _parse_canonical_int64,
825+
"$timestamp": _parse_timestamp,
826+
"$numberDecimal": _parse_canonical_decimal128,
827+
"$dbPointer": _parse_canonical_dbpointer,
828+
"$regularExpression": _parse_canonical_regex,
829+
"$symbol": _parse_canonical_symbol,
830+
"$numberInt": _parse_canonical_int32,
831+
"$numberDouble": _parse_canonical_double,
832+
}
833+
_PARSERS_SET = set(_PARSERS)
834+
835+
832836
def _encode_binary(data: bytes, subtype: int, json_options: JSONOptions) -> Any:
833837
if json_options.json_mode == JSONMode.LEGACY:
834838
return {"$binary": base64.b64encode(data).decode(), "$type": "%02x" % subtype}

test/performance/perf_test.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,15 +78,17 @@ def setUpClass(cls):
7878
client_context.init()
7979

8080
def setUp(self):
81-
pass
81+
self.setup_time = time.monotonic()
8282

8383
def tearDown(self):
84+
duration = time.monotonic() - self.setup_time
8485
# Remove "Test" so that TestFlatEncoding is reported as "FlatEncoding".
8586
name = self.__class__.__name__[4:]
8687
median = self.percentile(50)
8788
megabytes_per_sec = self.data_size / median / 1000000
8889
print(
89-
f"Running {self.__class__.__name__}. MB/s={megabytes_per_sec}, MEDIAN={self.percentile(50)}"
90+
f"Completed {self.__class__.__name__} {megabytes_per_sec:.3f} MB/s, MEDIAN={self.percentile(50):.3f}s, "
91+
f"total time={duration:.3f}s"
9092
)
9193
result_data.append(
9294
{
@@ -149,6 +151,7 @@ def mp_map(self, map_func, files):
149151

150152
class MicroTest(PerformanceTest):
151153
def setUp(self):
154+
super().setUp()
152155
# Location of test data.
153156
with open(os.path.join(TEST_PATH, os.path.join("extended_bson", self.dataset))) as data:
154157
self.file_data = data.read()
@@ -256,6 +259,7 @@ class TestRunCommand(PerformanceTest, unittest.TestCase):
256259
data_size = len(encode({"hello": True})) * NUM_DOCS
257260

258261
def setUp(self):
262+
super().setUp()
259263
self.client = client_context.client
260264
self.client.drop_database("perftest")
261265

@@ -267,6 +271,7 @@ def do_task(self):
267271

268272
class TestDocument(PerformanceTest):
269273
def setUp(self):
274+
super().setUp()
270275
# Location of test data.
271276
with open(
272277
os.path.join(TEST_PATH, os.path.join("single_and_multi_document", self.dataset))
@@ -458,6 +463,7 @@ def read_gridfs_file(filename):
458463

459464
class TestJsonMultiImport(PerformanceTest, unittest.TestCase):
460465
def setUp(self):
466+
super().setUp()
461467
self.client = client_context.client
462468
self.client.drop_database("perftest")
463469
ldjson_path = os.path.join(TEST_PATH, os.path.join("parallel", "ldjson_multi"))
@@ -481,6 +487,7 @@ def tearDown(self):
481487

482488
class TestJsonMultiExport(PerformanceTest, unittest.TestCase):
483489
def setUp(self):
490+
super().setUp()
484491
self.client = client_context.client
485492
self.client.drop_database("perftest")
486493
self.client.perfest.corpus.create_index("file")
@@ -501,6 +508,7 @@ def tearDown(self):
501508

502509
class TestGridFsMultiFileUpload(PerformanceTest, unittest.TestCase):
503510
def setUp(self):
511+
super().setUp()
504512
self.client = client_context.client
505513
self.client.drop_database("perftest")
506514
gridfs_path = os.path.join(TEST_PATH, os.path.join("parallel", "gridfs_multi"))
@@ -525,6 +533,7 @@ def tearDown(self):
525533

526534
class TestGridFsMultiFileDownload(PerformanceTest, unittest.TestCase):
527535
def setUp(self):
536+
super().setUp()
528537
self.client = client_context.client
529538
self.client.drop_database("perftest")
530539

0 commit comments

Comments
 (0)