Skip to content

Commit 29e0dc0

Browse files
authored
CDRIVER-4666 remove locks during single-threaded scanning (#1367)
* add a failing test for CDRIVER-4666 * assert single_threaded * remove locks during single-threaded scanning Fixes a possible recursive mutex lock in some error scenarios. Refer: CDRIVER-4666
1 parent 6c40c23 commit 29e0dc0

File tree

2 files changed

+133
-25
lines changed

2 files changed

+133
-25
lines changed

src/libmongoc/src/mongoc/mongoc-topology.c

Lines changed: 28 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ mongoc_topology_reconcile (const mongoc_topology_t *topology,
7373
mongoc_server_description_t *sd;
7474
mongoc_topology_scanner_node_t *ele, *tmp;
7575

76+
BSON_ASSERT (topology->single_threaded);
7677
servers = mc_tpld_servers (td);
7778
/* Add newly discovered nodes */
7879
for (size_t i = 0u; i < servers->items_len; i++) {
@@ -124,15 +125,16 @@ _mongoc_topology_scanner_setup_err_cb (uint32_t id,
124125
{
125126
mongoc_topology_t *topology = BSON_ASSERT_PTR_INLINE (data);
126127

128+
BSON_ASSERT (topology->single_threaded);
127129
if (_mongoc_topology_get_type (topology) == MONGOC_TOPOLOGY_LOAD_BALANCED) {
128130
/* In load balanced mode, scanning is only for connection establishment.
129131
* It must not modify the topology description. */
130132
} else {
131-
/* We need to update the topology description */
132-
mc_tpld_modification mod = mc_tpld_modify_begin (topology);
133+
// Use `mc_tpld_unsafe_get_mutable` to get a mutable topology description
134+
// without locking. This function only applies to single-threaded clients.
135+
mongoc_topology_description_t *td = mc_tpld_unsafe_get_mutable (topology);
133136
mongoc_topology_description_handle_hello (
134-
mod.new_td, id, NULL /* hello reply */, -1 /* rtt_msec */, error);
135-
mc_tpld_modify_commit (mod);
137+
td, id, NULL /* hello reply */, -1 /* rtt_msec */, error);
136138
}
137139
}
138140

@@ -159,51 +161,47 @@ _mongoc_topology_scanner_cb (uint32_t id,
159161
{
160162
mongoc_topology_t *const topology = BSON_ASSERT_PTR_INLINE (data);
161163
mongoc_server_description_t *sd;
162-
mc_tpld_modification tdmod;
164+
mongoc_topology_description_t *td;
163165

166+
BSON_ASSERT (topology->single_threaded);
164167
if (_mongoc_topology_get_type (topology) == MONGOC_TOPOLOGY_LOAD_BALANCED) {
165168
/* In load balanced mode, scanning is only for connection establishment.
166169
* It must not modify the topology description. */
167170
return;
168171
}
169172

170-
tdmod = mc_tpld_modify_begin (topology);
173+
// Use `mc_tpld_unsafe_get_mutable` to get a mutable topology description
174+
// without locking. This function only applies to single-threaded clients.
175+
td = mc_tpld_unsafe_get_mutable (topology);
171176

172-
sd = mongoc_topology_description_server_by_id (tdmod.new_td, id, NULL);
177+
sd = mongoc_topology_description_server_by_id (td, id, NULL);
173178

174179
if (!hello_response) {
175180
/* Server monitoring: When a server check fails due to a network error
176181
* (including a network timeout), the client MUST clear its connection
177182
* pool for the server */
178183
_mongoc_topology_description_clear_connection_pool (
179-
tdmod.new_td, id, &kZeroServiceId);
184+
td, id, &kZeroServiceId);
180185
}
181186

182187
/* Server Discovery and Monitoring Spec: "Once a server is connected, the
183188
* client MUST change its type to Unknown only after it has retried the
184189
* server once." */
185190
if (!hello_response && sd && sd->type != MONGOC_SERVER_UNKNOWN) {
186-
_mongoc_topology_update_no_lock (
187-
id, hello_response, rtt_msec, tdmod.new_td, error);
191+
_mongoc_topology_update_no_lock (id, hello_response, rtt_msec, td, error);
188192

189193
/* add another hello call to the current scan - the scan continues
190194
* until all commands are done */
191195
mongoc_topology_scanner_scan (topology->scanner, sd->id);
192196
} else {
193-
_mongoc_topology_update_no_lock (
194-
id, hello_response, rtt_msec, tdmod.new_td, error);
197+
_mongoc_topology_update_no_lock (id, hello_response, rtt_msec, td, error);
195198

196199
/* The processing of the hello results above may have added, changed, or
197200
* removed server descriptions. We need to reconcile that with our
198201
* monitoring agents
199202
*/
200-
mongoc_topology_reconcile (topology, tdmod.new_td);
201-
202-
mongoc_cond_broadcast (&topology->cond_client);
203+
mongoc_topology_reconcile (topology, td);
203204
}
204-
205-
206-
mc_tpld_modify_commit (tdmod);
207205
}
208206

209207
static void
@@ -881,7 +879,8 @@ mongoc_topology_rescan_srv (mongoc_topology_t *topology)
881879
static void
882880
mongoc_topology_scan_once (mongoc_topology_t *topology, bool obey_cooldown)
883881
{
884-
mc_tpld_modification tdmod;
882+
mongoc_topology_description_t *td;
883+
BSON_ASSERT (topology->single_threaded);
885884
if (mongoc_topology_should_rescan_srv (topology)) {
886885
/* Prior to scanning hosts, update the list of SRV hosts, if applicable.
887886
*/
@@ -892,9 +891,10 @@ mongoc_topology_scan_once (mongoc_topology_t *topology, bool obey_cooldown)
892891
* description based on hello responses in connection handshakes, see
893892
* _mongoc_topology_update_from_handshake. retire scanner nodes for removed
894893
* members and create scanner nodes for new ones. */
895-
tdmod = mc_tpld_modify_begin (topology);
896-
mongoc_topology_reconcile (topology, tdmod.new_td);
897-
mc_tpld_modify_commit (tdmod);
894+
// Use `mc_tpld_unsafe_get_mutable` to get a mutable topology description
895+
// without locking. This function only applies to single-threaded clients.
896+
td = mc_tpld_unsafe_get_mutable (topology);
897+
mongoc_topology_reconcile (topology, td);
898898

899899
mongoc_topology_scanner_start (topology->scanner, obey_cooldown);
900900
mongoc_topology_scanner_work (topology->scanner);
@@ -920,6 +920,7 @@ void
920920
_mongoc_topology_do_blocking_scan (mongoc_topology_t *topology,
921921
bson_error_t *error)
922922
{
923+
BSON_ASSERT (topology->single_threaded);
923924
_mongoc_handshake_freeze ();
924925

925926
mongoc_topology_scan_once (topology, true /* obey cooldown */);
@@ -1162,9 +1163,11 @@ mongoc_topology_select_server_id (mongoc_topology_t *topology,
11621163

11631164
if (topology->single_threaded) {
11641165
if (!td.ptr->opened) {
1165-
mc_tpld_modification tdmod = mc_tpld_modify_begin (topology);
1166-
_mongoc_topology_description_monitor_opening (tdmod.new_td);
1167-
mc_tpld_modify_commit (tdmod);
1166+
// Use `mc_tpld_unsafe_get_mutable` to get a mutable topology
1167+
// description without locking. This block only applies to
1168+
// single-threaded clients.
1169+
_mongoc_topology_description_monitor_opening (
1170+
mc_tpld_unsafe_get_mutable (topology));
11681171
mc_tpld_renew_ref (&td, topology);
11691172
}
11701173

src/libmongoc/tests/test-mongoc-topology.c

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2573,6 +2573,108 @@ test_hello_ok_pooled (void)
25732573
_test_hello_ok (true);
25742574
}
25752575

2576+
// initiator_fail is a stream initiator that always fails.
2577+
static mongoc_stream_t *
2578+
initiator_fail (const mongoc_uri_t *uri,
2579+
const mongoc_host_list_t *host,
2580+
void *user_data,
2581+
bson_error_t *error)
2582+
{
2583+
bson_set_error (error,
2584+
MONGOC_ERROR_STREAM,
2585+
MONGOC_ERROR_STREAM_CONNECT,
2586+
"failing in initiator");
2587+
printf ("failing in initiator\n");
2588+
return false;
2589+
}
2590+
2591+
// Test failure in `mongoc_topology_scanner_node_setup` during retry of scanning
2592+
// a known server. This is a regression test of CDRIVER-4666.
2593+
static void
2594+
test_failure_to_setup_after_retry (void)
2595+
{
2596+
mock_server_t *server;
2597+
mongoc_uri_t *uri;
2598+
mongoc_client_t *client;
2599+
future_t *future;
2600+
bson_error_t error;
2601+
2602+
server = mock_server_new ();
2603+
mock_server_run (server);
2604+
uri = mongoc_uri_copy (mock_server_get_uri (server));
2605+
client = mongoc_client_new_from_uri_with_error (uri, &error);
2606+
ASSERT_OR_PRINT (client, error);
2607+
2608+
// Override the heartbeatFrequencyMS (default 60 seconds) and
2609+
// minHeartbeatFrequencyMS (default 500ms) to speed up the test.
2610+
const int64_t overridden_heartbeat_ms = 1;
2611+
{
2612+
mc_tpld_modification tdmod = mc_tpld_modify_begin (client->topology);
2613+
tdmod.new_td->heartbeat_msec = overridden_heartbeat_ms;
2614+
mc_tpld_modify_commit (tdmod);
2615+
client->topology->min_heartbeat_frequency_msec = overridden_heartbeat_ms;
2616+
}
2617+
2618+
future = future_client_command_simple (
2619+
client, "test", tmp_bson ("{'ping': 1}"), NULL, NULL, &error);
2620+
2621+
// The first command starts the first topology scan.
2622+
// Expect legacy hello with handshake.
2623+
{
2624+
request_t *request = mock_server_receives_legacy_hello (server, "{}");
2625+
char *reply = bson_strdup_printf ("{'ok': 1,"
2626+
" 'minWireVersion': %d,"
2627+
" 'maxWireVersion': %d }",
2628+
WIRE_VERSION_MIN,
2629+
WIRE_VERSION_MAX);
2630+
2631+
reply_to_request_simple (request, reply);
2632+
bson_free (reply);
2633+
request_destroy (request);
2634+
}
2635+
2636+
// Expect "ping" command.
2637+
{
2638+
request_t *request = mock_server_receives_msg (
2639+
server, MONGOC_MSG_NONE, tmp_bson ("{'ping': 1}"));
2640+
reply_to_request_with_ok_and_destroy (request);
2641+
ASSERT_OR_PRINT (future_get_bool (future), error);
2642+
future_destroy (future);
2643+
}
2644+
2645+
// Wait until ready for next topology scan.
2646+
_mongoc_usleep (overridden_heartbeat_ms * 1000);
2647+
2648+
// Send another command.
2649+
future = future_client_command_simple (
2650+
client, "test", tmp_bson ("{'ping': 1}"), NULL, NULL, &error);
2651+
2652+
// Expect legacy hello with handshake.
2653+
{
2654+
request_t *request = mock_server_receives_legacy_hello (server, "{}");
2655+
// Set the initiator to fail.
2656+
mongoc_client_set_stream_initiator (client, initiator_fail, NULL);
2657+
// A network error on a previously known server triggers the retry.
2658+
reply_to_request_with_hang_up (request);
2659+
request_destroy (request);
2660+
}
2661+
2662+
// The initiator fails in `mongoc_topology_scanner_node_setup`. Causes a
2663+
// deadlock similar to that observed in CDRIVER-4666.
2664+
// Test fails on macOS due to deadlock with "future_get_bool timed out".
2665+
2666+
ASSERT (!future_get_bool (future));
2667+
future_destroy (future);
2668+
ASSERT_ERROR_CONTAINS (error,
2669+
MONGOC_ERROR_SERVER_SELECTION,
2670+
MONGOC_ERROR_SERVER_SELECTION_FAILURE,
2671+
"No suitable servers found");
2672+
2673+
mongoc_client_destroy (client);
2674+
mongoc_uri_destroy (uri);
2675+
mock_server_destroy (server);
2676+
}
2677+
25762678
void
25772679
test_topology_install (TestSuite *suite)
25782680
{
@@ -2737,4 +2839,7 @@ test_topology_install (TestSuite *suite)
27372839
suite, "/Topology/hello_ok/single", test_hello_ok_single);
27382840
TestSuite_AddMockServerTest (
27392841
suite, "/Topology/hello_ok/pooled", test_hello_ok_pooled);
2842+
TestSuite_AddMockServerTest (suite,
2843+
"/Topology/failure_to_setup_after_retry",
2844+
test_failure_to_setup_after_retry);
27402845
}

0 commit comments

Comments
 (0)