Skip to content

Commit a4bde35

Browse files
dumbbellmergify[bot]
authored andcommitted
rabbit_feature_flags: Use cluster members hint for cluster sync
[Why] During peer discovery, when the feature flags state is synchronized on a starting node that joins a cluster thanks to peer discovery, the list of nodes returned by `rabbit_nodes:list_running()` is incorrect because Mnesia is not initiliazed yet. Because of that, the synchronization works on the wrong inventory of feature flags. In the end, the states of feature flags is incorrect across the cluster. [How] `rabbit_mnesia` passes a list of nodes to `rabbit_feature_flags:sync_feature_flags_with_cluster/2`. We can use this list, as we did in feature flags V1. This makes sure that the synchronization works with a valid list of cluster members, in case the cluster state is not ready yet. V2: Filter the given list of nodes to only keep those where `rabbit` is running. This avoids trying to collect inventory from nodes which are stopped. (cherry picked from commit 7c53958)
1 parent 3cf1f34 commit a4bde35

File tree

3 files changed

+23
-9
lines changed

3 files changed

+23
-9
lines changed

deps/rabbit/src/rabbit_feature_flags.erl

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1149,8 +1149,13 @@ sync_feature_flags_with_cluster([] = _Nodes, true = _NodeIsVirgin) ->
11491149
rabbit_ff_controller:enable_default();
11501150
sync_feature_flags_with_cluster([] = _Nodes, false = _NodeIsVirgin) ->
11511151
ok;
1152-
sync_feature_flags_with_cluster(_Nodes, _NodeIsVirgin) ->
1153-
rabbit_ff_controller:sync_cluster().
1152+
sync_feature_flags_with_cluster(Nodes, _NodeIsVirgin) ->
1153+
%% We don't use `rabbit_nodes:filter_running()' here because the given
1154+
%% `Nodes' list may contain nodes which are not members yet (the cluster
1155+
%% could be being created or expanded).
1156+
Nodes1 = [N || N <- Nodes, rabbit:is_running(N)],
1157+
Nodes2 = lists:usort([node() | Nodes1]),
1158+
rabbit_ff_controller:sync_cluster(Nodes2).
11541159

11551160
-spec refresh_feature_flags_after_app_load() ->
11561161
ok | {error, any()} | no_return().

deps/rabbit/src/rabbit_ff_controller.erl

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
enable/1,
3636
enable_default/0,
3737
check_node_compatibility/1,
38-
sync_cluster/0,
38+
sync_cluster/1,
3939
refresh_after_app_load/0,
4040
get_forced_feature_flag_names/0]).
4141

@@ -134,21 +134,21 @@ check_node_compatibility(RemoteNode) ->
134134
%% feature flags.
135135
check_node_compatibility_task(ThisNode, RemoteNode).
136136

137-
sync_cluster() ->
137+
sync_cluster(Nodes) ->
138138
?LOG_DEBUG(
139139
"Feature flags: SYNCING FEATURE FLAGS in cluster...",
140140
[],
141141
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
142142
case erlang:whereis(?LOCAL_NAME) of
143143
Pid when is_pid(Pid) ->
144144
%% The function is called while `rabbit' is running.
145-
gen_statem:call(?LOCAL_NAME, sync_cluster);
145+
gen_statem:call(?LOCAL_NAME, {sync_cluster, Nodes});
146146
undefined ->
147147
%% The function is called while `rabbit' is stopped. We need to
148148
%% start a one-off controller, again to make sure concurrent
149149
%% changes are blocked.
150150
{ok, Pid} = start_link(),
151-
Ret = gen_statem:call(Pid, sync_cluster),
151+
Ret = gen_statem:call(Pid, {sync_cluster, Nodes}),
152152
gen_statem:stop(Pid),
153153
Ret
154154
end.
@@ -273,8 +273,8 @@ proceed_with_task({enable, FeatureNames}) ->
273273
enable_task(FeatureNames);
274274
proceed_with_task(enable_default) ->
275275
enable_default_task();
276-
proceed_with_task(sync_cluster) ->
277-
sync_cluster_task();
276+
proceed_with_task({sync_cluster, Nodes}) ->
277+
sync_cluster_task(Nodes);
278278
proceed_with_task(refresh_after_app_load) ->
279279
refresh_after_app_load_task().
280280

@@ -645,6 +645,15 @@ get_forced_feature_flag_names_from_config() ->
645645
Reason :: term().
646646

647647
sync_cluster_task() ->
648+
Nodes = running_nodes(),
649+
sync_cluster_task(Nodes).
650+
651+
-spec sync_cluster_task(Nodes) -> Ret when
652+
Nodes :: [node()],
653+
Ret :: ok | {error, Reason},
654+
Reason :: term().
655+
656+
sync_cluster_task(Nodes) ->
648657
%% We assume that a feature flag can only be enabled, not disabled.
649658
%% Therefore this synchronization searches for feature flags enabled on
650659
%% some nodes but not all, and make sure they are enabled everywhere.
@@ -657,7 +666,6 @@ sync_cluster_task() ->
657666
%% would make sure a feature flag isn't enabled while there is a network
658667
%% partition. On the other hand, this would require that all nodes are
659668
%% running before we can expand the cluster...
660-
Nodes = running_nodes(),
661669
?LOG_DEBUG(
662670
"Feature flags: synchronizing feature flags on nodes: ~tp",
663671
[Nodes],

deps/rabbit/test/feature_flags_v2_SUITE.erl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ setup_slave_node(Config) ->
178178
_ = rabbit_ff_registry_factory:initialize_registry(),
179179
ok = start_controller(),
180180
ok = rabbit_feature_flags:enable(feature_flags_v2),
181+
_ = catch rabbit_boot_state:set(ready),
181182
ok.
182183

183184
setup_logger() ->

0 commit comments

Comments
 (0)