7
7
from cachebox import BaseCacheImpl , LRUCache
8
8
from google .protobuf .json_format import MessageToDict
9
9
from google .protobuf .struct_pb2 import Struct
10
+ from grpc import ChannelConnectivity
10
11
11
12
from openfeature .evaluation_context import EvaluationContext
12
13
from openfeature .event import ProviderEventDetails
@@ -47,6 +48,7 @@ def __init__(
47
48
[ProviderEventDetails ], None
48
49
],
49
50
):
51
+ self .active = False
50
52
self .config = config
51
53
self .emit_provider_ready = emit_provider_ready
52
54
self .emit_provider_error = emit_provider_error
@@ -57,26 +59,30 @@ def __init__(
57
59
if self .config .cache == CacheType .LRU
58
60
else None
59
61
)
60
- self .stub , self .channel = self ._create_stub ()
61
- self .retry_backoff_seconds = config .retry_backoff_ms * 0.001
62
- self .retry_backoff_max_seconds = config .retry_backoff_max_ms * 0.001
63
- self .retry_grace_attempts = config .retry_grace_attempts
62
+
63
+ self .retry_grace_period = config .retry_grace_period
64
64
self .streamline_deadline_seconds = config .stream_deadline_ms * 0.001
65
65
self .deadline = config .deadline_ms * 0.001
66
66
self .connected = False
67
-
68
- def _create_stub (
69
- self ,
70
- ) -> typing .Tuple [evaluation_pb2_grpc .ServiceStub , grpc .Channel ]:
71
- config = self .config
72
67
channel_factory = grpc .secure_channel if config .tls else grpc .insecure_channel
73
- channel = channel_factory (
68
+
69
+ # Create the channel with the service config
70
+ options = [
71
+ ("grpc.keepalive_time_ms" , config .keep_alive_time ),
72
+ ("grpc.initial_reconnect_backoff_ms" , config .retry_backoff_ms ),
73
+ ("grpc.max_reconnect_backoff_ms" , config .retry_backoff_max_ms ),
74
+ ("grpc.min_reconnect_backoff_ms" , config .deadline_ms ),
75
+ ]
76
+ self .channel = channel_factory (
74
77
f"{ config .host } :{ config .port } " ,
75
- options = (( "grpc.keepalive_time_ms" , config . keep_alive_time ),) ,
78
+ options = options ,
76
79
)
77
- stub = evaluation_pb2_grpc .ServiceStub (channel )
80
+ self . stub = evaluation_pb2_grpc .ServiceStub (self . channel )
78
81
79
- return stub , channel
82
+ self .thread : typing .Optional [threading .Thread ] = None
83
+ self .timer : typing .Optional [threading .Timer ] = None
84
+
85
+ self .start_time = time .time ()
80
86
81
87
def initialize (self , evaluation_context : EvaluationContext ) -> None :
82
88
self .connect ()
@@ -89,11 +95,12 @@ def shutdown(self) -> None:
89
95
90
96
def connect (self ) -> None :
91
97
self .active = True
92
- self .thread = threading .Thread (
93
- target = self .listen , daemon = True , name = "FlagdGrpcServiceWorkerThread"
94
- )
95
- self .thread .start ()
96
98
99
+ # Run monitoring in a separate thread
100
+ self .monitor_thread = threading .Thread (
101
+ target = self .monitor , daemon = True , name = "FlagdGrpcServiceMonitorThread"
102
+ )
103
+ self .monitor_thread .start ()
97
104
## block until ready or deadline reached
98
105
timeout = self .deadline + time .time ()
99
106
while not self .connected and time .time () < timeout :
@@ -105,81 +112,87 @@ def connect(self) -> None:
105
112
"Blocking init finished before data synced. Consider increasing startup deadline to avoid inconsistent evaluations."
106
113
)
107
114
115
+ def monitor (self ) -> None :
116
+ self .channel .subscribe (self ._state_change_callback , try_to_connect = True )
117
+
118
+ def _state_change_callback (self , new_state : ChannelConnectivity ) -> None :
119
+ logger .debug (f"gRPC state change: { new_state } " )
120
+ if new_state == ChannelConnectivity .READY :
121
+ if not self .thread or not self .thread .is_alive ():
122
+ self .thread = threading .Thread (
123
+ target = self .listen ,
124
+ daemon = True ,
125
+ name = "FlagdGrpcServiceWorkerThread" ,
126
+ )
127
+ self .thread .start ()
128
+
129
+ if self .timer and self .timer .is_alive ():
130
+ logger .debug ("gRPC error timer expired" )
131
+ self .timer .cancel ()
132
+
133
+ elif new_state == ChannelConnectivity .TRANSIENT_FAILURE :
134
+ # this is the failed reconnect attempt so we are going into stale
135
+ self .emit_provider_stale (
136
+ ProviderEventDetails (
137
+ message = "gRPC sync disconnected, reconnecting" ,
138
+ )
139
+ )
140
+ self .start_time = time .time ()
141
+ # adding a timer, so we can emit the error event after time
142
+ self .timer = threading .Timer (self .retry_grace_period , self .emit_error )
143
+
144
+ logger .debug ("gRPC error timer started" )
145
+ self .timer .start ()
146
+ self .connected = False
147
+
148
+ def emit_error (self ) -> None :
149
+ logger .debug ("gRPC error emitted" )
150
+ if self .cache :
151
+ self .cache .clear ()
152
+ self .emit_provider_error (
153
+ ProviderEventDetails (
154
+ message = "gRPC sync disconnected, reconnecting" ,
155
+ error_code = ErrorCode .GENERAL ,
156
+ )
157
+ )
158
+
108
159
def listen (self ) -> None :
109
- retry_delay = self . retry_backoff_seconds
160
+ logger . debug ( "gRPC starting listener thread" )
110
161
call_args = (
111
162
{"timeout" : self .streamline_deadline_seconds }
112
163
if self .streamline_deadline_seconds > 0
113
164
else {}
114
165
)
115
- retry_counter = 0
116
- while self .active :
117
- request = evaluation_pb2 .EventStreamRequest ()
166
+ call_args ["wait_for_ready" ] = True
167
+ request = evaluation_pb2 .EventStreamRequest ()
118
168
169
+ # defining a never ending loop to recreate the stream
170
+ while self .active :
119
171
try :
120
172
logger .debug ("Setting up gRPC sync flags connection" )
121
173
for message in self .stub .EventStream (request , ** call_args ):
122
174
if message .type == "provider_ready" :
123
- if not self .connected :
124
- self .emit_provider_ready (
125
- ProviderEventDetails (
126
- message = "gRPC sync connection established"
127
- )
175
+ self .connected = True
176
+ self .emit_provider_ready (
177
+ ProviderEventDetails (
178
+ message = "gRPC sync connection established"
128
179
)
129
- self .connected = True
130
- retry_counter = 0
131
- # reset retry delay after successsful read
132
- retry_delay = self .retry_backoff_seconds
133
-
180
+ )
134
181
elif message .type == "configuration_change" :
135
182
data = MessageToDict (message )["data" ]
136
183
self .handle_changed_flags (data )
137
184
138
185
if not self .active :
139
186
logger .info ("Terminating gRPC sync thread" )
140
187
return
141
- except grpc .RpcError as e :
142
- logger .error (f"SyncFlags stream error, { e .code ()= } { e .details ()= } " )
143
- # re-create the stub if there's a connection issue - otherwise reconnect does not work as expected
144
- self .stub , self .channel = self ._create_stub ()
188
+ except grpc .RpcError as e : # noqa: PERF203
189
+ # although it seems like this error log is not interesting, without it, the retry is not working as expected
190
+ logger .debug (f"SyncFlags stream error, { e .code ()= } { e .details ()= } " )
145
191
except ParseError :
146
192
logger .exception (
147
193
f"Could not parse flag data using flagd syntax: { message = } "
148
194
)
149
195
150
- self .connected = False
151
- self .on_connection_error (retry_counter , retry_delay )
152
-
153
- retry_delay = self .handle_retry (retry_counter , retry_delay )
154
-
155
- retry_counter = retry_counter + 1
156
-
157
- def handle_retry (self , retry_counter : int , retry_delay : float ) -> float :
158
- if retry_counter == 0 :
159
- logger .info ("gRPC sync disconnected, reconnecting immediately" )
160
- else :
161
- logger .info (f"gRPC sync disconnected, reconnecting in { retry_delay } s" )
162
- time .sleep (retry_delay )
163
- retry_delay = min (1.1 * retry_delay , self .retry_backoff_max_seconds )
164
- return retry_delay
165
-
166
- def on_connection_error (self , retry_counter : int , retry_delay : float ) -> None :
167
- if retry_counter == self .retry_grace_attempts :
168
- if self .cache :
169
- self .cache .clear ()
170
- self .emit_provider_error (
171
- ProviderEventDetails (
172
- message = f"gRPC sync disconnected, reconnecting in { retry_delay } s" ,
173
- error_code = ErrorCode .GENERAL ,
174
- )
175
- )
176
- elif retry_counter == 1 :
177
- self .emit_provider_stale (
178
- ProviderEventDetails (
179
- message = f"gRPC sync disconnected, reconnecting in { retry_delay } s" ,
180
- )
181
- )
182
-
183
196
def handle_changed_flags (self , data : typing .Any ) -> None :
184
197
changed_flags = list (data ["flags" ].keys ())
185
198
0 commit comments