|
| 1 | +--- |
| 2 | +layout: pattern |
| 3 | +title: Circuit Breaker |
| 4 | +folder: circuit-breaker |
| 5 | +permalink: /patterns/circuit-breaker/ |
| 6 | +categories: Behavioral |
| 7 | +tags: |
| 8 | + - Performance |
| 9 | + - Decoupling |
| 10 | + - Cloud distributed |
| 11 | +--- |
| 12 | + |
| 13 | +## 含义 |
| 14 | + |
| 15 | +以这样的方式(译者:指断路器方式)处理昂贵的远程服务调用,可以防止单个服务/组件的故障导致整个应用程序崩溃,同时我们可以尽快地进行服务重连。 |
| 16 | + |
| 17 | +## 解释 |
| 18 | + |
| 19 | +现实世界案例 |
| 20 | + |
| 21 | +> 设想一下,一个网络应用程序既有本地文件/图像,又有用于获取数据的远程服务。这些远程服务可能在某些时候是健康的、有反应的,也可能在某些时候由于各种原因而变得缓慢和无反应。因此,如果其中一个远程服务速度慢或不能成功响应,我们的应用程序将尝试使用多个线程/进程从远程服务中获取响应,很快所有的线程/进程都会挂起(也称为线程饥饿 thread starvation),从而导致我们整个 Web 应用程序崩溃。我们应该能够检测到这种情况,并向用户显示一个适当的信息,以便用户可以探索应用程序的其他部分,而不受远程服务故障的影响。同时,其他正常工作的服务应该保持运作,不受这次故障的影响。 |
| 22 | +
|
| 23 | +简而言之 |
| 24 | + |
| 25 | +> 断路器允许优雅地处理失败的远程服务。当我们的应用程序的所有部分都高度解耦时,这种方式的效果会很好,一个组件的失败并不会导致其他部分停止工作。 |
| 26 | +
|
| 27 | +维基百科的解释 |
| 28 | + |
| 29 | +> 断路器是现代软件开发中使用的一种设计模式。它用于检测故障,并封装了防止故障不断复发的逻辑,在维护期间,临时地处理外部系统故障或意外的系统问题。 |
| 30 | +
|
| 31 | +## Programmatic Example |
| 32 | + |
| 33 | +那么,这一切是如何实现的呢?考虑到上面的例子,我们将在一个简单的例子中模拟这个功能。一个监控服务(译者:下图的 Monitoring Service)模拟了网络应用,进行本地和远程调用。 |
| 34 | + |
| 35 | +该服务架构如下: |
| 36 | + |
| 37 | + |
| 38 | + |
| 39 | +终端用户(译者:上图的 End User)应用的代码如下: |
| 40 | + |
| 41 | +```java |
| 42 | +@Slf4j |
| 43 | +public class App { |
| 44 | + |
| 45 | + private static final Logger LOGGER = LoggerFactory.getLogger(App.class); |
| 46 | + |
| 47 | + /** |
| 48 | + * Program entry point. |
| 49 | + * |
| 50 | + * @param args command line args |
| 51 | + */ |
| 52 | + public static void main(String[] args) { |
| 53 | + |
| 54 | + var serverStartTime = System.nanoTime(); |
| 55 | + |
| 56 | + var delayedService = new DelayedRemoteService(serverStartTime, 5); |
| 57 | + var delayedServiceCircuitBreaker = new DefaultCircuitBreaker(delayedService, 3000, 2, |
| 58 | + 2000 * 1000 * 1000); |
| 59 | + |
| 60 | + var quickService = new QuickRemoteService(); |
| 61 | + var quickServiceCircuitBreaker = new DefaultCircuitBreaker(quickService, 3000, 2, |
| 62 | + 2000 * 1000 * 1000); |
| 63 | + |
| 64 | + //Create an object of monitoring service which makes both local and remote calls |
| 65 | + var monitoringService = new MonitoringService(delayedServiceCircuitBreaker, |
| 66 | + quickServiceCircuitBreaker); |
| 67 | + |
| 68 | + //Fetch response from local resource |
| 69 | + LOGGER.info(monitoringService.localResourceResponse()); |
| 70 | + |
| 71 | + //Fetch response from delayed service 2 times, to meet the failure threshold |
| 72 | + LOGGER.info(monitoringService.delayedServiceResponse()); |
| 73 | + LOGGER.info(monitoringService.delayedServiceResponse()); |
| 74 | + |
| 75 | + //Fetch current state of delayed service circuit breaker after crossing failure threshold limit |
| 76 | + //which is OPEN now |
| 77 | + LOGGER.info(delayedServiceCircuitBreaker.getState()); |
| 78 | + |
| 79 | + //Meanwhile, the delayed service is down, fetch response from the healthy quick service |
| 80 | + LOGGER.info(monitoringService.quickServiceResponse()); |
| 81 | + LOGGER.info(quickServiceCircuitBreaker.getState()); |
| 82 | + |
| 83 | + //Wait for the delayed service to become responsive |
| 84 | + try { |
| 85 | + LOGGER.info("Waiting for delayed service to become responsive"); |
| 86 | + Thread.sleep(5000); |
| 87 | + } catch (InterruptedException e) { |
| 88 | + e.printStackTrace(); |
| 89 | + } |
| 90 | + //Check the state of delayed circuit breaker, should be HALF_OPEN |
| 91 | + LOGGER.info(delayedServiceCircuitBreaker.getState()); |
| 92 | + |
| 93 | + //Fetch response from delayed service, which should be healthy by now |
| 94 | + LOGGER.info(monitoringService.delayedServiceResponse()); |
| 95 | + //As successful response is fetched, it should be CLOSED again. |
| 96 | + LOGGER.info(delayedServiceCircuitBreaker.getState()); |
| 97 | + } |
| 98 | +} |
| 99 | +``` |
| 100 | + |
| 101 | +监控服务代码(译者:上图的 monitoring service): |
| 102 | + |
| 103 | +```java |
| 104 | +public class MonitoringService { |
| 105 | + |
| 106 | + private final CircuitBreaker delayedService; |
| 107 | + |
| 108 | + private final CircuitBreaker quickService; |
| 109 | + |
| 110 | + public MonitoringService(CircuitBreaker delayedService, CircuitBreaker quickService) { |
| 111 | + this.delayedService = delayedService; |
| 112 | + this.quickService = quickService; |
| 113 | + } |
| 114 | + |
| 115 | + //Assumption: Local service won't fail, no need to wrap it in a circuit breaker logic |
| 116 | + public String localResourceResponse() { |
| 117 | + return "Local Service is working"; |
| 118 | + } |
| 119 | + |
| 120 | + /** |
| 121 | + * Fetch response from the delayed service (with some simulated startup time). |
| 122 | + * |
| 123 | + * @return response string |
| 124 | + */ |
| 125 | + public String delayedServiceResponse() { |
| 126 | + try { |
| 127 | + return this.delayedService.attemptRequest(); |
| 128 | + } catch (RemoteServiceException e) { |
| 129 | + return e.getMessage(); |
| 130 | + } |
| 131 | + } |
| 132 | + |
| 133 | + /** |
| 134 | + * Fetches response from a healthy service without any failure. |
| 135 | + * |
| 136 | + * @return response string |
| 137 | + */ |
| 138 | + public String quickServiceResponse() { |
| 139 | + try { |
| 140 | + return this.quickService.attemptRequest(); |
| 141 | + } catch (RemoteServiceException e) { |
| 142 | + return e.getMessage(); |
| 143 | + } |
| 144 | + } |
| 145 | +} |
| 146 | +``` |
| 147 | +可以看出,它直接进行了获取本地资源的调用,但它把对远程(昂贵的)服务的调用包装在一个断路器对象中,这样可以防止出现如下故障: |
| 148 | + |
| 149 | +```java |
| 150 | +public class DefaultCircuitBreaker implements CircuitBreaker { |
| 151 | + |
| 152 | + private final long timeout; |
| 153 | + private final long retryTimePeriod; |
| 154 | + private final RemoteService service; |
| 155 | + long lastFailureTime; |
| 156 | + private String lastFailureResponse; |
| 157 | + int failureCount; |
| 158 | + private final int failureThreshold; |
| 159 | + private State state; |
| 160 | + private final long futureTime = 1000 * 1000 * 1000 * 1000; |
| 161 | + |
| 162 | + /** |
| 163 | + * Constructor to create an instance of Circuit Breaker. |
| 164 | + * |
| 165 | + * @param timeout Timeout for the API request. Not necessary for this simple example |
| 166 | + * @param failureThreshold Number of failures we receive from the depended service before changing |
| 167 | + * state to 'OPEN' |
| 168 | + * @param retryTimePeriod Time period after which a new request is made to remote service for |
| 169 | + * status check. |
| 170 | + */ |
| 171 | + DefaultCircuitBreaker(RemoteService serviceToCall, long timeout, int failureThreshold, |
| 172 | + long retryTimePeriod) { |
| 173 | + this.service = serviceToCall; |
| 174 | + // We start in a closed state hoping that everything is fine |
| 175 | + this.state = State.CLOSED; |
| 176 | + this.failureThreshold = failureThreshold; |
| 177 | + // Timeout for the API request. |
| 178 | + // Used to break the calls made to remote resource if it exceeds the limit |
| 179 | + this.timeout = timeout; |
| 180 | + this.retryTimePeriod = retryTimePeriod; |
| 181 | + //An absurd amount of time in future which basically indicates the last failure never happened |
| 182 | + this.lastFailureTime = System.nanoTime() + futureTime; |
| 183 | + this.failureCount = 0; |
| 184 | + } |
| 185 | + |
| 186 | + // Reset everything to defaults |
| 187 | + @Override |
| 188 | + public void recordSuccess() { |
| 189 | + this.failureCount = 0; |
| 190 | + this.lastFailureTime = System.nanoTime() + futureTime; |
| 191 | + this.state = State.CLOSED; |
| 192 | + } |
| 193 | + |
| 194 | + @Override |
| 195 | + public void recordFailure(String response) { |
| 196 | + failureCount = failureCount + 1; |
| 197 | + this.lastFailureTime = System.nanoTime(); |
| 198 | + // Cache the failure response for returning on open state |
| 199 | + this.lastFailureResponse = response; |
| 200 | + } |
| 201 | + |
| 202 | + // Evaluate the current state based on failureThreshold, failureCount and lastFailureTime. |
| 203 | + protected void evaluateState() { |
| 204 | + if (failureCount >= failureThreshold) { //Then something is wrong with remote service |
| 205 | + if ((System.nanoTime() - lastFailureTime) > retryTimePeriod) { |
| 206 | + //We have waited long enough and should try checking if service is up |
| 207 | + state = State.HALF_OPEN; |
| 208 | + } else { |
| 209 | + //Service would still probably be down |
| 210 | + state = State.OPEN; |
| 211 | + } |
| 212 | + } else { |
| 213 | + //Everything is working fine |
| 214 | + state = State.CLOSED; |
| 215 | + } |
| 216 | + } |
| 217 | + |
| 218 | + @Override |
| 219 | + public String getState() { |
| 220 | + evaluateState(); |
| 221 | + return state.name(); |
| 222 | + } |
| 223 | + |
| 224 | + /** |
| 225 | + * Break the circuit beforehand if it is known service is down Or connect the circuit manually if |
| 226 | + * service comes online before expected. |
| 227 | + * |
| 228 | + * @param state State at which circuit is in |
| 229 | + */ |
| 230 | + @Override |
| 231 | + public void setState(State state) { |
| 232 | + this.state = state; |
| 233 | + switch (state) { |
| 234 | + case OPEN: |
| 235 | + this.failureCount = failureThreshold; |
| 236 | + this.lastFailureTime = System.nanoTime(); |
| 237 | + break; |
| 238 | + case HALF_OPEN: |
| 239 | + this.failureCount = failureThreshold; |
| 240 | + this.lastFailureTime = System.nanoTime() - retryTimePeriod; |
| 241 | + break; |
| 242 | + default: |
| 243 | + this.failureCount = 0; |
| 244 | + } |
| 245 | + } |
| 246 | + |
| 247 | + /** |
| 248 | + * Executes service call. |
| 249 | + * |
| 250 | + * @return Value from the remote resource, stale response or a custom exception |
| 251 | + */ |
| 252 | + @Override |
| 253 | + public String attemptRequest() throws RemoteServiceException { |
| 254 | + evaluateState(); |
| 255 | + if (state == State.OPEN) { |
| 256 | + // return cached response if the circuit is in OPEN state |
| 257 | + return this.lastFailureResponse; |
| 258 | + } else { |
| 259 | + // Make the API request if the circuit is not OPEN |
| 260 | + try { |
| 261 | + //In a real application, this would be run in a thread and the timeout |
| 262 | + //parameter of the circuit breaker would be utilized to know if service |
| 263 | + //is working. Here, we simulate that based on server response itself |
| 264 | + var response = service.call(); |
| 265 | + // Yay!! the API responded fine. Let's reset everything. |
| 266 | + recordSuccess(); |
| 267 | + return response; |
| 268 | + } catch (RemoteServiceException ex) { |
| 269 | + recordFailure(ex.getMessage()); |
| 270 | + throw ex; |
| 271 | + } |
| 272 | + } |
| 273 | + } |
| 274 | +} |
| 275 | +``` |
| 276 | + |
| 277 | +上述模式是如何防止失败的呢?让我们通过它所实现的这个有限状态机来了解。 |
| 278 | + |
| 279 | + |
| 280 | + |
| 281 | +- 我们用 `timeout`(超时)、 `failureThreshold` (失败阈值)、`retryTimePeriod`(重试时间周期) 参数初始化断路器对象 ,用于确定 API 的适应性。 |
| 282 | +- 最初,断路器处于 `closed` 关闭状态,没有发生对 API 的远程调用。 |
| 283 | +- 每次调用成功,我们就把状态重置为开始时的样子。 |
| 284 | +- 如果失败的次数超过了一定的阈值(`failureThreshold`),断路器就会进入 `open` 开启状态,它的作用就像一个开启的电路,阻止远程服务的调用,从而节省资源。 |
| 285 | +- 一旦我们超过重试时间周期(`retryTimePeriod`),断路器就会转到 `half-open` 半启用状态,并再次调用远程服务,检查服务是否正常,以便我们可以提供最新的响应内容。如果远程服务调用失败会使断路器回到 `open` 状态,并在重试超时后进行另一次尝试;如果远程服务调用成功则使断路器进入 `closed` 状态,这样一切又开始正常工作。 |
| 286 | + |
| 287 | +## 类图 |
| 288 | + |
| 289 | + |
| 290 | + |
| 291 | +## 适用场景 |
| 292 | + |
| 293 | +在以下场景下,可以使用断路器模式: |
| 294 | + |
| 295 | +- 构建一个高可用的应用程序,某些些服务的失败不会导致整个应用程序的崩溃。 |
| 296 | +- 构建一个持续运行(长期在线)的应用程序,以便其组件可以在不完全关闭的情况下进行升级。 |
| 297 | + |
| 298 | +## 相关模式 |
| 299 | + |
| 300 | +- [Retry Pattern](https://github.com/iluwatar/java-design-patterns/tree/master/retry) |
| 301 | + |
| 302 | +## 现实案例 |
| 303 | + |
| 304 | +* [Spring Circuit Breaker module](https://spring.io/guides/gs/circuit-breaker) |
| 305 | +* [Netflix Hystrix API](https://github.com/Netflix/Hystrix) |
| 306 | + |
| 307 | +## 引用 |
| 308 | + |
| 309 | +* [Understanding Circuit Breaker Pattern](https://itnext.io/understand-circuitbreaker-design-pattern-with-simple-practical-example-92a752615b42) |
| 310 | +* [Martin Fowler on Circuit Breaker](https://martinfowler.com/bliki/CircuitBreaker.html) |
| 311 | +* [Fault tolerance in a high volume, distributed system](https://medium.com/netflix-techblog/fault-tolerance-in-a-high-volume-distributed-system-91ab4faae74a) |
| 312 | +* [Circuit Breaker pattern](https://docs.microsoft.com/en-us/azure/architecture/patterns/circuit-breaker) |
0 commit comments