Skip to content

Commit aac4200

Browse files
feature: support graceful shutdown
If connected to Tarantool 2.10 or newer and WatchersFeature is required, after this patch connection supports server graceful shutdown [1]. In this case, server will wait until all client requests will be finished and client disconnects before going down (server also may go down by timeout). Client reconnect will happen if connection options enable reconnect. Beware that graceful shutdown event initialization is asynchronous. 1. https://www.tarantool.io/en/doc/latest/dev_guide/internals/iproto/graceful_shutdown/ Closes #214
1 parent 2faaa7d commit aac4200

File tree

4 files changed

+703
-14
lines changed

4 files changed

+703
-14
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ Versioning](http://semver.org/spec/v2.0.0.html) except to the first release.
1515
- Error type support in MessagePack (#209)
1616
- Event subscription support (#119)
1717
- Session settings support (#215)
18+
- Support graceful shutdown (#214)
1819

1920
### Changed
2021

connection.go

+132-14
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,17 @@ const ignoreStreamId = 0
2525
const (
2626
connDisconnected = 0
2727
connConnected = 1
28-
connClosed = 2
28+
connShutdown = 2
29+
connClosed = 3
2930
)
3031

3132
const (
3233
connTransportNone = ""
3334
connTransportSsl = "ssl"
3435
)
3536

37+
const shutdownEventKey = "box.shutdown"
38+
3639
type ConnEventKind int
3740
type ConnLogKind int
3841

@@ -45,6 +48,8 @@ const (
4548
ReconnectFailed
4649
// Either reconnect attempts exhausted, or explicit Close is called.
4750
Closed
51+
// Shutdown signals that shutdown callback is processing.
52+
Shutdown
4853

4954
// LogReconnectFailed is logged when reconnect attempt failed.
5055
LogReconnectFailed ConnLogKind = iota + 1
@@ -134,10 +139,20 @@ func (d defaultLogger) Report(event ConnLogKind, conn *Connection, v ...interfac
134139
// always returns array of array (array of tuples for space related methods).
135140
// For Eval* and Call* Tarantool always returns array, but does not forces
136141
// array of arrays.
142+
//
143+
// If connected to Tarantool 2.10 or newer and WatchersFeature is required,
144+
// connection supports server graceful shutdown. In this case, server will
145+
// wait until all client requests will be finished and client disconnects
146+
// before going down (server also may go down by timeout). Client reconnect will
147+
// happen if connection options enable reconnect. Beware that graceful shutdown
148+
// event initialization is asynchronous.
149+
//
150+
// More on graceful shutdown: https://www.tarantool.io/en/doc/latest/dev_guide/internals/iproto/graceful_shutdown/
137151
type Connection struct {
138152
addr string
139153
c net.Conn
140154
mutex sync.Mutex
155+
cond *sync.Cond
141156
// Schema contains schema loaded on connection.
142157
Schema *Schema
143158
// requestId contains the last request ID for requests with nil context.
@@ -162,6 +177,11 @@ type Connection struct {
162177
serverProtocolInfo ProtocolInfo
163178
// watchMap is a map of key -> chan watchState.
164179
watchMap sync.Map
180+
181+
// shutdownWatcher is the "box.shutdown" event watcher.
182+
shutdownWatcher Watcher
183+
// requestCnt is a counter of active requests.
184+
requestCnt int64
165185
}
166186

167187
var _ = Connector(&Connection{}) // Check compatibility with connector interface.
@@ -385,6 +405,8 @@ func Connect(addr string, opts Opts) (conn *Connection, err error) {
385405
conn.opts.Logger = defaultLogger{}
386406
}
387407

408+
conn.cond = sync.NewCond(&conn.mutex)
409+
388410
if err = conn.createConnection(false); err != nil {
389411
ter, ok := err.(Error)
390412
if conn.opts.Reconnect <= 0 {
@@ -421,6 +443,16 @@ func Connect(addr string, opts Opts) (conn *Connection, err error) {
421443
}
422444
}
423445

446+
// Subscribe shutdown event to process graceful shutdown.
447+
if conn.isWatchersRequired() {
448+
watcher, werr := conn.NewWatcher(shutdownEventKey, shutdownEventCallback)
449+
if werr != nil {
450+
conn.closeConnection(werr, true)
451+
return nil, werr
452+
}
453+
conn.shutdownWatcher = watcher
454+
}
455+
424456
return conn, err
425457
}
426458

@@ -589,6 +621,7 @@ func (conn *Connection) dial() (err error) {
589621
conn.lockShards()
590622
conn.c = connection
591623
atomic.StoreUint32(&conn.state, connConnected)
624+
conn.cond.Broadcast()
592625
conn.unlockShards()
593626
go conn.writer(w, connection)
594627
go conn.reader(r, connection)
@@ -762,10 +795,17 @@ func (conn *Connection) closeConnection(neterr error, forever bool) (err error)
762795
if conn.state != connClosed {
763796
close(conn.control)
764797
atomic.StoreUint32(&conn.state, connClosed)
798+
conn.cond.Broadcast()
799+
// Free the resources.
800+
if conn.shutdownWatcher != nil {
801+
go conn.shutdownWatcher.Unregister()
802+
conn.shutdownWatcher = nil
803+
}
765804
conn.notify(Closed)
766805
}
767806
} else {
768807
atomic.StoreUint32(&conn.state, connDisconnected)
808+
conn.cond.Broadcast()
769809
conn.notify(Disconnected)
770810
}
771811
if conn.c != nil {
@@ -784,9 +824,7 @@ func (conn *Connection) closeConnection(neterr error, forever bool) (err error)
784824
return
785825
}
786826

787-
func (conn *Connection) reconnect(neterr error, c net.Conn) {
788-
conn.mutex.Lock()
789-
defer conn.mutex.Unlock()
827+
func (conn *Connection) reconnectImpl(neterr error, c net.Conn) {
790828
if conn.opts.Reconnect > 0 {
791829
if c == conn.c {
792830
conn.closeConnection(neterr, false)
@@ -799,6 +837,13 @@ func (conn *Connection) reconnect(neterr error, c net.Conn) {
799837
}
800838
}
801839

840+
func (conn *Connection) reconnect(neterr error, c net.Conn) {
841+
conn.mutex.Lock()
842+
defer conn.mutex.Unlock()
843+
conn.reconnectImpl(neterr, c)
844+
conn.cond.Broadcast()
845+
}
846+
802847
func (conn *Connection) lockShards() {
803848
for i := range conn.shard {
804849
conn.shard[i].rmut.Lock()
@@ -1026,6 +1071,15 @@ func (conn *Connection) newFuture(ctx context.Context) (fut *Future) {
10261071
fut.done = nil
10271072
shard.rmut.Unlock()
10281073
return
1074+
case connShutdown:
1075+
fut.err = ClientError{
1076+
ErrConnectionShutdown,
1077+
"server shutdown in progress",
1078+
}
1079+
fut.ready = nil
1080+
fut.done = nil
1081+
shard.rmut.Unlock()
1082+
return
10291083
}
10301084
pos := (fut.requestId / conn.opts.Concurrency) & (requestsMap - 1)
10311085
if ctx != nil {
@@ -1086,6 +1140,7 @@ func (conn *Connection) send(req Request, streamId uint64) *Future {
10861140
if fut.ready == nil {
10871141
return fut
10881142
}
1143+
10891144
if req.Ctx() != nil {
10901145
select {
10911146
case <-req.Ctx().Done():
@@ -1094,10 +1149,15 @@ func (conn *Connection) send(req Request, streamId uint64) *Future {
10941149
default:
10951150
}
10961151
}
1152+
1153+
atomic.AddInt64(&(conn.requestCnt), int64(1))
1154+
10971155
conn.putFuture(fut, req, streamId)
1156+
10981157
if req.Ctx() != nil {
10991158
go conn.contextWatchdog(fut, req.Ctx())
11001159
}
1160+
11011161
return fut
11021162
}
11031163

@@ -1164,6 +1224,10 @@ func (conn *Connection) markDone(fut *Future) {
11641224
if conn.rlimit != nil {
11651225
<-conn.rlimit
11661226
}
1227+
1228+
if atomic.AddInt64(&(conn.requestCnt), int64(-1)) == 0 {
1229+
conn.cond.Broadcast()
1230+
}
11671231
}
11681232

11691233
func (conn *Connection) peekFuture(reqid uint32) (fut *Future) {
@@ -1458,6 +1522,15 @@ func subscribeWatchChannel(conn *Connection, key string) (chan watchState, error
14581522
return st, nil
14591523
}
14601524

1525+
func (conn *Connection) isWatchersRequired() bool {
1526+
for _, feature := range conn.opts.RequiredProtocolInfo.Features {
1527+
if feature == WatchersFeature {
1528+
return true
1529+
}
1530+
}
1531+
return false
1532+
}
1533+
14611534
// NewWatcher creates a new Watcher object for the connection.
14621535
//
14631536
// You need to require WatchersFeature to use watchers, see examples for the
@@ -1496,15 +1569,7 @@ func (conn *Connection) NewWatcher(key string, callback WatchCallback) (Watcher,
14961569
// asynchronous. We do not expect any response from a Tarantool instance
14971570
// That's why we can't just check the Tarantool response for an unsupported
14981571
// request error.
1499-
watchersRequired := false
1500-
for _, feature := range conn.opts.RequiredProtocolInfo.Features {
1501-
if feature == WatchersFeature {
1502-
watchersRequired = true
1503-
break
1504-
}
1505-
}
1506-
1507-
if !watchersRequired {
1572+
if !conn.isWatchersRequired() {
15081573
err := fmt.Errorf("the feature %s must be required by connection "+
15091574
"options to create a watcher", WatchersFeature)
15101575
return nil, err
@@ -1563,7 +1628,11 @@ func (conn *Connection) NewWatcher(key string, callback WatchCallback) (Watcher,
15631628

15641629
if state.cnt == 0 {
15651630
// The last one sends IPROTO_UNWATCH.
1566-
conn.Do(newUnwatchRequest(key)).Get()
1631+
if !conn.ClosedNow() {
1632+
// conn.ClosedNow() check is a workaround for calling
1633+
// Unregister from connectionClose().
1634+
conn.Do(newUnwatchRequest(key)).Get()
1635+
}
15671636
conn.watchMap.Delete(key)
15681637
close(state.unready)
15691638
}
@@ -1666,3 +1735,52 @@ func (conn *Connection) ServerProtocolInfo() ProtocolInfo {
16661735
func (conn *Connection) ClientProtocolInfo() ProtocolInfo {
16671736
return clientProtocolInfo.Clone()
16681737
}
1738+
1739+
func shutdownEventCallback(event WatchEvent) {
1740+
// Receives "true" on server shutdown.
1741+
// See https://www.tarantool.io/en/doc/latest/dev_guide/internals/iproto/graceful_shutdown/
1742+
// step 2.
1743+
val, ok := event.Value.(bool)
1744+
if ok && val {
1745+
go event.Conn.processShutdown()
1746+
}
1747+
}
1748+
1749+
func (conn *Connection) processShutdown() {
1750+
// Forbid state changes.
1751+
conn.mutex.Lock()
1752+
defer conn.mutex.Unlock()
1753+
1754+
if !atomic.CompareAndSwapUint32(&(conn.state), connConnected, connShutdown) {
1755+
return
1756+
}
1757+
conn.notify(Shutdown)
1758+
1759+
c := conn.c
1760+
for (atomic.LoadUint32(&(conn.state)) == connShutdown) &&
1761+
(atomic.LoadInt64(&(conn.requestCnt)) != 0) &&
1762+
(c == conn.c) {
1763+
// Use cond var on conn.mutex since request execution may
1764+
// call reconnect(). It is ok if state changes as part of
1765+
// reconnect since Tarantool server won't allow to reconnect
1766+
// in the middle of shutting down.
1767+
conn.cond.Wait()
1768+
}
1769+
// Do not unregister task explicitly here since connection teardown
1770+
// has the same effect. To clean up connection resources,
1771+
// unregister on full close.
1772+
1773+
if (atomic.LoadUint32(&(conn.state)) == connShutdown) &&
1774+
(c == conn.c) {
1775+
// Start to reconnect based on common rules, same as in net.box.
1776+
// Reconnect also closes the connection: server waits until all
1777+
// subscribed connections are terminated.
1778+
// See https://www.tarantool.io/en/doc/latest/dev_guide/internals/iproto/graceful_shutdown/
1779+
// step 3.
1780+
conn.reconnectImpl(
1781+
ClientError{
1782+
ErrConnectionClosed,
1783+
"connection closed after server shutdown",
1784+
}, conn.c)
1785+
}
1786+
}

errors.go

+1
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ const (
5555
ErrProtocolError = 0x4000 + iota
5656
ErrTimeouted = 0x4000 + iota
5757
ErrRateLimited = 0x4000 + iota
58+
ErrConnectionShutdown = 0x4000 + iota
5859
)
5960

6061
// Tarantool server error codes.

0 commit comments

Comments
 (0)