-
Notifications
You must be signed in to change notification settings - Fork 939
[Multi-Tab] Recover from unexpected primary lease loss #984
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
c40ff09
e51265b
a7c8497
f9c7872
34d52ae
692b789
3de1af4
c243b26
ee94d57
3da6855
cbd58e4
72aa4a9
7f685f9
b1c7e8d
6d3628d
794e9f7
e203796
5c56479
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -289,11 +289,11 @@ export class SyncEngine implements RemoteSyncer, SharedClientStateSyncer { | |
|
||
if (!targetRemainsActive) { | ||
this.remoteStore.unlisten(queryView.targetId); | ||
await this.removeAndCleanupQuery(queryView); | ||
return this.localStore | ||
await this.localStore | ||
.releaseQuery(query, /*keepPersistedQueryData=*/ false) | ||
.then(() => this.removeAndCleanupQuery(queryView)) | ||
.then(() => this.localStore.collectGarbage()) | ||
.catch(err => this.tryRecoverClient(err)); | ||
.catch(err => this.tryRecoverFromPrimaryLeaseLoss(err)); | ||
} | ||
} else { | ||
await this.removeAndCleanupQuery(queryView); | ||
|
@@ -399,7 +399,7 @@ export class SyncEngine implements RemoteSyncer, SharedClientStateSyncer { | |
.then(changes => { | ||
return this.emitNewSnapsAndNotifyLocalStore(changes, remoteEvent); | ||
}) | ||
.catch(err => this.tryRecoverClient(err)); | ||
.catch(err => this.tryRecoverFromPrimaryLeaseLoss(err)); | ||
} | ||
|
||
/** | ||
|
@@ -469,11 +469,10 @@ export class SyncEngine implements RemoteSyncer, SharedClientStateSyncer { | |
} else { | ||
const queryView = this.queryViewsByTarget[targetId]; | ||
assert(!!queryView, 'Unknown targetId: ' + targetId); | ||
await this.removeAndCleanupQuery(queryView); | ||
await this.localStore.releaseQuery( | ||
queryView.query, | ||
/* keepPersistedQueryData */ false | ||
); | ||
await this.localStore | ||
.releaseQuery(queryView.query, /* keepPersistedQueryData */ false) | ||
.then(() => this.removeAndCleanupQuery(queryView)) | ||
.catch(err => this.tryRecoverFromPrimaryLeaseLoss(err)); | ||
this.errorHandler!(queryView.query, err); | ||
} | ||
} | ||
|
@@ -505,10 +504,16 @@ export class SyncEngine implements RemoteSyncer, SharedClientStateSyncer { | |
// connection is disabled. | ||
await this.remoteStore.fillWritePipeline(); | ||
} else if (batchState === 'acknowledged' || batchState === 'rejected') { | ||
// If we receive a notification of an `acknowledged` or `rejected` batch | ||
// via Web Storage, we are either already secondary or another tab has | ||
// taken the primary lease. | ||
await this.applyPrimaryState(false); | ||
if (this.isPrimary) { | ||
// If we receive a notification of an `acknowledged` or `rejected` batch | ||
// via Web Storage, we are either already secondary or another tab has | ||
// taken the primary lease. | ||
log.debug( | ||
LOG_TAG, | ||
'Unexpectedly received mutation batch notification when already primary. Releasing primary lease.' | ||
); | ||
await this.applyPrimaryState(false); | ||
} | ||
|
||
// NOTE: Both these methods are no-ops for batches that originated from | ||
// other clients. | ||
|
@@ -542,7 +547,7 @@ export class SyncEngine implements RemoteSyncer, SharedClientStateSyncer { | |
this.sharedClientState.removeLocalPendingMutation(batchId); | ||
return this.emitNewSnapsAndNotifyLocalStore(changes); | ||
}) | ||
.catch(err => this.tryRecoverClient(err)); | ||
.catch(err => this.tryRecoverFromPrimaryLeaseLoss(err)); | ||
} | ||
|
||
rejectFailedWrite(batchId: BatchId, error: FirestoreError): Promise<void> { | ||
|
@@ -561,7 +566,7 @@ export class SyncEngine implements RemoteSyncer, SharedClientStateSyncer { | |
this.sharedClientState.removeLocalPendingMutation(batchId); | ||
return this.emitNewSnapsAndNotifyLocalStore(changes); | ||
}) | ||
.catch(err => this.tryRecoverClient(err)); | ||
.catch(err => this.tryRecoverFromPrimaryLeaseLoss(err)); | ||
} | ||
|
||
private addMutationCallback( | ||
|
@@ -738,22 +743,28 @@ export class SyncEngine implements RemoteSyncer, SharedClientStateSyncer { | |
if (this.isPrimary) { | ||
await this.localStore | ||
.collectGarbage() | ||
.catch(err => this.tryRecoverClient(err)); | ||
.catch(err => this.tryRecoverFromPrimaryLeaseLoss(err)); | ||
} | ||
} | ||
|
||
/** | ||
* Marks the client as secondary if an IndexedDb operation fails because the | ||
* primary lease has been taken by another client. This can happen when the | ||
* client is temporarily CPU throttled and fails to renew its lease in time, | ||
* in which we treat the current client as secondary. We can always revert | ||
* back to primary status via the lease refresh in our persistence layer. | ||
* in which case we treat the current client as secondary. We can always | ||
* regain our primary lease via the lease refresh in our persistence layer. | ||
* | ||
* @param err An error returned by an IndexedDb operation. | ||
* @param err An error returned by a LocalStore operation. | ||
* @return A Promise that resolves after we recovered, or the original error. | ||
*/ | ||
private async tryRecoverClient(err: FirestoreError): Promise<void> { | ||
private async tryRecoverFromPrimaryLeaseLoss( | ||
err: FirestoreError | ||
): Promise<void> { | ||
if (err.code === Code.FAILED_PRECONDITION) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What would you think about adding an IndexedDbPersistence.isLostLeaseError() helper that checks both the code and the message? Also, it may be useful to add a log message here... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not too big of a fan of checking the error string and even if we were to re-use the FAILED_PRECONDITION code somewhere else in our persistence layer, flipping the primary state to secondary seems like a reasonable action. Without this, I think we can just do the check here inline. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We already re-use FAILED_PRECONDITION for the PRIMARY_LEASE_EXCLUSIVE_ERROR_MSG error, and I don't think it makes sense to fall back to secondary in that case. I'd rather pessimistically assume that this error handling should not apply to future unknown errors and only expand it if/when we have specific cases we want to let it handle. If we do keep this error handling fairly inclusive, we should definitely include the error in the log message. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I moved the error check to IndexedDb and included the error message. Another option would be to statically define the error and compare against the instance, but that messes with the stacktraces. |
||
log.debug( | ||
LOG_TAG, | ||
'Unexpectedly lost primary lease, reverting to secondary' | ||
); | ||
return this.applyPrimaryState(false); | ||
} else { | ||
throw err; | ||
|
@@ -830,6 +841,7 @@ export class SyncEngine implements RemoteSyncer, SharedClientStateSyncer { | |
this.isPrimary = false; | ||
await this.remoteStore.disableNetwork(); | ||
objUtils.forEachNumber(this.queryViewsByTarget, targetId => { | ||
// TODO(multitab): Remove query views for non-local queries. | ||
this.remoteStore.unlisten(targetId); | ||
}); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hrm... When we become primary we're going to add queryViews for targets we don't have local listeners for. So when you transition non-primary=>primary=>non-primary, you'll end up with a bunch of extra queryViews. While perhaps not harmful, I'd prefer if this were more deterministic. So should we remove no-longer-needed queryViews here too? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's unfortunately no API yet that allows me to determine whether a view is "local". SharedClientState does have this information. But since this will require some plumbing, I turned this part of the cleanup into a TODO. |
||
} | ||
|
@@ -849,6 +861,10 @@ export class SyncEngine implements RemoteSyncer, SharedClientStateSyncer { | |
if (this.isPrimary) { | ||
// If we receive a target state notification via Web Storage, we are | ||
// either already secondary or another tab has taken the primary lease. | ||
log.debug( | ||
LOG_TAG, | ||
'Unexpectedly received query state notification when already primary. Releasing primary lease.' | ||
); | ||
await this.applyPrimaryState(false); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, these checks worry me and I am curious what happens if we don't have them and wait for the authoritative lease loss notification to happen. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removing these lines breaks some of the new tests. This is certainly a test issue, but in general multi-tab is only safe if there is only one primary (we don't want two simultaneous WriteStreams for example). I prefer having no primary temporarily than two primaries at once. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd prefer that too, but I don't think we can guarantee it. The only thing we can guarantee is that only one tab can make IndexedDb writes as primary at a time (via transactions that require the primary lease). But we could have two tabs with WriteStreams open for some period of time. Ideally our design would allow for this. If it's infeasible, then I guess we should acknowledge that there are race conditions but convince ourselves that it'll be sufficiently unlikely and/or non-catastrophic or something... |
||
} | ||
|
||
|
@@ -912,11 +928,10 @@ export class SyncEngine implements RemoteSyncer, SharedClientStateSyncer { | |
// removed if it has been rejected by the backend. | ||
if (queryView) { | ||
this.remoteStore.unlisten(targetId); | ||
await this.removeAndCleanupQuery(queryView); | ||
await this.localStore.releaseQuery( | ||
queryView.query, | ||
/*keepPersistedQueryData=*/ false | ||
); | ||
await this.localStore | ||
.releaseQuery(queryView.query, /*keepPersistedQueryData=*/ false) | ||
.then(() => this.removeAndCleanupQuery(queryView)) | ||
.catch(err => this.tryRecoverFromPrimaryLeaseLoss(err)); | ||
} | ||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -223,7 +223,13 @@ export class IndexedDbPersistence implements Persistence { | |
this.isPrimary = canActAsPrimary; | ||
// Always call the primary state listener, since SyncEngine may have | ||
// changed the primary state to 'false'. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FWIW this feels a hacky to me, but if we're going to do this and rely on it, you should document it on A cleaner (though perhaps more code?) approach would be to aggressively give up our primary lease in tryRecoverClient() so that we are definitely not the primary anymore. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree with you here, but I don't think the plumbing to make this work is necessarily a worthwhile investment. For now, I changed the method comment in It might even be possible to remove the primary flag from the persistence layer. It is currently only used at startup and at shutdown. |
||
this.queue.enqueue(() => this.primaryStateListener(this.isPrimary)); | ||
this.queue.enqueue(async () => { | ||
// Verify that `shutdown()` hasn't been called yet by the time | ||
// we invoke the `primaryStateListener`. | ||
if (this.started) { | ||
return this.primaryStateListener(this.isPrimary); | ||
} | ||
}); | ||
|
||
if (this.isPrimary) { | ||
return this.acquireOrExtendPrimaryLease(txn); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -565,18 +565,21 @@ export class RemoteStore implements TargetMetadataProvider { | |
this.writeStream.writeMutations(batch.mutations); | ||
} | ||
}, | ||
err => { | ||
if (err.code === Code.FAILED_PRECONDITION) { | ||
// We have temporarily lost our primary lease. If we recover, | ||
// Sync Engine will re-enable the network. | ||
return this.disableNetwork(); | ||
} else { | ||
return Promise.reject(err); | ||
} | ||
} | ||
err => this.tryRecoverFromPrimaryLeaseLoss(err) | ||
); | ||
} | ||
|
||
private tryRecoverFromPrimaryLeaseLoss(err: FirestoreError): void { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please rename this if you rename the other one. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
if (err.code === Code.FAILED_PRECONDITION) { | ||
// We have temporarily lost our primary lease. If we recover, | ||
// Sync Engine will re-enable the network. | ||
this.disableNetworkInternal(); | ||
this.onlineStateTracker.set(OnlineState.Unknown); | ||
} else { | ||
throw err; | ||
} | ||
} | ||
|
||
private onMutationResult( | ||
commitVersion: SnapshotVersion, | ||
results: MutationResult[] | ||
|
@@ -646,15 +649,7 @@ export class RemoteStore implements TargetMetadataProvider { | |
|
||
return this.localStore | ||
.setLastStreamToken(emptyByteString()) | ||
.catch(err => { | ||
if (err.code === Code.FAILED_PRECONDITION) { | ||
// We have temporarily lost our primary lease. If we recover, | ||
// Sync Engine will re-enable the network. | ||
return this.disableNetwork(); | ||
} else { | ||
return Promise.reject(err); | ||
} | ||
}); | ||
.catch(err => this.tryRecoverFromPrimaryLeaseLoss(err)); | ||
} else { | ||
// Some other error, don't reset stream token. Our stream logic will | ||
// just retry with exponential backoff. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we keep this logic, we should rephrase this comment since we're not actually releasing the primary lease. We're just pretending we don't have it. :-)