Skip to content

[Multi-Tab] Recover from unexpected primary lease loss #984

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Jul 25, 2018
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 40 additions & 25 deletions packages/firestore/src/core/sync_engine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -289,11 +289,11 @@ export class SyncEngine implements RemoteSyncer, SharedClientStateSyncer {

if (!targetRemainsActive) {
this.remoteStore.unlisten(queryView.targetId);
await this.removeAndCleanupQuery(queryView);
return this.localStore
await this.localStore
.releaseQuery(query, /*keepPersistedQueryData=*/ false)
.then(() => this.removeAndCleanupQuery(queryView))
.then(() => this.localStore.collectGarbage())
.catch(err => this.tryRecoverClient(err));
.catch(err => this.tryRecoverFromPrimaryLeaseLoss(err));
}
} else {
await this.removeAndCleanupQuery(queryView);
Expand Down Expand Up @@ -399,7 +399,7 @@ export class SyncEngine implements RemoteSyncer, SharedClientStateSyncer {
.then(changes => {
return this.emitNewSnapsAndNotifyLocalStore(changes, remoteEvent);
})
.catch(err => this.tryRecoverClient(err));
.catch(err => this.tryRecoverFromPrimaryLeaseLoss(err));
}

/**
Expand Down Expand Up @@ -469,11 +469,10 @@ export class SyncEngine implements RemoteSyncer, SharedClientStateSyncer {
} else {
const queryView = this.queryViewsByTarget[targetId];
assert(!!queryView, 'Unknown targetId: ' + targetId);
await this.removeAndCleanupQuery(queryView);
await this.localStore.releaseQuery(
queryView.query,
/* keepPersistedQueryData */ false
);
await this.localStore
.releaseQuery(queryView.query, /* keepPersistedQueryData */ false)
.then(() => this.removeAndCleanupQuery(queryView))
.catch(err => this.tryRecoverFromPrimaryLeaseLoss(err));
this.errorHandler!(queryView.query, err);
}
}
Expand Down Expand Up @@ -505,10 +504,16 @@ export class SyncEngine implements RemoteSyncer, SharedClientStateSyncer {
// connection is disabled.
await this.remoteStore.fillWritePipeline();
} else if (batchState === 'acknowledged' || batchState === 'rejected') {
// If we receive a notification of an `acknowledged` or `rejected` batch
// via Web Storage, we are either already secondary or another tab has
// taken the primary lease.
await this.applyPrimaryState(false);
if (this.isPrimary) {
// If we receive a notification of an `acknowledged` or `rejected` batch
// via Web Storage, we are either already secondary or another tab has
// taken the primary lease.
log.debug(
LOG_TAG,
'Unexpectedly received mutation batch notification when already primary. Releasing primary lease.'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we keep this logic, we should rephrase this comment since we're not actually releasing the primary lease. We're just pretending we don't have it. :-)

);
await this.applyPrimaryState(false);
}

// NOTE: Both these methods are no-ops for batches that originated from
// other clients.
Expand Down Expand Up @@ -542,7 +547,7 @@ export class SyncEngine implements RemoteSyncer, SharedClientStateSyncer {
this.sharedClientState.removeLocalPendingMutation(batchId);
return this.emitNewSnapsAndNotifyLocalStore(changes);
})
.catch(err => this.tryRecoverClient(err));
.catch(err => this.tryRecoverFromPrimaryLeaseLoss(err));
}

rejectFailedWrite(batchId: BatchId, error: FirestoreError): Promise<void> {
Expand All @@ -561,7 +566,7 @@ export class SyncEngine implements RemoteSyncer, SharedClientStateSyncer {
this.sharedClientState.removeLocalPendingMutation(batchId);
return this.emitNewSnapsAndNotifyLocalStore(changes);
})
.catch(err => this.tryRecoverClient(err));
.catch(err => this.tryRecoverFromPrimaryLeaseLoss(err));
}

private addMutationCallback(
Expand Down Expand Up @@ -738,22 +743,28 @@ export class SyncEngine implements RemoteSyncer, SharedClientStateSyncer {
if (this.isPrimary) {
await this.localStore
.collectGarbage()
.catch(err => this.tryRecoverClient(err));
.catch(err => this.tryRecoverFromPrimaryLeaseLoss(err));
}
}

/**
* Marks the client as secondary if an IndexedDb operation fails because the
* primary lease has been taken by another client. This can happen when the
* client is temporarily CPU throttled and fails to renew its lease in time,
* in which we treat the current client as secondary. We can always revert
* back to primary status via the lease refresh in our persistence layer.
* in which case we treat the current client as secondary. We can always
* regain our primary lease via the lease refresh in our persistence layer.
*
* @param err An error returned by an IndexedDb operation.
* @param err An error returned by a LocalStore operation.
* @return A Promise that resolves after we recovered, or the original error.
*/
private async tryRecoverClient(err: FirestoreError): Promise<void> {
private async tryRecoverFromPrimaryLeaseLoss(
err: FirestoreError
): Promise<void> {
if (err.code === Code.FAILED_PRECONDITION) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What would you think about adding an IndexedDbPersistence.isLostLeaseError() helper that checks both the code and the message?

Also, it may be useful to add a log message here...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not too big of a fan of checking the error string and even if we were to re-use the FAILED_PRECONDITION code somewhere else in our persistence layer, flipping the primary state to secondary seems like a reasonable action. Without this, I think we can just do the check here inline.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already re-use FAILED_PRECONDITION for the PRIMARY_LEASE_EXCLUSIVE_ERROR_MSG error, and I don't think it makes sense to fall back to secondary in that case.

I'd rather pessimistically assume that this error handling should not apply to future unknown errors and only expand it if/when we have specific cases we want to let it handle.

If we do keep this error handling fairly inclusive, we should definitely include the error in the log message.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I moved the error check to IndexedDb and included the error message. Another option would be to statically define the error and compare against the instance, but that messes with the stacktraces.

log.debug(
LOG_TAG,
'Unexpectedly lost primary lease, reverting to secondary'
);
return this.applyPrimaryState(false);
} else {
throw err;
Expand Down Expand Up @@ -830,6 +841,7 @@ export class SyncEngine implements RemoteSyncer, SharedClientStateSyncer {
this.isPrimary = false;
await this.remoteStore.disableNetwork();
objUtils.forEachNumber(this.queryViewsByTarget, targetId => {
// TODO(multitab): Remove query views for non-local queries.
this.remoteStore.unlisten(targetId);
});
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hrm... When we become primary we're going to add queryViews for targets we don't have local listeners for.
Here, when we lose primary, we'll stop listening to those queries, but we'll keep the queryViews.

So when you transition non-primary=>primary=>non-primary, you'll end up with a bunch of extra queryViews. While perhaps not harmful, I'd prefer if this were more deterministic. So should we remove no-longer-needed queryViews here too?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's unfortunately no API yet that allows me to determine whether a view is "local". SharedClientState does have this information. But since this will require some plumbing, I turned this part of the cleanup into a TODO.

}
Expand All @@ -849,6 +861,10 @@ export class SyncEngine implements RemoteSyncer, SharedClientStateSyncer {
if (this.isPrimary) {
// If we receive a target state notification via Web Storage, we are
// either already secondary or another tab has taken the primary lease.
log.debug(
LOG_TAG,
'Unexpectedly received query state notification when already primary. Releasing primary lease.'
);
await this.applyPrimaryState(false);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, these checks worry me and I am curious what happens if we don't have them and wait for the authoritative lease loss notification to happen.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing these lines breaks some of the new tests. This is certainly a test issue, but in general multi-tab is only safe if there is only one primary (we don't want two simultaneous WriteStreams for example). I prefer having no primary temporarily than two primaries at once.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer that too, but I don't think we can guarantee it. The only thing we can guarantee is that only one tab can make IndexedDb writes as primary at a time (via transactions that require the primary lease). But we could have two tabs with WriteStreams open for some period of time.

Ideally our design would allow for this. If it's infeasible, then I guess we should acknowledge that there are race conditions but convince ourselves that it'll be sufficiently unlikely and/or non-catastrophic or something...

}

Expand Down Expand Up @@ -912,11 +928,10 @@ export class SyncEngine implements RemoteSyncer, SharedClientStateSyncer {
// removed if it has been rejected by the backend.
if (queryView) {
this.remoteStore.unlisten(targetId);
await this.removeAndCleanupQuery(queryView);
await this.localStore.releaseQuery(
queryView.query,
/*keepPersistedQueryData=*/ false
);
await this.localStore
.releaseQuery(queryView.query, /*keepPersistedQueryData=*/ false)
.then(() => this.removeAndCleanupQuery(queryView))
.catch(err => this.tryRecoverFromPrimaryLeaseLoss(err));
}
}
}
Expand Down
8 changes: 7 additions & 1 deletion packages/firestore/src/local/indexeddb_persistence.ts
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,13 @@ export class IndexedDbPersistence implements Persistence {
this.isPrimary = canActAsPrimary;
// Always call the primary state listener, since SyncEngine may have
// changed the primary state to 'false'.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW this feels a hacky to me, but if we're going to do this and rely on it, you should document it on PrimaryStateListener interface and/or the setPrimaryStateListener() method ("Called when the primary state changes and again every N seconds while primary")

A cleaner (though perhaps more code?) approach would be to aggressively give up our primary lease in tryRecoverClient() so that we are definitely not the primary anymore.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with you here, but I don't think the plumbing to make this work is necessarily a worthwhile investment. For now, I changed the method comment in setPrimaryStateListener.

It might even be possible to remove the primary flag from the persistence layer. It is currently only used at startup and at shutdown.

this.queue.enqueue(() => this.primaryStateListener(this.isPrimary));
this.queue.enqueue(async () => {
// Verify that `shutdown()` hasn't been called yet by the time
// we invoke the `primaryStateListener`.
if (this.started) {
return this.primaryStateListener(this.isPrimary);
}
});

if (this.isPrimary) {
return this.acquireOrExtendPrimaryLease(txn);
Expand Down
6 changes: 3 additions & 3 deletions packages/firestore/src/local/persistence.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,9 @@ export interface Persistence {
shutdown(deleteData?: boolean): Promise<void>;

/**
* Registers a listener that gets called when the primary state of the
* instance changes. Upon registering, this listener is invoked immediately
* with the current primary state.
* Registers a listener that gets called immediately wih the current primary
* state and then periodically as the lease is verified (likely with the same
* state).
*
* PORTING NOTE: This is only used for Web multi-tab.
*/
Expand Down
31 changes: 13 additions & 18 deletions packages/firestore/src/remote/remote_store.ts
Original file line number Diff line number Diff line change
Expand Up @@ -565,18 +565,21 @@ export class RemoteStore implements TargetMetadataProvider {
this.writeStream.writeMutations(batch.mutations);
}
},
err => {
if (err.code === Code.FAILED_PRECONDITION) {
// We have temporarily lost our primary lease. If we recover,
// Sync Engine will re-enable the network.
return this.disableNetwork();
} else {
return Promise.reject(err);
}
}
err => this.tryRecoverFromPrimaryLeaseLoss(err)
);
}

private tryRecoverFromPrimaryLeaseLoss(err: FirestoreError): void {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please rename this if you rename the other one.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

if (err.code === Code.FAILED_PRECONDITION) {
// We have temporarily lost our primary lease. If we recover,
// Sync Engine will re-enable the network.
this.disableNetworkInternal();
this.onlineStateTracker.set(OnlineState.Unknown);
} else {
throw err;
}
}

private onMutationResult(
commitVersion: SnapshotVersion,
results: MutationResult[]
Expand Down Expand Up @@ -646,15 +649,7 @@ export class RemoteStore implements TargetMetadataProvider {

return this.localStore
.setLastStreamToken(emptyByteString())
.catch(err => {
if (err.code === Code.FAILED_PRECONDITION) {
// We have temporarily lost our primary lease. If we recover,
// Sync Engine will re-enable the network.
return this.disableNetwork();
} else {
return Promise.reject(err);
}
});
.catch(err => this.tryRecoverFromPrimaryLeaseLoss(err));
} else {
// Some other error, don't reset stream token. Our stream logic will
// just retry with exponential backoff.
Expand Down
71 changes: 49 additions & 22 deletions packages/firestore/test/unit/specs/listen_spec.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -820,27 +820,22 @@ describeSpec('Listens:', [], () => {
const docA = doc('collection/a', 1000, { key: 'a' });
const docB = doc('collection/b', 2000, { key: 'b' });

return (
client(0)
.expectPrimaryState(true)
.userListens(query)
.watchAcksFull(query, 1000, docA)
.expectEvents(query, { added: [docA] })
.client(1)
.userListens(query)
.expectEvents(query, { added: [docA] })
.stealPrimaryLease()
.expectListen(query, 'resume-token-1000')
.watchAcksFull(query, 2000, docB)
.expectEvents(query, { added: [docB] })
.client(0)
.expectPrimaryState(false)
// TODO(multitab): Suppres the additional `fromCache`, which is raised
// locally because we disable the network when we lose the primary
// lease.
.expectEvents(query, { fromCache: true })
.expectEvents(query, { added: [docB] })
);
return client(0)
.expectPrimaryState(true)
.userListens(query)
.watchAcksFull(query, 1000, docA)
.expectEvents(query, { added: [docA] })
.client(1)
.userListens(query)
.expectEvents(query, { added: [docA] })
.stealPrimaryLease()
.expectListen(query, 'resume-token-1000')
.watchAcksFull(query, 2000, docB)
.expectEvents(query, { added: [docB] })
.client(0)
.expectPrimaryState(false)
.expectEvents(query, { fromCache: true })
.expectEvents(query, { added: [docB] });
});

specTest(
Expand All @@ -862,7 +857,7 @@ describeSpec('Listens:', [], () => {
.stealPrimaryLease()
.client(0)
// Send a watch update to client 0, who is longer primary (but doesn't
// it yet). The watch update gets ignored.
// know it yet). The watch update gets ignored.
.watchAcksFull(query, 1000, docA)
.expectPrimaryState(false)
.client(1)
Expand All @@ -872,4 +867,36 @@ describeSpec('Listens:', [], () => {
);
}
);

specTest(
'Listen is established in newly started primary',
['multi-client'],
() => {
const query = Query.atPath(path('collection'));
const docA = doc('collection/a', 1000, { key: 'a' });
const docB = doc('collection/b', 2000, { key: 'b' });

// Client 0 executes a query on behalf of Client 1. When client 0 shuts
// down, client 2 starts up and becomes primary, taking ownership of the
// existing query.
return client(0)
.expectPrimaryState(true)
.client(1)
.userListens(query)
.expectEvents(query, { fromCache: true })
.client(0)
.expectListen(query)
.watchAcksFull(query, 1000, docA)
.client(1)
.expectEvents(query, { added: [docA] })
.client(0)
.shutdown()
.client(2)
.expectPrimaryState(true)
.expectListen(query, 'resume-token-1000')
.watchAcksFull(query, 2000, docB)
.client(1)
.expectEvents(query, { added: [docB] });
}
);
});
8 changes: 6 additions & 2 deletions packages/firestore/test/unit/specs/spec_test_runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ import {
DbOwnerKey,
SCHEMA_VERSION
} from '../../../src/local/indexeddb_schema';
import { TestPlatform, SharedFakeWebStorage } from '../../util/test_platform';

class MockConnection implements Connection {
watchStream: StreamBridge<
Expand Down Expand Up @@ -1010,8 +1011,8 @@ abstract class TestRunner {
if (this.expectedActiveTargets) {
if (!obj.isEmpty(this.expectedActiveTargets)) {
await this.connection.waitForWatchOpen();
await this.queue.drain();
}
await this.queue.drain();
}

const actualTargets = obj.shallowCopy(this.connection.activeTargets);
Expand Down Expand Up @@ -1529,7 +1530,10 @@ async function writeOwnerToIndexedDb(clientId: ClientId): Promise<void> {
);
await db.runTransaction('readwrite', ['owner'], txn => {
const owner = txn.store<DbOwnerKey, DbOwner>(DbOwner.store);
return owner.put('owner', new DbOwner(clientId, Date.now()));
return owner.put(
'owner',
new DbOwner(clientId, /* allowTabSynchronization=*/ true, Date.now())
);
});
db.close();
}
18 changes: 18 additions & 0 deletions packages/firestore/test/unit/specs/write_spec.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1158,6 +1158,24 @@ describeSpec('Writes:', [], () => {
});
});

specTest('Write is sent by newly started primary', ['multi-client'], () => {
return client(0)
.expectPrimaryState(true)
.client(1)
.expectPrimaryState(false)
.userSets('collection/a', { v: 1 })
.client(0)
.shutdown()
.client(2)
.expectPrimaryState(true)
.expectNumOutstandingWrites(1)
.writeAcks('collection/a', 1000, { expectUserCallback: false })
.client(1)
.expectUserCallbacks({
acknowledged: ['collection/a']
});
});

specTest(
'Unresponsive primary ignores acknowledged write',
['multi-client'],
Expand Down