Skip to content

Commit a213ed9

Browse files
committed
Refactor IDE spawn logic
- Instead of respawning when there is no join link, respawn when there the PID is dead. - Give more time for the PID to become alive before trying to respawn. - More logging.
1 parent 502e33e commit a213ed9

File tree

3 files changed

+73
-41
lines changed

3 files changed

+73
-41
lines changed

CHANGELOG.md

+12
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,18 @@
44

55
## Unreleased
66

7+
### Changed
8+
9+
- Base respawning the IDE off whether the PID is alive rather than whether we
10+
have a join link. This works around a possible Gateway bug where respawning
11+
the IDE kills an existing IDE that is still trying to spawn.
12+
- Wait longer for the initial IDE spawn before trying to spawn it again. This
13+
works around a possible Gateway bug where the PID is not accurate.
14+
15+
### Added
16+
17+
- Extra logging around the IDE spawn to help debugging.
18+
719
## 2.13.0 - 2024-07-16
820

921
### Added

gradle.properties

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ pluginGroup=com.coder.gateway
44
# Zip file name.
55
pluginName=coder-gateway
66
# SemVer format -> https://semver.org
7-
pluginVersion=2.13.0
7+
pluginVersion=2.13.1
88
# See https://plugins.jetbrains.com/docs/intellij/build-number-ranges.html
99
# for insight into build numbers and IntelliJ Platform versions.
1010
pluginSinceBuild=233.6745

src/main/kotlin/com/coder/gateway/CoderRemoteConnectionHandle.kt

+60-40
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ import java.net.URI
4848
import java.time.Duration
4949
import java.time.LocalDateTime
5050
import java.time.format.DateTimeFormatter
51+
import java.util.concurrent.TimeUnit
5152
import java.util.concurrent.TimeoutException
5253
import kotlin.coroutines.resume
5354
import kotlin.coroutines.resumeWithException
@@ -227,16 +228,9 @@ class CoderRemoteConnectionHandle {
227228

228229
// Wait for the IDE to come up.
229230
indicator.text = "Waiting for ${workspace.ideName} backend..."
230-
var status: UnattendedHostStatus? = null
231231
val remoteProjectPath = accessor.makeRemotePath(ShellArgument.PlainText(workspace.projectPath))
232232
val logsDir = accessor.getLogsDir(workspace.ideProduct.productCode, remoteProjectPath)
233-
while (lifetime.status == LifetimeStatus.Alive) {
234-
status = ensureIDEBackend(accessor, workspace, ideDir, remoteProjectPath, logsDir, lifetime, null)
235-
if (!status?.joinLink.isNullOrBlank()) {
236-
break
237-
}
238-
delay(5000)
239-
}
233+
var status = ensureIDEBackend(accessor, workspace, ideDir, remoteProjectPath, logsDir, lifetime, null)
240234

241235
// We wait for non-null, so this only happens on cancellation.
242236
val joinLink = status?.joinLink
@@ -437,8 +431,8 @@ class CoderRemoteConnectionHandle {
437431
}
438432

439433
/**
440-
* Ensure the backend is started. Status and/or links may be null if the
441-
* backend has not started.
434+
* Ensure the backend is started. It will not return until a join link is
435+
* received or the lifetime expires.
442436
*/
443437
private suspend fun ensureIDEBackend(
444438
accessor: HighLevelHostAccessor,
@@ -449,41 +443,67 @@ class CoderRemoteConnectionHandle {
449443
lifetime: LifetimeDefinition,
450444
currentStatus: UnattendedHostStatus?,
451445
): UnattendedHostStatus? {
452-
val details = "${workspace.hostname}:${ideDir.toRawString()}, project=${remoteProjectPath.toRawString()}"
453-
return try {
454-
if (currentStatus?.appPid != null &&
455-
!currentStatus.joinLink.isNullOrBlank() &&
456-
accessor.isPidAlive(currentStatus.appPid.toInt())
457-
) {
458-
// If the PID is alive, assume the join link we have is still
459-
// valid. The join link seems to change even if it is the same
460-
// backend running, so if we always fetched the link the client
461-
// would relaunch over and over.
446+
val details = "$${workspace.hostname}:${ideDir.toRawString()}, project=${remoteProjectPath.toRawString()}"
447+
448+
// Check if the current IDE is alive.
449+
if (currentStatus != null) {
450+
val isAlive = try {
451+
val isAlive = accessor.isPidAlive(currentStatus.appPid.toInt())
452+
logger.info("Got ${workspace.ideName} status: pid=${currentStatus.appPid}, alive=$isAlive")
453+
isAlive
454+
} catch (ex: Exception) {
455+
logger.info("Failed to check if ${workspace.ideName} is alive on $details: pid=${currentStatus.appPid}", ex)
456+
false
457+
}
458+
if (isAlive) {
459+
// Use the current status and join link.
462460
return currentStatus
461+
} else {
462+
logger.info("Relaunching ${workspace.ideName} since it is not alive...")
463463
}
464+
} else {
465+
logger.info("Launching ${workspace.ideName} for the first time on ${workspace.hostname}...")
466+
}
464467

465-
// See if there is already a backend running. Weirdly, there is
466-
// always a PID, even if there is no backend running, and
467-
// backendUnresponsive is always false, but the links are null so
468-
// hopefully that is an accurate indicator that the IDE is up.
469-
val status = accessor.getHostIdeStatus(ideDir, remoteProjectPath)
470-
if (!status.joinLink.isNullOrBlank()) {
471-
logger.info("Found existing ${workspace.ideName} backend on $details")
472-
return status
473-
}
468+
// If the PID is not alive, spawn a new backend. This may not be
469+
// idempotent, so only call if we are really sure we need to.
470+
accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
474471

475-
// Otherwise, spawn a new backend. This does not seem to spawn a
476-
// second backend if one is already running, yet it does somehow
477-
// cause a second client to launch. So only run this if we are
478-
// really sure we have to launch a new backend.
479-
logger.info("Starting ${workspace.ideName} backend on $details")
480-
accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
481-
// Get the newly spawned PID and join link.
482-
return accessor.getHostIdeStatus(ideDir, remoteProjectPath)
483-
} catch (ex: Exception) {
484-
logger.info("Failed to get ${workspace.ideName} status from $details", ex)
485-
currentStatus
472+
// Get the newly spawned PID and join link.
473+
var attempts = 0
474+
val maxAttempts = 6
475+
val wait = TimeUnit.SECONDS.toMillis(5)
476+
while (lifetime.status == LifetimeStatus.Alive) {
477+
try {
478+
attempts++
479+
val status = accessor.getHostIdeStatus(ideDir, remoteProjectPath)
480+
if (!status.joinLink.isNullOrBlank()) {
481+
logger.info("Found join link for ${workspace.ideName}; proceeding to connect: pid=${status.appPid}")
482+
return status
483+
}
484+
// If we did not get a join link, see if the IDE is alive in
485+
// case it died and we need to respawn.
486+
val isAlive = status.appPid > 0 && accessor.isPidAlive(status.appPid.toInt())
487+
logger.info("${workspace.ideName} status: pid=${status.appPid}, alive=$isAlive, unresponsive=${status.backendUnresponsive}, attempt=$attempts")
488+
// It is not clear whether the PID can be trusted because we get
489+
// one even when there is no backend at all. For now give it
490+
// some time and if it is still dead, only then try to respawn.
491+
if (!isAlive && attempts >= maxAttempts) {
492+
logger.info("${workspace.ideName} is still not alive after $attempts checks, respawning backend and waiting $wait ms to try again")
493+
accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
494+
attempts = 0
495+
} else {
496+
logger.info("No join link found in status; waiting $wait ms to try again")
497+
}
498+
} catch (ex: Exception) {
499+
logger.info("Failed to get ${workspace.ideName} status from $details; waiting $wait ms to try again", ex)
500+
}
501+
delay(wait)
486502
}
503+
504+
// This means the lifetime is no longer alive.
505+
logger.info("Connection to ${workspace.ideName} on $details aborted by user")
506+
return null
487507
}
488508

489509
companion object {

0 commit comments

Comments
 (0)