@@ -48,6 +48,7 @@ import java.net.URI
48
48
import java.time.Duration
49
49
import java.time.LocalDateTime
50
50
import java.time.format.DateTimeFormatter
51
+ import java.util.concurrent.TimeUnit
51
52
import java.util.concurrent.TimeoutException
52
53
import kotlin.coroutines.resume
53
54
import kotlin.coroutines.resumeWithException
@@ -227,16 +228,9 @@ class CoderRemoteConnectionHandle {
227
228
228
229
// Wait for the IDE to come up.
229
230
indicator.text = " Waiting for ${workspace.ideName} backend..."
230
- var status: UnattendedHostStatus ? = null
231
231
val remoteProjectPath = accessor.makeRemotePath(ShellArgument .PlainText (workspace.projectPath))
232
232
val logsDir = accessor.getLogsDir(workspace.ideProduct.productCode, remoteProjectPath)
233
- while (lifetime.status == LifetimeStatus .Alive ) {
234
- status = ensureIDEBackend(accessor, workspace, ideDir, remoteProjectPath, logsDir, lifetime, null )
235
- if (! status?.joinLink.isNullOrBlank()) {
236
- break
237
- }
238
- delay(5000 )
239
- }
233
+ var status = ensureIDEBackend(accessor, workspace, ideDir, remoteProjectPath, logsDir, lifetime, null )
240
234
241
235
// We wait for non-null, so this only happens on cancellation.
242
236
val joinLink = status?.joinLink
@@ -437,8 +431,8 @@ class CoderRemoteConnectionHandle {
437
431
}
438
432
439
433
/* *
440
- * Ensure the backend is started. Status and/or links may be null if the
441
- * backend has not started .
434
+ * Ensure the backend is started. It will not return until a join link is
435
+ * received or the lifetime expires .
442
436
*/
443
437
private suspend fun ensureIDEBackend (
444
438
accessor : HighLevelHostAccessor ,
@@ -449,41 +443,67 @@ class CoderRemoteConnectionHandle {
449
443
lifetime : LifetimeDefinition ,
450
444
currentStatus : UnattendedHostStatus ? ,
451
445
): UnattendedHostStatus ? {
452
- val details = " ${workspace.hostname} :${ideDir.toRawString()} , project=${remoteProjectPath.toRawString()} "
453
- return try {
454
- if (currentStatus?.appPid != null &&
455
- ! currentStatus.joinLink.isNullOrBlank() &&
456
- accessor.isPidAlive(currentStatus.appPid.toInt())
457
- ) {
458
- // If the PID is alive, assume the join link we have is still
459
- // valid. The join link seems to change even if it is the same
460
- // backend running, so if we always fetched the link the client
461
- // would relaunch over and over.
446
+ val details = " $${workspace.hostname} :${ideDir.toRawString()} , project=${remoteProjectPath.toRawString()} "
447
+
448
+ // Check if the current IDE is alive.
449
+ if (currentStatus != null ) {
450
+ val isAlive = try {
451
+ val isAlive = accessor.isPidAlive(currentStatus.appPid.toInt())
452
+ logger.info(" Got ${workspace.ideName} status: pid=${currentStatus.appPid} , alive=$isAlive " )
453
+ isAlive
454
+ } catch (ex: Exception ) {
455
+ logger.info(" Failed to check if ${workspace.ideName} is alive on $details : pid=${currentStatus.appPid} " , ex)
456
+ false
457
+ }
458
+ if (isAlive) {
459
+ // Use the current status and join link.
462
460
return currentStatus
461
+ } else {
462
+ logger.info(" Relaunching ${workspace.ideName} since it is not alive..." )
463
463
}
464
+ } else {
465
+ logger.info(" Launching ${workspace.ideName} for the first time on ${workspace.hostname} ..." )
466
+ }
464
467
465
- // See if there is already a backend running. Weirdly, there is
466
- // always a PID, even if there is no backend running, and
467
- // backendUnresponsive is always false, but the links are null so
468
- // hopefully that is an accurate indicator that the IDE is up.
469
- val status = accessor.getHostIdeStatus(ideDir, remoteProjectPath)
470
- if (! status.joinLink.isNullOrBlank()) {
471
- logger.info(" Found existing ${workspace.ideName} backend on $details " )
472
- return status
473
- }
468
+ // If the PID is not alive, spawn a new backend. This may not be
469
+ // idempotent, so only call if we are really sure we need to.
470
+ accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
474
471
475
- // Otherwise, spawn a new backend. This does not seem to spawn a
476
- // second backend if one is already running, yet it does somehow
477
- // cause a second client to launch. So only run this if we are
478
- // really sure we have to launch a new backend.
479
- logger.info(" Starting ${workspace.ideName} backend on $details " )
480
- accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
481
- // Get the newly spawned PID and join link.
482
- return accessor.getHostIdeStatus(ideDir, remoteProjectPath)
483
- } catch (ex: Exception ) {
484
- logger.info(" Failed to get ${workspace.ideName} status from $details " , ex)
485
- currentStatus
472
+ // Get the newly spawned PID and join link.
473
+ var attempts = 0
474
+ val maxAttempts = 6
475
+ val wait = TimeUnit .SECONDS .toMillis(5 )
476
+ while (lifetime.status == LifetimeStatus .Alive ) {
477
+ try {
478
+ attempts++
479
+ val status = accessor.getHostIdeStatus(ideDir, remoteProjectPath)
480
+ if (! status.joinLink.isNullOrBlank()) {
481
+ logger.info(" Found join link for ${workspace.ideName} ; proceeding to connect: pid=${status.appPid} " )
482
+ return status
483
+ }
484
+ // If we did not get a join link, see if the IDE is alive in
485
+ // case it died and we need to respawn.
486
+ val isAlive = status.appPid > 0 && accessor.isPidAlive(status.appPid.toInt())
487
+ logger.info(" ${workspace.ideName} status: pid=${status.appPid} , alive=$isAlive , unresponsive=${status.backendUnresponsive} , attempt=$attempts " )
488
+ // It is not clear whether the PID can be trusted because we get
489
+ // one even when there is no backend at all. For now give it
490
+ // some time and if it is still dead, only then try to respawn.
491
+ if (! isAlive && attempts >= maxAttempts) {
492
+ logger.info(" ${workspace.ideName} is still not alive after $attempts checks, respawning backend and waiting $wait ms to try again" )
493
+ accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
494
+ attempts = 0
495
+ } else {
496
+ logger.info(" No join link found in status; waiting $wait ms to try again" )
497
+ }
498
+ } catch (ex: Exception ) {
499
+ logger.info(" Failed to get ${workspace.ideName} status from $details ; waiting $wait ms to try again" , ex)
500
+ }
501
+ delay(wait)
486
502
}
503
+
504
+ // This means the lifetime is no longer alive.
505
+ logger.info(" Connection to ${workspace.ideName} on $details aborted by user" )
506
+ return null
487
507
}
488
508
489
509
companion object {
0 commit comments