Skip to content

Commit ac11458

Browse files
authored
[nexus] Sled Expungement causes PhysicalDisk expungement (#5371)
Fixes #5369 This updates the "expunge sled" endpoint which is part of the internal API, but the same implementation should still apply when the endpoint is exposed to operators.
1 parent afb2e9a commit ac11458

File tree

1 file changed

+220
-60
lines changed
  • nexus/db-queries/src/db/datastore

1 file changed

+220
-60
lines changed

nexus/db-queries/src/db/datastore/sled.rs

+220-60
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,9 @@ impl DataStore {
329329
/// sufficient warning to the operator.
330330
///
331331
/// This is idempotent, and it returns the old policy of the sled.
332+
///
333+
/// Calling this function also implicitly marks the disks attached to a sled
334+
/// as "expunged".
332335
pub async fn sled_set_policy_to_expunged(
333336
&self,
334337
opctx: &OpContext,
@@ -348,73 +351,127 @@ impl DataStore {
348351
&self,
349352
opctx: &OpContext,
350353
authz_sled: &authz::Sled,
351-
new_policy: SledPolicy,
354+
new_sled_policy: SledPolicy,
352355
check: ValidateTransition,
353356
) -> Result<SledPolicy, TransitionError> {
354-
use db::schema::sled::dsl;
355-
356357
opctx.authorize(authz::Action::Modify, authz_sled).await?;
357358

358359
let sled_id = authz_sled.id();
359-
let query = diesel::update(dsl::sled)
360-
.filter(dsl::time_deleted.is_null())
361-
.filter(dsl::id.eq(sled_id));
362-
363-
let t = SledTransition::Policy(new_policy);
364-
let valid_old_policies = t.valid_old_policies();
365-
let valid_old_states = t.valid_old_states();
366-
367-
let query = match check {
368-
ValidateTransition::Yes => query
369-
.filter(dsl::sled_policy.eq_any(
370-
valid_old_policies.into_iter().map(to_db_sled_policy),
371-
))
372-
.filter(
373-
dsl::sled_state.eq_any(valid_old_states.iter().copied()),
374-
)
375-
.into_boxed(),
376-
#[cfg(test)]
377-
ValidateTransition::No => query.into_boxed(),
378-
};
360+
let err = OptionalError::new();
361+
let conn = self.pool_connection_authorized(opctx).await?;
362+
let policy = self
363+
.transaction_retry_wrapper("sled_set_policy")
364+
.transaction(&conn, |conn| {
365+
let err = err.clone();
379366

380-
let query = query
381-
.set((
382-
dsl::sled_policy.eq(to_db_sled_policy(new_policy)),
383-
dsl::time_modified.eq(Utc::now()),
384-
))
385-
.check_if_exists::<Sled>(sled_id);
367+
async move {
368+
let t = SledTransition::Policy(new_sled_policy);
369+
let valid_old_policies = t.valid_old_policies();
370+
let valid_old_states = t.valid_old_states();
371+
372+
use db::schema::sled::dsl;
373+
let query = diesel::update(dsl::sled)
374+
.filter(dsl::time_deleted.is_null())
375+
.filter(dsl::id.eq(sled_id));
376+
377+
let query = match check {
378+
ValidateTransition::Yes => query
379+
.filter(
380+
dsl::sled_policy.eq_any(
381+
valid_old_policies
382+
.into_iter()
383+
.map(to_db_sled_policy),
384+
),
385+
)
386+
.filter(
387+
dsl::sled_state
388+
.eq_any(valid_old_states.iter().copied()),
389+
)
390+
.into_boxed(),
391+
#[cfg(test)]
392+
ValidateTransition::No => query.into_boxed(),
393+
};
394+
395+
let query = query
396+
.set((
397+
dsl::sled_policy
398+
.eq(to_db_sled_policy(new_sled_policy)),
399+
dsl::time_modified.eq(Utc::now()),
400+
))
401+
.check_if_exists::<Sled>(sled_id);
402+
403+
let result = query.execute_and_check(&conn).await?;
404+
405+
let old_policy = match (check, result.status) {
406+
(ValidateTransition::Yes, UpdateStatus::Updated) => {
407+
result.found.policy()
408+
}
409+
(
410+
ValidateTransition::Yes,
411+
UpdateStatus::NotUpdatedButExists,
412+
) => {
413+
// Two reasons this can happen:
414+
// 1. An idempotent update: this is treated as a success.
415+
// 2. Invalid state transition: a failure.
416+
//
417+
// To differentiate between the two, check that the new policy
418+
// is the same as the old policy, and that the old state is
419+
// valid.
420+
if result.found.policy() == new_sled_policy
421+
&& valid_old_states
422+
.contains(&result.found.state())
423+
{
424+
result.found.policy()
425+
} else {
426+
return Err(err.bail(
427+
TransitionError::InvalidTransition {
428+
current: result.found,
429+
transition: SledTransition::Policy(
430+
new_sled_policy,
431+
),
432+
},
433+
));
434+
}
435+
}
436+
#[cfg(test)]
437+
(ValidateTransition::No, _) => result.found.policy(),
438+
};
439+
440+
// When a sled is expunged, the associated disks with that
441+
// sled should also be implicitly set to expunged.
442+
let new_disk_policy = match new_sled_policy {
443+
SledPolicy::InService { .. } => None,
444+
SledPolicy::Expunged => {
445+
Some(nexus_db_model::PhysicalDiskPolicy::Expunged)
446+
}
447+
};
448+
if let Some(new_disk_policy) = new_disk_policy {
449+
use db::schema::physical_disk::dsl as physical_disk_dsl;
450+
diesel::update(physical_disk_dsl::physical_disk)
451+
.filter(physical_disk_dsl::time_deleted.is_null())
452+
.filter(physical_disk_dsl::sled_id.eq(sled_id))
453+
.set(
454+
physical_disk_dsl::disk_policy
455+
.eq(new_disk_policy),
456+
)
457+
.execute_async(&conn)
458+
.await?;
459+
}
386460

387-
let result = query
388-
.execute_and_check(&*self.pool_connection_authorized(opctx).await?)
461+
Ok(old_policy)
462+
}
463+
})
389464
.await
390-
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
391-
392-
match (check, result.status) {
393-
(ValidateTransition::Yes, UpdateStatus::Updated) => {
394-
Ok(result.found.policy())
395-
}
396-
(ValidateTransition::Yes, UpdateStatus::NotUpdatedButExists) => {
397-
// Two reasons this can happen:
398-
// 1. An idempotent update: this is treated as a success.
399-
// 2. Invalid state transition: a failure.
400-
//
401-
// To differentiate between the two, check that the new policy
402-
// is the same as the old policy, and that the old state is
403-
// valid.
404-
if result.found.policy() == new_policy
405-
&& valid_old_states.contains(&result.found.state())
406-
{
407-
Ok(result.found.policy())
408-
} else {
409-
Err(TransitionError::InvalidTransition {
410-
current: result.found,
411-
transition: SledTransition::Policy(new_policy),
412-
})
465+
.map_err(|e| {
466+
if let Some(err) = err.take() {
467+
return err;
413468
}
414-
}
415-
#[cfg(test)]
416-
(ValidateTransition::No, _) => Ok(result.found.policy()),
417-
}
469+
TransitionError::from(public_error_from_diesel(
470+
e,
471+
ErrorHandler::Server,
472+
))
473+
})?;
474+
Ok(policy)
418475
}
419476

420477
/// Marks the state of the sled as decommissioned, as believed by Nexus.
@@ -675,6 +732,9 @@ mod test {
675732
use anyhow::{Context, Result};
676733
use itertools::Itertools;
677734
use nexus_db_model::Generation;
735+
use nexus_db_model::PhysicalDisk;
736+
use nexus_db_model::PhysicalDiskKind;
737+
use nexus_db_model::PhysicalDiskPolicy;
678738
use nexus_test_utils::db::test_setup_database;
679739
use nexus_types::identity::Asset;
680740
use omicron_common::api::external;
@@ -967,6 +1027,107 @@ mod test {
9671027
logctx.cleanup_successful();
9681028
}
9691029

1030+
async fn lookup_physical_disk(
1031+
datastore: &DataStore,
1032+
id: Uuid,
1033+
) -> PhysicalDisk {
1034+
use db::schema::physical_disk::dsl;
1035+
dsl::physical_disk
1036+
.filter(dsl::id.eq(id))
1037+
.filter(dsl::time_deleted.is_null())
1038+
.select(PhysicalDisk::as_select())
1039+
.get_result_async(
1040+
&*datastore
1041+
.pool_connection_for_tests()
1042+
.await
1043+
.expect("No connection"),
1044+
)
1045+
.await
1046+
.expect("Failed to lookup physical disk")
1047+
}
1048+
1049+
#[tokio::test]
1050+
async fn test_sled_expungement_also_expunges_disks() {
1051+
let logctx =
1052+
dev::test_setup_log("test_sled_expungement_also_expunges_disks");
1053+
let mut db = test_setup_database(&logctx.log).await;
1054+
1055+
let (opctx, datastore) = datastore_test(&logctx, &db).await;
1056+
1057+
// Set up a sled to test against.
1058+
let sled = datastore.sled_upsert(test_new_sled_update()).await.unwrap();
1059+
let sled_id = sled.id();
1060+
1061+
// Add a couple disks to this sled.
1062+
//
1063+
// (Note: This isn't really enough DB fakery to actually provision e.g.
1064+
// Crucible regions, but it creates enough of a control plane object to
1065+
// be associated with the Sled by UUID)
1066+
let disk1 = PhysicalDisk::new(
1067+
Uuid::new_v4(),
1068+
"vendor1".to_string(),
1069+
"serial1".to_string(),
1070+
"model1".to_string(),
1071+
PhysicalDiskKind::U2,
1072+
sled_id,
1073+
);
1074+
let disk2 = PhysicalDisk::new(
1075+
Uuid::new_v4(),
1076+
"vendor2".to_string(),
1077+
"serial2".to_string(),
1078+
"model2".to_string(),
1079+
PhysicalDiskKind::U2,
1080+
sled_id,
1081+
);
1082+
1083+
datastore
1084+
.physical_disk_upsert(&opctx, disk1.clone())
1085+
.await
1086+
.expect("Failed to upsert physical disk");
1087+
datastore
1088+
.physical_disk_upsert(&opctx, disk2.clone())
1089+
.await
1090+
.expect("Failed to upsert physical disk");
1091+
1092+
// Confirm the disks are "in-service".
1093+
//
1094+
// We verify this state because it should be changing below.
1095+
assert_eq!(
1096+
PhysicalDiskPolicy::InService,
1097+
lookup_physical_disk(&datastore, disk1.id()).await.disk_policy
1098+
);
1099+
assert_eq!(
1100+
PhysicalDiskPolicy::InService,
1101+
lookup_physical_disk(&datastore, disk2.id()).await.disk_policy
1102+
);
1103+
1104+
// Expunge the sled. As a part of this process, the query should UPDATE
1105+
// the physical_disk table.
1106+
sled_set_policy(
1107+
&opctx,
1108+
&datastore,
1109+
sled_id,
1110+
SledPolicy::Expunged,
1111+
ValidateTransition::Yes,
1112+
Expected::Ok(SledPolicy::provisionable()),
1113+
)
1114+
.await
1115+
.expect("Could not expunge sled");
1116+
1117+
// Observe that the disk state is now expunged
1118+
assert_eq!(
1119+
PhysicalDiskPolicy::Expunged,
1120+
lookup_physical_disk(&datastore, disk1.id()).await.disk_policy
1121+
);
1122+
assert_eq!(
1123+
PhysicalDiskPolicy::Expunged,
1124+
lookup_physical_disk(&datastore, disk2.id()).await.disk_policy
1125+
);
1126+
1127+
db.cleanup().await.unwrap();
1128+
logctx.cleanup_successful();
1129+
}
1130+
9701131
#[tokio::test]
9711132
async fn test_sled_transitions() {
9721133
// Test valid and invalid state and policy transitions.
@@ -1199,8 +1360,7 @@ mod test {
11991360
/// Tests listing large numbers of sleds via the batched interface
12001361
#[tokio::test]
12011362
async fn sled_list_batch() {
1202-
let logctx =
1203-
dev::test_setup_log("sled_reservation_create_non_provisionable");
1363+
let logctx = dev::test_setup_log("sled_list_batch");
12041364
let mut db = test_setup_database(&logctx.log).await;
12051365
let (opctx, datastore) = datastore_test(&logctx, &db).await;
12061366

0 commit comments

Comments
 (0)