Skip to content

Commit d231b40

Browse files
authored
[batch] Make GCP disk attachment idempotent (hail-is#13955)
This is a fix for an error Ben found. ``` Traceback (most recent call last): File "/usr/local/lib/python3.9/dist-packages/batch/worker/worker.py", line 1907, in run await self.setup_io() File "/usr/local/lib/python3.9/dist-packages/batch/worker/worker.py", line 1848, in setup_io await self.disk.create(labels=labels) File "/usr/local/lib/python3.9/dist-packages/batch/cloud/gcp/worker/disk.py", line 47, in create await self._attach() File "/usr/local/lib/python3.9/dist-packages/batch/cloud/gcp/worker/disk.py", line 112, in _attach self.last_response = await self.compute_client.attach_disk( File "/usr/local/lib/python3.9/dist-packages/hailtop/aiocloud/aiogoogle/client/compute_client.py", line 83, in attach_disk return await self._request_with_zonal_operations_response(self.post, path, params, **kwargs) File "/usr/local/lib/python3.9/dist-packages/hailtop/aiocloud/aiogoogle/client/compute_client.py", line 126, in _request_with_zonal_operations_response return await retry_transient_errors(request_and_wait) File "/usr/local/lib/python3.9/dist-packages/hailtop/utils/utils.py", line 763, in retry_transient_errors return await retry_transient_errors_with_debug_string('', 0, f, *args, **kwargs) File "/usr/local/lib/python3.9/dist-packages/hailtop/utils/utils.py", line 775, in retry_transient_errors_with_debug_string return await f(*args, **kwargs) File "/usr/local/lib/python3.9/dist-packages/hailtop/aiocloud/aiogoogle/client/compute_client.py", line 116, in request_and_wait raise GCPOperationError(result['httpErrorStatusCode'], hailtop.aiocloud.aiogoogle.client.compute_client.GCPOperationError: GCPOperationError: 400:BAD REQUEST ['RESOURCE_IN_USE_BY_ANOTHER_RESOURCE'] ["The disk resource 'projects/hail-vdc/zones/us-central1-b/disks/batch-disk-82XXXXX' is already being used by 'projects/hail-vdc/zones/us-central1-b/instances/batch-worker-default-standard-yjXXXX'"]; {'kind': 'compute#operation', 'id': 'XXXXX', 'name': 'operation-XXXXX', 'zone': 'https://www.googleapis.com/compute/v1/projects/hail-vdc/zones/us-central1-b', 'clientOperationId': 'XXXX', 'operationType': 'attachDisk', 'targetLink': 'https://www.googleapis.com/compute/v1/projects/hail-vdc/zones/us-central1-b/instances/batch-worker-default-standard-yjupd', 'targetId': 'XXXX', 'status': 'DONE', 'user': '[email protected]', 'progress': 100, 'insertTime': '2023-10-30T20:38:40.145-07:00', 'startTime': '2023-10-30T20:38:41.871-07:00', 'endTime': '2023-10-30T20:38:42.367-07:00', 'error': {'errors': [{'code': 'RESOURCE_IN_USE_BY_ANOTHER_RESOURCE', 'message': "The disk resource 'projects/hail-vdc/zones/us-central1-b/disks/batch-disk-82XXXXX' is already being used by 'projects/hail-vdc/zones/us-central1-b/instances/batch-worker-default-standard-yjXXXX'"}]}, 'httpErrorStatusCode': 400, 'httpErrorMessage': 'BAD REQUEST', 'selfLink': 'https://www.googleapis.com/compute/v1/projects/hail-vdc/zones/us-central1-b/operations/operation-XXX'} ```
1 parent 2c1188c commit d231b40

File tree

1 file changed

+16
-3
lines changed
  • batch/batch/cloud/gcp/worker

1 file changed

+16
-3
lines changed

batch/batch/cloud/gcp/worker/disk.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,14 @@ async def _create(self, labels: Optional[Dict[str, str]] = None):
101101
self.last_response = await self.compute_client.create_disk(f'/zones/{self.zone}/disks', json=config)
102102
self._created = True
103103

104+
def _disk_is_already_attached_to_this_machine(self, e: aiogoogle.client.compute_client.GCPOperationError) -> bool:
105+
if e.status == 400:
106+
assert e.error_messages and e.error_codes
107+
return all(self.instance_name in em for em in e.error_messages) and all(
108+
em == 'RESOURCE_IN_USE_BY_ANOTHER_RESOURCE' for em in e.error_codes
109+
)
110+
return False
111+
104112
async def _attach(self):
105113
async with LoggingTimer(f'attaching disk {self.name} to {self.instance_name}'):
106114
config = {
@@ -109,9 +117,14 @@ async def _attach(self):
109117
'deviceName': self.name,
110118
}
111119

112-
self.last_response = await self.compute_client.attach_disk(
113-
f'/zones/{self.zone}/instances/{self.instance_name}/attachDisk', json=config
114-
)
120+
try:
121+
self.last_response = await self.compute_client.attach_disk(
122+
f'/zones/{self.zone}/instances/{self.instance_name}/attachDisk', json=config
123+
)
124+
except aiogoogle.client.compute_client.GCPOperationError as e:
125+
if not self._disk_is_already_attached_to_this_machine(e):
126+
raise e
127+
115128
self._attached = True
116129

117130
async def _detach(self):

0 commit comments

Comments
 (0)