Skip to content

Commit 34031a5

Browse files
committed
azure: narrow exception handling in deployment operations helper
Catch Azure-specific exceptions (HttpResponseError, ResourceNotFoundError) explicitly and log them at debug. Keep a broad fallback for unexpected errors but log with exc_info=True so the traceback is preserved instead of being silently swallowed. The original error path is unchanged: this helper still never raises. Addresses review comment on PR #4438.
1 parent e9f5233 commit 34031a5

1 file changed

Lines changed: 50 additions & 16 deletions

File tree

lisa/sut_orchestrator/azure/platform_.py

Lines changed: 50 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
)
3333

3434
import requests
35-
from azure.core.exceptions import HttpResponseError, ResourceNotFoundError
35+
from azure.core.exceptions import AzureError, HttpResponseError, ResourceNotFoundError
3636
from azure.identity import DefaultAzureCredential
3737
from azure.mgmt.compute.models import (
3838
CommunityGalleryImage,
@@ -1877,12 +1877,12 @@ def _deploy(
18771877
# listing deployment operations to surface the real sub-resource
18781878
# errors.
18791879
top_code = getattr(e.error, "code", "") or ""
1880-
if (
1881-
"aggregated deployment error is too large" in error_message
1882-
or "ResourceDeploymentFailure" in top_code
1883-
or (
1884-
"DeploymentFailed" in top_code
1885-
and not getattr(e.error, "details", None)
1880+
has_details = bool(getattr(e.error, "details", None))
1881+
if "aggregated deployment error is too large" in error_message or (
1882+
not has_details
1883+
and (
1884+
"ResourceDeploymentFailure" in top_code
1885+
or "DeploymentFailed" in top_code
18861886
)
18871887
):
18881888
op_errors = self._collect_deployment_operation_errors(
@@ -1960,18 +1960,43 @@ def _collect_deployment_operation_errors(
19601960
) -> List[str]:
19611961
"""Fetch per-resource errors from a failed ARM deployment.
19621962
1963-
Used when the top-level HttpResponseError only carries the aggregated
1964-
"deployment error is too large" message and no nested details.
1963+
Used as a fallback when the top-level HttpResponseError does not
1964+
already carry actionable per-resource details. Callers in ``_deploy``
1965+
invoke this helper for any of the following cases:
1966+
1967+
* The aggregated "deployment error is too large" message, where ARM
1968+
truncates the nested error tree.
1969+
* ``ResourceDeploymentFailure`` errors with no nested
1970+
``error.details`` (e.g. transient internal server errors that
1971+
carry only a tracking id).
1972+
* ``DeploymentFailed`` errors that arrive without any nested
1973+
``details`` populated.
1974+
1975+
When the top-level error already carries actionable nested
1976+
``details``, callers skip this helper to avoid an extra ARM call
1977+
and duplicated/noisy output.
1978+
1979+
In all of these cases, listing the deployment operations is the
1980+
only way to surface the underlying per-resource failure messages.
19651981
"""
19661982
errors: List[str] = []
19671983
try:
1968-
operations = self._rm_client.deployment_operations.list(
1969-
resource_group_name=resource_group_name,
1970-
deployment_name=AZURE_DEPLOYMENT_NAME,
1971-
)
1984+
# Azure SDK calls share auth state via files on disk; serialize
1985+
# access to avoid intermittent failures during parallel runs.
1986+
# See common.py global_credential_access_lock for context.
1987+
with global_credential_access_lock:
1988+
operations = list(
1989+
self._rm_client.deployment_operations.list(
1990+
resource_group_name=resource_group_name,
1991+
deployment_name=AZURE_DEPLOYMENT_NAME,
1992+
)
1993+
)
19721994
for op in operations:
19731995
props = getattr(op, "properties", None)
1974-
if not props or props.provisioning_state != "Failed":
1996+
if not props:
1997+
continue
1998+
provisioning_state = getattr(props, "provisioning_state", None) or ""
1999+
if provisioning_state.lower() != "failed":
19752000
continue
19762001
target = getattr(props, "target_resource", None)
19772002
resource_type = getattr(target, "resource_type", "") if target else ""
@@ -1985,8 +2010,17 @@ def _collect_deployment_operation_errors(
19852010
)
19862011
else:
19872012
errors.append(f"{resource_type}/{resource_name}: {status}")
1988-
except Exception as ex:
1989-
log.debug(f"failed to list deployment operations: {ex}")
2013+
except (AzureError, ValueError, TypeError, AttributeError) as ex:
2014+
# Keep the original error path intact: never let this helper raise.
2015+
# Catch Azure SDK errors (AzureError covers HttpResponseError /
2016+
# ResourceNotFoundError) plus common parsing/shape mismatches
2017+
# (ValueError, TypeError, AttributeError). Programming errors
2018+
# outside this set will still propagate so they remain visible.
2019+
# Log with traceback so failures here are still debuggable.
2020+
log.debug(
2021+
f"failed to collect deployment operation errors: {ex}",
2022+
exc_info=True,
2023+
)
19902024
return errors
19912025

19922026
# the VM may not be queried after deployed. use retry to mitigate it.

0 commit comments

Comments
 (0)