3232)
3333
3434import requests
35- from azure .core .exceptions import HttpResponseError , ResourceNotFoundError
35+ from azure .core .exceptions import AzureError , HttpResponseError , ResourceNotFoundError
3636from azure .identity import DefaultAzureCredential
3737from azure .mgmt .compute .models import (
3838 CommunityGalleryImage ,
@@ -1877,12 +1877,12 @@ def _deploy(
18771877 # listing deployment operations to surface the real sub-resource
18781878 # errors.
18791879 top_code = getattr (e .error , "code" , "" ) or ""
1880- if (
1881- "aggregated deployment error is too large" in error_message
1882- or "ResourceDeploymentFailure" in top_code
1883- or (
1884- "DeploymentFailed " in top_code
1885- and not getattr ( e . error , "details" , None )
1880+ has_details = bool ( getattr ( e . error , "details" , None ))
1881+ if "aggregated deployment error is too large" in error_message or (
1882+ not has_details
1883+ and (
1884+ "ResourceDeploymentFailure " in top_code
1885+ or "DeploymentFailed" in top_code
18861886 )
18871887 ):
18881888 op_errors = self ._collect_deployment_operation_errors (
@@ -1960,18 +1960,43 @@ def _collect_deployment_operation_errors(
19601960 ) -> List [str ]:
19611961 """Fetch per-resource errors from a failed ARM deployment.
19621962
1963- Used when the top-level HttpResponseError only carries the aggregated
1964- "deployment error is too large" message and no nested details.
1963+ Used as a fallback when the top-level HttpResponseError does not
1964+ already carry actionable per-resource details. Callers in ``_deploy``
1965+ invoke this helper for any of the following cases:
1966+
1967+ * The aggregated "deployment error is too large" message, where ARM
1968+ truncates the nested error tree.
1969+ * ``ResourceDeploymentFailure`` errors with no nested
1970+ ``error.details`` (e.g. transient internal server errors that
1971+ carry only a tracking id).
1972+ * ``DeploymentFailed`` errors that arrive without any nested
1973+ ``details`` populated.
1974+
1975+ When the top-level error already carries actionable nested
1976+ ``details``, callers skip this helper to avoid an extra ARM call
1977+ and duplicated/noisy output.
1978+
1979+ In all of these cases, listing the deployment operations is the
1980+ only way to surface the underlying per-resource failure messages.
19651981 """
19661982 errors : List [str ] = []
19671983 try :
1968- operations = self ._rm_client .deployment_operations .list (
1969- resource_group_name = resource_group_name ,
1970- deployment_name = AZURE_DEPLOYMENT_NAME ,
1971- )
1984+ # Azure SDK calls share auth state via files on disk; serialize
1985+ # access to avoid intermittent failures during parallel runs.
1986+ # See common.py global_credential_access_lock for context.
1987+ with global_credential_access_lock :
1988+ operations = list (
1989+ self ._rm_client .deployment_operations .list (
1990+ resource_group_name = resource_group_name ,
1991+ deployment_name = AZURE_DEPLOYMENT_NAME ,
1992+ )
1993+ )
19721994 for op in operations :
19731995 props = getattr (op , "properties" , None )
1974- if not props or props .provisioning_state != "Failed" :
1996+ if not props :
1997+ continue
1998+ provisioning_state = getattr (props , "provisioning_state" , None ) or ""
1999+ if provisioning_state .lower () != "failed" :
19752000 continue
19762001 target = getattr (props , "target_resource" , None )
19772002 resource_type = getattr (target , "resource_type" , "" ) if target else ""
@@ -1985,8 +2010,17 @@ def _collect_deployment_operation_errors(
19852010 )
19862011 else :
19872012 errors .append (f"{ resource_type } /{ resource_name } : { status } " )
1988- except Exception as ex :
1989- log .debug (f"failed to list deployment operations: { ex } " )
2013+ except (AzureError , ValueError , TypeError , AttributeError ) as ex :
2014+ # Keep the original error path intact: never let this helper raise.
2015+ # Catch Azure SDK errors (AzureError covers HttpResponseError /
2016+ # ResourceNotFoundError) plus common parsing/shape mismatches
2017+ # (ValueError, TypeError, AttributeError). Programming errors
2018+ # outside this set will still propagate so they remain visible.
2019+ # Log with traceback so failures here are still debuggable.
2020+ log .debug (
2021+ f"failed to collect deployment operation errors: { ex } " ,
2022+ exc_info = True ,
2023+ )
19902024 return errors
19912025
19922026 # the VM may not be queried after deployed. use retry to mitigate it.
0 commit comments