-
Notifications
You must be signed in to change notification settings - Fork 232
[AI Generated] BugFix: Handle missing cache topology in lscpu output for verify_l3_cache #4458
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -121,6 +121,16 @@ def verify_l3_cache( | |||||||||||||||||||||||||||||||||||
| # For all other cases, check L3 cache mapping with socket awareness | ||||||||||||||||||||||||||||||||||||
| cpu_info = lscpu.get_cpu_info() | ||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||
| # On some VMs (e.g. confidential VMs), cache topology is not exposed | ||||||||||||||||||||||||||||||||||||
| # by the hypervisor, so lscpu reports "-" for all cache values. | ||||||||||||||||||||||||||||||||||||
| # In this case, we cannot verify L3 cache mapping. | ||||||||||||||||||||||||||||||||||||
| if any(cpu.l3_cache == -1 for cpu in cpu_info): | ||||||||||||||||||||||||||||||||||||
| raise SkippedException( | ||||||||||||||||||||||||||||||||||||
| "Cache topology is not exposed on this VM. " | ||||||||||||||||||||||||||||||||||||
| "lscpu reports no cache information (likely a confidential VM " | ||||||||||||||||||||||||||||||||||||
| "or a VM size that does not expose cache topology to the guest)." | ||||||||||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||
| # Build a mapping of socket -> NUMA nodes and socket -> L3 caches | ||||||||||||||||||||||||||||||||||||
| socket_to_numa_nodes: dict[int, set[int]] = {} | ||||||||||||||||||||||||||||||||||||
| socket_to_l3_caches: dict[int, set[int]] = {} | ||||||||||||||||||||||||||||||||||||
|
|
@@ -299,6 +309,11 @@ def _create_stimer_interrupts(self, node: Node, cpu_count: int) -> None: | |||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||
| def _verify_node_mapping(self, node: Node, numa_node_size: int) -> None: | ||||||||||||||||||||||||||||||||||||
| cpu_info = node.tools[Lscpu].get_cpu_info() | ||||||||||||||||||||||||||||||||||||
| if any(cpu.l3_cache == -1 for cpu in cpu_info): | ||||||||||||||||||||||||||||||||||||
| raise SkippedException( | ||||||||||||||||||||||||||||||||||||
| "Cache topology is not exposed on this VM. " | ||||||||||||||||||||||||||||||||||||
| "lscpu reports no cache information." | ||||||||||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||||||||||
|
Comment on lines
+312
to
+316
|
||||||||||||||||||||||||||||||||||||
| if any(cpu.l3_cache == -1 for cpu in cpu_info): | |
| raise SkippedException( | |
| "Cache topology is not exposed on this VM. " | |
| "lscpu reports no cache information." | |
| ) | |
| if all(cpu.l3_cache == -1 for cpu in cpu_info): | |
| raise SkippedException( | |
| "Cache topology is not exposed on this VM. " | |
| "lscpu reports no cache information." | |
| ) | |
| if any(cpu.l3_cache == -1 for cpu in cpu_info): | |
| raise LisaException( | |
| "Cache topology is partially exposed on this VM: some CPUs " | |
| "report unknown L3 cache IDs while others do not. Verify the " | |
| "guest cache topology reporting and investigate inconsistent " | |
| "lscpu output before rerunning the test." | |
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -91,6 +91,12 @@ class Lscpu(Tool): | |
| r"(?P<l1_data_cache>\d+):(?P<l1_instruction_cache>\d+):" | ||
| r"(?P<l2_cache>\d+):(?P<l3_cache>\d+)$" | ||
| ) | ||
| # On some VMs (e.g. confidential VMs), cache topology is not exposed | ||
| # and lscpu outputs "-" instead of cache IDs: | ||
| # 0 0 0 - | ||
| _core_numa_no_cache = re.compile( | ||
| r"\s*(?P<cpu>\d+)\s+(?P<numa_node>\d+)\s+(?P<socket>\d+)\s+-$" | ||
| ) | ||
| # Model name: Intel(R) Xeon(R) Platinum 8168 CPU @ 2.70GHz | ||
| # Model name: AMD EPYC 7763 64-Core Processor | ||
| # Model name: AMD EPYC 7763 64-Core Processor | ||
|
|
@@ -267,6 +273,10 @@ def get_cpu_info(self) -> List[CPUInfo]: | |
| # CPU NODE SOCKET L1d:L1i:L2:L3 | ||
| # 0 0 0 0:0:0:0 | ||
| # 1 0 0 0:0:0:0 | ||
| # | ||
| # On some VMs (e.g. confidential VMs), cache topology is not exposed: | ||
| # CPU NODE SOCKET CACHE | ||
| # 0 0 0 - | ||
| result = self.run( | ||
| "--extended=cpu,node,socket,cache", expected_exit_code=0 | ||
| ).stdout | ||
|
|
@@ -278,21 +288,37 @@ def get_cpu_info(self) -> List[CPUInfo]: | |
| output: List[CPUInfo] = [] | ||
| for item in mappings: | ||
| match_result = self._core_numa_mappings.fullmatch(item) | ||
| assert ( | ||
| match_result | ||
| ), f"lscpu NUMA node mapping is not in expected format: {item}" | ||
| output.append( | ||
| CPUInfo( | ||
| cpu=int(match_result.group("cpu")), | ||
| numa_node=int(match_result.group("numa_node")), | ||
| socket=int(match_result.group("socket")), | ||
| l1_data_cache=int(match_result.group("l1_data_cache")), | ||
| l1_instruction_cache=int( | ||
| match_result.group("l1_instruction_cache") | ||
| ), | ||
| l2_cache=int(match_result.group("l2_cache")), | ||
| l3_cache=int(match_result.group("l3_cache")), | ||
| if match_result: | ||
| output.append( | ||
| CPUInfo( | ||
| cpu=int(match_result.group("cpu")), | ||
| numa_node=int(match_result.group("numa_node")), | ||
| socket=int(match_result.group("socket")), | ||
| l1_data_cache=int(match_result.group("l1_data_cache")), | ||
| l1_instruction_cache=int( | ||
| match_result.group("l1_instruction_cache") | ||
| ), | ||
| l2_cache=int(match_result.group("l2_cache")), | ||
| l3_cache=int(match_result.group("l3_cache")), | ||
| ) | ||
| ) | ||
| continue | ||
| no_cache_match = self._core_numa_no_cache.fullmatch(item) | ||
| if no_cache_match: | ||
| output.append( | ||
| CPUInfo( | ||
| cpu=int(no_cache_match.group("cpu")), | ||
| numa_node=int(no_cache_match.group("numa_node")), | ||
| socket=int(no_cache_match.group("socket")), | ||
| l1_data_cache=-1, | ||
| l1_instruction_cache=-1, | ||
| l2_cache=-1, | ||
| l3_cache=-1, | ||
| ) | ||
|
Comment on lines
+313
to
+317
|
||
| ) | ||
| continue | ||
| raise AssertionError( | ||
| f"lscpu NUMA node mapping is not in expected format: {item}" | ||
| ) | ||
| return output | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The skip condition uses
any(cpu.l3_cache == -1 ...), but the comment says lscpu reports "-" for all cache values. If only a subset of CPUs has unknown cache IDs (mixed output),any()will skip and potentially hide a real parsing/host issue. Consider usingall(...)for the expected "no cache info at all" case, and treat a mixed state as a test failure (or raise a more specific exception).