Skip to content

Commit 96869b7

Browse files
vyadavmsftCopilot
andauthored
Harden OpenVMM Azure guest orchestration (#4420)
* Add cloud-init-based guest provisioning (cherry picked from commit 6787224) * Make OpenVMM launch robust in detached mode (cherry picked from commit 03db0e8) * Add TAP networking and host-side SSH reachability (cherry picked from commit cdd05e7) * Add OpenVMM guest smoke coverage and runner integration Add an Azure smoke runbook and OpenVMM smoke suite that exercise guest boot, restart, and stop/start on a prepared host. Describe the LisaRunner guest-enabled flow explicitly: - schedule environment initialization when the outer environment reaches Deployed - initialize guest nodes after the outer environment connects - allow guest-targeted tests that require Deployed to run once the outer host is Connected - keep retry on the runner redeploy path instead of eagerly deleting the environment Also wire the Azure/OpenVMM guest handling, schema updates, diagnostics, and selftest coverage needed for this flow. (cherry picked from commit ac4c15c) * Harden OpenVMM cleanup and host handling Make guest teardown best-effort so parent node cleanup and close continue even when a guest cleanup path fails. Tighten OpenVMM host-side handling by using host-native paths for copied artifacts, validating whoami before TAP creation, guarding working directory deletion, scoping dnsmasq state to the OpenVMM interface, and restoring the original ip_forward state when SSH forwarding is torn down. Also update the launcher wrapper to return the real OpenVMM PID so later liveness checks and forced cleanup target the VM process rather than the wrapper shell. Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 64340b8 commit 96869b7

11 files changed

Lines changed: 1685 additions & 101 deletions

File tree

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
name: openvmm azure smoke
2+
extension:
3+
- ../../testsuites
4+
variable:
5+
- name: host_admin_username
6+
value: lisatest
7+
- name: host_admin_password
8+
value: ""
9+
is_secret: true
10+
- name: host_admin_private_key_file
11+
value: ""
12+
is_secret: true
13+
- name: guest_admin_username
14+
value: lisatest
15+
- name: guest_admin_password
16+
value: ""
17+
is_secret: true
18+
- name: guest_admin_private_key_file
19+
value: ""
20+
is_secret: true
21+
- name: guest_extra_user_data
22+
value: ""
23+
- name: subscription_id
24+
value: ""
25+
- name: location
26+
value: "westus3"
27+
- name: marketplace_image
28+
value: ""
29+
- name: vm_size
30+
value: ""
31+
- name: openvmm_binary
32+
value: /usr/local/bin/openvmm
33+
- name: openvmm_install_path
34+
value: /usr/local/bin/openvmm
35+
- name: openvmm_installer_repo
36+
value: https://github.com/microsoft/openvmm.git
37+
- name: openvmm_installer_ref
38+
value: ""
39+
- name: openvmm_installer_force_install
40+
value: false
41+
- name: openvmm_host_working_dir
42+
value: /var/tmp
43+
- name: uefi_firmware_path
44+
value: ""
45+
- name: uefi_firmware_is_remote_path
46+
value: false
47+
- name: disk_img_path
48+
value: ""
49+
- name: disk_img_is_remote_path
50+
value: false
51+
- name: tap_name
52+
value: tap0
53+
- name: bridge_name
54+
value: ovmbr0
55+
- name: tap_host_cidr
56+
value: 10.0.0.1/24
57+
- name: forwarded_port
58+
value: 60022
59+
notifier:
60+
- type: html
61+
transformer:
62+
- type: openvmm_installer
63+
phase: environment_connected
64+
installer:
65+
type: source
66+
repo: $(openvmm_installer_repo)
67+
ref: $(openvmm_installer_ref)
68+
force_install: $(openvmm_installer_force_install)
69+
install_path: $(openvmm_install_path)
70+
platform:
71+
- type: azure
72+
admin_username: $(host_admin_username)
73+
admin_password: $(host_admin_password)
74+
admin_private_key_file: $(host_admin_private_key_file)
75+
guest_enabled: true
76+
guests:
77+
- type: openvmm
78+
use_parent_capability: false
79+
username: $(guest_admin_username)
80+
password: $(guest_admin_password)
81+
private_key_file: $(guest_admin_private_key_file)
82+
cloud_init:
83+
extra_user_data: $(guest_extra_user_data)
84+
lisa_working_dir: $(openvmm_host_working_dir)
85+
openvmm_binary: $(openvmm_binary)
86+
boot_mode: uefi
87+
capability:
88+
core_count: 2
89+
memory_mb: 2048
90+
uefi:
91+
firmware_path: $(uefi_firmware_path)
92+
firmware_is_remote_path: $(uefi_firmware_is_remote_path)
93+
disk_img: $(disk_img_path)
94+
disk_img_is_remote_path: $(disk_img_is_remote_path)
95+
serial:
96+
mode: file
97+
network:
98+
mode: tap
99+
address_mode: discover
100+
tap_name: $(tap_name)
101+
bridge_name: $(bridge_name)
102+
tap_host_cidr: $(tap_host_cidr)
103+
forward_ssh_port: true
104+
forwarded_port: $(forwarded_port)
105+
azure:
106+
subscription_id: $(subscription_id)
107+
requirement:
108+
azure:
109+
marketplace: $(marketplace_image)
110+
location: $(location)
111+
vm_size: $(vm_size)
112+
testcase:
113+
- criteria:
114+
name:
115+
- verify_openvmm_guest_boot
116+
- verify_openvmm_restart_via_platform
117+
- verify_openvmm_stop_start_in_platform
Lines changed: 75 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,60 +1,103 @@
11
# Copyright (c) Microsoft Corporation.
22
# Licensed under the MIT license.
33

4-
from assertpy import assert_that
4+
from typing import Any
55

6-
from lisa import Node, SkippedException, TestCaseMetadata, TestSuite, TestSuiteMetadata
6+
from lisa import (
7+
Logger,
8+
RemoteNode,
9+
SkippedException,
10+
TestCaseMetadata,
11+
TestSuite,
12+
TestSuiteMetadata,
13+
simple_requirement,
14+
)
15+
from lisa.environment import EnvironmentStatus
716
from lisa.features import StartStop
817
from lisa.sut_orchestrator.openvmm.node import OpenVmmGuestNode
9-
from lisa.testsuite import simple_requirement
18+
from lisa.tools import Uname
1019

1120

1221
@TestSuiteMetadata(
1322
area="openvmm",
1423
category="functional",
1524
description="""
16-
Smoke coverage for OpenVMM guest provisioning and platform lifecycle.
25+
This test suite validates OpenVMM guests running on a prepared L1 host.
1726
""",
1827
)
19-
class OpenVmmSmokeTestSuite(TestSuite):
28+
class OpenVmmPlatformSuite(TestSuite):
29+
def before_case(self, log: Logger, **kwargs: Any) -> None:
30+
node = kwargs["node"]
31+
if not isinstance(node, OpenVmmGuestNode):
32+
raise SkippedException(
33+
"This suite only applies to OpenVMM guest nodes. "
34+
f"Actual node type: {type(node).__name__}."
35+
)
36+
2037
@TestCaseMetadata(
2138
description="""
22-
Validate an OpenVMM guest is provisioned, reachable over SSH, and can
23-
execute a simple command after launch.
39+
This case validates that an OpenVMM guest is reachable over SSH and that
40+
the guest booted successfully.
2441
""",
25-
priority=1,
26-
requirement=simple_requirement(supported_features=[StartStop]),
42+
priority=0,
43+
requirement=simple_requirement(
44+
environment_status=EnvironmentStatus.Deployed,
45+
),
2746
)
28-
def verify_openvmm_provisioning(self, node: Node) -> None:
29-
openvmm_node = self._get_openvmm_guest(node)
47+
def verify_openvmm_guest_boot(
48+
self,
49+
log: Logger,
50+
node: RemoteNode,
51+
) -> None:
52+
kernel_release = node.tools[Uname].get_linux_information().kernel_version_raw
53+
log.info(f"Connected to OpenVMM guest kernel {kernel_release}")
3054

31-
result = openvmm_node.execute("echo openvmm-smoke", shell=True)
55+
@TestCaseMetadata(
56+
description="""
57+
This case validates that platform restart keeps the OpenVMM guest
58+
reachable after the restart.
59+
""",
60+
priority=0,
61+
requirement=simple_requirement(
62+
environment_status=EnvironmentStatus.Deployed,
63+
supported_features=[StartStop],
64+
),
65+
)
66+
def verify_openvmm_restart_via_platform(
67+
self,
68+
log: Logger,
69+
node: RemoteNode,
70+
) -> None:
71+
start_stop = node.features[StartStop]
72+
start_stop.restart()
3273

33-
result.assert_exit_code()
34-
assert_that(result.stdout.strip()).is_equal_to("openvmm-smoke")
74+
kernel_release = node.tools[Uname].get_linux_information().kernel_version_raw
75+
log.info(f"OpenVMM guest returned after restart on kernel {kernel_release}")
3576

3677
@TestCaseMetadata(
3778
description="""
38-
Validate the OpenVMM StartStop feature can stop and start a guest while
39-
preserving SSH connectivity for subsequent command execution.
79+
This case validates that platform stop/start keeps the OpenVMM guest
80+
reachable for subsequent command execution.
4081
""",
41-
priority=1,
42-
requirement=simple_requirement(supported_features=[StartStop]),
82+
priority=0,
83+
requirement=simple_requirement(
84+
environment_status=EnvironmentStatus.Deployed,
85+
supported_features=[StartStop],
86+
),
4387
)
44-
def verify_openvmm_stop_start_in_platform(self, node: Node) -> None:
45-
openvmm_node = self._get_openvmm_guest(node)
46-
47-
start_stop = openvmm_node.features[StartStop]
88+
def verify_openvmm_stop_start_in_platform(
89+
self,
90+
log: Logger,
91+
node: RemoteNode,
92+
) -> None:
93+
start_stop = node.features[StartStop]
94+
log.info("Stopping OpenVMM guest via platform")
4895
start_stop.stop(wait=True)
96+
log.info("Starting OpenVMM guest via platform")
4997
start_stop.start(wait=True)
5098

51-
result = openvmm_node.execute("echo openvmm-recovered", shell=True)
52-
53-
result.assert_exit_code()
54-
assert_that(result.stdout.strip()).is_equal_to("openvmm-recovered")
55-
56-
def _get_openvmm_guest(self, node: Node) -> OpenVmmGuestNode:
57-
if not isinstance(node, OpenVmmGuestNode):
58-
raise SkippedException("This suite only applies to OpenVMM guest nodes.")
59-
60-
return node
99+
kernel_release = node.tools[Uname].get_linux_information().kernel_version_raw
100+
log.info(
101+
f"OpenVMM guest returned after platform stop/start on kernel "
102+
f"{kernel_release}"
103+
)

lisa/node.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -355,12 +355,30 @@ def execute_async(
355355
)
356356

357357
def cleanup(self) -> None:
358+
for guest in self.guests:
359+
try:
360+
guest.cleanup()
361+
except Exception:
362+
self.log.exception(
363+
"failed to clean up guest "
364+
f"'{guest.name or guest.index}' while cleaning node "
365+
f"'{self.name}'. Continuing parent cleanup."
366+
)
358367
self.log.debug("cleaning up...")
359368
if hasattr(self, "_log_handler") and self._log_handler:
360369
remove_handler(self._log_handler, self.log)
361370
self._log_handler.close()
362371

363372
def close(self) -> None:
373+
for guest in self.guests:
374+
try:
375+
guest.close()
376+
except Exception:
377+
self.log.exception(
378+
"failed to close guest "
379+
f"'{guest.name or guest.index}' while closing node "
380+
f"'{self.name}'. Continuing parent close."
381+
)
364382
self.log.debug("closing node connection...")
365383
if self._shell:
366384
self._shell.close()
@@ -553,7 +571,12 @@ def mark_dirty(self) -> None:
553571
self._is_dirty = True
554572

555573
def test_connection(self) -> bool:
556-
assert self._shell
574+
if not self._shell:
575+
self.log.debug(
576+
f"connection test failed for node '{self.name}' because its "
577+
"shell is not initialized"
578+
)
579+
return False
557580
if not self._shell.is_remote:
558581
return True
559582
self.log.debug("testing connection...")

lisa/runners/lisa_runner.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,13 @@ def _dispatch_test_result(
200200
# run on deployed environment
201201
can_run_results = [x for x in can_run_results if x.can_run]
202202
if environment.status == EnvironmentStatus.Deployed and can_run_results:
203+
if self._guest_enabled:
204+
return self._generate_task(
205+
task_method=self._initialize_environment_task,
206+
environment=environment,
207+
test_results=can_run_results[:1],
208+
)
209+
203210
selected_test_results = self._get_test_result_to_run(
204211
test_results=test_results, environment=environment
205212
)
@@ -341,6 +348,9 @@ def _initialize_environment_task(
341348
phase=constants.TRANSFORMER_PHASE_ENVIRONMENT_CONNECTED,
342349
environment=environment,
343350
)
351+
if self._guest_enabled:
352+
guest_environment = environment.get_guest_environment()
353+
guest_environment.nodes.initialize()
344354
except Exception as e:
345355
self._attach_failed_environment_to_result(
346356
environment=environment,
@@ -639,8 +649,10 @@ def _get_runnable_test_results(
639649
)
640650
and (
641651
environment_status is None
642-
or x.runtime_data.metadata.requirement.environment_status
643-
== environment_status
652+
or self._matches_environment_status(
653+
x.runtime_data.metadata.requirement.environment_status,
654+
environment_status,
655+
)
644656
)
645657
]
646658
if environment:
@@ -688,6 +700,23 @@ def _get_runnable_test_results(
688700
results = self._sort_test_results(results)
689701
return results
690702

703+
def _matches_environment_status(
704+
self,
705+
requirement_status: EnvironmentStatus,
706+
actual_status: EnvironmentStatus,
707+
) -> bool:
708+
if requirement_status == actual_status:
709+
return True
710+
711+
if (
712+
self._guest_enabled
713+
and actual_status == EnvironmentStatus.Connected
714+
and requirement_status == EnvironmentStatus.Deployed
715+
):
716+
return True
717+
718+
return False
719+
691720
def _get_test_result_to_run(
692721
self, test_results: List[TestResult], environment: Environment
693722
) -> List[TestResult]:

lisa/sut_orchestrator/azure/platform_.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -807,7 +807,9 @@ def _get_node_information(self, node: Node) -> Dict[str, str]: # noqa: C901
807807

808808
# Guest nodes (like WslContainerNode) don't have features attribute
809809
# Skip security profile collection for guest nodes
810-
if hasattr(node, "features"):
810+
if hasattr(node, "features") and node.features.is_supported(
811+
SecurityProfile
812+
):
811813
security_profile = node.features[SecurityProfile].get_settings()
812814
else:
813815
security_profile = None
@@ -973,7 +975,11 @@ def _get_kernel_version(self, node: Node) -> str:
973975
linux_information = node.tools[Uname].get_linux_information()
974976
result = linux_information.kernel_version_raw
975977
elif not node.is_connected or node.is_posix:
976-
if not result and hasattr(node, ATTRIBUTE_FEATURES):
978+
if (
979+
not result
980+
and hasattr(node, ATTRIBUTE_FEATURES)
981+
and node.features.is_supported(features.SerialConsole)
982+
):
977983
# try to get kernel version in Azure. use it, when uname doesn't work
978984
node.log.debug("detecting kernel version from serial log...")
979985
serial_console = node.features[features.SerialConsole]
@@ -1009,7 +1015,11 @@ def _get_wala_version(self, node: Node) -> str:
10091015
node.log.debug(f"error on run waagent: {e}")
10101016

10111017
if not node.is_connected or node.is_posix:
1012-
if not result and hasattr(node, ATTRIBUTE_FEATURES):
1018+
if (
1019+
not result
1020+
and hasattr(node, ATTRIBUTE_FEATURES)
1021+
and node.features.is_supported(features.SerialConsole)
1022+
):
10131023
node.log.debug("detecting wala agent version from serial log...")
10141024
serial_console = node.features[features.SerialConsole]
10151025
result = serial_console.get_matched_str(WALA_VERSION_PATTERN)

0 commit comments

Comments
 (0)