diff --git a/src/planner/api/dependencies.py b/src/planner/api/dependencies.py index 30dea07f..3bab523c 100644 --- a/src/planner/api/dependencies.py +++ b/src/planner/api/dependencies.py @@ -15,7 +15,7 @@ from starlette.concurrency import run_in_threadpool from planner.cluster import KubernetesClusterManager, KubernetesDeploymentError -from planner.configuration import DeploymentGenerator, YAMLValidator +from planner.configuration import DeploymentGenerator, LlmdDeploymentGenerator, YAMLValidator from planner.knowledge_base.model_catalog import ModelCatalog from planner.knowledge_base.slo_templates import SLOTemplateRepository from planner.orchestration.workflow import RecommendationWorkflow @@ -101,6 +101,7 @@ def init_app_state(app: FastAPI) -> None: app.state.model_catalog = ModelCatalog() app.state.slo_repo = SLOTemplateRepository() app.state.deployment_generator = DeploymentGenerator(simulator_mode=False) + app.state.llmd_deployment_generator = LlmdDeploymentGenerator() app.state.yaml_validator = YAMLValidator() app.state.cluster_managers = {} # dict[str, KubernetesClusterManager] @@ -161,6 +162,11 @@ def get_deployment_generator(request: Request) -> DeploymentGenerator: return cast(DeploymentGenerator, request.app.state.deployment_generator) +def get_llmd_deployment_generator(request: Request) -> LlmdDeploymentGenerator: + """Get the llm-d deployment generator singleton.""" + return cast(LlmdDeploymentGenerator, request.app.state.llmd_deployment_generator) + + def get_yaml_validator(request: Request) -> YAMLValidator: """Get the YAML validator singleton.""" return cast(YAMLValidator, request.app.state.yaml_validator) diff --git a/src/planner/api/routes/configuration.py b/src/planner/api/routes/configuration.py index d7dff3d7..f2920672 100644 --- a/src/planner/api/routes/configuration.py +++ b/src/planner/api/routes/configuration.py @@ -3,7 +3,7 @@ import logging import random from datetime import datetime -from typing import Any +from typing import Any, Literal from fastapi import APIRouter, Depends, HTTPException, Request, status from pydantic import BaseModel @@ -12,11 +12,14 @@ from planner.api.dependencies import ( get_cluster_manager_or_raise, get_deployment_generator, + get_llmd_deployment_generator, get_yaml_validator, ) -from planner.configuration import DeploymentGenerator, YAMLValidator +from planner.configuration import DeploymentGenerator, LlmdDeploymentGenerator, YAMLValidator from planner.shared.schemas import DeploymentMode, DeploymentRecommendation +StackType = Literal["vllm", "llm-d"] + logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/v1", tags=["configuration"]) @@ -27,6 +30,7 @@ class DeploymentRequest(BaseModel): recommendation: DeploymentRecommendation namespace: str = "default" + stack: StackType = "vllm" class DeploymentResponse(BaseModel): @@ -78,16 +82,30 @@ async def set_mode(request: DeploymentModeRequest, http_request: Request): async def deploy_model( request: DeploymentRequest, deployment_generator: DeploymentGenerator = Depends(get_deployment_generator), + llmd_generator: LlmdDeploymentGenerator = Depends(get_llmd_deployment_generator), yaml_validator: YAMLValidator = Depends(get_yaml_validator), ): """Generate deployment YAML and return contents inline.""" try: - logger.info(f"Generating deployment for model: {request.recommendation.model_name}") - - result = deployment_generator.generate_all( - recommendation=request.recommendation, namespace=request.namespace + logger.info( + f"Generating deployment for model: {request.recommendation.model_name}" + f" (stack={request.stack})" ) + if request.stack == "llm-d": + result = llmd_generator.generate_all( + recommendation=request.recommendation, namespace=request.namespace + ) + elif request.stack == "vllm": + result = deployment_generator.generate_all( + recommendation=request.recommendation, namespace=request.namespace + ) + else: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Unknown stack: {request.stack}", + ) + try: yaml_validator.validate_all(result["files"]) logger.info(f"All YAML files validated for deployment: {result['deployment_id']}") diff --git a/src/planner/configuration/__init__.py b/src/planner/configuration/__init__.py index 823ec6f1..b53a8116 100644 --- a/src/planner/configuration/__init__.py +++ b/src/planner/configuration/__init__.py @@ -1,4 +1,5 @@ """Configuration module for YAML generation and validation.""" from .generator import DeploymentGenerator +from .llmd_generator import LlmdDeploymentGenerator from .validator import YAMLValidator diff --git a/src/planner/configuration/generator.py b/src/planner/configuration/generator.py index c5d05ebe..218eb636 100644 --- a/src/planner/configuration/generator.py +++ b/src/planner/configuration/generator.py @@ -11,6 +11,7 @@ from jinja2 import Environment, FileSystemLoader +from planner.configuration.utils import generate_deployment_id as _generate_deployment_id from planner.knowledge_base.model_catalog import ModelCatalog from planner.shared.schemas import DeploymentRecommendation @@ -59,45 +60,8 @@ def __init__(self, output_dir: str | None = None, simulator_mode: bool = False): ) def generate_deployment_id(self, recommendation: DeploymentRecommendation) -> str: - """ - Generate a unique deployment ID that meets Kubernetes naming requirements: - - Must start with a letter - - Only lowercase alphanumeric and hyphens - - Max 44 characters (KServe adds "-predictor-default" suffix, total must be ≤63) - - Args: - recommendation: Deployment recommendation - - Returns: - Deployment ID (e.g., "chatbot-mistral-7b-20251003143022") - """ - import re - - timestamp = datetime.now().strftime("%Y%m%d%H%M%S") # 14 chars: YYYYMMDDHHMMSS - use_case = recommendation.intent.use_case.replace("_", "-") - - # Clean model name: remove special chars, keep alphanumeric and hyphens - model_name = (recommendation.model_id or "unknown").split("/")[-1].lower() - model_name = re.sub(r"[^a-z0-9-]", "-", model_name) - # Remove consecutive hyphens - model_name = re.sub(r"-+", "-", model_name).strip("-") - - # Build ID - deployment_id = f"{use_case}-{model_name}-{timestamp}" - - # KServe creates names like "{deployment_id}-predictor-default" (adds 19 chars) - # So deployment_id must be max 44 chars to stay under 63 char DNS limit - max_deployment_id_len = 44 - - if len(deployment_id) > max_deployment_id_len: - # Truncate model name to fit - max_model_len = ( - max_deployment_id_len - len(use_case) - len(timestamp) - 2 - ) # 2 for hyphens - model_name = model_name[:max_model_len].rstrip("-") - deployment_id = f"{use_case}-{model_name}-{timestamp}" - - return deployment_id + """Generate a unique deployment ID that meets Kubernetes naming requirements.""" + return _generate_deployment_id(recommendation) def _prepare_template_context( self, diff --git a/src/planner/configuration/llmd_generator.py b/src/planner/configuration/llmd_generator.py new file mode 100644 index 00000000..c0a0ddc1 --- /dev/null +++ b/src/planner/configuration/llmd_generator.py @@ -0,0 +1,113 @@ +"""llm-d Deployment Generator. + +Generates kustomize overlay + Helm values for the llm-d stack, +aligned with how llm-d recommends deployment: +- Model servers via kustomize (referencing llm-d base manifests) +- EPP + InferencePool via Helm (standalone chart) +""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import Any + +from jinja2 import Environment, FileSystemLoader + +from planner.configuration.utils import ( + generate_deployment_id, + validate_model_id, + validate_namespace, +) +from planner.shared.schemas import DeploymentRecommendation + +logger = logging.getLogger(__name__) + + +class LlmdDeploymentGenerator: + """Generate llm-d deployment manifests (kustomize overlay + helm values).""" + + def __init__(self, output_dir: str | None = None): + template_dir = Path(__file__).parent / "templates" / "llmd" + self.env = Environment( + loader=FileSystemLoader(str(template_dir)), + trim_blocks=True, + lstrip_blocks=True, + keep_trailing_newline=True, + ) + + if output_dir: + self.output_dir = Path(output_dir) + else: + project_root = Path(__file__).parent.parent.parent.parent + self.output_dir = project_root / "generated_configs" + + self.output_dir.mkdir(parents=True, exist_ok=True) + + def _prepare_context( + self, + recommendation: DeploymentRecommendation, + deployment_id: str, + namespace: str, + ) -> dict[str, Any]: + """Prepare Jinja2 template context from recommendation.""" + gpu_config = recommendation.gpu_config + + model_id = recommendation.model_id or "unknown" + validate_model_id(model_id) + validate_namespace(namespace) + + tensor_parallel = gpu_config.tensor_parallel if gpu_config else 1 + + return { + "deployment_id": deployment_id, + "namespace": namespace, + "model_id": model_id, + "tensor_parallel": tensor_parallel, + "gpus_per_replica": tensor_parallel, + "replicas": gpu_config.replicas if gpu_config else 1, + } + + def generate_all( + self, + recommendation: DeploymentRecommendation, + namespace: str = "default", + ) -> dict[str, Any]: + """Generate all llm-d deployment files. + + Returns a dict with: deployment_id, namespace, files, contents. + """ + deployment_id = generate_deployment_id(recommendation) + context = self._prepare_context(recommendation, deployment_id, namespace) + + configs: list[tuple[str, str, str]] = [ + ("kustomization.yaml.j2", "modelserver/kustomization.yaml", "kustomization"), + ("patch-vllm.yaml.j2", "modelserver/patch-vllm.yaml", "patch_vllm"), + ("values.yaml.j2", "scheduler/values.yaml", "helm_values"), + ] + + deployment_dir = self.output_dir / deployment_id + (deployment_dir / "modelserver").mkdir(parents=True, exist_ok=True) + (deployment_dir / "scheduler").mkdir(parents=True, exist_ok=True) + + generated_files: dict[str, str] = {} + generated_contents: dict[str, str] = {} + + for template_name, output_rel_path, config_type in configs: + template = self.env.get_template(template_name) + rendered = template.render(**context) + + output_path = deployment_dir / output_rel_path + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, "w") as f: + f.write(rendered) + + generated_files[config_type] = str(output_path) + generated_contents[config_type] = rendered + + return { + "deployment_id": deployment_id, + "namespace": namespace, + "files": generated_files, + "contents": generated_contents, + } diff --git a/src/planner/configuration/templates/llmd/kustomization.yaml.j2 b/src/planner/configuration/templates/llmd/kustomization.yaml.j2 new file mode 100644 index 00000000..b275cf3d --- /dev/null +++ b/src/planner/configuration/templates/llmd/kustomization.yaml.j2 @@ -0,0 +1,42 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + # TODO: pin to release tag (e.g. ?ref=v0.1.0) per llm-d-planner release + - https://github.com/llm-d/llm-d//guides/recipes/modelserver/base/single-host/default + +namePrefix: "{{ deployment_id }}-" + +images: + - name: REPLACE_MODEL_SERVER_IMAGE + newName: vllm/vllm-openai + newTag: latest + +labels: + - pairs: + app: "{{ deployment_id }}" + includeSelectors: true + includeTemplates: true + fields: + - version: v1 + kind: ServiceAccount + path: metadata/labels + create: true + - group: apps + version: v1 + kind: Deployment + path: metadata/labels + create: true + - group: apps + version: v1 + kind: Deployment + path: spec/selector/matchLabels + create: true + - group: apps + version: v1 + kind: Deployment + path: spec/template/metadata/labels + create: true + +patches: + - path: patch-vllm.yaml diff --git a/src/planner/configuration/templates/llmd/patch-vllm.yaml.j2 b/src/planner/configuration/templates/llmd/patch-vllm.yaml.j2 new file mode 100644 index 00000000..5e74ea3c --- /dev/null +++ b/src/planner/configuration/templates/llmd/patch-vllm.yaml.j2 @@ -0,0 +1,25 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: decode +spec: + replicas: {{ replicas }} + template: + spec: + containers: + - name: modelserver + command: ["vllm", "serve"] + args: + - "{{ model_id }}" + - "--tensor-parallel-size={{ tensor_parallel }}" + - "--port=8000" + resources: + requests: + nvidia.com/gpu: "{{ gpus_per_replica }}" + limits: + nvidia.com/gpu: "{{ gpus_per_replica }}" + startupProbe: + initialDelaySeconds: 15 + periodSeconds: 30 + timeoutSeconds: 5 + failureThreshold: 120 diff --git a/src/planner/configuration/templates/llmd/values.yaml.j2 b/src/planner/configuration/templates/llmd/values.yaml.j2 new file mode 100644 index 00000000..bcba617e --- /dev/null +++ b/src/planner/configuration/templates/llmd/values.yaml.j2 @@ -0,0 +1,48 @@ +# Helm values for llm-d router deployment. +# Install with: +# helm install {{ deployment_id }} \ +# oci://registry.k8s.io/gateway-api-inference-extension/charts/standalone \ +# -f scheduler/values.yaml \ +# -n {{ namespace }} +# TODO: pin image tag and chart version per llm-d-planner release + +inferenceExtension: + replicas: 1 + image: + registry: ghcr.io + repository: llm-d/llm-d-inference-scheduler + tag: v0.8.0 + flags: + v: 2 + failureMode: "FailOpen" + pluginsConfigFile: "{{ deployment_id }}-plugins.yaml" + pluginsCustomConfig: + {{ deployment_id }}-plugins.yaml: | + apiVersion: llm-d.ai/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: prefix-cache-scorer + - type: decode-filter + - type: max-score-picker + - type: single-profile-handler + schedulingProfiles: + - name: default + plugins: + - pluginRef: decode-filter + - pluginRef: max-score-picker + - pluginRef: prefix-cache-scorer + weight: 2 + resources: + requests: + cpu: "4" + memory: 8Gi + limits: + memory: 16Gi + +inferencePool: + modelServers: + matchLabels: + app: "{{ deployment_id }}" + targetPorts: + - number: 8000 + appProtocol: http diff --git a/src/planner/configuration/utils.py b/src/planner/configuration/utils.py new file mode 100644 index 00000000..39417e69 --- /dev/null +++ b/src/planner/configuration/utils.py @@ -0,0 +1,47 @@ +"""Shared utilities for deployment configuration generators.""" + +from __future__ import annotations + +import re +from datetime import datetime + +from planner.shared.schemas import DeploymentRecommendation + +_MODEL_ID_RE = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9._/-]*$") +_NAMESPACE_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,62}$") + + +def validate_model_id(model_id: str) -> None: + """Validate model_id to prevent YAML injection.""" + if not _MODEL_ID_RE.match(model_id): + raise ValueError(f"Invalid model_id format: {model_id}") + + +def validate_namespace(namespace: str) -> None: + """Validate Kubernetes namespace name.""" + if not _NAMESPACE_RE.match(namespace): + raise ValueError(f"Invalid namespace format: {namespace}") + + +def generate_deployment_id(recommendation: DeploymentRecommendation) -> str: + """Generate a Kubernetes-safe deployment ID. + + Must start with a letter, only lowercase alphanumeric and hyphens, + max 44 characters (KServe adds "-predictor-default" suffix, total must be <= 63). + """ + timestamp = datetime.now().strftime("%Y%m%d%H%M%S") + use_case = recommendation.intent.use_case.replace("_", "-") + + model_name = (recommendation.model_id or "unknown").split("/")[-1].lower() + model_name = re.sub(r"[^a-z0-9-]", "-", model_name) + model_name = re.sub(r"-+", "-", model_name).strip("-") + + deployment_id = f"{use_case}-{model_name}-{timestamp}" + + max_len = 44 + if len(deployment_id) > max_len: + max_model_len = max_len - len(use_case) - len(timestamp) - 2 + model_name = model_name[:max_model_len].rstrip("-") + deployment_id = f"{use_case}-{model_name}-{timestamp}" + + return deployment_id diff --git a/tests/unit/test_llmd_generator.py b/tests/unit/test_llmd_generator.py new file mode 100644 index 00000000..a9d30c18 --- /dev/null +++ b/tests/unit/test_llmd_generator.py @@ -0,0 +1,347 @@ +"""Unit tests for llm-d deployment generator.""" + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import pytest +import yaml +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from planner.api.routes import configuration_router +from planner.configuration import DeploymentGenerator, YAMLValidator +from planner.configuration.llmd_generator import LlmdDeploymentGenerator +from planner.shared.schemas.intent import DeploymentIntent +from planner.shared.schemas.recommendation import ( + DeploymentRecommendation, + GPUConfig, +) +from planner.shared.schemas.specification import SLOTargets, TrafficProfile + + +@pytest.fixture +def client(tmp_path) -> TestClient: + """Create a test client with mocked app state (no DB or disk side-effects).""" + app = FastAPI() + + with patch("planner.configuration.generator.ModelCatalog"): + app.state.deployment_generator = DeploymentGenerator( + output_dir=str(tmp_path / "vllm"), simulator_mode=False + ) + app.state.llmd_deployment_generator = LlmdDeploymentGenerator(output_dir=str(tmp_path / "llmd")) + app.state.yaml_validator = YAMLValidator() + app.state.cluster_managers = {} + app.state.cluster_manager_lock = MagicMock() + + app.include_router(configuration_router) + + return TestClient(app) + + +@pytest.fixture +def sample_recommendation() -> DeploymentRecommendation: + return DeploymentRecommendation( + intent=DeploymentIntent( + use_case="chatbot_conversational", + experience_class="conversational", + user_count=100, + ), + traffic_profile=TrafficProfile(prompt_tokens=512, output_tokens=256, expected_qps=9.0), + slo_targets=SLOTargets( + ttft_p95_target_ms=150, + itl_p95_target_ms=25, + e2e_p95_target_ms=7000, + ), + model_id="meta-llama/Llama-3-8B-Instruct", + model_name="Llama-3-8B-Instruct", + model_uri=None, + meets_slo=True, + gpu_config=GPUConfig( + gpu_type="NVIDIA-A100-80GB", + gpu_count=6, + tensor_parallel=2, + replicas=3, + ), + reasoning="test recommendation", + ) + + +@pytest.fixture +def llmd_generator(tmp_path) -> LlmdDeploymentGenerator: + """Create an LlmdDeploymentGenerator writing to a temporary directory.""" + return LlmdDeploymentGenerator(output_dir=str(tmp_path)) + + +@pytest.mark.unit +class TestLlmdGeneratorOutput: + def test_invalid_model_id_raises(self, llmd_generator: LlmdDeploymentGenerator) -> None: + """Test that invalid model_id format raises ValueError.""" + rec = DeploymentRecommendation( + intent=DeploymentIntent( + use_case="chatbot_conversational", + experience_class="conversational", + user_count=100, + ), + traffic_profile=TrafficProfile(prompt_tokens=512, output_tokens=256, expected_qps=9.0), + slo_targets=SLOTargets( + ttft_p95_target_ms=150, + itl_p95_target_ms=25, + e2e_p95_target_ms=7000, + ), + model_id='bad-model"\nmalicious: code', + model_name="bad-model", + model_uri=None, + meets_slo=False, + gpu_config=GPUConfig( + gpu_type="NVIDIA-A100-80GB", + gpu_count=2, + tensor_parallel=2, + replicas=3, + ), + reasoning="test", + ) + with pytest.raises(ValueError, match="Invalid model_id format"): + llmd_generator.generate_all(rec) + + def test_invalid_namespace_raises(self, llmd_generator: LlmdDeploymentGenerator) -> None: + """Test that invalid namespace format raises ValueError.""" + rec = DeploymentRecommendation( + intent=DeploymentIntent( + use_case="chatbot_conversational", + experience_class="conversational", + user_count=100, + ), + traffic_profile=TrafficProfile(prompt_tokens=512, output_tokens=256, expected_qps=9.0), + slo_targets=SLOTargets( + ttft_p95_target_ms=150, + itl_p95_target_ms=25, + e2e_p95_target_ms=7000, + ), + model_id="meta-llama/Llama-3-8B-Instruct", + model_name="Llama-3-8B-Instruct", + model_uri=None, + meets_slo=True, + gpu_config=GPUConfig( + gpu_type="NVIDIA-A100-80GB", + gpu_count=2, + tensor_parallel=2, + replicas=3, + ), + reasoning="test", + ) + with pytest.raises(ValueError, match="Invalid namespace format"): + llmd_generator.generate_all(rec, namespace="INVALID NS!") + + def test_generate_all_returns_three_files( + self, + llmd_generator: LlmdDeploymentGenerator, + sample_recommendation: DeploymentRecommendation, + ) -> None: + from pathlib import Path + + result = llmd_generator.generate_all(sample_recommendation, namespace="prod") + + assert set(result["files"].keys()) == { + "kustomization", + "patch_vllm", + "helm_values", + } + assert set(result["contents"].keys()) == { + "kustomization", + "patch_vllm", + "helm_values", + } + for path in result["files"].values(): + assert Path(path).exists() + + def test_generate_all_returns_deployment_id_and_namespace( + self, + llmd_generator: LlmdDeploymentGenerator, + sample_recommendation: DeploymentRecommendation, + ) -> None: + result = llmd_generator.generate_all(sample_recommendation, namespace="myns") + + assert result["deployment_id"] + assert result["namespace"] == "myns" + + def test_all_outputs_are_valid_yaml( + self, + llmd_generator: LlmdDeploymentGenerator, + sample_recommendation: DeploymentRecommendation, + ) -> None: + result = llmd_generator.generate_all(sample_recommendation) + + for key in ("kustomization", "patch_vllm", "helm_values"): + parsed = yaml.safe_load(result["contents"][key]) + assert parsed is not None, f"{key} rendered as empty YAML" + + +@pytest.mark.unit +class TestKustomizationOutput: + def test_references_llmd_base( + self, + llmd_generator: LlmdDeploymentGenerator, + sample_recommendation: DeploymentRecommendation, + ) -> None: + result = llmd_generator.generate_all(sample_recommendation) + parsed = yaml.safe_load(result["contents"]["kustomization"]) + + assert any("llm-d/llm-d" in r for r in parsed["resources"]) + + def test_sets_name_prefix_from_deployment_id( + self, + llmd_generator: LlmdDeploymentGenerator, + sample_recommendation: DeploymentRecommendation, + ) -> None: + result = llmd_generator.generate_all(sample_recommendation) + parsed = yaml.safe_load(result["contents"]["kustomization"]) + + assert parsed["namePrefix"].startswith("chatbot") + assert parsed["namePrefix"].endswith("-") + + def test_sets_app_label( + self, + llmd_generator: LlmdDeploymentGenerator, + sample_recommendation: DeploymentRecommendation, + ) -> None: + result = llmd_generator.generate_all(sample_recommendation) + parsed = yaml.safe_load(result["contents"]["kustomization"]) + + label_pairs = parsed["labels"][0]["pairs"] + assert "app" in label_pairs + + +@pytest.mark.unit +class TestPatchVllmOutput: + def test_uses_model_id( + self, + llmd_generator: LlmdDeploymentGenerator, + sample_recommendation: DeploymentRecommendation, + ) -> None: + result = llmd_generator.generate_all(sample_recommendation) + parsed = yaml.safe_load(result["contents"]["patch_vllm"]) + + container = parsed["spec"]["template"]["spec"]["containers"][0] + assert "meta-llama/Llama-3-8B-Instruct" in container["args"] + + def test_uses_tensor_parallel( + self, + llmd_generator: LlmdDeploymentGenerator, + sample_recommendation: DeploymentRecommendation, + ) -> None: + result = llmd_generator.generate_all(sample_recommendation) + parsed = yaml.safe_load(result["contents"]["patch_vllm"]) + + container = parsed["spec"]["template"]["spec"]["containers"][0] + assert "--tensor-parallel-size=2" in container["args"] + + def test_uses_replicas( + self, + llmd_generator: LlmdDeploymentGenerator, + sample_recommendation: DeploymentRecommendation, + ) -> None: + result = llmd_generator.generate_all(sample_recommendation) + parsed = yaml.safe_load(result["contents"]["patch_vllm"]) + + assert parsed["spec"]["replicas"] == 3 + + def test_sets_gpu_resources_per_replica( + self, + llmd_generator: LlmdDeploymentGenerator, + sample_recommendation: DeploymentRecommendation, + ) -> None: + result = llmd_generator.generate_all(sample_recommendation) + parsed = yaml.safe_load(result["contents"]["patch_vllm"]) + + container = parsed["spec"]["template"]["spec"]["containers"][0] + # gpu_count=6, tensor_parallel=2, replicas=3 — each pod gets tensor_parallel GPUs + assert container["resources"]["requests"]["nvidia.com/gpu"] == "2" + assert container["resources"]["limits"]["nvidia.com/gpu"] == "2" + + +@pytest.mark.unit +class TestHelmValuesOutput: + def test_contains_inference_extension( + self, + llmd_generator: LlmdDeploymentGenerator, + sample_recommendation: DeploymentRecommendation, + ) -> None: + result = llmd_generator.generate_all(sample_recommendation) + parsed = yaml.safe_load(result["contents"]["helm_values"]) + + assert "inferenceExtension" in parsed + assert ( + parsed["inferenceExtension"]["image"]["repository"] == "llm-d/llm-d-inference-scheduler" + ) + + def test_contains_inference_pool_selector( + self, + llmd_generator: LlmdDeploymentGenerator, + sample_recommendation: DeploymentRecommendation, + ) -> None: + result = llmd_generator.generate_all(sample_recommendation) + parsed = yaml.safe_load(result["contents"]["helm_values"]) + + pool = parsed["inferencePool"] + assert "app" in pool["modelServers"]["matchLabels"] + + def test_contains_default_epp_config( + self, + llmd_generator: LlmdDeploymentGenerator, + sample_recommendation: DeploymentRecommendation, + ) -> None: + result = llmd_generator.generate_all(sample_recommendation) + parsed = yaml.safe_load(result["contents"]["helm_values"]) + + custom_config = parsed["inferenceExtension"]["pluginsCustomConfig"] + config_key = list(custom_config.keys())[0] + epp_config = yaml.safe_load(custom_config[config_key]) + assert epp_config["kind"] == "EndpointPickerConfig" + assert len(epp_config["plugins"]) == 4 + assert epp_config["schedulingProfiles"][0]["name"] == "default" + + def test_target_ports( + self, + llmd_generator: LlmdDeploymentGenerator, + sample_recommendation: DeploymentRecommendation, + ) -> None: + result = llmd_generator.generate_all(sample_recommendation) + parsed = yaml.safe_load(result["contents"]["helm_values"]) + + assert parsed["inferencePool"]["targetPorts"] == [{"number": 8000}] + + +@pytest.mark.unit +class TestDeployEndpointStack: + def test_deploy_with_stack_llmd( + self, client: TestClient, sample_recommendation: DeploymentRecommendation + ) -> None: + response = client.post( + "/api/v1/deploy", + json={ + "recommendation": sample_recommendation.model_dump(), + "namespace": "test-ns", + "stack": "llm-d", + }, + ) + assert response.status_code == 200 + data = response.json() + assert data["success"] is True + assert "kustomization" in data["yaml_contents"] + assert "helm_values" in data["yaml_contents"] + assert "patch_vllm" in data["yaml_contents"] + + def test_deploy_with_stack_vllm_is_default( + self, client: TestClient, sample_recommendation: DeploymentRecommendation + ) -> None: + response = client.post( + "/api/v1/deploy", + json={ + "recommendation": sample_recommendation.model_dump(), + "namespace": "test-ns", + }, + ) + assert response.status_code == 200 + data = response.json() + assert "inferenceservice" in data["yaml_contents"] diff --git a/ui/api_client.py b/ui/api_client.py index 4d7235db..d30a83b9 100644 --- a/ui/api_client.py +++ b/ui/api_client.py @@ -404,7 +404,7 @@ def extract_business_context(user_input: str) -> dict | None: return None -def deploy_and_generate_yaml(recommendation: dict) -> dict | None: +def deploy_and_generate_yaml(recommendation: dict, stack: str = "vllm") -> dict | None: """Deploy a recommendation and return generated YAML contents. Returns dict with deployment_id, yaml_contents, and success status, or None on error. @@ -412,7 +412,7 @@ def deploy_and_generate_yaml(recommendation: dict) -> dict | None: try: response = requests.post( f"{API_BASE_URL}/api/v1/deploy", - json={"recommendation": recommendation, "namespace": "default"}, + json={"recommendation": recommendation, "namespace": "default", "stack": stack}, timeout=30, ) response.raise_for_status() diff --git a/ui/components/deployment.py b/ui/components/deployment.py index 907a2c0d..49a51dc8 100644 --- a/ui/components/deployment.py +++ b/ui/components/deployment.py @@ -63,6 +63,22 @@ def render_deployment_tab(): st.markdown("---") + # Deployment stack selection + prev_stack = st.session_state.get("deployment_stack", "vllm") + stack = st.radio( + "Deployment Stack", + options=["vllm", "llm-d"], + format_func=lambda x: "vLLM (standalone)" if x == "vllm" else "llm-d (inference stack)", + horizontal=True, + key="deployment_stack", + ) + if stack != prev_stack and st.session_state.get("deployment_yaml_generated"): + st.session_state.deployment_yaml_generated = False + st.session_state.deployment_yaml_files = {} + st.session_state.deployment_id = None + st.session_state.deployment_error = None + st.rerun() + # YAML Generation Section if not st.session_state.get("deployment_yaml_generated"): st.subheader("Deployment Files") @@ -71,11 +87,13 @@ def render_deployment_tab(): if st.button("Generate YAML Files", type="primary", key="generate_yaml_btn"): with st.spinner("Generating deployment files..."): try: + stack = st.session_state.get("deployment_stack", "vllm") response = requests.post( f"{API_BASE_URL}/api/v1/deploy", json={ "recommendation": selected_config, "namespace": "default", + "stack": stack, }, timeout=30, ) @@ -140,20 +158,24 @@ def render_deployment_tab(): ) if yaml_files: - file_order = ["inferenceservice", "autoscaling", "servicemonitor"] - file_labels = { - "inferenceservice": "InferenceService (KServe)", - "autoscaling": "Autoscaling (HPA)", - "servicemonitor": "ServiceMonitor (Prometheus)", - } + stack = st.session_state.get("deployment_stack", "vllm") + if stack == "llm-d": + file_order = ["kustomization", "patch_vllm", "helm_values"] + file_labels = { + "kustomization": "Kustomization (Model Server)", + "patch_vllm": "vLLM Patch (Model Server)", + "helm_values": "Helm Values (EPP + InferencePool)", + } + else: + file_order = ["inferenceservice", "autoscaling", "servicemonitor"] + file_labels = { + "inferenceservice": "InferenceService (KServe)", + "autoscaling": "Autoscaling (HPA)", + "servicemonitor": "ServiceMonitor (Prometheus)", + } for file_key in file_order: - matching_content = None - for filename, content in yaml_files.items(): - if file_key in filename.lower(): - matching_content = content - break - + matching_content = yaml_files.get(file_key) if matching_content: label = file_labels.get(file_key, file_key) with st.expander(f"{label}", expanded=False): diff --git a/ui/components/recommendations.py b/ui/components/recommendations.py index 42eefe84..89a9ee67 100644 --- a/ui/components/recommendations.py +++ b/ui/components/recommendations.py @@ -241,7 +241,8 @@ def _render_category_card(title, recs_list, highlight_field, category_key, col): st.session_state.deployment_id = None st.session_state.deployed_to_cluster = False - result = deploy_and_generate_yaml(rec) + stack = st.session_state.get("deployment_stack", "vllm") + result = deploy_and_generate_yaml(rec, stack=stack) if result and result.get("success"): st.session_state.deployment_id = result["deployment_id"] st.session_state.deployment_yaml_files = result["yaml_contents"]