Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion src/planner/api/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from starlette.concurrency import run_in_threadpool

from planner.cluster import KubernetesClusterManager, KubernetesDeploymentError
from planner.configuration import DeploymentGenerator, YAMLValidator
from planner.configuration import DeploymentGenerator, LlmdDeploymentGenerator, YAMLValidator
from planner.knowledge_base.model_catalog import ModelCatalog
from planner.knowledge_base.slo_templates import SLOTemplateRepository
from planner.orchestration.workflow import RecommendationWorkflow
Expand Down Expand Up @@ -101,6 +101,7 @@ def init_app_state(app: FastAPI) -> None:
app.state.model_catalog = ModelCatalog()
app.state.slo_repo = SLOTemplateRepository()
app.state.deployment_generator = DeploymentGenerator(simulator_mode=False)
app.state.llmd_deployment_generator = LlmdDeploymentGenerator()
app.state.yaml_validator = YAMLValidator()
app.state.cluster_managers = {} # dict[str, KubernetesClusterManager]

Expand Down Expand Up @@ -161,6 +162,11 @@ def get_deployment_generator(request: Request) -> DeploymentGenerator:
return cast(DeploymentGenerator, request.app.state.deployment_generator)


def get_llmd_deployment_generator(request: Request) -> LlmdDeploymentGenerator:
"""Get the llm-d deployment generator singleton."""
return cast(LlmdDeploymentGenerator, request.app.state.llmd_deployment_generator)


def get_yaml_validator(request: Request) -> YAMLValidator:
"""Get the YAML validator singleton."""
return cast(YAMLValidator, request.app.state.yaml_validator)
Expand Down
30 changes: 24 additions & 6 deletions src/planner/api/routes/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
import random
from datetime import datetime
from typing import Any
from typing import Any, Literal

from fastapi import APIRouter, Depends, HTTPException, Request, status
from pydantic import BaseModel
Expand All @@ -12,11 +12,14 @@
from planner.api.dependencies import (
get_cluster_manager_or_raise,
get_deployment_generator,
get_llmd_deployment_generator,
get_yaml_validator,
)
from planner.configuration import DeploymentGenerator, YAMLValidator
from planner.configuration import DeploymentGenerator, LlmdDeploymentGenerator, YAMLValidator
from planner.shared.schemas import DeploymentMode, DeploymentRecommendation

StackType = Literal["vllm", "llm-d"]

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/api/v1", tags=["configuration"])
Expand All @@ -27,6 +30,7 @@ class DeploymentRequest(BaseModel):

recommendation: DeploymentRecommendation
namespace: str = "default"
stack: StackType = "vllm"


class DeploymentResponse(BaseModel):
Expand Down Expand Up @@ -78,16 +82,30 @@ async def set_mode(request: DeploymentModeRequest, http_request: Request):
async def deploy_model(
request: DeploymentRequest,
deployment_generator: DeploymentGenerator = Depends(get_deployment_generator),
llmd_generator: LlmdDeploymentGenerator = Depends(get_llmd_deployment_generator),
yaml_validator: YAMLValidator = Depends(get_yaml_validator),
):
"""Generate deployment YAML and return contents inline."""
try:
logger.info(f"Generating deployment for model: {request.recommendation.model_name}")

result = deployment_generator.generate_all(
recommendation=request.recommendation, namespace=request.namespace
logger.info(
f"Generating deployment for model: {request.recommendation.model_name}"
f" (stack={request.stack})"
)

if request.stack == "llm-d":
result = llmd_generator.generate_all(

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

YAML validation is skipped for llm-d.

recommendation=request.recommendation, namespace=request.namespace
)
elif request.stack == "vllm":
result = deployment_generator.generate_all(
recommendation=request.recommendation, namespace=request.namespace
)
else:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Unknown stack: {request.stack}",
)

try:
yaml_validator.validate_all(result["files"])
logger.info(f"All YAML files validated for deployment: {result['deployment_id']}")
Expand Down
1 change: 1 addition & 0 deletions src/planner/configuration/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Configuration module for YAML generation and validation."""

from .generator import DeploymentGenerator
from .llmd_generator import LlmdDeploymentGenerator
from .validator import YAMLValidator
42 changes: 3 additions & 39 deletions src/planner/configuration/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from jinja2 import Environment, FileSystemLoader

from planner.configuration.utils import generate_deployment_id as _generate_deployment_id
from planner.knowledge_base.model_catalog import ModelCatalog
from planner.shared.schemas import DeploymentRecommendation

Expand Down Expand Up @@ -59,45 +60,8 @@ def __init__(self, output_dir: str | None = None, simulator_mode: bool = False):
)

def generate_deployment_id(self, recommendation: DeploymentRecommendation) -> str:
"""
Generate a unique deployment ID that meets Kubernetes naming requirements:
- Must start with a letter
- Only lowercase alphanumeric and hyphens
- Max 44 characters (KServe adds "-predictor-default" suffix, total must be ≤63)

Args:
recommendation: Deployment recommendation

Returns:
Deployment ID (e.g., "chatbot-mistral-7b-20251003143022")
"""
import re

timestamp = datetime.now().strftime("%Y%m%d%H%M%S") # 14 chars: YYYYMMDDHHMMSS
use_case = recommendation.intent.use_case.replace("_", "-")

# Clean model name: remove special chars, keep alphanumeric and hyphens
model_name = (recommendation.model_id or "unknown").split("/")[-1].lower()
model_name = re.sub(r"[^a-z0-9-]", "-", model_name)
# Remove consecutive hyphens
model_name = re.sub(r"-+", "-", model_name).strip("-")

# Build ID
deployment_id = f"{use_case}-{model_name}-{timestamp}"

# KServe creates names like "{deployment_id}-predictor-default" (adds 19 chars)
# So deployment_id must be max 44 chars to stay under 63 char DNS limit
max_deployment_id_len = 44

if len(deployment_id) > max_deployment_id_len:
# Truncate model name to fit
max_model_len = (
max_deployment_id_len - len(use_case) - len(timestamp) - 2
) # 2 for hyphens
model_name = model_name[:max_model_len].rstrip("-")
deployment_id = f"{use_case}-{model_name}-{timestamp}"

return deployment_id
"""Generate a unique deployment ID that meets Kubernetes naming requirements."""
return _generate_deployment_id(recommendation)

def _prepare_template_context(
self,
Expand Down
113 changes: 113 additions & 0 deletions src/planner/configuration/llmd_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""llm-d Deployment Generator.

Generates kustomize overlay + Helm values for the llm-d stack,
aligned with how llm-d recommends deployment:
- Model servers via kustomize (referencing llm-d base manifests)
- EPP + InferencePool via Helm (standalone chart)
"""

from __future__ import annotations

import logging
from pathlib import Path
from typing import Any

from jinja2 import Environment, FileSystemLoader

from planner.configuration.utils import (
generate_deployment_id,
validate_model_id,
validate_namespace,
)
from planner.shared.schemas import DeploymentRecommendation

logger = logging.getLogger(__name__)


class LlmdDeploymentGenerator:
"""Generate llm-d deployment manifests (kustomize overlay + helm values)."""

def __init__(self, output_dir: str | None = None):
template_dir = Path(__file__).parent / "templates" / "llmd"
self.env = Environment(
loader=FileSystemLoader(str(template_dir)),
trim_blocks=True,
lstrip_blocks=True,
keep_trailing_newline=True,
)

if output_dir:
self.output_dir = Path(output_dir)
else:
project_root = Path(__file__).parent.parent.parent.parent
self.output_dir = project_root / "generated_configs"

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not loving this. Why do we want to generate configs as part of the project dir?
What happens when this is deployed in a container? Do we populate output_dir? Is it mapped to an external volume?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This follows the same pattern as DeploymentGenerator (the vLLM/KServe generator), both default to project_root / "generated_configs" and accept an output_dir override. In practice the files are written to disk as a side effect but the API returns the contents inline (the response includes yaml_contents), so the caller doesn't depend on the filesystem path.


self.output_dir.mkdir(parents=True, exist_ok=True)

def _prepare_context(
self,
recommendation: DeploymentRecommendation,
deployment_id: str,
namespace: str,
) -> dict[str, Any]:
"""Prepare Jinja2 template context from recommendation."""
gpu_config = recommendation.gpu_config

model_id = recommendation.model_id or "unknown"
validate_model_id(model_id)
validate_namespace(namespace)

tensor_parallel = gpu_config.tensor_parallel if gpu_config else 1

return {
"deployment_id": deployment_id,
"namespace": namespace,
"model_id": model_id,
"tensor_parallel": tensor_parallel,
"gpus_per_replica": tensor_parallel,
"replicas": gpu_config.replicas if gpu_config else 1,
}

def generate_all(
self,
recommendation: DeploymentRecommendation,
namespace: str = "default",
) -> dict[str, Any]:
"""Generate all llm-d deployment files.

Returns a dict with: deployment_id, namespace, files, contents.
"""
deployment_id = generate_deployment_id(recommendation)
context = self._prepare_context(recommendation, deployment_id, namespace)

configs: list[tuple[str, str, str]] = [
("kustomization.yaml.j2", "modelserver/kustomization.yaml", "kustomization"),
("patch-vllm.yaml.j2", "modelserver/patch-vllm.yaml", "patch_vllm"),
("values.yaml.j2", "scheduler/values.yaml", "helm_values"),
]

deployment_dir = self.output_dir / deployment_id
(deployment_dir / "modelserver").mkdir(parents=True, exist_ok=True)
(deployment_dir / "scheduler").mkdir(parents=True, exist_ok=True)

generated_files: dict[str, str] = {}
generated_contents: dict[str, str] = {}

for template_name, output_rel_path, config_type in configs:
template = self.env.get_template(template_name)
rendered = template.render(**context)

output_path = deployment_dir / output_rel_path
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w") as f:
f.write(rendered)

generated_files[config_type] = str(output_path)
generated_contents[config_type] = rendered

return {
"deployment_id": deployment_id,
"namespace": namespace,
"files": generated_files,
"contents": generated_contents,
}
42 changes: 42 additions & 0 deletions src/planner/configuration/templates/llmd/kustomization.yaml.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

resources:
# TODO: pin to release tag (e.g. ?ref=v0.1.0) per llm-d-planner release

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to consider automating this in the release workflow.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed. As a follow-up item, when we cut a release, the CI/release workflow should inject a pinned ref tag for both the kustomize base resource and the EPP image tag in values.yaml.j2. For now it tracks latest.

- https://github.com/llm-d/llm-d//guides/recipes/modelserver/base/single-host/default

namePrefix: "{{ deployment_id }}-"

images:
- name: REPLACE_MODEL_SERVER_IMAGE
newName: vllm/vllm-openai
newTag: latest

labels:
- pairs:
app: "{{ deployment_id }}"
includeSelectors: true
includeTemplates: true
fields:
- version: v1
kind: ServiceAccount
path: metadata/labels
create: true
- group: apps
version: v1
kind: Deployment
path: metadata/labels
create: true
- group: apps
version: v1
kind: Deployment
path: spec/selector/matchLabels
create: true
- group: apps
version: v1
kind: Deployment
path: spec/template/metadata/labels
create: true

patches:
- path: patch-vllm.yaml
25 changes: 25 additions & 0 deletions src/planner/configuration/templates/llmd/patch-vllm.yaml.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: decode
spec:
replicas: {{ replicas }}
template:
spec:
containers:
- name: modelserver
command: ["vllm", "serve"]
args:
- "{{ model_id }}"
- "--tensor-parallel-size={{ tensor_parallel }}"
- "--port=8000"
resources:
requests:
nvidia.com/gpu: "{{ gpus_per_replica }}"

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need to populate gpu_count now that it's not used in the templates?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

limits:
nvidia.com/gpu: "{{ gpus_per_replica }}"
startupProbe:
initialDelaySeconds: 15
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 120
48 changes: 48 additions & 0 deletions src/planner/configuration/templates/llmd/values.yaml.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Helm values for llm-d router deployment.
# Install with:
# helm install {{ deployment_id }} \
# oci://registry.k8s.io/gateway-api-inference-extension/charts/standalone \
# -f scheduler/values.yaml \
# -n {{ namespace }}
# TODO: pin image tag and chart version per llm-d-planner release

inferenceExtension:
replicas: 1
image:
registry: ghcr.io
repository: llm-d/llm-d-inference-scheduler
tag: v0.8.0

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will we pin this version per release of llm-d-planner?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Now I'm just using the latest available for the core components. We probably want to store the pinned versions we're generating somewhere.

flags:
v: 2
failureMode: "FailOpen"
pluginsConfigFile: "{{ deployment_id }}-plugins.yaml"
pluginsCustomConfig:
{{ deployment_id }}-plugins.yaml: |
apiVersion: llm-d.ai/v1alpha1
kind: EndpointPickerConfig
plugins:
- type: prefix-cache-scorer
- type: decode-filter
- type: max-score-picker
- type: single-profile-handler
schedulingProfiles:
- name: default
plugins:
- pluginRef: decode-filter
- pluginRef: max-score-picker
- pluginRef: prefix-cache-scorer
weight: 2
resources:
requests:
cpu: "4"
memory: 8Gi
limits:
memory: 16Gi

inferencePool:
modelServers:
matchLabels:
app: "{{ deployment_id }}"
targetPorts:
- number: 8000
appProtocol: http
Loading