-
Notifications
You must be signed in to change notification settings - Fork 11
Add minimal support for llm-d manifest generation #274
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
13d6d4d
38c58b1
c578531
b4e5da3
315c702
26d83d0
e550453
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| """Configuration module for YAML generation and validation.""" | ||
|
|
||
| from .generator import DeploymentGenerator | ||
| from .llmd_generator import LlmdDeploymentGenerator | ||
| from .validator import YAMLValidator |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,113 @@ | ||
| """llm-d Deployment Generator. | ||
|
|
||
| Generates kustomize overlay + Helm values for the llm-d stack, | ||
| aligned with how llm-d recommends deployment: | ||
| - Model servers via kustomize (referencing llm-d base manifests) | ||
| - EPP + InferencePool via Helm (standalone chart) | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import logging | ||
| from pathlib import Path | ||
| from typing import Any | ||
|
|
||
| from jinja2 import Environment, FileSystemLoader | ||
|
|
||
| from planner.configuration.utils import ( | ||
| generate_deployment_id, | ||
| validate_model_id, | ||
| validate_namespace, | ||
| ) | ||
| from planner.shared.schemas import DeploymentRecommendation | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| class LlmdDeploymentGenerator: | ||
| """Generate llm-d deployment manifests (kustomize overlay + helm values).""" | ||
|
|
||
| def __init__(self, output_dir: str | None = None): | ||
| template_dir = Path(__file__).parent / "templates" / "llmd" | ||
| self.env = Environment( | ||
| loader=FileSystemLoader(str(template_dir)), | ||
| trim_blocks=True, | ||
| lstrip_blocks=True, | ||
| keep_trailing_newline=True, | ||
| ) | ||
|
|
||
| if output_dir: | ||
| self.output_dir = Path(output_dir) | ||
| else: | ||
| project_root = Path(__file__).parent.parent.parent.parent | ||
| self.output_dir = project_root / "generated_configs" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not loving this. Why do we want to generate configs as part of the project dir?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This follows the same pattern as DeploymentGenerator (the vLLM/KServe generator), both default to |
||
|
|
||
| self.output_dir.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| def _prepare_context( | ||
| self, | ||
| recommendation: DeploymentRecommendation, | ||
| deployment_id: str, | ||
| namespace: str, | ||
| ) -> dict[str, Any]: | ||
| """Prepare Jinja2 template context from recommendation.""" | ||
| gpu_config = recommendation.gpu_config | ||
|
|
||
| model_id = recommendation.model_id or "unknown" | ||
| validate_model_id(model_id) | ||
| validate_namespace(namespace) | ||
|
|
||
| tensor_parallel = gpu_config.tensor_parallel if gpu_config else 1 | ||
|
|
||
| return { | ||
| "deployment_id": deployment_id, | ||
| "namespace": namespace, | ||
| "model_id": model_id, | ||
| "tensor_parallel": tensor_parallel, | ||
| "gpus_per_replica": tensor_parallel, | ||
| "replicas": gpu_config.replicas if gpu_config else 1, | ||
| } | ||
|
|
||
| def generate_all( | ||
| self, | ||
| recommendation: DeploymentRecommendation, | ||
| namespace: str = "default", | ||
| ) -> dict[str, Any]: | ||
| """Generate all llm-d deployment files. | ||
|
|
||
| Returns a dict with: deployment_id, namespace, files, contents. | ||
| """ | ||
| deployment_id = generate_deployment_id(recommendation) | ||
| context = self._prepare_context(recommendation, deployment_id, namespace) | ||
|
|
||
| configs: list[tuple[str, str, str]] = [ | ||
| ("kustomization.yaml.j2", "modelserver/kustomization.yaml", "kustomization"), | ||
| ("patch-vllm.yaml.j2", "modelserver/patch-vllm.yaml", "patch_vllm"), | ||
| ("values.yaml.j2", "scheduler/values.yaml", "helm_values"), | ||
| ] | ||
|
|
||
| deployment_dir = self.output_dir / deployment_id | ||
| (deployment_dir / "modelserver").mkdir(parents=True, exist_ok=True) | ||
| (deployment_dir / "scheduler").mkdir(parents=True, exist_ok=True) | ||
|
|
||
| generated_files: dict[str, str] = {} | ||
| generated_contents: dict[str, str] = {} | ||
|
|
||
| for template_name, output_rel_path, config_type in configs: | ||
| template = self.env.get_template(template_name) | ||
| rendered = template.render(**context) | ||
|
|
||
| output_path = deployment_dir / output_rel_path | ||
| output_path.parent.mkdir(parents=True, exist_ok=True) | ||
| with open(output_path, "w") as f: | ||
| f.write(rendered) | ||
|
|
||
| generated_files[config_type] = str(output_path) | ||
| generated_contents[config_type] = rendered | ||
|
|
||
| return { | ||
| "deployment_id": deployment_id, | ||
| "namespace": namespace, | ||
| "files": generated_files, | ||
| "contents": generated_contents, | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,42 @@ | ||
| apiVersion: kustomize.config.k8s.io/v1beta1 | ||
| kind: Kustomization | ||
|
|
||
| resources: | ||
| # TODO: pin to release tag (e.g. ?ref=v0.1.0) per llm-d-planner release | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to consider automating this in the release workflow.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed. As a follow-up item, when we cut a release, the CI/release workflow should inject a pinned ref tag for both the kustomize base resource and the EPP image tag in |
||
| - https://github.com/llm-d/llm-d//guides/recipes/modelserver/base/single-host/default | ||
|
|
||
| namePrefix: "{{ deployment_id }}-" | ||
|
|
||
| images: | ||
| - name: REPLACE_MODEL_SERVER_IMAGE | ||
| newName: vllm/vllm-openai | ||
| newTag: latest | ||
|
|
||
| labels: | ||
| - pairs: | ||
| app: "{{ deployment_id }}" | ||
| includeSelectors: true | ||
| includeTemplates: true | ||
| fields: | ||
| - version: v1 | ||
| kind: ServiceAccount | ||
| path: metadata/labels | ||
| create: true | ||
| - group: apps | ||
| version: v1 | ||
| kind: Deployment | ||
| path: metadata/labels | ||
| create: true | ||
| - group: apps | ||
| version: v1 | ||
| kind: Deployment | ||
| path: spec/selector/matchLabels | ||
| create: true | ||
| - group: apps | ||
| version: v1 | ||
| kind: Deployment | ||
| path: spec/template/metadata/labels | ||
| create: true | ||
|
|
||
| patches: | ||
| - path: patch-vllm.yaml | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,25 @@ | ||
| apiVersion: apps/v1 | ||
| kind: Deployment | ||
| metadata: | ||
| name: decode | ||
| spec: | ||
| replicas: {{ replicas }} | ||
| template: | ||
| spec: | ||
| containers: | ||
| - name: modelserver | ||
| command: ["vllm", "serve"] | ||
| args: | ||
| - "{{ model_id }}" | ||
| - "--tensor-parallel-size={{ tensor_parallel }}" | ||
| - "--port=8000" | ||
| resources: | ||
| requests: | ||
| nvidia.com/gpu: "{{ gpus_per_replica }}" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we still need to populate
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
| limits: | ||
| nvidia.com/gpu: "{{ gpus_per_replica }}" | ||
| startupProbe: | ||
| initialDelaySeconds: 15 | ||
| periodSeconds: 30 | ||
| timeoutSeconds: 5 | ||
| failureThreshold: 120 | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,48 @@ | ||
| # Helm values for llm-d router deployment. | ||
| # Install with: | ||
| # helm install {{ deployment_id }} \ | ||
| # oci://registry.k8s.io/gateway-api-inference-extension/charts/standalone \ | ||
| # -f scheduler/values.yaml \ | ||
| # -n {{ namespace }} | ||
| # TODO: pin image tag and chart version per llm-d-planner release | ||
|
|
||
| inferenceExtension: | ||
| replicas: 1 | ||
| image: | ||
| registry: ghcr.io | ||
| repository: llm-d/llm-d-inference-scheduler | ||
| tag: v0.8.0 | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will we pin this version per release of llm-d-planner?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. Now I'm just using the latest available for the core components. We probably want to store the pinned versions we're generating somewhere. |
||
| flags: | ||
| v: 2 | ||
| failureMode: "FailOpen" | ||
| pluginsConfigFile: "{{ deployment_id }}-plugins.yaml" | ||
| pluginsCustomConfig: | ||
| {{ deployment_id }}-plugins.yaml: | | ||
| apiVersion: llm-d.ai/v1alpha1 | ||
| kind: EndpointPickerConfig | ||
| plugins: | ||
| - type: prefix-cache-scorer | ||
| - type: decode-filter | ||
| - type: max-score-picker | ||
| - type: single-profile-handler | ||
| schedulingProfiles: | ||
| - name: default | ||
| plugins: | ||
| - pluginRef: decode-filter | ||
| - pluginRef: max-score-picker | ||
| - pluginRef: prefix-cache-scorer | ||
| weight: 2 | ||
| resources: | ||
| requests: | ||
| cpu: "4" | ||
| memory: 8Gi | ||
| limits: | ||
| memory: 16Gi | ||
|
|
||
| inferencePool: | ||
| modelServers: | ||
| matchLabels: | ||
| app: "{{ deployment_id }}" | ||
| targetPorts: | ||
| - number: 8000 | ||
| appProtocol: http | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
YAML validation is skipped for llm-d.