Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions config/charts/llm-d-router-gateway/templates/gke.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,30 @@ spec:
grpcHealthCheck:
portSpecification: "USE_FIXED_PORT"
port: {{ $eppHealthPort }}
{{- if .Values.router.epp.gkePreferredBackends.enabled }}
---
kind: HealthCheckPolicy
apiVersion: networking.gke.io/v1
metadata:
name: {{ printf "%s-backup" (include "llm-d-router.name" .) }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "llm-d-router.labels" . | nindent 4 }}
spec:
targetRef:
group: ""
kind: Service
name: {{ printf "%s-backup" (include "llm-d-router.name" .) }}
default:
config:
type: GRPC
grpcHealthCheck:
portSpecification: "USE_FIXED_PORT"
port: {{ $eppHealthPort }}
{{- end }}
{{- end }}
---
{{- if not .Values.router.epp.gkePreferredBackends.enabled }}
apiVersion: networking.gke.io/v1
kind: GCPBackendPolicy
metadata:
Expand All @@ -78,4 +100,52 @@ spec:
enabled: true # log all requests by default
---
{{- end }}
{{- if .Values.router.epp.gkePreferredBackends.enabled }}
apiVersion: networking.gke.io/v1
kind: GCPBackendPolicy
metadata:
name: {{ .Release.Name }}
namespace: {{ .Release.Namespace }}
spec:
targetRef:
group: ""
kind: Service
name: {{ include "llm-d-router.name" . }}
default:
timeoutSec: 300
backendPreference: PREFERRED
{{- if .Values.router.epp.gkePreferredBackends.balancingMode }}
balancingMode: {{ .Values.router.epp.gkePreferredBackends.balancingMode }}
{{- end }}
{{- if .Values.router.epp.gkePreferredBackends.maxRatePerEndpoint }}
maxRatePerEndpoint: {{ .Values.router.epp.gkePreferredBackends.maxRatePerEndpoint }}
{{- end }}
{{- if .Values.router.epp.gkePreferredBackends.capacityScalerPercent }}
capacityScalerPercent: {{ .Values.router.epp.gkePreferredBackends.capacityScalerPercent }}
{{- end }}
---
apiVersion: networking.gke.io/v1
kind: GCPBackendPolicy
metadata:
name: {{ printf "%s-backup" .Release.Name }}
namespace: {{ .Release.Namespace }}
spec:
targetRef:
group: ""
kind: Service
name: {{ printf "%s-backup" (include "llm-d-router.name" .) }}
default:
timeoutSec: 300
backendPreference: DEFAULT
{{- if .Values.router.epp.gkePreferredBackends.balancingMode }}
balancingMode: {{ .Values.router.epp.gkePreferredBackends.balancingMode }}
{{- end }}
{{- if .Values.router.epp.gkePreferredBackends.maxRatePerEndpoint }}
maxRatePerEndpoint: {{ .Values.router.epp.gkePreferredBackends.maxRatePerEndpoint }}
{{- end }}
{{- if .Values.router.epp.gkePreferredBackends.capacityScalerPercent }}
capacityScalerPercent: {{ .Values.router.epp.gkePreferredBackends.capacityScalerPercent }}
{{- end }}
{{- end }}
{{- end }}
{{- include "llm-d-router.gke" . -}}
93 changes: 64 additions & 29 deletions config/charts/routerlib/templates/_deployment.yaml
Original file line number Diff line number Diff line change
@@ -1,31 +1,4 @@
{{- define "llm-d-epp.deployment" -}}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "llm-d-router.name" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "llm-d-router.labels" . | nindent 4 }}
{{- include "llm-d-router.modeLabels" . | nindent 4 }}
spec:
replicas: {{ .Values.router.epp.replicas | default 1 }}
strategy:
# The current recommended EPP deployment pattern is to have a single active replica. This ensures
# optimal performance of the stateful operations such prefix cache aware scorer.
# The Recreate strategy the old replica is killed immediately, and allow the new replica(s) to
# quickly take over. This is particularly important in the high availability set up with leader
# election, as the rolling update strategy would prevent the old leader being killed because
# otherwise the maxUnavailable would be 100%.
type: Recreate
selector:
matchLabels:
{{- include "llm-d-router.selectorLabels" . | nindent 6 }}
template:
metadata:
labels:
{{- include "llm-d-router.selectorLabels" . | nindent 8 }}
{{- include "llm-d-router.modeLabels" . | nindent 8 }}
spec:
{{- define "llm-d-epp.deployment-pod-spec" -}}
{{- $proxy := include "llm-d-router.proxy" . | fromYaml | default dict }}
{{- $proxyType := include "llm-d-router.proxyType" . | trim }}
{{- $proxyMode := include "llm-d-router.proxyMode" . | trim }}
Expand Down Expand Up @@ -129,7 +102,7 @@ spec:
- "json"
- --config-file
- "/config/{{ .Values.router.epp.pluginsConfigFile }}"
{{- if gt (.Values.router.epp.replicas | int) 1 }}
{{- if and (gt (.Values.router.epp.replicas | int) 1) (not .Values.router.epp.gkePreferredBackends.enabled) }}
- --ha-enable-leader-election
{{- end }}
{{- $grpcHealthPort := .Values.router.epp.grpcHealthPort | default 9003 }}
Expand Down Expand Up @@ -305,5 +278,67 @@ spec:
tolerations:
{{- toYaml .Values.router.epp.tolerations | nindent 8 }}
{{- end }}
{{- end }}

{{- define "llm-d-epp.deployment" -}}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "llm-d-router.name" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "llm-d-router.labels" . | nindent 4 }}
{{- include "llm-d-router.modeLabels" . | nindent 4 }}
spec:
{{- $replicas := .Values.router.epp.replicas | default 1 }}
{{- if .Values.router.epp.gkePreferredBackends.enabled }}
{{- $replicas = .Values.router.epp.gkePreferredBackends.preferredReplicas | default 1 }}
{{- end }}
replicas: {{ $replicas }}
strategy:
type: Recreate
selector:
matchLabels:
{{- if .Values.router.epp.gkePreferredBackends.enabled }}
llm-d-router-gateway: {{ include "llm-d-router.name" . }}
{{- else }}
{{- include "llm-d-router.selectorLabels" . | nindent 6 }}
{{- end }}
template:
metadata:
labels:
{{- if .Values.router.epp.gkePreferredBackends.enabled }}
llm-d-router-gateway: {{ include "llm-d-router.name" . }}
{{- else }}
{{- include "llm-d-router.selectorLabels" . | nindent 8 }}
{{- end }}
{{- include "llm-d-router.modeLabels" . | nindent 8 }}
spec:
{{ include "llm-d-epp.deployment-pod-spec" . }}
{{- if .Values.router.epp.gkePreferredBackends.enabled }}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ printf "%s-backup" (include "llm-d-router.name" .) }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "llm-d-router.labels" . | nindent 4 }}
{{- include "llm-d-router.modeLabels" . | nindent 4 }}
spec:
replicas: {{ .Values.router.epp.gkePreferredBackends.defaultReplicas | default 1 }}
strategy:
type: Recreate
selector:
matchLabels:
llm-d-router-gateway: {{ printf "%s-backup" (include "llm-d-router.name" .) }}
template:
metadata:
labels:
llm-d-router-gateway: {{ printf "%s-backup" (include "llm-d-router.name" .) }}
{{- include "llm-d-router.modeLabels" . | nindent 8 }}
spec:
{{ include "llm-d-epp.deployment-pod-spec" . }}
{{- end }}
---
{{- end }}
14 changes: 14 additions & 0 deletions config/charts/routerlib/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -397,11 +397,25 @@ EPP resource validations
{{/*
EPP generic validations
*/}}
{{- define "llm-d-router.validations.epp.preferredBackends" -}}
{{- if .Values.router.epp.gkePreferredBackends.enabled }}
{{- $preferredReplicas := .Values.router.epp.gkePreferredBackends.preferredReplicas | default 1 | int }}
{{- $defaultReplicas := .Values.router.epp.gkePreferredBackends.defaultReplicas | default 1 | int }}
{{- if lt $preferredReplicas 1 }}
{{- fail ".Values.router.epp.gkePreferredBackends.preferredReplicas must be at least 1 when gkePreferredBackends.enabled is true" }}
{{- end }}
{{- if lt $defaultReplicas 1 }}
{{- fail ".Values.router.epp.gkePreferredBackends.defaultReplicas must be at least 1 when gkePreferredBackends.enabled is true" }}
{{- end }}
{{- end }}
{{- end -}}

{{- define "llm-d-router.validations.epp" -}}
{{- include "llm-d-router.validations.deprecations" . }}
{{- include "llm-d-router.validations.epp.resources" . }}
{{- include "llm-d-router.validations.epp.inferenceObjectives" . }}
{{- include "llm-d-router.validations.epp.tokenizer" . }}
{{- include "llm-d-router.validations.epp.preferredBackends" . }}
{{- end -}}

{{/*
Expand Down
4 changes: 4 additions & 0 deletions config/charts/routerlib/templates/_inferencepool.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@ spec:
{{- end }}
{{- end }}
endpointPickerRef:
{{- if .Values.router.epp.gkePreferredBackends.enabled }}
name: {{ printf "%s-backup" (include "llm-d-router.name" .) }}
{{- else }}
name: {{ include "llm-d-router.name" . }}
{{- end }}
port:
number: {{ .Values.router.epp.extProcPort | default 9002 }}
failureMode: {{ .Values.router.inferencePool.failureMode | default "FailOpen" }}
Expand Down
29 changes: 29 additions & 0 deletions config/charts/routerlib/templates/_service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,22 @@ metadata:
namespace: {{ .Release.Namespace }}
labels:
{{- include "llm-d-router.labels" . | nindent 4 }}
{{- if .Values.router.epp.gkePreferredBackends.enabled }}
annotations:
cloud.google.com/neg: '{"exposed_ports":{"{{ .Values.router.epp.extProcPort | default 9002 }}":{}}}'
{{- end }}
spec:
selector:
{{- if .Values.router.epp.gkePreferredBackends.enabled }}
llm-d-router-gateway: {{ include "llm-d-router.name" . }}
{{- else }}
{{- include "llm-d-router.selectorLabels" . | nindent 4 }}
{{- end }}
ports:
- name: grpc-ext-proc
protocol: TCP
port: {{ .Values.router.epp.extProcPort | default 9002 }}
appProtocol: kubernetes.io/h2c
- name: http-metrics
protocol: TCP
port: {{ .Values.router.metricsPort | default 9090 }}
Expand All @@ -21,4 +30,24 @@ spec:
{{- end }}
type: ClusterIP
---
{{- if .Values.router.epp.gkePreferredBackends.enabled }}
apiVersion: v1
kind: Service
metadata:
name: {{ printf "%s-backup" (include "llm-d-router.name" .) }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "llm-d-router.labels" . | nindent 4 }}
annotations:
cloud.google.com/neg: '{"exposed_ports":{"{{ .Values.router.epp.extProcPort | default 9002 }}":{}}}'
spec:
selector:
llm-d-router-gateway: {{ printf "%s-backup" (include "llm-d-router.name" .) }}
ports:
- name: grpc-ext-proc
protocol: TCP
port: {{ .Values.router.epp.extProcPort | default 9002 }}
appProtocol: kubernetes.io/h2c
type: ClusterIP
{{- end }}
{{- end }}
13 changes: 13 additions & 0 deletions config/charts/routerlib/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,19 @@ clusterDomain: cluster.local

epp:
replicas: 1
gkePreferredBackends:
enabled: false
# Number of primary active pod instances pinned to the PREFERRED load balancing tier.
# Setting >1 scales concurrent active routing.
preferredReplicas: 1
# Number of standby pod instances pinned to the DEFAULT load balancing tier.
# Setting >1 scales warm standby failover capacity.
defaultReplicas: 1
# Benchmark capacity and threshold settings for PREFERRED load balancing tier.
# When incoming traffic exceeds capacityScalerPercent of maxRatePerEndpoint, excess traffic spills over to DEFAULT standby pods.
balancingMode: RATE
maxRatePerEndpoint: 100
capacityScalerPercent: 100
image:
registry: ghcr.io/llm-d
repository: llm-d-router-endpoint-picker-dev
Expand Down