Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/many-seas-fry.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@fake-scope/fake-pkg": patch
---

Add turn detection protobufs
219 changes: 219 additions & 0 deletions protobufs/agent/livekit_agent_inference.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
// Copyright 2026 LiveKit, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package livekit.agent;

option go_package = "github.com/livekit/protocol/livekit/agent";
option csharp_namespace = "LiveKit.Proto";
option ruby_package = "LiveKit::Proto";
option optimize_for = SPEED;

import "agent/livekit_agent_session.proto";
import "google/protobuf/duration.proto";
import "google/protobuf/timestamp.proto";

// --- Shared Types ---

enum AudioEncoding {
AUDIO_ENCODING_PCM_S16LE = 0;
AUDIO_ENCODING_OPUS = 1;
}

message SessionSettings {
uint32 sample_rate = 1;
AudioEncoding encoding = 2;
oneof type_settings {
EotSettings eot_settings = 3;
InterruptionSettings interruption_settings = 4;
}
}

message Error {
string message = 1;
// error code follows the HTTP status code convention
// 4xx for client errors
// 5xx for server errors
uint32 code = 2;
}

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This Error message is too generic. Can we have another name?

Fwiw, we only do a string error in other services

@chenghao-mou chenghao-mou Apr 21, 2026

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code here is the convention from Inference. We use that for the status_code in APIStatusError.

Renamed it to InferenceError.


// --- End of Turn (EOT) Settings ---

message EotSettings {
google.protobuf.Duration detection_interval = 1;
}

// --- Interruption Settings ---

message InterruptionSettings {
// detection threshold in range [0.0, 1.0]; higher values are less sensitive
float threshold = 1;
// minimum number of frames with probability greater than threshold to trigger an interruption
uint32 min_frames = 2;
google.protobuf.Duration max_audio_duration = 3;
google.protobuf.Duration audio_prefix_duration = 4;
google.protobuf.Duration detection_interval = 5;
}

// --- Client -> Server ---

message SessionCreate {
SessionSettings settings = 1;
}

message InputAudio {
bytes audio = 1;
google.protobuf.Timestamp created_at = 2;
uint32 num_samples = 3;
}

message EotInputChatContext {
repeated ChatMessage messages = 1;
}

message SessionFlush {}

message SessionClose {}

message InferenceStart {
string request_id = 1;
}

message InferenceStop {
string request_id = 1;
}

// audio buffer sentinel messages
message BufferStart {}

message BufferStop {}

message ClientMessage {
google.protobuf.Timestamp created_at = 1;
oneof message {
SessionCreate session_create = 2;
InputAudio input_audio = 3;
SessionFlush session_flush = 4;
SessionClose session_close = 5;
InferenceStart inference_start = 6;
InferenceStop inference_stop = 7;
BufferStart buffer_start = 8;
BufferStop buffer_stop = 9;
// only for end of turn
EotInputChatContext eot_input_chat_context = 10;
}
}

// --- Server -> Model ---

message EotInferenceRequest {
bytes audio = 1;
string assistant_text = 2;
AudioEncoding encoding = 3;
uint32 sample_rate = 4;
}

message InterruptionInferenceRequest {
bytes audio = 1;
AudioEncoding encoding = 2;
uint32 sample_rate = 3;
}

message InferenceRequest {
oneof request {
EotInferenceRequest eot_inference_request = 1;
InterruptionInferenceRequest interruption_inference_request = 2;
}
}

message InferenceStats {
// server-side e2e latency (server input to server output)
google.protobuf.Duration e2e_latency = 1;
google.protobuf.Duration preprocessing_duration = 2;
google.protobuf.Duration inference_duration = 3;
}

message ProcessingStats {
google.protobuf.Timestamp earliest_client_created_at = 1;
google.protobuf.Timestamp latest_client_created_at = 2;
// client-side e2e latency (client send to client receive)
google.protobuf.Duration e2e_latency = 3;
InferenceStats inference_stats = 4;
}

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe?

Suggested change
message InferenceStats {
// server-side e2e latency (server input to server output)
google.protobuf.Duration e2e_latency = 1;
google.protobuf.Duration preprocessing_duration = 2;
google.protobuf.Duration inference_duration = 3;
}
message ProcessingStats {
google.protobuf.Timestamp earliest_client_created_at = 1;
google.protobuf.Timestamp latest_client_created_at = 2;
// client-side e2e latency (client send to client receive)
google.protobuf.Duration e2e_latency = 3;
InferenceStats inference_stats = 4;
}
message ProcessingStats {
message InferenceStats {
// server-side e2e latency (server input to server output)
google.protobuf.Duration e2e_latency = 1;
google.protobuf.Duration preprocessing_duration = 2;
google.protobuf.Duration inference_duration = 3;
}
google.protobuf.Timestamp earliest_client_created_at = 1;
google.protobuf.Timestamp latest_client_created_at = 2;
// client-side e2e latency (client send to client receive)
google.protobuf.Duration e2e_latency = 3;
InferenceStats inference_stats = 4;
}

@chenghao-mou chenghao-mou Apr 21, 2026

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The model-to-Inference response needs to include InferenceStats but not ProcessingStats (EotInferenceResponse and InterruptionInferenceResponse)

@theomonnom theomonnom Apr 23, 2026

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, it's kind of confusing, could we maybe just have one message for stats?

and two nested optional structs inside it?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Made a flat InferenceStats and Inference-to-model fields optional, so now we only have one message for both senders and receivers.



message EotInferenceResponse {
float probability = 1;
InferenceStats stats = 2;
}

message InterruptionInferenceResponse {
bool is_interruption = 1;
// per frame probabilities
repeated float probabilities = 2;
InferenceStats stats = 3;
}

message InferenceResponse {
oneof response {
EotInferenceResponse eot_inference_response = 1;
InterruptionInferenceResponse interruption_inference_response = 2;
}
}

// --- Server -> Client ---

message SessionCreated {}

message InferenceStarted {}

message InferenceStopped {}

message SessionClosed {}

message EotPrediction {
float probability = 1;
ProcessingStats processing_stats = 2;
}

message InterruptionPrediction {
bool is_interruption = 1;
repeated float probabilities = 2;
ProcessingStats processing_stats = 3;
google.protobuf.Timestamp created_at = 4;
google.protobuf.Duration prediction_duration = 5;
}

message Prediction {
oneof prediction {
EotPrediction eot_prediction = 1;
InterruptionPrediction interruption_prediction = 2;
}
}

Comment thread
chenghao-mou marked this conversation as resolved.
Outdated
message ServerMessage {
google.protobuf.Timestamp server_created_at = 1;
optional string request_id = 2;
// echoes the client-side created_at timestamp
optional google.protobuf.Timestamp client_created_at = 3;
oneof message {
SessionCreated session_created = 4;
InferenceStarted inference_started = 5;
InferenceStopped inference_stopped = 6;
SessionClosed session_closed = 7;
Error error = 8;
Prediction prediction = 9;
}
}
Loading