Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions apps/daemon/src/media-models.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ export type MediaModel = {

export const MEDIA_PROVIDERS: MediaProvider[] = [
{ id: 'openai', label: 'OpenAI', hint: 'gpt-image-2 / dall-e-3', integrated: true, defaultBaseUrl: 'https://api.openai.com/v1' },
// Local Codex CLI built-in image_gen — drives the operator's own
// already-signed-in Codex CLI (ChatGPT subscription), so it needs no
// OPENAI_API_KEY. Like `hyperframes`, it's a local renderer: no settings
// card, no stored credentials.
{ id: 'codex-cli', label: 'Codex CLI', hint: 'Local Codex CLI image_gen — uses ChatGPT subscription, no API key', integrated: true, credentialsRequired: false, settingsVisible: false },
{ id: 'volcengine', label: 'Volcengine Ark (Doubao)', hint: 'Seedance 2.0 / Seedream', integrated: true, defaultBaseUrl: 'https://ark.cn-beijing.volces.com/api/v3' },
{ id: 'grok', label: 'xAI Grok Imagine', hint: 'grok-imagine — image + video with native audio', integrated: true, defaultBaseUrl: 'https://api.x.ai/v1' },
{ id: 'hyperframes', label: 'HyperFrames', hint: 'Local HTML -> MP4 renderer', integrated: true, credentialsRequired: false, settingsVisible: false },
Expand Down Expand Up @@ -91,6 +96,12 @@ export const IMAGE_MODELS: MediaModel[] = [
{ id: 'dall-e-3', label: 'dall-e-3', hint: 'OpenAI · classic', provider: 'openai', caps: ['t2i'] },
{ id: 'dall-e-2', label: 'dall-e-2', hint: 'OpenAI · legacy', provider: 'openai', caps: ['t2i'] },

// No-API-key path to ChatGPT-grade image generation: the daemon spawns a
// headless `codex exec` turn and lets the operator's signed-in Codex CLI
// run its built-in image_gen tool. Works for ANY coding agent (Claude
// Code, Gemini, …), not just when Codex is the chat agent.
{ id: 'codex-image-gen', label: 'codex-image-gen', hint: 'Codex CLI · built-in image_gen · ChatGPT subscription, no API key', provider: 'codex-cli', caps: ['t2i'] },

{ id: 'doubao-seedream-3-0-t2i-250415', label: 'seedream-3.0', hint: 'ByteDance · Doubao image', provider: 'volcengine', caps: ['t2i'] },
{ id: 'doubao-seededit-3-0-i2i-250628', label: 'seededit-3.0', hint: 'ByteDance · image edit', provider: 'volcengine', caps: ['i2i'] },

Expand Down
296 changes: 292 additions & 4 deletions apps/daemon/src/media.ts
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ import {
aihubmixGeminiImageBytes,
classifyAIHubMixModel,
} from './aihubmix.js';
import {
codexAgentDef,
codexNeedsDangerFullAccessSandbox,
} from './runtimes/defs/codex.js';
import { resolveAgentExecutable } from './runtimes/executables.js';

const execFile = promisify(execFileCb);
type ProviderConfig = { apiKey?: string; baseUrl?: string; model?: string };
Expand Down Expand Up @@ -680,6 +685,11 @@ export async function generateMedia(args: {
bytes = result.bytes;
providerNote = result.providerNote;
suggestedExt = result.suggestedExt;
} else if (def.provider === 'codex-cli' && surface === 'image') {
const result = await renderCodexCliImage(ctx, args.onProgress);
bytes = result.bytes;
providerNote = result.providerNote;
suggestedExt = result.suggestedExt;
} else {
// No real renderer wired up for this (provider, surface). Gate the
// stub fallback behind OD_MEDIA_ALLOW_STUBS so release builds don't
Expand All @@ -699,10 +709,11 @@ export async function generateMedia(args: {
if (err instanceof StubProviderDisabledError) {
throw err;
}
// HyperFrames is a local render, not a remote provider. Falling back
// to a stub here hides actionable composition/preflight failures and
// can make the agent retry or narrate a fake MP4 as success.
if (def.provider === 'hyperframes') {
// HyperFrames and codex-cli are local renders, not remote providers.
// Falling back to a stub here hides actionable composition/preflight or
// image_gen failures and can make the agent retry or narrate a fake
// asset as success.
if (def.provider === 'hyperframes' || def.provider === 'codex-cli') {
throw err;
}
// A real provider failed (network blip, 4xx, missing key, …). We
Expand Down Expand Up @@ -3763,6 +3774,283 @@ function runHyperFramesRender(compAbs: string, tmpOutput: string, onProgress?: P
});
}

// ---------------------------------------------------------------------------
// Provider: codex-cli — local Codex CLI built-in image_gen (no API key).
//
// Unlike the `openai` provider (which POSTs to the Images API and needs an
// OPENAI_API_KEY), this provider drives the operator's already-signed-in
// Codex CLI. We spawn a headless `codex exec` turn, instruct it to use its
// built-in `image_gen` tool, and have it copy the result to a path we then
// read back. The bytes bill against the user's ChatGPT subscription; Open
// Design never sees an API key.
//
// Why a provider and not a skill: every coding agent (Claude Code, Gemini,
// …) routes media through `od media generate`, so wiring it here lets the
// no-key path work no matter which agent drives the chat. PR #622's
// renderCodexImagegenOverride only rewires generation when the chat agent
// ITSELF is Codex; a non-Codex agent still had no no-key route until now.
//
// The image_gen tool call leaves NO distinct event in `codex exec --json`
// (only command_execution + agent_message items show up), so we confirm
// success from ground truth: codex exits 0, the file exists at the path we
// handed it, and it carries a PNG signature. If codex reports the tool is
// unavailable, or no valid image lands, we throw — this provider never
// falls back to a stub (a fake "success" image is worse than a loud error;
// see the hyperframes precedent in the dispatcher's catch block).
// ---------------------------------------------------------------------------

const CODEX_CLI_IMAGE_TIMEOUT_MS = 4 * 60 * 1000;
const CODEX_IMAGE_SAVED_MARKER = 'OD_IMAGE_SAVED:';
const CODEX_IMAGE_UNAVAILABLE_MARKER = 'IMAGE_GEN_UNAVAILABLE';
const CODEX_OUTPUT_BASENAME = 'od-codex-image.png';

async function renderCodexCliImage(ctx: MediaContext, onProgress?: ProgressFn): Promise<RenderResult> {
const codexBin = resolveAgentExecutable(
codexAgentDef,
process.env as Record<string, string>,
);
if (!codexBin) {
throw new Error(
'codex-image-gen needs the Codex CLI on PATH (or set CODEX_BIN). Install it ' +
'and sign in with your ChatGPT account — https://developers.openai.com/codex/cli — ' +
'then this model generates images with no OPENAI_API_KEY.',
);
}

// Render into a private temp workspace, not the project dir: the codex
// turn writes scratch state (and its sandbox is rooted here), and we only
// want the final PNG bytes to flow back to the generic dispatcher, which
// writes them into the project under the user-supplied filename.
const tmpRoot = await mkdtemp(path.join(os.tmpdir(), 'open-design-codex-img-'));
const outPath = path.join(tmpRoot, CODEX_OUTPUT_BASENAME);
try {
const prompt = buildCodexImagePrompt(ctx, outPath);
await runCodexImageGen(codexBin, prompt, tmpRoot, onProgress);
let bytes: Buffer;
try {
bytes = await readFile(outPath);
} catch {
throw new Error(
'Codex exited cleanly but produced no image at the requested path. The ' +
'built-in image_gen tool is likely unavailable in this Codex session — ' +
'it requires an official ChatGPT login, not an API key or third-party gateway.',
);
}
assertCodexPngBytes(bytes);
return {
bytes,
providerNote: `codex-cli/image_gen · ${ctx.aspect} · ${bytes.length} bytes · ChatGPT subscription (no API key)`,
suggestedExt: '.png',
};
} finally {
await rm(tmpRoot, { recursive: true, force: true });
}
}

function buildCodexImagePrompt(ctx: MediaContext, outPath: string): string {
const prompt = ctx.prompt?.trim() || 'an abstract placeholder image';
const aspect = ctx.aspect || '1:1';
return [
`Use your built-in image_gen tool to generate this image: ${prompt}`,
`Aspect ratio: ${aspect}.`,
`After the image is generated, copy the final image file to exactly this absolute path: ${outPath}`,
`Then print one line in the form ${CODEX_IMAGE_SAVED_MARKER}${outPath} and stop.`,
'Do not use any API-key, CLI, or scripts/image_gen.py fallback path. If the built-in ' +
`image_gen tool is not available in this session, print exactly ${CODEX_IMAGE_UNAVAILABLE_MARKER} ` +
'and stop without doing anything else.',
].join('\n');
}

/**
* The PNG signature. image_gen writes real raster PNGs (~hundreds of KB);
* we reject anything without the magic bytes so a stray text file or the
* 67-byte stub PNG can't masquerade as a successful generation.
*/
function assertCodexPngBytes(bytes: Buffer): void {
const PNG_SIG = Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]);
if (bytes.length < 128 || !bytes.subarray(0, 8).equals(PNG_SIG)) {
throw new Error(
`Codex wrote ${bytes.length} bytes but they are not a valid PNG. The image_gen ` +
'tool did not produce a usable image (check that your Codex CLI is signed in ' +
'to ChatGPT and that image_gen is enabled).',
);
}
}

/**
* Spawn one headless `codex exec` turn and resolve when it finishes a
* successful run. Prompt is delivered over stdin (recent Codex CLIs reject
* a bare `-` argv sentinel — see runtimes/defs/codex.ts). stdout carries
* the `--json` event stream; stderr carries codex's own logs, which we keep
* only to attach to a failure message. Rejects on: the IMAGE_GEN_UNAVAILABLE
* sentinel, a non-zero exit, spawn error, or timeout (SIGTERM→SIGKILL).
*/
function runCodexImageGen(
codexBin: string,
prompt: string,
cwd: string,
onProgress?: ProgressFn,
): Promise<void> {
return new Promise<void>((resolve, reject) => {
// Match the sandbox policy the daemon's codex runtime already uses:
// workspace-write + network on macOS/Linux, danger-full-access only
// where Codex has no working sandbox (Windows/WSL). image_gen reaches
// its backend over codex's own connection, not the sandboxed shell, so
// workspace-write is sufficient (verified locally) and keeps the spawned
// turn's file writes scoped to our temp workspace.
const needsDangerFullAccess = codexNeedsDangerFullAccessSandbox();
const sandboxArgs = needsDangerFullAccess
? ['--sandbox', 'danger-full-access']
: ['--sandbox', 'workspace-write', '-c', 'sandbox_workspace_write.network_access=true'];
// Mirror codexAgentDef.buildArgs: newer Codex builds honor the
// permissions config over the legacy sandbox flags, and without this a
// Windows/WSL launch can stay read-only and fail to write the PNG (#2834).
const args = [
'exec', '--json', '--skip-git-repo-check',
...sandboxArgs,
'-c', 'default_permissions=":workspace"',
Comment thread
fancy-agent marked this conversation as resolved.
Comment thread
fancy-agent marked this conversation as resolved.
'-C', cwd,
];

const child = spawn(codexBin, args, {
cwd,
env: process.env,
Comment thread
fancy-agent marked this conversation as resolved.
Outdated
Comment thread
fancy-agent marked this conversation as resolved.
Outdated
stdio: ['pipe', 'pipe', 'pipe'],
});
// A fast-exiting codex (bad auth / image_gen unavailable) can close stdin
// before we finish writing the prompt; the resulting EPIPE surfaces as an
// async 'error' event that would otherwise crash the daemon. Swallow it —
// the close handler reports the real exit reason. (Same guard the daemon's
// main codex spawn path uses.)
child.stdin.on('error', () => {});

let settled = false;
let unavailable = false;
let stderrTail = '';
let stdoutBuf = '';

const finish = (fn: () => void): void => {
if (settled) return;
settled = true;
clearTimeout(timer);
fn();
};

const consumeLine = (raw: string): void => {
const line = raw.trim();
if (!line) return;
const text = extractCodexAgentText(line);
if (text) {
// Anchor on a line that *is* the sentinel (the prompt asks codex to
// "print exactly IMAGE_GEN_UNAVAILABLE"), so a message that merely
// mentions the marker can't reject an otherwise-successful run.
if (text.split('\n').some((l) => l.trim() === CODEX_IMAGE_UNAVAILABLE_MARKER)) {
unavailable = true;
}
if (typeof onProgress === 'function') {
try {
onProgress(truncate(text, 200));
} catch {
// best-effort: never let a progress emitter kill the render
}
}
}
};

child.stdout.on('data', (chunk: Buffer) => {
stdoutBuf += chunk.toString('utf8');
let nl: number;
while ((nl = stdoutBuf.indexOf('\n')) >= 0) {
consumeLine(stdoutBuf.slice(0, nl));
stdoutBuf = stdoutBuf.slice(nl + 1);
}
});
child.stderr.on('data', (chunk: Buffer) => {
stderrTail += chunk.toString('utf8');
if (stderrTail.length > 8000) stderrTail = stderrTail.slice(-8000);
});

const timer = setTimeout(() => {
try {
child.kill('SIGTERM');
} catch {
// ignore
}
// Escalate if it ignores SIGTERM. unref so this stray timer can't keep
// the process alive after we've already rejected.
setTimeout(() => {
try {
child.kill('SIGKILL');
} catch {
// ignore
}
}, 2000).unref();
finish(() =>
reject(
new Error(
`codex image_gen timed out after ${Math.round(CODEX_CLI_IMAGE_TIMEOUT_MS / 1000)}s`,
),
),
);
}, CODEX_CLI_IMAGE_TIMEOUT_MS);

// Deliver the prompt over stdin and close it; codex reads stdin once.
try {
child.stdin.end(prompt);
} catch {
// If stdin is already gone the close handler reports the real failure.
}

child.on('error', (err) => {
finish(() => reject(err));
});
child.on('close', (code, signal) => {
if (stdoutBuf.trim()) consumeLine(stdoutBuf);
finish(() => {
if (unavailable) {
reject(
new Error(
'Codex reported its built-in image_gen tool is unavailable in this session. ' +
'It requires an official ChatGPT login (not an API key or third-party gateway). ' +
'Run `codex login` and confirm `image_gen` is enabled, then retry.',
),
);
return;
}
if (code === 0) {
resolve();
return;
}
const reason = signal ? `signal ${signal}` : `exit ${code}`;
const tail = stderrTail.trim().split('\n').slice(-12).join('\n');
reject(new Error(`codex image_gen exited ${reason}` + (tail ? `\n${tail}` : '')));
});
});
});
}

/**
* Pull the human-readable text out of one `codex exec --json` line. We only
* care about `agent_message` items (where our OD_IMAGE_SAVED /
* IMAGE_GEN_UNAVAILABLE sentinels land); everything else returns null so the
* caller ignores command_execution noise and codex's internal log lines.
*/
function extractCodexAgentText(line: string): string | null {
let evt: unknown;
try {
evt = JSON.parse(line);
} catch {
return null;
}
if (!isRecord(evt)) return null;
// Final agent text lands on a completed item; ignore any streaming partials.
if (evt.type !== 'item.completed') return null;
const item = evt.item;
if (!isRecord(item)) return null;
if (item.type !== 'agent_message') return null;
return typeof item.text === 'string' ? item.text : null;
}

// ---------------------------------------------------------------------------
// Stub renderer.
//
Expand Down
Loading
Loading