);
}
function DocSection({ title, sub, endpoint, method, params, example, toast }) {
return (
{title}
{sub}
{method}{endpoint}
Parameters
Field
Type
Description
Default
{params.map(p => (
{p.name}{p.required && *}
{p.type}
{p.desc}
{p.default || '—'}
))}
Example
{example}
);
}
function DocsText({ toast }) {
return ;
}
function DocsImage({ toast }) {
return ;
}
function DocsVoice({ toast }) {
return ;
}
function DocsStt({ toast }) {
return ;
}
// ───────── Live Call (WebSocket) ─────────
// Custom layout — DocSection is REST-shaped; the Live Call uses a bidirectional
// WS with a JSON control protocol, so we render the docs by hand using the same
// card / code-block / table primitives as the other tabs.
function DocsLive({ toast }) {
const copy = (text) => { navigator.clipboard.writeText(text); toast('Copied', 'pos'); };
const wsUrl = 'wss://api.hidrogen.in/v1/live';
const connectParams = [
{
name: 'key', type: 'string', required: 'either',
desc: 'Hidrogen API key (prefix hi_ or sk_). Pass either as ?key=... in the URL OR as an Authorization: Bearer header — pick exactly one. Browsers can only set query params, so the dashboard tester uses ?key=; GSM gateways and server-side clients should prefer the header, which avoids leaking the key into nginx access logs.',
default: '—',
},
{
name: 'input_sample_rate', type: 'integer', required: false,
desc: 'Sample rate (in Hz) of the PCM16 mono audio frames YOU send to the server. Allowed: 8000, 16000, 24000, 48000. Use 8000 for GSM/SIP/telephony bridges — Hidrogen will up-sample to Gemini\'s native 16 kHz transparently. Use 48000 if you\'re feeding raw browser-mic samples without client-side resampling. Anything else and the server resamples.',
default: '16000',
},
{
name: 'output_sample_rate', type: 'integer', required: false,
desc: 'Sample rate (in Hz) you want the AI audio chunks delivered at. Allowed: 8000, 16000, 24000, 48000. Gemini emits at 24 kHz natively; the server resamples to whatever you ask. Use 8000 if you\'re writing the reply straight to a phone line, 24000 (default) for high-quality browser/app playback.',
default: '24000',
},
{
name: 'system_prompt', type: 'string', required: false,
desc: 'URL-encoded system instruction that defines the AI\'s persona, language, and behavior for the entire call. Examples: "You are a courteous customer-support agent. Reply in Hindi unless the caller speaks English." Capped at 4000 chars. Sent to the model exactly once at session start; cannot be changed mid-call.',
default: '—',
},
{
name: 'greet', type: 'string', required: false,
desc: 'Make the AI speak the first turn. Two modes: pass "1" for a sensible default opener ("Greet the user warmly and briefly. Ask how you can help."), or pass any URL-encoded directive of your own ("Namaste! Briefly introduce yourself in Hinglish."). The text is sent as a hidden user turn — Gemini responds in audio so the caller hears the greeting the moment the line opens. Capped at 1000 chars. Counts as a billed turn (same per-token pricing as any other turn).',
default: 'off (AI waits for caller audio)',
},
];
const protocol = [
{
dir: '← server',
frame: '{"type":"ready"}',
meaning: 'Sent once, after Gemini accepts the setup. From this point you may send audio. If you opened with ?greet=…, the AI will start emitting "audio" frames immediately after this — be ready to play.',
},
{
dir: '→ client',
frame: '{"type":"audio","data":""}',
meaning: 'Microphone / telephony audio chunk. PCM16 mono at the input_sample_rate you connected with, base64-encoded. Recommended chunk size: ~20–100 ms of audio (avoid mega-frames; the server forwards each one to Gemini as it arrives).',
},
{
dir: '→ client',
frame: '{"type":"end"}',
meaning: 'Explicit end-of-utterance signal. The server VAD already detects end-of-speech automatically, so this is only useful for pipelines without natural pauses (e.g. you\'re concatenating recorded clips). Safe to omit.',
},
{
dir: '→ client',
frame: '{"type":"text","text":"..."}',
meaning: 'Send a text message in the middle of a voice call (e.g. inject a quick instruction from your back-end logic). The model replies with audio as usual. Doesn\'t replace ongoing audio input — both streams coexist.',
},
{
dir: '← server',
frame: '{"type":"audio","data":""}',
meaning: 'AI voice reply chunk. PCM16 mono at the output_sample_rate you requested. Append directly to your speaker / phone-line buffer; chunks arrive in playback order.',
},
{
dir: '← server',
frame: '{"type":"interrupted"}',
meaning: 'The caller spoke while the AI was speaking ("barge-in"). The AI has stopped emitting audio for the current turn — flush every queued/scheduled audio buffer on your side immediately so the AI\'s tail doesn\'t bleed over the new question.',
},
{
dir: '← server',
frame: '{"type":"turn_complete"}',
meaning: 'The AI finished its current turn — no more audio frames coming until the caller speaks again. Use this to mute "AI speaking" indicators in your UI.',
},
{
dir: '← server',
frame: '{"type":"ending","reason":"..."}',
meaning: 'Call is about to close (the server closes the socket within a few hundred ms after this). reason ∈ {max_duration, insufficient_balance, server_goaway, upstream_closed, idle_timeout}. Final billing is settled before the close.',
},
{
dir: '← server',
frame: '{"type":"error","code":"...","message":"..."}',
meaning: 'Fatal error — the socket will close right after this frame. code ∈ {auth, insufficient_balance, upstream, server}. Inspect message for the human-readable cause.',
},
];
// Close-code reference — emitted on the WS close event.
const closeCodes = [
{ code: '1000', label: 'normal', desc: 'Clean close. Either side asked to end the call, or the upstream session reached its natural end.' },
{ code: '4401', label: 'auth', desc: 'API key missing, malformed, revoked, or belongs to a deactivated user.' },
{ code: '4402', label: 'insufficient_balance', desc: 'Wallet balance fell below min_balance_paise (set in Admin → Settings). Top up and reconnect.' },
{ code: '4408', label: 'timeout', desc: 'Either no caller audio for 60 s (idle) or the 15-minute hard session cap (max_duration).' },
{ code: '4502', label: 'upstream_error', desc: 'Gemini Live API rejected the session, refused the model ID, or dropped the connection mid-call.' },
];
const nodeExample = `// npm i ws
// Streams a raw PCM16 16 kHz mono file to Hidrogen Live and saves the reply.
const WebSocket = require('ws');
const fs = require('fs');
const API_KEY = 'hi_xxx'; // your Hidrogen API key
// Optional: make the AI speak first. Pass "1" for a default opener, or any
// custom directive — the model treats it as a hidden user turn and replies in audio.
const greet = encodeURIComponent('Greet the user warmly in one short sentence, then ask how you can help.');
const ws = new WebSocket(
'${wsUrl}?key=' + API_KEY +
'&input_sample_rate=16000&output_sample_rate=24000' +
'&greet=' + greet
);
ws.on('open', () => console.log('connecting…'));
const out = fs.createWriteStream('reply-24k.pcm');
ws.on('message', (raw) => {
let m; try { m = JSON.parse(raw.toString()); } catch { return; }
if (m.type === 'ready') {
// setup done — stream a PCM16 file in ~100 ms chunks (3200 bytes @ 16 kHz)
const pcm = fs.readFileSync('mic-16k-mono.pcm');
for (let i = 0; i < pcm.length; i += 3200) {
const chunk = pcm.subarray(i, i + 3200);
ws.send(JSON.stringify({ type: 'audio', data: chunk.toString('base64') }));
}
ws.send(JSON.stringify({ type: 'end' }));
}
if (m.type === 'audio') out.write(Buffer.from(m.data, 'base64'));
if (m.type === 'interrupted') console.log('barge-in — flush playback');
if (m.type === 'turn_complete') console.log('AI finished turn');
if (m.type === 'ending') console.log('call ending:', m.reason);
if (m.type === 'error') console.error('error:', m.code, m.message);
});
ws.on('close', (code) => { out.end(); console.log('closed', code); });`;
const gsmExample = `// GSM / telephony gateway — connect at 8 kHz, Hidrogen resamples both ways.
// Use the Authorization header from non-browser clients to keep the key out of URLs/logs.
// "greet" makes the AI speak first so the caller hears a greeting the moment the line opens.
const url = new URL('${wsUrl}');
url.searchParams.set('input_sample_rate', '8000');
url.searchParams.set('output_sample_rate', '8000');
url.searchParams.set('greet', 'Namaste! Hidrogen support me aapka swagat hai. Aap kaise sahayata chahte hain?');
const ws = new WebSocket(url.toString(), {
headers: { Authorization: 'Bearer hi_xxx' },
});
// On {type:'ready'}, pipe inbound phone audio (PCM16 8 kHz mono) as base64 JSON frames.
// On {type:'audio'}, decode base64 PCM16 8 kHz and write to the phone line.`;
return (
Live Call (real-time voice)
Bidirectional voice WebSocket — talk to the AI, hear it reply in real time.
Connect any audio source: a browser mic, a mobile app, or a GSM/telephony gateway bridging a phone call.
{/* Endpoint badge row */}
WSS{wsUrl}
{/* Authentication */}
Authentication
Every connection requires a Hidrogen API key (the same hi_… keys you use for the REST endpoints). Choose one of two delivery methods — never both at once:
HTTP headerAuthorization: Bearer hi_xxx — the recommended method for any server-side or GSM/telephony client. The key never appears in URLs or proxy logs, and the connection is indistinguishable from a normal REST call to security tooling.
Query parameter?key=hi_xxx — the only option for browser clients, since the WebSocket API doesn't expose request headers. The key will appear in nginx access logs unless you scrub it; rotate keys regularly if browser auth is your main path.
Authentication failures close the socket with code 4401. Successful connections share the same key cache as the REST endpoints, so a key already warm from /v1/chat/completions is a cache hit here too.
All audio — both directions — is 16-bit signed little-endian PCM, single-channel (mono), base64-encoded inside the JSON frames described below. No headers, no containers, no compression — just raw samples. Send chunks as fast as you produce them (typically 20–100 ms per chunk); the server forwards each one to Gemini immediately.
Default sample rates: 16 kHz for what you send (matches Gemini\'s native input), 24 kHz for what you receive (matches Gemini\'s native output). Both are overridable via the input_sample_rate / output_sample_rate query parameters; the server resamples transparently using linear interpolation.
For GSM / SIP / telephony bridges: set both to 8000. Your bridge can pipe phone-line PCM straight in and write the AI\'s reply straight back to the call without doing any sample-rate conversion of its own.
Allowed values: 8000, 16000, 24000, 48000. Anything outside that list is rejected with close code 4400.
{/* Protocol */}
JSON message protocol
Every frame in both directions is a UTF-8 JSON text frame (no binary WebSocket frames are used). The type field discriminates the shape; unknown types are ignored. Frames are processed in arrival order with no buffering on either side.
Direction
Frame
Meaning
{protocol.map((p, i) => (
{p.dir}
{p.frame}
{p.meaning}
))}
{/* WebSocket close codes */}
WebSocket close codes
When the server closes the socket, the close event carries a numeric code. Use it to distinguish "the call ended normally" from "fix something and retry":
Code
Label
Description
{closeCodes.map(c => (
{c.code}
{c.label}
{c.desc}
))}
{/* Node example */}
Example — Node.js client
{nodeExample}
{/* GSM example */}
Example — GSM / telephony bridge
{gsmExample}
Most telephony stacks (Asterisk AudioSocket, SIP gateways, SIM800/SIM900 modems via a host program) deliver 8 kHz µ-law or A-law audio. Decode to PCM16 in your bridge process, connect with input_sample_rate=8000 and output_sample_rate=8000, and Hidrogen handles the 8 ↔ 16/24 kHz resampling so you can write the reply straight back to the phone line.