From 1ca7772ea4d8691f85197051fb5b71116805758d Mon Sep 17 00:00:00 2001 From: racct <elias@adornis.de> Date: Thu, 12 Jun 2025 07:44:35 +0000 Subject: [PATCH 1/2] feat: provide call metadata to function calls --- lab/voice-assistant/call-providers/twilio.ts | 22 +++++--- .../llm-providers/google-ai.ts | 14 ++++- lab/voice-assistant/llm-providers/openai.ts | 55 ++++++++++++++----- lab/voice-assistant/voice-agent.ts | 2 + 4 files changed, 70 insertions(+), 23 deletions(-) diff --git a/lab/voice-assistant/call-providers/twilio.ts b/lab/voice-assistant/call-providers/twilio.ts index 1914bfd3f4..979358a464 100644 --- a/lab/voice-assistant/call-providers/twilio.ts +++ b/lab/voice-assistant/call-providers/twilio.ts @@ -1,12 +1,12 @@ +import { A } from '@adornis/base/env-info.js'; import { logger } from '@adornis/base/logging.js'; import express from 'express'; -import { WebSocketServer } from 'ws'; import type { WebSocket } from 'ws'; -import { A } from '@adornis/base/env-info.js'; +import { WebSocketServer } from 'ws'; import type { ICallProvider, ILLMSession } from '../voice-agent.js'; // --- Twilio WebSocket audio streaming --- -const twilioWebsockets = new WeakMap<WebSocket, string>(); +const twilioWebsockets = new WeakMap<WebSocket, { streamSid: string; callSid: string }>(); // function saveBinaryFile(fileName: string, content: Buffer) { // writeFile(fileName, content, 'utf8', err => { @@ -17,8 +17,8 @@ const twilioWebsockets = new WeakMap<WebSocket, string>(); // } async function sendAudioToTwilio(buffer: Buffer, ws: WebSocket) { - const streamSid = twilioWebsockets.get(ws); - if (!streamSid) { + const twilioWebsocketInfo = twilioWebsockets.get(ws); + if (!twilioWebsocketInfo) { logger.warn('No active Twilio WebSocket connection to send audio.'); return; } @@ -29,7 +29,7 @@ async function sendAudioToTwilio(buffer: Buffer, ws: WebSocket) { return ws.send( JSON.stringify({ event: 'media', - streamSid, + streamSid: twilioWebsocketInfo.streamSid, media: { payload: mulawBase64 }, }), ); @@ -46,7 +46,7 @@ function flushTwilioStream(ws: WebSocket) { ws.send( JSON.stringify({ event: 'clear', - streamSid: twilioWebsockets.get(ws), + streamSid: twilioWebsockets.get(ws)?.streamSid, }), ); } @@ -85,7 +85,7 @@ export const twilioCallProvider: ICallProvider = { // }); wss.on('connection', ws => { - twilioWebsockets.set(ws, ''); // Track the active Twilio websocket + twilioWebsockets.set(ws, { streamSid: '', callSid: '' }); // Track the active Twilio websocket ws.on('close', () => { twilioWebsockets.delete(ws); }); @@ -99,7 +99,7 @@ export const twilioCallProvider: ICallProvider = { break; case 'start': console.log(`Starting media stream...`); - twilioWebsockets.set(ws, msg.start.streamSid); + twilioWebsockets.set(ws, { streamSid: msg.start.streamSid, callSid: msg.start.callSid }); // mediaStreamSaver.twilioStreamStart(); llmSession = await llmProvider.init({ config, @@ -110,6 +110,10 @@ export const twilioCallProvider: ICallProvider = { const callAudioChunk = audioBufferTransformerProvider.llmToCall(audioChunk); void sendAudioToTwilio(callAudioChunk, ws); }, + callMetaData: { + callSid: msg.start.callSid, + streamSid: msg.start.streamSid, + }, }); break; diff --git a/lab/voice-assistant/llm-providers/google-ai.ts b/lab/voice-assistant/llm-providers/google-ai.ts index 12db08d92f..2c78640bc3 100644 --- a/lab/voice-assistant/llm-providers/google-ai.ts +++ b/lab/voice-assistant/llm-providers/google-ai.ts @@ -33,6 +33,7 @@ const sessionManagement = new WeakMap< sendAudioChunkToCall: (audioChunk: Buffer) => void | Promise<void>; closeAudioStream: () => void | Promise<void>; flushAudioStream: () => void | Promise<void>; + callMetaData: any; } >(); @@ -85,6 +86,7 @@ function handleModelTurn(message: LiveServerMessage, session: Session) { if (!sessionInfo) return; // TODO needs testing with real tools as soon as gemini properly supports tools + // TODO provide meta data to tools // if (message.toolCall) { // logger.info({ toolCall: message.toolCall }, 'toolCall'); @@ -148,6 +150,7 @@ export async function startLiveSession( sendAudioChunkToCall: (audioChunk: Buffer) => void | Promise<void>, closeAudioStream: () => void | Promise<void>, flushAudioStream: () => void | Promise<void>, + callMetaData: any, ) { const ai = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY, @@ -211,6 +214,7 @@ export async function startLiveSession( sendAudioChunkToCall, closeAudioStream, flushAudioStream, + callMetaData, }); } @@ -267,6 +271,7 @@ export async function startLiveSession( sendAudioChunkToCall, closeAudioStream, flushAudioStream, + callMetaData, }); } @@ -294,8 +299,15 @@ export const googleAiLLMProvider: ILLMProvider = { flushAudioStream, closeAudioStream, sendAudioChunkToCall, + callMetaData, }) => { - const session = await startLiveSession(config, sendAudioChunkToCall, closeAudioStream, flushAudioStream); + const session = await startLiveSession( + config, + sendAudioChunkToCall, + closeAudioStream, + flushAudioStream, + callMetaData, + ); return { sendAudioChunk: (audioChunk: Buffer) => { session.sendRealtimeInput({ diff --git a/lab/voice-assistant/llm-providers/openai.ts b/lab/voice-assistant/llm-providers/openai.ts index 4092774af0..8b82b6e0ed 100644 --- a/lab/voice-assistant/llm-providers/openai.ts +++ b/lab/voice-assistant/llm-providers/openai.ts @@ -7,12 +7,13 @@ import type { ILLMProvider, LLMProviderConfig } from '../voice-agent.js'; const sessionManagement = new WeakMap< OpenAIRealtimeWebSocket, { - config: any; + config: LLMProviderConfig; isWorking: boolean; responseQueue: Array<ResponseAudioDeltaEvent | ResponseDoneEvent>; sendAudioChunkToCall: (audioChunk: Buffer) => void | Promise<void>; closeAudioStream: () => void | Promise<void>; flushAudioStream: () => void | Promise<void>; + callMetaData: any; } >(); @@ -83,6 +84,7 @@ export async function startLiveSession( sendAudioChunkToCall: (audioChunk: Buffer) => void | Promise<void>, closeAudioStream: () => void | Promise<void>, flushAudioStream: () => void | Promise<void>, + callMetaData: any, ) { const rt = new OpenAIRealtimeWebSocket({ model: 'gpt-4o-realtime-preview-2024-12-17', @@ -106,6 +108,7 @@ export async function startLiveSession( sendAudioChunkToCall, closeAudioStream, flushAudioStream, + callMetaData, }); } @@ -138,20 +141,39 @@ export async function startLiveSession( } }); - rt.on('response.done', message => { - sessionManagement.get(rt)!.responseQueue.push(message); + rt.on('response.done', async message => { + const sessionInfo = sessionManagement.get(rt); + if (!sessionInfo) return; + sessionInfo.responseQueue.push(message); for (const outputObject of message.response.output ?? []) { if (outputObject.type === 'function_call') { logger.info({ functionCall: outputObject }, 'function call'); - rt.send({ - type: 'conversation.item.create', - item: { - type: 'function_call_output', - call_id: outputObject.call_id, - output: 'Dossier Nummer existiert in Datenbank.', - }, - }); + const functionCallDefinition = sessionInfo.config.tools?.find(tool => tool.name === outputObject.name); + + if (!functionCallDefinition) { + logger.error({ functionCallDefinition }, 'function call definition not found'); + rt.send({ + type: 'conversation.item.create', + item: { + type: 'function_call_output', + call_id: outputObject.call_id, + output: 'Fehler: Funktion nicht gefunden.', + }, + }); + return; + } else { + const result = await functionCallDefinition.resolve(outputObject.arguments, sessionInfo.callMetaData); + rt.send({ + type: 'conversation.item.create', + item: { + type: 'function_call_output', + call_id: outputObject.call_id, + output: result, + }, + }); + } + rt.send({ type: 'response.create', response: { @@ -185,6 +207,7 @@ export async function startLiveSession( sendAudioChunkToCall, closeAudioStream, flushAudioStream, + callMetaData, }); } @@ -237,8 +260,14 @@ export async function startLiveSession( } export const openaiLLMProvider: ILLMProvider = { - init: async ({ config, flushAudioStream, closeAudioStream, sendAudioChunkToCall }) => { - const session = await startLiveSession(config, sendAudioChunkToCall, closeAudioStream, flushAudioStream); + init: async ({ config, flushAudioStream, closeAudioStream, sendAudioChunkToCall, callMetaData }) => { + const session = await startLiveSession( + config, + sendAudioChunkToCall, + closeAudioStream, + flushAudioStream, + callMetaData, + ); return { sendAudioChunk: (audioChunk: Buffer) => { session.send({ diff --git a/lab/voice-assistant/voice-agent.ts b/lab/voice-assistant/voice-agent.ts index 915c4293fe..9fce0755ba 100644 --- a/lab/voice-assistant/voice-agent.ts +++ b/lab/voice-assistant/voice-agent.ts @@ -38,12 +38,14 @@ export interface ILLMProvider { flushAudioStream, closeAudioStream, sendAudioChunkToCall, + callMetaData, }: { config: LLMProviderConfig; audioBufferTransformerProvider: IAudioBufferTransformerProvider; flushAudioStream: () => Promise<void> | void; closeAudioStream: () => Promise<void> | void; sendAudioChunkToCall: (audioChunk: Buffer) => Promise<void> | void; + callMetaData?: any; }) => Promise<ILLMSession>; } -- GitLab From a1090714adc22b0d5f7fa9d7bc4552fb2e91659d Mon Sep 17 00:00:00 2001 From: racct <elias@adornis.de> Date: Thu, 12 Jun 2025 07:44:51 +0000 Subject: [PATCH 2/2] feat: add twilio redirect to human tool --- lab/voice-assistant/package.json | 1 + .../tools/twilio-redirect-to-human.ts | 40 +++++++++++++++++++ pnpm-lock.yaml | 20 ++++++++++ 3 files changed, 61 insertions(+) create mode 100644 lab/voice-assistant/tools/twilio-redirect-to-human.ts diff --git a/lab/voice-assistant/package.json b/lab/voice-assistant/package.json index b0915cd197..e679e8f9a3 100644 --- a/lab/voice-assistant/package.json +++ b/lab/voice-assistant/package.json @@ -14,6 +14,7 @@ "alawmulaw": "^6.0.0", "express": "^4.18.2", "openai": "^4.100.0", + "twilio": "^5.7.0", "ws": "^8.18.2" }, "devDependencies": { diff --git a/lab/voice-assistant/tools/twilio-redirect-to-human.ts b/lab/voice-assistant/tools/twilio-redirect-to-human.ts new file mode 100644 index 0000000000..f3e27c57a1 --- /dev/null +++ b/lab/voice-assistant/tools/twilio-redirect-to-human.ts @@ -0,0 +1,40 @@ +import { A } from '@adornis/base/env-info.js'; +import twilio from 'twilio'; + +export const twilioRedirectToHuman = ({ + twilioNumber, + redirectNumber, + twilioAccountSid, + twilioAuthToken, +}: { + twilioNumber: string; + redirectNumber: string; + twilioAccountSid: string; + twilioAuthToken: string; +}) => { + return { + name: 'redirect_call', + description: 'Leitet den Anruf weiter an einen Supervisor', + type: 'function', + resolve: (params: any, callMetaData: { callSid: string; streamSid: string }) => { + console.log('redirect to supervisor', params, callMetaData); + const client = twilio(twilioAccountSid, twilioAuthToken); + client.calls(callMetaData.callSid).update({ + twiml: `<Response><Dial> + <Conference startConferenceOnEnter="true" endConferenceOnExit="true">${A.getGloballyUniqueID()}</Conference> + </Dial></Response>`, + }); + + client.calls.create({ + to: redirectNumber, + from: twilioNumber, + twiml: `<Response> + <Say> Sie werden verbunden mit dem Kunden. </Say> + <Dial> + <Conference startConferenceOnEnter="true" endConferenceOnExit="true">${A.getGloballyUniqueID()}</Conference> + </Dial></Response>`, + }); + return 'Erfolgreich an Supervisor weitergeleitet.'; + }, + }; +}; diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 0b6eb60303..0543ee1fef 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1014,6 +1014,9 @@ importers: openai: specifier: ^4.100.0 version: 4.100.0(encoding@0.1.13)(ws@8.18.2)(zod@3.24.4) + twilio: + specifier: ^5.7.0 + version: 5.7.0 ws: specifier: ^8.18.2 version: 8.18.2 @@ -11343,6 +11346,10 @@ packages: resolution: {integrity: sha512-yrRh6immcL5xEVX7VmHsl3vU01x/fmqxf38kvxMrrtvEtAkYARYOPor9lt5T7964zC7l31k5sTrnLJmd2jjeOA==} engines: {node: '>=14.0'} + twilio@5.7.0: + resolution: {integrity: sha512-AcN9jo/C0sFitprIg2G6CJF+EACvff+8fiTMxf7Puz+6jtmc0NgJTwmyQbPiAnJcpXWOrPdI92Obr3PV4ZKXkw==} + engines: {node: '>=14.0'} + type-check@0.3.2: resolution: {integrity: sha512-ZCmOJdvOWDBYJlzAoFkC+Q0+bUyEOS1ltgp1MGU03fqHG+dbi9tBFU2Rd9QKiDZFAYrhPh2JUf7rZRIuHRKtOg==} engines: {node: '>= 0.8.0'} @@ -21285,6 +21292,19 @@ snapshots: - debug - supports-color + twilio@5.7.0: + dependencies: + axios: 1.9.0(debug@4.4.1) + dayjs: 1.11.13 + https-proxy-agent: 5.0.1 + jsonwebtoken: 9.0.2 + qs: 6.14.0 + scmp: 2.1.0 + xmlbuilder: 13.0.2 + transitivePeerDependencies: + - debug + - supports-color + type-check@0.3.2: dependencies: prelude-ls: 1.1.2 -- GitLab