diff --git a/__tests__/vapi-early-media.test.ts b/__tests__/vapi-early-media.test.ts new file mode 100644 index 000000000..62220525e --- /dev/null +++ b/__tests__/vapi-early-media.test.ts @@ -0,0 +1,256 @@ +/** + * Tests for early getUserMedia acquisition in Vapi.start() + * + * Mobile browsers enforce strict "user gesture" policies that only allow + * a short window (~1-5s) between a user tap and a getUserMedia() call. + * The SDK must call getUserMedia() BEFORE making any network requests + * (e.g., the web call creation API call) to stay within that window. + * + * VAP-12773: https://linear.app/vapi/issue/VAP-12773 + */ + +// Track call ordering to verify getUserMedia runs before API call +let callOrder: string[] = []; + +// Mock MediaStreamTrack +const mockAudioTrack = { + kind: 'audio', + id: 'mock-audio-track-id', + enabled: true, + stop: jest.fn(), + addEventListener: jest.fn(), + removeEventListener: jest.fn(), +} as unknown as MediaStreamTrack; + +const mockMediaStream = { + getAudioTracks: () => [mockAudioTrack], + getTracks: () => [mockAudioTrack], +} as unknown as MediaStream; + +// Mock navigator.mediaDevices.getUserMedia +const mockGetUserMedia = jest.fn().mockImplementation(async () => { + callOrder.push('getUserMedia'); + return mockMediaStream; +}); + +// Set up global navigator mock +Object.defineProperty(global, 'navigator', { + value: { + mediaDevices: { + getUserMedia: mockGetUserMedia, + }, + userAgent: + 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15', + }, + writable: true, + configurable: true, +}); + +// Set up global document mock for audio player creation +Object.defineProperty(global, 'document', { + value: { + createElement: jest.fn().mockReturnValue({ + dataset: {}, + style: { setProperty: jest.fn() }, + play: jest.fn().mockResolvedValue(undefined), + muted: false, + autoplay: false, + srcObject: null, + }), + body: { + appendChild: jest.fn(), + }, + querySelector: jest.fn().mockReturnValue(null), + }, + writable: true, + configurable: true, +}); + +// Mock Daily.co +const mockDailyCallInstance = { + join: jest.fn().mockResolvedValue(undefined), + destroy: jest.fn().mockResolvedValue(undefined), + iframe: jest.fn().mockReturnValue({ + style: { setProperty: jest.fn() }, + }), + on: jest.fn(), + sendAppMessage: jest.fn(), + setLocalAudio: jest.fn(), + localAudio: jest.fn().mockReturnValue(true), + startRecording: jest.fn(), + stopRecording: jest.fn(), + startRemoteParticipantsAudioLevelObserver: jest.fn(), + updateInputSettings: jest.fn(), + updateParticipant: jest.fn(), + setInputDevicesAsync: jest.fn().mockResolvedValue(undefined), +}; + +jest.mock('@daily-co/daily-js', () => ({ + __esModule: true, + default: { + createCallObject: jest.fn().mockImplementation((options: any) => { + callOrder.push('createCallObject'); + return mockDailyCallInstance; + }), + }, +})); + +// Mock the API client +jest.mock('../client', () => ({ + client: { + baseUrl: 'https://api.vapi.ai', + setSecurityData: jest.fn(), + call: { + callControllerCreateWebCall: jest.fn().mockImplementation(async () => { + callOrder.push('apiCall'); + // Simulate network delay + await new Promise((resolve) => setTimeout(resolve, 50)); + return { + data: { + id: 'test-call-id', + webCallUrl: 'https://test.daily.co/test-room', + artifactPlan: { videoRecordingEnabled: false }, + assistant: { voice: { provider: 'default' } }, + }, + }; + }), + }, + }, +})); + +import Vapi from '../vapi'; +import DailyIframe from '@daily-co/daily-js'; + +describe('Vapi.start() - Early getUserMedia Acquisition (VAP-12773)', () => { + let vapi: Vapi; + + beforeEach(() => { + callOrder = []; + jest.clearAllMocks(); + vapi = new Vapi('test-token'); + }); + + afterEach(async () => { + try { + await vapi.stop(); + } catch { + // Ignore cleanup errors + } + }); + + it('should call getUserMedia BEFORE the API call to create the web call', async () => { + await vapi.start('test-assistant-id'); + + // Verify getUserMedia was called + expect(mockGetUserMedia).toHaveBeenCalled(); + + // Verify the order: getUserMedia must come before the API call + const getUserMediaIndex = callOrder.indexOf('getUserMedia'); + const apiCallIndex = callOrder.indexOf('apiCall'); + + expect(getUserMediaIndex).not.toBe(-1); + expect(apiCallIndex).not.toBe(-1); + expect(getUserMediaIndex).toBeLessThan(apiCallIndex); + }); + + it('should pass the pre-acquired audio track to DailyIframe.createCallObject', async () => { + await vapi.start('test-assistant-id'); + + expect(DailyIframe.createCallObject).toHaveBeenCalledWith( + expect.objectContaining({ + audioSource: mockAudioTrack, + }), + ); + }); + + it('should still work when getUserMedia fails (fallback to default behavior)', async () => { + mockGetUserMedia.mockRejectedValueOnce(new Error('Permission denied')); + + const result = await vapi.start('test-assistant-id'); + + // Should still proceed with the call (DailyIframe handles getUserMedia internally as fallback) + expect(DailyIframe.createCallObject).toHaveBeenCalledWith( + expect.objectContaining({ + audioSource: true, + }), + ); + expect(result).not.toBeNull(); + }); + + it('should request audio-only from getUserMedia (not video)', async () => { + await vapi.start('test-assistant-id'); + + expect(mockGetUserMedia).toHaveBeenCalledWith({ audio: true }); + }); + + it('should stop pre-acquired tracks on cleanup if call creation fails', async () => { + // Add error listener to prevent EventEmitter from throwing on 'error' events + const errorHandler = jest.fn(); + vapi.on('error', errorHandler); + + // Make the API call fail + const { client } = require('../client'); + client.call.callControllerCreateWebCall.mockRejectedValueOnce( + new Error('API Error'), + ); + + const result = await vapi.start('test-assistant-id'); + + // The call should have failed gracefully + expect(result).toBeNull(); + + // The pre-acquired track should be stopped to free the microphone + expect(mockAudioTrack.stop).toHaveBeenCalled(); + + // Clean up + vapi.removeListener('error', errorHandler); + }); + + it('should accept a pre-acquired MediaStream in start options', async () => { + const userProvidedTrack = { + kind: 'audio', + id: 'user-provided-track', + enabled: true, + stop: jest.fn(), + } as unknown as MediaStreamTrack; + + const userProvidedStream = { + getAudioTracks: () => [userProvidedTrack], + getTracks: () => [userProvidedTrack], + } as unknown as MediaStream; + + await vapi.start('test-assistant-id', undefined, undefined, undefined, undefined, { + mediaStream: userProvidedStream, + }); + + // Should NOT call getUserMedia when a stream is provided + expect(mockGetUserMedia).not.toHaveBeenCalled(); + + // Should use the user-provided track + expect(DailyIframe.createCallObject).toHaveBeenCalledWith( + expect.objectContaining({ + audioSource: userProvidedTrack, + }), + ); + }); + + it('should not call getUserMedia when start is called without being in a gesture context but audioSource is already a track', async () => { + // If the user already configured audioSource as a MediaStreamTrack in the constructor, + // we should not call getUserMedia again + const existingTrack = { + kind: 'audio', + id: 'existing-track', + enabled: true, + stop: jest.fn(), + } as unknown as MediaStreamTrack; + + const vapiWithTrack = new Vapi('test-token', undefined, undefined, { + audioSource: existingTrack, + }); + + await vapiWithTrack.start('test-assistant-id'); + + // Should NOT call getUserMedia when audioSource is already a track + expect(mockGetUserMedia).not.toHaveBeenCalled(); + }); +}); diff --git a/vapi.ts b/vapi.ts index e659313ff..0a08bf24a 100644 --- a/vapi.ts +++ b/vapi.ts @@ -193,6 +193,23 @@ type StartCallOptions = { * @example true */ roomDeleteOnUserLeaveEnabled?: boolean; + /** + * A pre-acquired MediaStream to use for the call. When provided, the SDK will + * skip its own `getUserMedia()` call and use this stream's audio track instead. + * + * This is useful when the caller wants to acquire the microphone earlier in the + * user-gesture lifecycle (e.g., in a button click handler) to avoid mobile browser + * "user gesture" timeout issues that can cause `NotAllowedError`. + * + * @example + * ```ts + * const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + * await vapi.start('assistant-id', undefined, undefined, undefined, undefined, { + * mediaStream: stream, + * }); + * ``` + */ + mediaStream?: MediaStream; } type WebCall = { @@ -394,6 +411,75 @@ export default class Vapi extends VapiEventEmitter { this.started = true; + // Determine whether we need to eagerly acquire a media stream. + // On mobile browsers the "user gesture" window is very short (~1-5 s). + // If the audioSource is already a MediaStreamTrack (set in constructor + // options or passed via options.mediaStream) we can skip this step. + let earlyAudioTrack: MediaStreamTrack | null = null; + + const userProvidedStream = options?.mediaStream; + const constructorAudioSource = this.dailyCallObject.audioSource; + const hasExistingTrack = + userProvidedStream || + (constructorAudioSource != null && + typeof constructorAudioSource === 'object' && + 'kind' in (constructorAudioSource as any)); + + if (!hasExistingTrack) { + // Acquire the microphone NOW, while we are still inside the user + // gesture window, before any async network calls. + this.emit('call-start-progress', { + stage: 'early-media-acquisition', + status: 'started', + timestamp: new Date().toISOString(), + }); + + const earlyMediaStartTime = Date.now(); + + try { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + earlyAudioTrack = stream.getAudioTracks()[0] ?? null; + + const earlyMediaDuration = Date.now() - earlyMediaStartTime; + this.emit('call-start-progress', { + stage: 'early-media-acquisition', + status: 'completed', + duration: earlyMediaDuration, + timestamp: new Date().toISOString(), + metadata: { trackId: earlyAudioTrack?.id }, + }); + } catch (mediaError) { + const earlyMediaDuration = Date.now() - earlyMediaStartTime; + const serializedMediaError = serializeError(mediaError); + this.emit('call-start-progress', { + stage: 'early-media-acquisition', + status: 'failed', + duration: earlyMediaDuration, + timestamp: new Date().toISOString(), + metadata: { error: serializedMediaError.message }, + }); + // Non-fatal: fall back to letting Daily.co handle getUserMedia itself. + // This path may still fail on mobile due to the gesture timeout, but + // it preserves backward compatibility on desktop and other environments. + } + } else if (userProvidedStream) { + // Use the caller-provided MediaStream + earlyAudioTrack = userProvidedStream.getAudioTracks()[0] ?? null; + this.emit('call-start-progress', { + stage: 'early-media-acquisition', + status: 'completed', + timestamp: new Date().toISOString(), + metadata: { source: 'user-provided', trackId: earlyAudioTrack?.id }, + }); + } else { + this.emit('call-start-progress', { + stage: 'early-media-acquisition', + status: 'completed', + timestamp: new Date().toISOString(), + metadata: { source: 'constructor-audio-source' }, + }); + } + try { // Stage 1: Create web call this.emit('call-start-progress', { @@ -401,9 +487,9 @@ export default class Vapi extends VapiEventEmitter { status: 'started', timestamp: new Date().toISOString() }); - + const webCallStartTime = Date.now(); - + const webCall = ( await client.call.callControllerCreateWebCall({ assistant: typeof assistant === 'string' ? undefined : assistant, @@ -446,24 +532,30 @@ export default class Vapi extends VapiEventEmitter { const isVideoEnabled = webCall?.assistant?.voice?.provider === 'tavus'; + // Determine the audioSource for the Daily call object. + // If we pre-acquired a track, use it so Daily.co does not call + // getUserMedia again (which would fail outside the gesture window). + const resolvedAudioSource: boolean | string | MediaStreamTrack = + earlyAudioTrack ?? this.dailyCallObject.audioSource ?? true; + // Stage 2: Create Daily call object this.emit('call-start-progress', { stage: 'daily-call-object-creation', status: 'started', timestamp: new Date().toISOString(), metadata: { - audioSource: this.dailyCallObject.audioSource ?? true, + audioSource: earlyAudioTrack ? 'pre-acquired-track' : (this.dailyCallObject.audioSource ?? true), videoSource: this.dailyCallObject.videoSource ?? isVideoRecordingEnabled, isVideoRecordingEnabled, isVideoEnabled } }); - + const dailyCallStartTime = Date.now(); - + try { this.call = DailyIframe.createCallObject({ - audioSource: this.dailyCallObject.audioSource ?? true, + audioSource: resolvedAudioSource, videoSource: this.dailyCallObject.videoSource ?? isVideoRecordingEnabled, dailyConfig: this.dailyCallConfig, }); @@ -838,9 +930,18 @@ export default class Vapi extends VapiEventEmitter { return webCall; } catch (e) { + // Stop the pre-acquired audio track to free the microphone + if (earlyAudioTrack) { + try { + earlyAudioTrack.stop(); + } catch { + // Ignore errors stopping the track + } + } + const totalDuration = Date.now() - startTime; const serializedError = serializeError(e); - + this.emit('call-start-failed', { stage: 'unknown', totalDuration, @@ -854,7 +955,7 @@ export default class Vapi extends VapiEventEmitter { isMobile: this.isMobileDevice() } }); - + // Also emit the generic error event for backward compatibility this.emit('error', { type: 'start-method-error', @@ -869,7 +970,7 @@ export default class Vapi extends VapiEventEmitter { isMobile: this.isMobileDevice() } }); - + await this.cleanup(); return null; }