Using Whisper for Speech-to-Text in Web Apps
Introduction
OpenAI’s Whisper provides state-of-the-art speech recognition. This guide integrates Whisper API with web applications for real-time transcription.
Prerequisites
- Next.js or React app
- OpenAI API key
- HTTPS (required for microphone access)
Step 1: Install Dependencies
npm install openai recordrtc
Step 2: Create Audio Recording Hook
Create hooks/useAudioRecorder.ts
:
import { useState, useRef, useCallback } from 'react';
import RecordRTC from 'recordrtc';
export function useAudioRecorder() {
const [isRecording, setIsRecording] = useState(false);
const [audioBlob, setAudioBlob] = useState<Blob | null>(null);
const recorderRef = useRef<RecordRTC | null>(null);
const streamRef = useRef<MediaStream | null>(null);
const startRecording = useCallback(async () => {
try {
const stream = await navigator.mediaDevices.getUserMedia({
audio: {
sampleRate: 16000,
channelCount: 1,
echoCancellation: true,
noiseSuppression: true,
}
});
streamRef.current = stream;
recorderRef.current = new RecordRTC(stream, {
type: 'audio',
mimeType: 'audio/wav',
sampleRate: 16000,
numberOfAudioChannels: 1,
});
recorderRef.current.startRecording();
setIsRecording(true);
} catch (error) {
console.error('Error accessing microphone:', error);
}
}, []);
const stopRecording = useCallback(() => {
if (recorderRef.current && streamRef.current) {
recorderRef.current.stopRecording(() => {
const blob = recorderRef.current?.getBlob();
setAudioBlob(blob || null);
setIsRecording(false);
// Stop all tracks
streamRef.current?.getTracks().forEach(track => track.stop());
streamRef.current = null;
});
}
}, []);
const resetRecording = useCallback(() => {
setAudioBlob(null);
setIsRecording(false);
}, []);
return {
isRecording,
audioBlob,
startRecording,
stopRecording,
resetRecording,
};
}
Step 3: Create Transcription API Route
Create app/api/transcribe/route.ts
:
import { NextRequest, NextResponse } from 'next/server';
import OpenAI from 'openai';
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY!,
});
export async function POST(req: NextRequest) {
try {
const formData = await req.formData();
const audioFile = formData.get('audio') as File;
if (!audioFile) {
return NextResponse.json(
{ error: 'No audio file provided' },
{ status: 400 }
);
}
const transcription = await openai.audio.transcriptions.create({
file: audioFile,
model: 'whisper-1',
language: 'en', // Optional: specify language
response_format: 'verbose_json', // Get timestamps
temperature: 0, // More deterministic
});
return NextResponse.json({
text: transcription.text,
segments: transcription.segments,
language: transcription.language,
duration: transcription.duration,
});
} catch (error) {
console.error('Transcription error:', error);
return NextResponse.json(
{ error: 'Failed to transcribe audio' },
{ status: 500 }
);
}
}
Step 4: Create Speech-to-Text Component
Create components/SpeechToText.tsx
:
'use client';
import { useState } from 'react';
import { useAudioRecorder } from '@/hooks/useAudioRecorder';
interface TranscriptionResult {
text: string;
segments?: Array<{
start: number;
end: number;
text: string;
}>;
language?: string;
duration?: number;
}
export default function SpeechToText() {
const [transcription, setTranscription] = useState<TranscriptionResult | null>(null);
const [isTranscribing, setIsTranscribing] = useState(false);
const [error, setError] = useState<string | null>(null);
const {
isRecording,
audioBlob,
startRecording,
stopRecording,
resetRecording,
} = useAudioRecorder();
const transcribeAudio = async () => {
if (!audioBlob) return;
setIsTranscribing(true);
setError(null);
try {
const formData = new FormData();
formData.append('audio', audioBlob, 'recording.wav');
const response = await fetch('/api/transcribe', {
method: 'POST',
body: formData,
});
const data = await response.json();
if (data.error) {
throw new Error(data.error);
}
setTranscription(data);
} catch (err) {
console.error('Transcription failed:', err);
setError(err instanceof Error ? err.message : 'Transcription failed');
} finally {
setIsTranscribing(false);
}
};
const handleReset = () => {
resetRecording();
setTranscription(null);
setError(null);
};
return (
<div className="max-w-2xl mx-auto p-6">
<h2 className="text-2xl font-bold mb-6">Speech to Text</h2>
{/* Recording Controls */}
<div className="mb-6">
<div className="flex gap-3 mb-4">
<button
onClick={isRecording ? stopRecording : startRecording}
className={`px-6 py-3 rounded-lg font-semibold ${
isRecording
? 'bg-red-500 hover:bg-red-600 text-white'
: 'bg-blue-500 hover:bg-blue-600 text-white'
}`}
>
{isRecording ? '⏹️ Stop Recording' : '🎤 Start Recording'}
</button>
{audioBlob && !isRecording && (
<button
onClick={transcribeAudio}
disabled={isTranscribing}
className="px-6 py-3 bg-green-500 hover:bg-green-600 text-white rounded-lg font-semibold disabled:bg-gray-400"
>
{isTranscribing ? 'Transcribing...' : '📝 Transcribe'}
</button>
)}
<button
onClick={handleReset}
className="px-6 py-3 bg-gray-500 hover:bg-gray-600 text-white rounded-lg font-semibold"
>
🔄 Reset
</button>
</div>
{/* Recording Status */}
{isRecording && (
<div className="flex items-center gap-2 text-red-600">
<div className="w-3 h-3 bg-red-600 rounded-full animate-pulse"></div>
Recording...
</div>
)}
</div>
{/* Audio Player */}
{audioBlob && (
<div className="mb-6">
<h3 className="font-semibold mb-2">Recorded Audio:</h3>
<audio
controls
src={URL.createObjectURL(audioBlob)}
className="w-full"
/>
</div>
)}
{/* Error Display */}
{error && (
<div className="mb-6 p-4 bg-red-100 border border-red-400 text-red-700 rounded">
Error: {error}
</div>
)}
{/* Transcription Results */}
{transcription && (
<div className="space-y-4">
<div>
<h3 className="font-semibold mb-2">Transcription:</h3>
<div className="p-4 bg-gray-100 rounded-lg">
<p className="text-lg">{transcription.text}</p>
</div>
</div>
{transcription.segments && transcription.segments.length > 0 && (
<details className="bg-gray-50 p-4 rounded-lg">
<summary className="font-semibold cursor-pointer">
Segments with Timestamps ({transcription.segments.length})
</summary>
<div className="mt-3 space-y-2">
{transcription.segments.map((segment, index) => (
<div key={index} className="flex gap-4 text-sm">
<span className="text-blue-600 font-mono">
{segment.start.toFixed(1)}s - {segment.end.toFixed(1)}s
</span>
<span>{segment.text}</span>
</div>
))}
</div>
</details>
)}
{transcription.language && (
<div className="text-sm text-gray-600">
Language: {transcription.language} |
Duration: {transcription.duration?.toFixed(2)}s
</div>
)}
</div>
)}
</div>
);
}
Step 5: Real-time Streaming (Advanced)
For real-time transcription, create a streaming solution:
import { useState, useRef, useCallback } from 'react';
export function useRealtimeTranscription() {
const [transcript, setTranscript] = useState('');
const [isListening, setIsListening] = useState(false);
const recognitionRef = useRef<SpeechRecognition | null>(null);
const startListening = useCallback(() => {
if ('webkitSpeechRecognition' in window || 'SpeechRecognition' in window) {
const SpeechRecognition = window.webkitSpeechRecognition || window.SpeechRecognition;
const recognition = new SpeechRecognition();
recognition.continuous = true;
recognition.interimResults = true;
recognition.lang = 'en-US';
recognition.onstart = () => setIsListening(true);
recognition.onend = () => setIsListening(false);
recognition.onresult = (event) => {
let finalTranscript = '';
let interimTranscript = '';
for (let i = event.resultIndex; i < event.results.length; i++) {
if (event.results[i].isFinal) {
finalTranscript += event.results[i][0].transcript;
} else {
interimTranscript += event.results[i][0].transcript;
}
}
setTranscript(finalTranscript + interimTranscript);
};
recognition.start();
recognitionRef.current = recognition;
}
}, []);
const stopListening = useCallback(() => {
if (recognitionRef.current) {
recognitionRef.current.stop();
}
}, []);
return { transcript, isListening, startListening, stopListening };
}
Step 6: Use in App
import SpeechToText from '@/components/SpeechToText';
export default function Home() {
return (
<main className="container mx-auto py-12">
<h1 className="text-4xl font-bold text-center mb-8">
Speech to Text with Whisper
</h1>
<SpeechToText />
</main>
);
}
Summary
Whisper integration provides highly accurate speech-to-text capabilities with support for multiple languages, timestamps, and confidence scores. Combine with web audio recording for complete speech recognition solutions.