The iOS Speech framework provides functionality to convert audio to text. This audio can be real-time or pre-recorded. Transcription models are hosted on device with more powerful ones available from Apple servers.
Swift
// Example pre-recorded audio file transcription
// Assumes Info.plist has permissions, authorization are handled;
import Speech
let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))
let request = SFSpeechURLRecognitionRequest(url: audioFileURL)
recognizer?.recognitionTask(with: request) { result, error in
if let result = result {
let transcription = result.bestTranscription.formattedString
}
}
How does it work with real time audio? For audio you need to import AVFoundation
Flow:
Microphone → AVAudioEngine (captures) → Audio Buffers
→ SFSpeechAudioBufferRecognitionRequest (transcribes) → Text
Swift
import Speech
import AVFoundation
class SpeechRecognizer {
private let recognizer = SFSpeechRecognizer(locale: Locale(identifier:"en-US"))
private var recognitionTask: SFSpeechRecognitionTask?
private let audioEngine = AVAudioEngine()
func startTranscribing() {
// 1. Get permission
SFSpeechRecognizer.requestAuthorization { status in
guard status == .authorized else { return }
// 2. Create recognition request
let request = SFSpeechAudioBufferRecognitionRequest()
// 3. Start recognition task
self.recognitionTask = self.recognizer?.recognitionTask(with:request) { result, error in
if let result = result {
print(result.bestTranscription.formattedString)
}
}
// 4. Setup audio engine
let inputNode = self.audioEngine.inputNode
let recordingFormat = inputNode.outputFormat(forBus: 0)
inputNode.installTap(onBus: 0, bufferSize: 1024, format:recordingFormat) { buffer, _ in
request.append(buffer)
}
// 5. Start audio engine
self.audioEngine.prepare()
try? self.audioEngine.start()
}
}
func stopTranscribing() {
audioEngine.stop()
audioEngine.inputNode.removeTap(onBus: 0)
recognitionTask?.cancel()
}
}