OpenAI open sourced their speech recognition model:
https://github.com/openai/whisper
There are 5 sizes: tiny, base, small, medium, large
Note: you need to pip install openai-whisper (pypi link)
Bash
$ pip install openai-whisper
Here is code to transcribe an audio file
Python
import json
import time
import whisper
# === Stage 1: Transcribe ===
MODEL_TYPE = "base"
model = whisper.load_model(MODEL_TYPE)
start = time.time()
file_name = "your_file_name"
extension = "mp4"
file_path = f"data/{file_name}.{extension}"
print(f"transcribing: {file_path}")
# fp16 only necessary on some devices
base_result = model.transcribe(file_path, fp16=False)
end = time.time()
total_seconds = end - start
word_count = len(base_result["text"].split(" "))
print(f"total words: {word_count}")
print(f"total seconds: {total_seconds}")
print(f"words/seconds: {word_count/total_seconds}")
# === Stage 2: Save transcription and segments locally for analysis ===
# save ['text']
with open(f'{file_name}-transcript-{MODEL_TYPE}.txt', 'w') as myfile:
myfile.write(base_result['text'])
# save ['segments']
with open(f'{file_name}-segments-{MODEL_TYPE}.json', 'w') as myfile:
myfile.write(json.dumps(base_result['segments']))
def get_segments_formatted(segments):
def seconds_to_HH_MM_SS(duration_seconds):
minutes, seconds = divmod(duration_seconds, 60)
hours, minutes = divmod(minutes, 60)
formatted_duration = "{:02d}:{:02d}:{:02d}".format(int(hours), int(minutes), int(seconds))
return formatted_duration
segment_transcript = []
for segment in segments:
start = seconds_to_HH_MM_SS(segment['start'])
text = segment['text']
print(f"{start} {text}")
segment_transcript.append(f"{start} {text}")
return "\n".join(segment_transcript)
# save ['segments'] but format the time
with open(f'{file_name}-sentence-transcript-{MODEL_TYPE}.txt', 'w') as myfile:
myfile.write(get_segments_formatted(base_result['segments']))