How to use OpenAI whisper with python

OpenAI open sourced their speech recognition model:

There are 5 sizes: tiny, base, small, medium, large

Note: you need to pip install openai-whisper (pypi link)

Bash

$ pip install openai-whisper

Here is code to transcribe an audio file

Python

import json
import time
import whisper

# === Stage 1: Transcribe ===
MODEL_TYPE = "base"

model = whisper.load_model(MODEL_TYPE)

start = time.time()
file_name = "your_file_name"
extension = "mp4"
file_path = f"data/{file_name}.{extension}"
print(f"transcribing: {file_path}")

# fp16 only necessary on some devices
base_result = model.transcribe(file_path, fp16=False)
end = time.time()
total_seconds = end - start

word_count = len(base_result["text"].split(" "))

print(f"total words: {word_count}")
print(f"total seconds: {total_seconds}")
print(f"words/seconds: {word_count/total_seconds}")

# === Stage 2: Save transcription and segments locally for analysis ===

# save ['text']
with open(f'{file_name}-transcript-{MODEL_TYPE}.txt', 'w') as myfile:
  myfile.write(base_result['text'])

# save ['segments']
with open(f'{file_name}-segments-{MODEL_TYPE}.json', 'w') as myfile:
myfile.write(json.dumps(base_result['segments']))

def get_segments_formatted(segments):
  def seconds_to_HH_MM_SS(duration_seconds):
    minutes, seconds = divmod(duration_seconds, 60)
    hours, minutes = divmod(minutes, 60)

    formatted_duration = "{:02d}:{:02d}:{:02d}".format(int(hours),             int(minutes), int(seconds))
  return formatted_duration

  segment_transcript = []
  for segment in segments:
    start = seconds_to_HH_MM_SS(segment['start'])
    text = segment['text']
    print(f"{start} {text}")
    segment_transcript.append(f"{start} {text}")

  return "\n".join(segment_transcript)

# save ['segments'] but format the time
with open(f'{file_name}-sentence-transcript-{MODEL_TYPE}.txt', 'w') as myfile:
  myfile.write(get_segments_formatted(base_result['segments']))