import os from typing import Optional, Dict, Any from llama_index.readers.whisper import WhisperReader from llama_index.core.tools import FunctionTool from llama_index.core import SimpleDirectoryReader from llama_index.readers.file import ( DocxReader, HWPReader, PDFReader, EpubReader, FlatReader, HTMLTagReader, ImageCaptionReader, ImageReader, ImageVisionLLMReader, IPYNBReader, MarkdownReader, MboxReader, PptxReader, PandasCSVReader, VideoAudioReader, UnstructuredReader, PyMuPDFReader, ImageTabularChartReader, XMLReader, PagedCSVReader, CSVReader, RTFReader, ) class WhisperTranscriber: """Class for transcribing audio using OpenAI's Whisper model.""" def __init__(self, model: str = "whisper-1", api_key: Optional[str] = None): """Initialize the WhisperTranscriber with specified model and API key.""" self.api_key = api_key or os.getenv("OPENAI_API_KEY") self.model = model self.reader = WhisperReader( model=self.model, api_key=self.api_key, ) def transcribe(self, audio_file_path: str) -> str: """ Transcribe an audio file to text. Args: audio_file_path: Path to the audio file (.mp3, .wav, etc.) Returns: Transcribed text from the audio file """ try: # Load data from audio file documents = self.reader.load_data(audio_file_path) # Extract and concatenate text from all returned documents if documents and len(documents) > 0: transcription = " ".join([doc.text for doc in documents if hasattr(doc, 'text')]) return transcription return "No transcription was generated from the audio file." except Exception as e: return f"Error transcribing audio file: {str(e)}" # Initialize the transcriber whisper_transcriber = WhisperTranscriber() # Create a function tool for audio transcription transcribe_audio_tool = FunctionTool.from_defaults( name="transcribe_audio", description="Transcribes speech from an audio file to text using OpenAI's Whisper model. Provide the full path to the audio file.", fn=whisper_transcriber.transcribe )