File size: 2,349 Bytes
f0544fd
09a77ad
f0544fd
09a77ad
41cae26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09a77ad
 
 
f0544fd
 
09a77ad
 
 
f0544fd
09a77ad
 
f0544fd
 
09a77ad
f0544fd
 
 
 
09a77ad
f0544fd
 
09a77ad
f0544fd
 
09a77ad
 
f0544fd
09a77ad
 
 
 
 
f0544fd
09a77ad
f0544fd
 
09a77ad
 
f0544fd
09a77ad
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
from typing import Optional, Dict, Any
from llama_index.readers.whisper import WhisperReader
from llama_index.core.tools import FunctionTool
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import (
    DocxReader,
    HWPReader,
    PDFReader,
    EpubReader,
    FlatReader,
    HTMLTagReader,
    ImageCaptionReader,
    ImageReader,
    ImageVisionLLMReader,
    IPYNBReader,
    MarkdownReader,
    MboxReader,
    PptxReader,
    PandasCSVReader,
    VideoAudioReader,
    UnstructuredReader,
    PyMuPDFReader,
    ImageTabularChartReader,
    XMLReader,
    PagedCSVReader,
    CSVReader,
    RTFReader,
)

class WhisperTranscriber:
    """Class for transcribing audio using OpenAI's Whisper model."""
    
    def __init__(self, model: str = "whisper-1", api_key: Optional[str] = None):
        """Initialize the WhisperTranscriber with specified model and API key."""
        self.api_key = api_key or os.getenv("OPENAI_API_KEY")
        self.model = model
        self.reader = WhisperReader(
            model=self.model,
            api_key=self.api_key,
        )
    
    def transcribe(self, audio_file_path: str) -> str:
        """
        Transcribe an audio file to text.
        
        Args:
            audio_file_path: Path to the audio file (.mp3, .wav, etc.)
            
        Returns:
            Transcribed text from the audio file
        """
        try:
            # Load data from audio file
            documents = self.reader.load_data(audio_file_path)
            
            # Extract and concatenate text from all returned documents
            if documents and len(documents) > 0:
                transcription = " ".join([doc.text for doc in documents if hasattr(doc, 'text')])
                return transcription
            return "No transcription was generated from the audio file."
        except Exception as e:
            return f"Error transcribing audio file: {str(e)}"


# Initialize the transcriber
whisper_transcriber = WhisperTranscriber()

# Create a function tool for audio transcription
transcribe_audio_tool = FunctionTool.from_defaults(
    name="transcribe_audio",
    description="Transcribes speech from an audio file to text using OpenAI's Whisper model. Provide the full path to the audio file.",
    fn=whisper_transcriber.transcribe
)