whisper-tg

Paused

File size: 5,038 Bytes

6c226f9
1e5e969
daf7f7b
 
6c226f9
7c39690
 
1e5e969
7c39690
6c226f9
daf7f7b
 
 
 
7c39690
 
 
daf7f7b
 
 
 
 
478eee2
3c0cd8e
7c39690
3c0cd8e
6c226f9
1e5e969
 
 
 
7c39690
1e5e969
 
7c39690
1e5e969
 
7c39690
7c6ce6c
 
 
 
 
7c39690
7c6ce6c
7c39690
7c6ce6c
7c39690
 
1e5e969
7c39690
1e5e969
7c6ce6c
7c39690
7c6ce6c
 
 
7c39690
7c6ce6c
 
0147bd5
 
 
 
 
 
7c6ce6c
7c39690
 
 
 
 
 
 
 
 
0147bd5
 
 
7c39690
 
 
 
 
 
7c6ce6c
0147bd5
 
 
 
087dbfe
1e5e969
0147bd5
 
1e5e969
7c39690
1e5e969
6c226f9
47407ef
6c226f9
 
 
 
3ce82e9
3c0cd8e
478eee2
0147bd5
478eee2
17f14b2
3c0cd8e
1e5e969
3c0cd8e
7c6ce6c
087dbfe
 
 
 
 
 
3c0cd8e
 
 
 
 
3ce82e9
6c226f9
478eee2
0147bd5
478eee2
a5bfe25
6c226f9
087dbfe
6c226f9
7c6ce6c
087dbfe
 
 
 
 
 
6c226f9
 
 
cc96a73
6c226f9
7c39690
47407ef
7097513

import gradio as gr
import requests
import subprocess
from loguru import logger

# Configure loguru
logger.add("app.log", rotation="500 MB", level="DEBUG")

API_URL = "https://skdpcqcdd929o4k3.us-east-1.aws.endpoints.huggingface.cloud"

# Check if ffmpeg is installed
def check_ffmpeg():
    try:
        subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
        logger.info("ffmpeg check passed successfully")
    except (subprocess.CalledProcessError, FileNotFoundError) as e:
        logger.error(f"ffmpeg check failed: {str(e)}")
        raise gr.Error("ffmpeg is not installed. Please install ffmpeg to use this application.")

# Initialize ffmpeg check
check_ffmpeg()

def transcribe(inputs):
    if inputs is None:
        logger.warning("No audio file submitted")
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

    headers = {
        "Accept": "application/json",
        "Content-Type": "audio/flac"
    }
    logger.debug(f"Using headers: {headers}")

    try:
        logger.info(f"Reading audio file: {inputs}")
        with open(inputs, "rb") as f:
            data = f.read()
        logger.debug(f"Audio file size: {len(data)} bytes")
        
        # Add parameters to request
        params = {
            "return_timestamps": True
        }
        logger.debug(f"Request parameters: {params}")
        
        logger.info("Sending request to API")
        response = requests.post(API_URL, headers=headers, data=data, params=params)
        logger.debug(f"API Response status: {response.status_code}")
        
        result = response.json()
        logger.debug(f"API Response: {result}")
        
        if "error" in result:
            logger.error(f"API returned error: {result['error']}")
            raise gr.Error(f"API Error: {result['error']}")
            
        if "text" not in result:
            logger.error("No transcription text in response")
            raise gr.Error("No transcription text in response")
            
        # Format response as JSON
        formatted_result = {
            "text": result["text"],
            "chunks": []
        }
        
        if "chunks" in result:
            logger.info(f"Processing {len(result['chunks'])} chunks")
            for i, chunk in enumerate(result["chunks"]):
                logger.debug(f"Processing chunk {i}: {chunk}")
                try:
                    start_time = chunk.get("timestamp", [None, None])[0]
                    end_time = chunk.get("timestamp", [None, None])[1]
                    text = chunk.get("text", "").strip()
                    
                    if start_time is not None and end_time is not None:
                        formatted_result["chunks"].append({
                            "text": text,
                            "timestamp": [start_time, end_time]
                        })
                    else:
                        logger.warning(f"Invalid timestamp in chunk {i}: {chunk}")
                except Exception as chunk_error:
                    logger.error(f"Error processing chunk {i}: {str(chunk_error)}")
                    continue
        else:
            logger.info("No chunks found, using single chunk")
            formatted_result["chunks"].append({
                "text": result["text"],
                "timestamp": [0.0, None]
            })
        
        logger.info(f"Successfully processed transcription with {len(formatted_result['chunks'])} chunks")
        return formatted_result
    except Exception as e:
        logger.exception(f"Error during transcription: {str(e)}")
        raise gr.Error(f"Failed to transcribe audio: {str(e)}")

demo = gr.Blocks(theme=gr.themes.Ocean())

mf_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources="microphone", type="filepath"),
    ],
    outputs=[
        gr.JSON(label="Transcription", open=True),
    ],
    title="Whisper Large V3 Turbo: Transcribe Audio",
    description=(
        "Transcribe long-form microphone or audio inputs with the click of a button! "
    ),
    flagging_mode="manual",
    flagging_options=[
        "Incorrect text",
        "Incorrect timestamp",
        "Other issue"
    ],
    flagging_dir="flagged_data"
)

file_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources="upload", type="filepath", label="Audio file"),
    ],
    outputs=[
        gr.JSON(label="Transcription", open=True),
    ],
    title="Whisper Large V3: Transcribe Audio",
    description=(
        "Transcribe long-form microphone or audio inputs with the click of a button! "
    ),
    flagging_mode="manual",
    flagging_options=[
        "Incorrect text",
        "Incorrect timestamp",
        "Other issue"
    ],
    flagging_dir="flagged_data"
)

with demo:
    gr.TabbedInterface([mf_transcribe, file_transcribe], ["Microphone", "Audio file"])

logger.info("Starting Gradio interface")
demo.queue().launch(ssr_mode=False)