whisper-tg

Paused

muhtasham commited on Mar 21

Commit

22095b0

1 Parent(s): c5741b3

feat: enhance transcription with configurable parameters and feedback system

- Add configurable batch size (1-32) and chunk length (5-60s) parameters

- Implement comprehensive feedback system with quick rating and detailed corrections

- Switch to local pipeline processing with GPU support

- Add logging for better debugging

- Improve error handling and user feedback

Files changed (1) hide show

app.py +126 -54

app.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import gradio as gr
-import requests
 import subprocess
 from loguru import logger
 import datetime
@@ -7,7 +9,7 @@ import datetime
 # Configure loguru
 logger.add("app.log", rotation="500 MB", level="DEBUG")
-API_URL = "https://skdpcqcdd929o4k3.us-east-1.aws.endpoints.huggingface.cloud"
 def format_time(seconds):
     """Convert seconds to SRT time format (HH:MM:SS,mmm)"""
@@ -40,44 +42,35 @@ def check_ffmpeg():
 # Initialize ffmpeg check
 check_ffmpeg()
-def transcribe(inputs, return_timestamps, generate_subs):
     if inputs is None:
         logger.warning("No audio file submitted")
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
-    headers = {
-        "Accept": "application/json",
-        "Content-Type": "audio/flac"
-    }
-    logger.debug(f"Using headers: {headers}")
     try:
-        logger.info(f"Reading audio file: {inputs}")
-        with open(inputs, "rb") as f:
-            data = f.read()
-        logger.debug(f"Audio file size: {len(data)} bytes")
-        # Add parameters to request
-        params = {
-            "return_timestamps": return_timestamps
-        }
-        logger.debug(f"Request parameters: {params}")
-        logger.info("Sending request to API")
-        response = requests.post(API_URL, headers=headers, data=data, params=params)
-        logger.debug(f"API Response status: {response.status_code}")
-        result = response.json()
-        logger.debug(f"API Response: {result}")
-        if "error" in result:
-            logger.error(f"API returned error: {result['error']}")
-            raise gr.Error(f"API Error: {result['error']}")
-        if "text" not in result:
-            logger.error("No transcription text in response")
-            raise gr.Error("No transcription text in response")
         # Format response as JSON
         formatted_result = {
             "text": result["text"]
@@ -98,13 +91,14 @@ def transcribe(inputs, return_timestamps, generate_subs):
                             "text": text,
                             "timestamp": [start_time, end_time]
                         }
-                        formatted_result["chunks"] = chunks
                         chunks.append(chunk_data)
                     else:
                         logger.warning(f"Invalid timestamp in chunk {i}: {chunk}")
                 except Exception as chunk_error:
                     logger.error(f"Error processing chunk {i}: {str(chunk_error)}")
                     continue
             logger.info(f"Successfully processed transcription with {len(chunks)} chunks")
         # Generate subtitles if requested
@@ -121,12 +115,18 @@ def transcribe(inputs, return_timestamps, generate_subs):
 demo = gr.Blocks(theme=gr.themes.Ocean())
 mf_transcribe = gr.Interface(
     fn=transcribe,
     inputs=[
         gr.Audio(sources="microphone", type="filepath"),
         gr.Checkbox(label="Include timestamps", value=True),
         gr.Checkbox(label="Generate subtitles", value=True),
     ],
     outputs=[
         gr.JSON(label="Transcription", open=True),
@@ -134,16 +134,11 @@ mf_transcribe = gr.Interface(
     ],
     title="Whisper Large V3 Turbo: Transcribe Audio",
     description=(
-        "Transcribe long-form microphone or audio inputs with the click of a button! "
-        "Generate subtitles for your videos in SRT format."
     ),
-    flagging_mode="manual",
-    flagging_options=[
-        "Incorrect text",
-        "Incorrect timestamp",
-        "Other issue"
-    ],
-    flagging_dir="flagged_data"
 )
 file_transcribe = gr.Interface(
@@ -152,6 +147,8 @@ file_transcribe = gr.Interface(
         gr.Audio(sources="upload", type="filepath", label="Audio file"),
         gr.Checkbox(label="Include timestamps", value=True),
         gr.Checkbox(label="Generate subtitles", value=True),
     ],
     outputs=[
         gr.JSON(label="Transcription", open=True),
@@ -159,20 +156,95 @@ file_transcribe = gr.Interface(
     ],
     title="Whisper Large V3: Transcribe Audio",
     description=(
-        "Transcribe long-form microphone or audio inputs with the click of a button! "
-        "Generate subtitles for your videos in SRT format."
     ),
-    flagging_mode="manual",
-    flagging_options=[
-        "Incorrect text",
-        "Incorrect timestamp",
-        "Other issue"
-    ],
-    flagging_dir="flagged_data"
 )
 with demo:
-    gr.TabbedInterface([mf_transcribe, file_transcribe], ["Microphone", "Audio file"])
 logger.info("Starting Gradio interface")
 demo.queue().launch(ssr_mode=False)

+import spaces
+import torch
 import gradio as gr
+from transformers import pipeline
 import subprocess
 from loguru import logger
 import datetime
 # Configure loguru
 logger.add("app.log", rotation="500 MB", level="DEBUG")
+MODEL_NAME = "muhtasham/whisper-tg"
 def format_time(seconds):
     """Convert seconds to SRT time format (HH:MM:SS,mmm)"""
 # Initialize ffmpeg check
 check_ffmpeg()
+device = 0 if torch.cuda.is_available() else "cpu"
+logger.info(f"Using device: {device}")
+def create_pipeline(chunk_length_s):
+    """Create a new pipeline with specified chunk length"""
+    return pipeline(
+        task="automatic-speech-recognition",
+        model=MODEL_NAME,
+        chunk_length_s=chunk_length_s,
+        device=device,
+    )
+# Initialize default pipeline
+pipe = create_pipeline(30)
+logger.info(f"Pipeline initialized: {pipe}")
+@spaces.GPU
+def transcribe(inputs, return_timestamps, generate_subs, batch_size, chunk_length_s):
     if inputs is None:
         logger.warning("No audio file submitted")
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
     try:
+        logger.info(f"Processing audio file: {inputs}")
+        # Create new pipeline with specified chunk length
+        current_pipe = create_pipeline(chunk_length_s)
+        result = current_pipe(inputs, batch_size=batch_size, return_timestamps=return_timestamps)
+        logger.debug(f"Pipeline result: {result}")
         # Format response as JSON
         formatted_result = {
             "text": result["text"]
                             "text": text,
                             "timestamp": [start_time, end_time]
                         }
                         chunks.append(chunk_data)
                     else:
                         logger.warning(f"Invalid timestamp in chunk {i}: {chunk}")
                 except Exception as chunk_error:
                     logger.error(f"Error processing chunk {i}: {str(chunk_error)}")
                     continue
+            formatted_result["chunks"] = chunks
             logger.info(f"Successfully processed transcription with {len(chunks)} chunks")
         # Generate subtitles if requested
 demo = gr.Blocks(theme=gr.themes.Ocean())
+# Create flagging callback with custom options
+flagging_callback = gr.CSVLogger()
+# Define interfaces first
 mf_transcribe = gr.Interface(
     fn=transcribe,
     inputs=[
         gr.Audio(sources="microphone", type="filepath"),
         gr.Checkbox(label="Include timestamps", value=True),
         gr.Checkbox(label="Generate subtitles", value=True),
+        gr.Slider(minimum=1, maximum=32, value=8, step=1, label="Batch Size"),
+        gr.Slider(minimum=5, maximum=30, value=15, step=5, label="Chunk Length (seconds)"),
     ],
     outputs=[
         gr.JSON(label="Transcription", open=True),
     ],
     title="Whisper Large V3 Turbo: Transcribe Audio",
     description=(
+        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
+        f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
+        " of arbitrary length."
     ),
+    flagging_mode="manual"
 )
 file_transcribe = gr.Interface(
         gr.Audio(sources="upload", type="filepath", label="Audio file"),
         gr.Checkbox(label="Include timestamps", value=True),
         gr.Checkbox(label="Generate subtitles", value=True),
+        gr.Slider(minimum=1, maximum=32, value=8, step=1, label="Batch Size"),
+        gr.Slider(minimum=10, maximum=60, value=30, step=5, label="Chunk Length (seconds)"),
     ],
     outputs=[
         gr.JSON(label="Transcription", open=True),
     ],
     title="Whisper Large V3: Transcribe Audio",
     description=(
+        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
+        f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
+        " of arbitrary length."
     ),
+    flagging_mode="manual"
 )
+# Then set up the demo with the interfaces
 with demo:
+    with gr.TabbedInterface([mf_transcribe, file_transcribe], ["Microphone", "Audio file"]) as tabs:
+        with gr.Row():
+            with gr.Column():
+                # Quick feedback
+                feedback_rating = gr.Radio(
+                    choices=["👍 Good", "👎 Bad"],
+                    label="Was this transcription accurate?",
+                    value="👍 Good"
+                )
+                # Detailed feedback
+                with gr.Accordion("Detailed Feedback", open=False):
+                    flag_type = gr.Radio(
+                        choices=[
+                            "Text Issue",
+                            "Timestamp Issue",
+                            "Missing Content",
+                            "Other Issue"
+                        ],
+                        label="What type of issue did you find?",
+                        value="Text Issue"
+                    )
+                    # Correction submission
+                    with gr.Row():
+                        with gr.Column():
+                            gr.Markdown("### Original")
+                            original_text = gr.Textbox(
+                                label="Original text",
+                                interactive=False,
+                                lines=2
+                            )
+                        with gr.Column():
+                            gr.Markdown("### Correction")
+                            corrected_text = gr.Textbox(
+                                label="Corrected text",
+                                placeholder="Enter the correct text here",
+                                lines=2
+                            )
+                    # Timestamp correction
+                    with gr.Row():
+                        with gr.Column():
+                            gr.Markdown("### Original Timestamp")
+                            original_timestamp = gr.Textbox(
+                                label="Original timestamp",
+                                interactive=False,
+                                lines=1
+                            )
+                        with gr.Column():
+                            gr.Markdown("### Corrected Timestamp")
+                            corrected_timestamp = gr.Textbox(
+                                label="Corrected timestamp (HH:MM:SS,mmm)",
+                                placeholder="00:00:00,000",
+                                lines=1
+                            )
+                    flag_details = gr.Textbox(
+                        label="Additional notes",
+                        placeholder="Any other details about the issue...",
+                        lines=3
+                    )
+                flag_button = gr.Button("Submit Feedback")
+            # Setup flagging callback with all feedback components
+            flagging_callback.setup(
+                [tabs, feedback_rating, flag_type, original_text, corrected_text,
+                 original_timestamp, corrected_timestamp, flag_details],
+                "flagged_data"
+            )
+            # Handle flag submission
+            flag_button.click(
+                lambda *args: flagging_callback.flag(list(args)),
+                [tabs, feedback_rating, flag_type, original_text, corrected_text,
+                 original_timestamp, corrected_timestamp, flag_details],
+                None,
+                preprocess=False
+            )
 logger.info("Starting Gradio interface")
 demo.queue().launch(ssr_mode=False)