whisper-tg

Paused

App Files Files Community

muhtasham commited on Mar 23

Commit

5476911

1 Parent(s): c2dcdf8

WIP

Browse files

Files changed (1) hide show

app.py +109 -210

app.py CHANGED Viewed

@@ -41,63 +41,21 @@ def format_time(seconds):
     milliseconds = td.microseconds // 1000
     return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
-def generate_srt(chunks):
-    """Generate SRT format subtitles from transcription chunks.
-    Args:
-        chunks (list): List of dictionaries containing transcription chunks.
-            Each chunk must have:
-            - "timestamp": List of [start_time, end_time] in seconds
-            - "text": The transcribed text for that time segment
-    Returns:
-        str: SRT formatted subtitles string with format:
-            ```
-            1
-            HH:MM:SS,mmm --> HH:MM:SS,mmm
-            Text content
-            2
-            HH:MM:SS,mmm --> HH:MM:SS,mmm
-            Text content
-            ...
-            ```
-    Example:
-        >>> chunks = [
-        ...     {"timestamp": [0.0, 1.5], "text": "Hello"},
-        ...     {"timestamp": [1.5, 3.0], "text": "World"}
-        ... ]
-        >>> generate_srt(chunks)
-        '1\\n00:00:00,000 --> 00:00:01,500\\nHello\\n\\n2\\n00:00:01,500 --> 00:00:03,000\\nWorld\\n\\n'
-    """
     srt_content = []
-    for i, chunk in enumerate(chunks, 1):
-        start_time = format_time(chunk["timestamp"][0])
-        end_time = format_time(chunk["timestamp"][1])
-        text = chunk.get("text", "").strip()
         srt_content.append(f"{i}\n{start_time} --> {end_time}\n{text}\n\n")
     return "".join(srt_content)
 def save_srt_to_file(srt_content):
-    """Save SRT content to a temporary file.
-    Args:
-        srt_content (str): The SRT formatted subtitles content to save.
-    Returns:
-        str or None: Path to the temporary file if content was saved,
-                    None if srt_content was empty.
-    Note:
-        The temporary file is created with delete=False to allow it to be
-        used after the function returns. The file should be deleted by the
-        caller when no longer needed.
-    """
     if not srt_content:
         return None
-    # Create a temporary file with .srt extension
     temp_file = tempfile.NamedTemporaryFile(suffix='.srt', delete=False)
     temp_file.write(srt_content.encode('utf-8'))
     temp_file.close()
@@ -116,17 +74,7 @@ def check_ffmpeg():
 check_ffmpeg()
 def download_youtube_audio(url):
-    """Download audio from YouTube using Sieve API.
-    Args:
-        url (str): YouTube video URL
-    Returns:
-        str: Path to downloaded audio file
-    Raises:
-        gr.Error: If download fails or API key is not set
-    """
     logger.info(f"Starting YouTube audio download process for URL: {url}")
     if not SIEVE_API_KEY:
@@ -134,13 +82,11 @@ def download_youtube_audio(url):
         raise gr.Error("SIEVE_API_KEY environment variable is not set")
     try:
-        # Create a temporary file for the audio
         temp_file = tempfile.NamedTemporaryFile(suffix='.mp3', delete=False)
         temp_file.close()
         output_path = temp_file.name
         logger.info(f"Created temporary file at: {output_path}")
-        # Prepare the request to Sieve API with exact parameters
         payload = {
             "function": "sieve/youtube-downloader",
             "inputs": {
@@ -158,53 +104,42 @@ def download_youtube_audio(url):
                 "audio_format": "mp3"
             }
         }
-        logger.debug(f"Prepared Sieve API payload: {payload}")
         # Send request to Sieve API with retries
         max_retries = 3
-        retry_delay = 5  # seconds
         for attempt in range(max_retries):
             try:
-                logger.info(f"Sending request to Sieve API (attempt {attempt + 1}/{max_retries})...")
                 response = requests.post(
                     f"{SIEVE_API_URL}/push",
                     headers={"X-API-Key": SIEVE_API_KEY, "Content-Type": "application/json"},
                     json=payload,
-                    timeout=1800  # Add timeout
                 )
                 response.raise_for_status()
                 response_data = response.json()
-                logger.debug(f"Sieve API response: {response_data}")
                 job_id = response_data.get("id")
                 if not job_id:
-                    logger.error("No job ID received from Sieve API")
                     if attempt < max_retries - 1:
-                        logger.warning(f"Retrying in {retry_delay} seconds...")
                         time.sleep(retry_delay)
                         continue
                     raise gr.Error("Failed to get job ID from Sieve API")
                 break
             except requests.exceptions.RequestException as e:
-                logger.warning(f"Request failed (attempt {attempt + 1}/{max_retries}): {str(e)}")
                 if attempt < max_retries - 1:
-                    logger.info(f"Retrying in {retry_delay} seconds...")
                     time.sleep(retry_delay)
                     continue
                 raise
-        logger.info(f"Received job ID: {job_id}")
         # Poll for job completion
         poll_count = 0
         max_polls = 1800
-        last_status = None
         while True:
             poll_count += 1
-            logger.info(f"Polling job status (attempt {poll_count}/{max_polls})...")
             try:
                 job_response = requests.get(
@@ -216,212 +151,170 @@ def download_youtube_audio(url):
                 job_data = job_response.json()
                 status = job_data.get("status")
-                if status != last_status:
-                    logger.info(f"Job status changed: {status}")
-                    last_status = status
                 if status == "completed" or status == "finished":
-                    logger.debug(f"Job status response: {job_data}")
-                    logger.info("Job completed successfully")
-                    # Get the output data
                     output_data = job_data.get("outputs", [])
                     if not output_data:
-                        logger.error(f"No output data found in completed job response. Full response: {job_data}")
                         raise gr.Error("No output data in job response")
-                    # Get the first output which should contain the audio URL
                     first_output = output_data[0]
                     if not isinstance(first_output, dict):
-                        logger.error(f"Unexpected output format: {first_output}")
                         raise gr.Error("Unexpected output format from job response")
-                    # Get the data field which contains the URL
                     output_data = first_output.get("data", {})
                     if not isinstance(output_data, dict):
-                        logger.error(f"Unexpected data format: {output_data}")
                         raise gr.Error("Unexpected data format from job response")
-                    # Get the audio URL from the data
                     audio_url = output_data.get("url")
                     if not audio_url:
-                        logger.error(f"No audio URL found in output data. Output data: {output_data}")
                         raise gr.Error("No audio URL in output data")
-                    logger.info(f"Received audio URL from Sieve: {audio_url}")
-                    # Download the audio file
-                    logger.info("Downloading audio file from Sieve storage...")
                     audio_response = requests.get(audio_url, timeout=1800)
                     audio_response.raise_for_status()
-                    file_size = len(audio_response.content)
-                    logger.info(f"Downloaded audio file size: {file_size/1024/1024:.2f} MB")
-                    # Save the file
                     with open(output_path, "wb") as f:
                         f.write(audio_response.content)
-                    logger.info(f"Successfully saved audio to: {output_path}")
-                    # Break out of the polling loop after successful download
                     break
                 elif status == "failed":
                     error_msg = job_data.get("error", "Unknown error")
-                    logger.error(f"Job failed with error: {error_msg}")
                     raise gr.Error(f"Job failed: {error_msg}")
                 if poll_count >= max_polls:
-                    logger.error("Maximum polling attempts reached")
                     raise gr.Error("Download took too long. Please try again or check if the video is accessible.")
-                logger.info("Job still processing, waiting 2 seconds before next poll...")
                 time.sleep(2)
             except requests.exceptions.RequestException as e:
-                logger.warning(f"Poll request failed: {str(e)}")
                 if poll_count >= max_polls:
                     raise gr.Error("Failed to check job status. Please try again.")
                 time.sleep(2)
-    except requests.exceptions.RequestException as e:
-        logger.exception(f"Network error during YouTube download: {str(e)}")
-        raise gr.Error(f"Failed to download YouTube audio: Network error - {str(e)}")
     except Exception as e:
-        logger.exception(f"Unexpected error during YouTube download: {str(e)}")
         raise gr.Error(f"Failed to download YouTube audio: {str(e)}")
     return output_path
-def transcribe_youtube(url, return_timestamps, generate_subs):
-    """Transcribe audio from YouTube video.
-    Args:
-        url (str): YouTube video URL
-        return_timestamps (bool): Whether to include timestamps in output
-        generate_subs (bool): Whether to generate SRT subtitles
-    Returns:
-        tuple: (formatted_result, srt_file, correction_text)
-    """
-    logger.info(f"Starting YouTube transcription process for URL: {url}")
-    logger.info(f"Options - Timestamps: {return_timestamps}, Generate subtitles: {generate_subs}")
     try:
-        # Download audio from YouTube
-        logger.info("Step 1: Downloading audio from YouTube...")
-        audio_path = download_youtube_audio(url)
-        logger.info(f"Successfully downloaded audio to: {audio_path}")
-        # Transcribe the downloaded audio
-        logger.info("Step 2: Transcribing downloaded audio...")
-        result = transcribe(audio_path, return_timestamps, generate_subs)
-        logger.info("Successfully completed transcription")
-        # Clean up the temporary file
-        logger.info("Step 3: Cleaning up temporary files...")
-        try:
-            os.unlink(audio_path)
-            logger.info(f"Successfully deleted temporary file: {audio_path}")
-        except Exception as e:
-            logger.warning(f"Failed to delete temporary file: {str(e)}")
-        return result
-    except Exception as e:
-        logger.exception(f"Error in YouTube transcription: {str(e)}")
-        raise gr.Error(f"Failed to transcribe YouTube video: {str(e)}")
-def transcribe(inputs, return_timestamps, generate_subs):
-    """Transcribe audio input using Whisper model via Hugging Face Inference API.
-    Args:
-        inputs (str): Path to audio file to transcribe.
-        return_timestamps (bool): Whether to include timestamps in output.
-        generate_subs (bool): Whether to generate SRT subtitles.
-    Returns:
-        tuple: (formatted_result, srt_file, correction_text)
-            - formatted_result (dict): Transcription results
-            - srt_file (str): Path to SRT file if generated, None otherwise
-            - correction_text (str): Empty string for corrections
-    Raises:
-        gr.Error: If no audio file is provided or transcription fails.
-    """
-    logger.info(f"Starting transcription process for file: {inputs}")
-    logger.info(f"Options - Timestamps: {return_timestamps}, Generate subtitles: {generate_subs}")
-    if inputs is None:
-        logger.warning("No audio file submitted")
-        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
     try:
-        # Read the audio file
-        logger.info("Step 1: Reading audio file...")
-        with open(inputs, "rb") as f:
-            data = f.read()
-        file_size = len(data)
-        logger.info(f"Successfully read audio file, size: {file_size/1024/1024:.2f} MB")
         # Prepare request parameters
-        params = {"return_timestamps": "true"} if return_timestamps else {}
         # Send request to API
-        logger.info("Step 2: Sending request to Whisper API...")
         response = requests.post(
-            API_URL,
-            headers=headers,
-            data=data,
             params=params
         )
         response.raise_for_status()
         result = response.json()
-        logger.debug(f"API response: {result}")
-        logger.info("Successfully received response from API")
-        # Format response as JSON
-        logger.info("Step 3: Processing API response...")
         formatted_result = {
-            "text": result.get("text", "")
         }
-        logger.info(f"Transcribed text length: {len(formatted_result['text'])} characters")
-        chunks = []
-        if return_timestamps and "chunks" in result:
-            logger.info(f"Processing {len(result['chunks'])} chunks for timestamps")
-            for i, chunk in enumerate(result["chunks"]):
-                logger.debug(f"Processing chunk {i}: {chunk}")
-                try:
-                    start_time = chunk.get("timestamp", [None, None])[0]
-                    end_time = chunk.get("timestamp", [None, None])[1]
-                    text = chunk.get("text", "").strip()
-                    if start_time is not None and end_time is not None:
-                        chunk_data = {
-                            "text": text,
-                            "timestamp": [start_time, end_time]
-                        }
-                        chunks.append(chunk_data)
-                    else:
-                        logger.warning(f"Invalid timestamp in chunk {i}: {chunk}")
-                except Exception as chunk_error:
-                    logger.error(f"Error processing chunk {i}: {str(chunk_error)}")
-                    continue
-            formatted_result["chunks"] = chunks
-            logger.info(f"Successfully processed {len(chunks)} chunks with timestamps")
         # Generate subtitles if requested
         srt_file = None
-        if generate_subs and chunks:
-            logger.info("Step 4: Generating SRT subtitles...")
-            srt_content = generate_srt(chunks)
             srt_file = save_srt_to_file(srt_content)
-            logger.info(f"Successfully generated SRT file: {srt_file}")
-        logger.info("Transcription process completed successfully")
-        return formatted_result, srt_file, ""  # Return empty string for correction textbox
     except requests.exceptions.RequestException as e:
         logger.exception(f"API request failed: {str(e)}")
         raise gr.Error(f"Failed to transcribe audio: API request failed - {str(e)}")
@@ -439,6 +332,8 @@ youtube_transcribe = gr.Interface(
         gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=..."),
         gr.Checkbox(label="Include timestamps", value=True),
         gr.Checkbox(label="Generate subtitles", value=True),
     ],
     outputs=[
         gr.JSON(label="Transcription", open=True),
@@ -461,6 +356,8 @@ mf_transcribe = gr.Interface(
         gr.Audio(sources="microphone", type="filepath"),
         gr.Checkbox(label="Include timestamps", value=True),
         gr.Checkbox(label="Generate subtitles", value=True),
     ],
     outputs=[
         gr.JSON(label="Transcription", open=True),
@@ -480,6 +377,8 @@ file_transcribe = gr.Interface(
         gr.Audio(sources="upload", type="filepath", label="Audio file"),
         gr.Checkbox(label="Include timestamps", value=True),
         gr.Checkbox(label="Generate subtitles", value=True),
     ],
     outputs=[
         gr.JSON(label="Transcription", open=True),

     milliseconds = td.microseconds // 1000
     return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
+def generate_srt(segments):
+    """Generate SRT format subtitles from transcription segments."""
     srt_content = []
+    for i, segment in enumerate(segments, 1):
+        start_time = format_time(segment["start"])
+        end_time = format_time(segment["end"])
+        text = segment.get("text", "").strip()
         srt_content.append(f"{i}\n{start_time} --> {end_time}\n{text}\n\n")
     return "".join(srt_content)
 def save_srt_to_file(srt_content):
+    """Save SRT content to a temporary file."""
     if not srt_content:
         return None
     temp_file = tempfile.NamedTemporaryFile(suffix='.srt', delete=False)
     temp_file.write(srt_content.encode('utf-8'))
     temp_file.close()
 check_ffmpeg()
 def download_youtube_audio(url):
+    """Download audio from YouTube using Sieve API."""
     logger.info(f"Starting YouTube audio download process for URL: {url}")
     if not SIEVE_API_KEY:
         raise gr.Error("SIEVE_API_KEY environment variable is not set")
     try:
         temp_file = tempfile.NamedTemporaryFile(suffix='.mp3', delete=False)
         temp_file.close()
         output_path = temp_file.name
         logger.info(f"Created temporary file at: {output_path}")
         payload = {
             "function": "sieve/youtube-downloader",
             "inputs": {
                 "audio_format": "mp3"
             }
         }
         # Send request to Sieve API with retries
         max_retries = 3
+        retry_delay = 5
         for attempt in range(max_retries):
             try:
                 response = requests.post(
                     f"{SIEVE_API_URL}/push",
                     headers={"X-API-Key": SIEVE_API_KEY, "Content-Type": "application/json"},
                     json=payload,
+                    timeout=1800
                 )
                 response.raise_for_status()
                 response_data = response.json()
                 job_id = response_data.get("id")
                 if not job_id:
                     if attempt < max_retries - 1:
                         time.sleep(retry_delay)
                         continue
                     raise gr.Error("Failed to get job ID from Sieve API")
                 break
             except requests.exceptions.RequestException as e:
                 if attempt < max_retries - 1:
                     time.sleep(retry_delay)
                     continue
                 raise
         # Poll for job completion
         poll_count = 0
         max_polls = 1800
         while True:
             poll_count += 1
             try:
                 job_response = requests.get(
                 job_data = job_response.json()
                 status = job_data.get("status")
                 if status == "completed" or status == "finished":
                     output_data = job_data.get("outputs", [])
                     if not output_data:
                         raise gr.Error("No output data in job response")
                     first_output = output_data[0]
                     if not isinstance(first_output, dict):
                         raise gr.Error("Unexpected output format from job response")
                     output_data = first_output.get("data", {})
                     if not isinstance(output_data, dict):
                         raise gr.Error("Unexpected data format from job response")
                     audio_url = output_data.get("url")
                     if not audio_url:
                         raise gr.Error("No audio URL in output data")
                     audio_response = requests.get(audio_url, timeout=1800)
                     audio_response.raise_for_status()
                     with open(output_path, "wb") as f:
                         f.write(audio_response.content)
                     break
                 elif status == "failed":
                     error_msg = job_data.get("error", "Unknown error")
                     raise gr.Error(f"Job failed: {error_msg}")
                 if poll_count >= max_polls:
                     raise gr.Error("Download took too long. Please try again or check if the video is accessible.")
                 time.sleep(2)
             except requests.exceptions.RequestException as e:
                 if poll_count >= max_polls:
                     raise gr.Error("Failed to check job status. Please try again.")
                 time.sleep(2)
     except Exception as e:
+        logger.exception(f"Error during YouTube download: {str(e)}")
         raise gr.Error(f"Failed to download YouTube audio: {str(e)}")
     return output_path
+def check_api_health():
+    """Check if the API is healthy before making requests."""
     try:
+        response = requests.get(f"{API_URL}/health")
+        response.raise_for_status()
+        health_data = response.json()
+        # Check if service is healthy
+        if health_data.get("status") != "healthy":
+            raise gr.Error("API service is not healthy. Please try again later.")
+        # Check resource usage
+        cpu_percent = health_data.get("cpu_percent", 0)
+        memory_percent = health_data.get("memory_percent", 0)
+        if cpu_percent > 90 or memory_percent > 90:
+            logger.warning(f"High resource usage detected - CPU: {cpu_percent}%, Memory: {memory_percent}%")
+        return True
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Health check failed: {str(e)}")
+        raise gr.Error("Failed to connect to the API service. Please try again later.")
+def transcribe_youtube(url, return_timestamps, generate_subs, chunk_length_s=30, batch_size=128):
+    """Transcribe audio from YouTube video using URL endpoint."""
+    logger.info(f"Starting YouTube transcription process for URL: {url}")
     try:
+        # Check API health first
+        check_api_health()
+        # Get audio URL from Sieve
+        audio_url = download_youtube_audio(url)
         # Prepare request parameters
+        params = {
+            "source_language": "tg",  # Tajik language
+            "timestamp_level": "sentence" if return_timestamps else None,
+            "task": "transcribe",
+            "chunk_length_s": chunk_length_s,
+            "batch_size": batch_size
+        }
         # Send request to API
         response = requests.post(
+            f"{API_URL}/transcribe/url",
+            json={"url": audio_url},
             params=params
         )
         response.raise_for_status()
         result = response.json()
+        # Format response
         formatted_result = {
+            "text": result["transcription"]["text"],
+            "language": result["transcription"]["language"]
         }
         # Generate subtitles if requested
         srt_file = None
+        if generate_subs and return_timestamps and "segments" in result["transcription"]:
+            srt_content = generate_srt(result["transcription"]["segments"])
             srt_file = save_srt_to_file(srt_content)
+        return formatted_result, srt_file, ""
+    except Exception as e:
+        logger.exception(f"Error in YouTube transcription: {str(e)}")
+        raise gr.Error(f"Failed to transcribe YouTube video: {str(e)}")
+def transcribe(inputs, return_timestamps, generate_subs, chunk_length_s=30, batch_size=128):
+    """Transcribe audio input using Whisper API."""
+    logger.info(f"Starting transcription process for file: {inputs}")
+    if inputs is None:
+        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
+    try:
+        # Check API health first
+        check_api_health()
+        # Read the audio file
+        with open(inputs, "rb") as f:
+            files = {"file": f}
+            # Prepare request parameters
+            params = {
+                "source_language": "tg",  # Tajik language
+                "timestamp_level": "sentence" if return_timestamps else None,
+                "task": "transcribe",
+                "chunk_length_s": chunk_length_s,
+                "batch_size": batch_size
+            }
+            # Send request to API
+            response = requests.post(
+                f"{API_URL}/transcribe",
+                files=files,
+                params=params
+            )
+            response.raise_for_status()
+            result = response.json()
+            # Format response
+            formatted_result = {
+                "text": result["transcription"]["text"],
+                "language": result["transcription"]["language"]
+            }
+            # Generate subtitles if requested
+            srt_file = None
+            if generate_subs and return_timestamps and "segments" in result["transcription"]:
+                srt_content = generate_srt(result["transcription"]["segments"])
+                srt_file = save_srt_to_file(srt_content)
+            return formatted_result, srt_file, ""
     except requests.exceptions.RequestException as e:
         logger.exception(f"API request failed: {str(e)}")
         raise gr.Error(f"Failed to transcribe audio: API request failed - {str(e)}")
         gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=..."),
         gr.Checkbox(label="Include timestamps", value=True),
         gr.Checkbox(label="Generate subtitles", value=True),
+        gr.Slider(minimum=10, maximum=60, value=30, step=5, label="Chunk Length (seconds)"),
+        gr.Slider(minimum=32, maximum=256, value=128, step=32, label="Batch Size")
     ],
     outputs=[
         gr.JSON(label="Transcription", open=True),
         gr.Audio(sources="microphone", type="filepath"),
         gr.Checkbox(label="Include timestamps", value=True),
         gr.Checkbox(label="Generate subtitles", value=True),
+        gr.Slider(minimum=10, maximum=60, value=30, step=5, label="Chunk Length (seconds)"),
+        gr.Slider(minimum=32, maximum=256, value=128, step=32, label="Batch Size")
     ],
     outputs=[
         gr.JSON(label="Transcription", open=True),
         gr.Audio(sources="upload", type="filepath", label="Audio file"),
         gr.Checkbox(label="Include timestamps", value=True),
         gr.Checkbox(label="Generate subtitles", value=True),
+        gr.Slider(minimum=10, maximum=60, value=30, step=5, label="Chunk Length (seconds)"),
+        gr.Slider(minimum=32, maximum=256, value=128, step=32, label="Batch Size")
     ],
     outputs=[
         gr.JSON(label="Transcription", open=True),