VocalWeb / app.py
PuristanLabs1's picture
Update app.py
7e5ccd3 verified
raw
history blame
8.46 kB
import spaces # Import spaces first to avoid CUDA initialization issues
import os
import gradio as gr
import trafilatura
from trafilatura import fetch_url, extract
import docling
from docling.document_converter import DocumentConverter
import torch
import soundfile as sf
import numpy as np
from langdetect import detect
from kokoro import KPipeline
import re
import json
import nltk
nltk.download("punkt")
# Initialize KokoroTTS with default English
#kokoro_tts = KPipeline(lang_code='a') # 'a' is for American English
kokoro_tts = KPipeline(lang_code='a', device="cpu") # Load initially on CPU
# Supported TTS Languages
SUPPORTED_TTS_LANGUAGES = {
"en": "a", # English (default)
"fr": "f", # French
"hi": "h", # Hindi
"it": "i", # Italian
"pt": "p", # Brazilian Portuguese
}
# Available voices in KokoroTTS
AVAILABLE_VOICES = [
'af_bella', 'af_sarah', 'am_adam', 'am_michael', 'bf_emma',
'bf_isabella', 'bm_george', 'bm_lewis', 'af_nicole', 'af_sky'
]
### 1️⃣ Fetch and Extract Content (Runs Immediately)
def fetch_and_display_content(url):
"""Fetch and extract text from a given URL (HTML or PDF)."""
if url.endswith(".pdf") or "pdf" in url:
converter = DocumentConverter()
#result = converter.convert(source)
text = converter.convert(url).document.export_to_markdown()
else:
downloaded = trafilatura.fetch_url(url)
text = extract(downloaded, output_format="markdown", with_metadata=True, include_tables=False, include_links=False, include_formatting=True, include_comments=False) #without metadata extraction
metadata, cleaned_text = extract_and_clean_text(text)
detected_lang = detect_language(cleaned_text)
# Add detected language to metadata
metadata["Detected Language"] = detected_lang.upper()
#return cleaned_text, detected_lang, gr.update(visible=True), gr.update(visible=True)
#return cleaned_text, metadata, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
return cleaned_text, metadata, detected_lang, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
### 2️⃣ Cleaning Function
def extract_and_clean_text(data):
metadata_dict = {}
# Step 1: Extract metadata enclosed between "---" at the beginning
metadata_pattern = re.match(r"^---(.*?)---", data, re.DOTALL)
if metadata_pattern:
metadata_raw = metadata_pattern.group(1).strip()
data = data[metadata_pattern.end():].strip() # Remove metadata from text
# Convert metadata into dictionary format manually (since YAML isn't reliable)
metadata_lines = metadata_raw.split("\n")
for line in metadata_lines:
if ": " in line: # Only process lines with key-value pairs
key, value = line.split(": ", 1) # Split at first ": "
# Convert lists (wrapped in square brackets) into Python lists
if value.startswith("[") and value.endswith("]"):
try:
value = json.loads(value) # Convert to list
except json.JSONDecodeError:
pass # If JSON parsing fails, keep it as a string
metadata_dict[key.strip()] = value.strip() # Store cleaned key-value pair
# Step 2: Clean the extracted text
def clean_text(text):
# Remove inline citations like [2][4]
text = re.sub(r'\[\d+\]', '', text)
# Remove URLs (both direct links and markdown-style links)
text = re.sub(r'http[s]?://\S+', '', text) # Direct links
text = re.sub(r'\[.*?\]\(http[s]?://\S+\)', '', text) # Markdown links
# Remove markdown-style headings and special characters (#, ##, *, etc.)
#text = re.sub(r'^\s*#+\s*', '', text, flags=re.MULTILINE) # Remove headings
#text = re.sub(r'[*_`]', '', text) # Remove bold/italic/monospace markers
# Remove References, Bibliography, External Links, and Comments sections
patterns = [r'References\b.*', r'Bibliography\b.*', r'External Links\b.*', r'COMMENTS\b.*']
for pattern in patterns:
text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
# Remove extra whitespace and newlines
text = re.sub(r'\n\s*\n+', '\n\n', text).strip()
return text
#cleaned_text = clean_text(data)
#return metadata_dict, cleaned_text
return metadata_dict, clean_text(data)
### 3️⃣ Language Detection
def detect_language(text):
"""Detects the language of extracted text."""
try:
lang = detect(text)
return lang if lang in SUPPORTED_TTS_LANGUAGES else "en" # Default to English if not supported
except:
return "en" # Default to English if detection fails
### 4️⃣ TTS Functionality (KokoroTTS)
@spaces.GPU(duration=1000)
def generate_audio_kokoro(text, lang, selected_voice):
"""Generate speech using KokoroTTS for supported languages."""
global kokoro_tts # Access the preloaded model
lang_code = SUPPORTED_TTS_LANGUAGES.get(lang, "a") # Default to English
#generator = kokoro_tts(text, voice="af_bella", speed=1, split_pattern=r'\n+')
generator = kokoro_tts(text, voice=selected_voice, speed=1, split_pattern=r'\n+')
# Generate and collect audio data
audio_data_list = [audio for _, _, audio in generator]
full_audio = np.concatenate(audio_data_list)
# Initialize an empty list to store audio data
#audio_data_list = []
# Generate and collect audio data
#for i, (gs, ps, audio) in enumerate(generator):
# print(f"Processing segment {i + 1}")
# print(gs) # Print the text segment
# audio_data_list.append(audio) # Append audio data to the list
# Concatenate all audio data into a single array
full_audio = np.concatenate(audio_data_list)
output_file = f"audio_{lang}.wav"
sf.write(output_file, full_audio, 24000) # Save as WAV file
return output_file
### 5️⃣ Main Processing Function
def process_url(url):
"""Processes the URL, extracts text, detects language, and converts to audio."""
content = fetch_content(url)
metadata,cleaned_text = extract_and_clean_text(content)
detected_lang = detect_language(cleaned_text)
audio_file = generate_audio_kokoro(cleaned_text, detected_lang)
return cleaned_text, detected_lang, audio_file
### 6️⃣ Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("# 🌍 Web-to-Audio Converter 🎙️")
url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com/article")
voice_selection = gr.Dropdown(AVAILABLE_VOICES, label="Select Voice", value="af_bella")
process_text_button = gr.Button("Fetch Text & Detect Language")
process_audio_button = gr.Button("Generate Audio", visible=False)
# Layout: Two adjacent columns (Text and Metadata)
with gr.Row():
extracted_text = gr.Textbox(label="Extracted Content", visible=False, interactive=False, lines=15)
metadata_output = gr.JSON(label="Article Metadata", visible=False) # Displays metadata
#extracted_text = gr.Markdown(label="Extracted Content")
detected_lang = gr.Textbox(label="Detected Language", visible=False)
full_audio_output = gr.Audio(label="Generated Audio", visible=True)
# Step 1: Fetch Text & Detect Language First
process_text_button.click(
fetch_and_display_content,
inputs=[url_input],
#outputs=[extracted_text, detected_language, process_audio_button, extracted_text]
#outputs=[extracted_text, metadata_output, process_audio_button, extracted_text, metadata_output]
outputs=[extracted_text, metadata_output, detected_lang, process_audio_button, extracted_text, metadata_output]
)
# Step 2: Generate Audio After Text & Language Are Displayed
process_audio_button.click(
generate_audio_kokoro,
#inputs=[extracted_text, detected_language],
#inputs=[extracted_text, metadata_output, voice_selection],
#inputs=[extracted_text, metadata_output["Detected Language"], voice_selection],
inputs=[extracted_text, detected_lang, voice_selection],
outputs=[full_audio_output]
)
#process_button.click(process_url, inputs=[url_input], outputs=[extracted_text, detected_language, full_audio_output])
demo.launch()