Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import trafilatura | |
import docling | |
import torch | |
import soundfile as sf | |
import numpy as np | |
from langdetect import detect | |
from kokoro import KPipeline | |
import re | |
import json | |
import nltk | |
nltk.download("punkt") | |
# Initialize KokoroTTS with default English | |
kokoro_tts = KPipeline(lang_code='a') # 'a' is for American English | |
# Supported TTS Languages | |
SUPPORTED_TTS_LANGUAGES = { | |
"en": "a", # English (default) | |
"fr": "f", # French | |
"hi": "h", # Hindi | |
"it": "i", # Italian | |
"pt": "p", # Brazilian Portuguese | |
} | |
### 1️⃣ Fetch and Extract Content | |
def fetch_content(url): | |
"""Fetch and extract text from a given URL (HTML or PDF).""" | |
if url.endswith(".pdf") or "pdf" in url: | |
text = docling.extract_text(url) | |
else: | |
downloaded = trafilatura.fetch_url(url) | |
text = trafilatura.extract(downloaded, output_format="markdown", with_metadata=False) | |
return text | |
### 2️⃣ Cleaning Function | |
def extract_and_clean_text(data): | |
"""Removes citations, links, markdown elements, and unnecessary sections.""" | |
def clean_text(text): | |
text = re.sub(r'\[\d+\]', '', text) # Remove citations like [2][4] | |
text = re.sub(r'http[s]?://\S+', '', text) # Remove URLs | |
text = re.sub(r'[*_`]', '', text) # Remove markdown formatting | |
text = re.sub(r'\n\s*\n+', '\n\n', text).strip() # Remove excessive whitespace | |
return text | |
return clean_text(data) | |
### 3️⃣ Language Detection | |
def detect_language(text): | |
"""Detects the language of extracted text.""" | |
try: | |
lang = detect(text) | |
return lang if lang in SUPPORTED_TTS_LANGUAGES else "en" # Default to English if not supported | |
except: | |
return "en" # Default to English if detection fails | |
### 4️⃣ TTS Functionality (KokoroTTS) | |
def generate_audio_kokoro(text, lang): | |
"""Generate speech using KokoroTTS for supported languages.""" | |
lang_code = SUPPORTED_TTS_LANGUAGES.get(lang, "a") # Default to English | |
generator = kokoro_tts(text, voice="af_bella", speed=1, split_pattern=r'\n+') | |
# Combine audio segments into a single file | |
audio_data = np.concatenate([audio for gs, ps, audio in generator]) | |
output_file = f"audio_{lang}.wav" | |
sf.write(output_file, audio_data, 24000) # Save as WAV file | |
return output_file | |
### 5️⃣ Main Processing Function | |
def process_url(url): | |
"""Processes the URL, extracts text, detects language, and converts to audio.""" | |
content = fetch_content(url) | |
cleaned_text = extract_and_clean_text(content) | |
detected_lang = detect_language(cleaned_text) | |
audio_file = generate_audio_kokoro(cleaned_text, detected_lang) | |
return cleaned_text, detected_lang, audio_file | |
### 6️⃣ Gradio Interface | |
with gr.Blocks() as demo: | |
gr.Markdown("# 🌍 Web-to-Audio Converter 🎙️") | |
url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com/article") | |
process_button = gr.Button("Generate Audio") | |
extracted_text = gr.Markdown(label="Extracted Content") | |
detected_language = gr.Textbox(label="Detected Language") | |
full_audio_output = gr.Audio(label="Generated Audio") | |
process_button.click(process_url, inputs=[url_input], outputs=[extracted_text, detected_language, full_audio_output]) | |
demo.launch() | |