File size: 3,308 Bytes
183aa58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import gradio as gr
import trafilatura
import docling
import torch
import soundfile as sf
import numpy as np
from langdetect import detect
from kokoro import KPipeline
import re
import json
import nltk

nltk.download("punkt")

# Initialize KokoroTTS with default English
kokoro_tts = KPipeline(lang_code='a')  # 'a' is for American English

# Supported TTS Languages
SUPPORTED_TTS_LANGUAGES = {
    "en": "a",  # English (default)
    "fr": "f",  # French
    "hi": "h",  # Hindi
    "it": "i",  # Italian
    "pt": "p",  # Brazilian Portuguese
}

### 1️⃣ Fetch and Extract Content
def fetch_content(url):
    """Fetch and extract text from a given URL (HTML or PDF)."""
    if url.endswith(".pdf") or "pdf" in url:
        text = docling.extract_text(url)
    else:
        downloaded = trafilatura.fetch_url(url)
        text = trafilatura.extract(downloaded, output_format="markdown", with_metadata=False)
    return text

### 2️⃣ Cleaning Function
def extract_and_clean_text(data):
    """Removes citations, links, markdown elements, and unnecessary sections."""
    def clean_text(text):
        text = re.sub(r'\[\d+\]', '', text)  # Remove citations like [2][4]
        text = re.sub(r'http[s]?://\S+', '', text)  # Remove URLs
        text = re.sub(r'[*_`]', '', text)  # Remove markdown formatting
        text = re.sub(r'\n\s*\n+', '\n\n', text).strip()  # Remove excessive whitespace
        return text

    return clean_text(data)

### 3️⃣ Language Detection
def detect_language(text):
    """Detects the language of extracted text."""
    try:
        lang = detect(text)
        return lang if lang in SUPPORTED_TTS_LANGUAGES else "en"  # Default to English if not supported
    except:
        return "en"  # Default to English if detection fails

### 4️⃣ TTS Functionality (KokoroTTS)
def generate_audio_kokoro(text, lang):
    """Generate speech using KokoroTTS for supported languages."""
    lang_code = SUPPORTED_TTS_LANGUAGES.get(lang, "a")  # Default to English
    generator = kokoro_tts(text, voice="af_bella", speed=1, split_pattern=r'\n+')

    # Combine audio segments into a single file
    audio_data = np.concatenate([audio for gs, ps, audio in generator])
    output_file = f"audio_{lang}.wav"
    sf.write(output_file, audio_data, 24000)  # Save as WAV file
    return output_file

### 5️⃣ Main Processing Function
def process_url(url):
    """Processes the URL, extracts text, detects language, and converts to audio."""
    content = fetch_content(url)
    cleaned_text = extract_and_clean_text(content)
    detected_lang = detect_language(cleaned_text)
    audio_file = generate_audio_kokoro(cleaned_text, detected_lang)

    return cleaned_text, detected_lang, audio_file

### 6️⃣ Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# 🌍 Web-to-Audio Converter 🎙️")
    
    url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com/article")
    process_button = gr.Button("Generate Audio")
    
    extracted_text = gr.Markdown(label="Extracted Content")
    detected_language = gr.Textbox(label="Detected Language")
    full_audio_output = gr.Audio(label="Generated Audio")

    process_button.click(process_url, inputs=[url_input], outputs=[extracted_text, detected_language, full_audio_output])

demo.launch()