Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import trafilatura
|
3 |
+
import docling
|
4 |
+
import torch
|
5 |
+
import soundfile as sf
|
6 |
+
import numpy as np
|
7 |
+
from langdetect import detect
|
8 |
+
from kokoro import KPipeline
|
9 |
+
import re
|
10 |
+
import json
|
11 |
+
import nltk
|
12 |
+
|
13 |
+
nltk.download("punkt")
|
14 |
+
|
15 |
+
# Initialize KokoroTTS with default English
|
16 |
+
kokoro_tts = KPipeline(lang_code='a') # 'a' is for American English
|
17 |
+
|
18 |
+
# Supported TTS Languages
|
19 |
+
SUPPORTED_TTS_LANGUAGES = {
|
20 |
+
"en": "a", # English (default)
|
21 |
+
"fr": "f", # French
|
22 |
+
"hi": "h", # Hindi
|
23 |
+
"it": "i", # Italian
|
24 |
+
"pt": "p", # Brazilian Portuguese
|
25 |
+
}
|
26 |
+
|
27 |
+
### 1️⃣ Fetch and Extract Content
|
28 |
+
def fetch_content(url):
|
29 |
+
"""Fetch and extract text from a given URL (HTML or PDF)."""
|
30 |
+
if url.endswith(".pdf") or "pdf" in url:
|
31 |
+
text = docling.extract_text(url)
|
32 |
+
else:
|
33 |
+
downloaded = trafilatura.fetch_url(url)
|
34 |
+
text = trafilatura.extract(downloaded, output_format="markdown", with_metadata=False)
|
35 |
+
return text
|
36 |
+
|
37 |
+
### 2️⃣ Cleaning Function
|
38 |
+
def extract_and_clean_text(data):
|
39 |
+
"""Removes citations, links, markdown elements, and unnecessary sections."""
|
40 |
+
def clean_text(text):
|
41 |
+
text = re.sub(r'\[\d+\]', '', text) # Remove citations like [2][4]
|
42 |
+
text = re.sub(r'http[s]?://\S+', '', text) # Remove URLs
|
43 |
+
text = re.sub(r'[*_`]', '', text) # Remove markdown formatting
|
44 |
+
text = re.sub(r'\n\s*\n+', '\n\n', text).strip() # Remove excessive whitespace
|
45 |
+
return text
|
46 |
+
|
47 |
+
return clean_text(data)
|
48 |
+
|
49 |
+
### 3️⃣ Language Detection
|
50 |
+
def detect_language(text):
|
51 |
+
"""Detects the language of extracted text."""
|
52 |
+
try:
|
53 |
+
lang = detect(text)
|
54 |
+
return lang if lang in SUPPORTED_TTS_LANGUAGES else "en" # Default to English if not supported
|
55 |
+
except:
|
56 |
+
return "en" # Default to English if detection fails
|
57 |
+
|
58 |
+
### 4️⃣ TTS Functionality (KokoroTTS)
|
59 |
+
def generate_audio_kokoro(text, lang):
|
60 |
+
"""Generate speech using KokoroTTS for supported languages."""
|
61 |
+
lang_code = SUPPORTED_TTS_LANGUAGES.get(lang, "a") # Default to English
|
62 |
+
generator = kokoro_tts(text, voice="af_bella", speed=1, split_pattern=r'\n+')
|
63 |
+
|
64 |
+
# Combine audio segments into a single file
|
65 |
+
audio_data = np.concatenate([audio for gs, ps, audio in generator])
|
66 |
+
output_file = f"audio_{lang}.wav"
|
67 |
+
sf.write(output_file, audio_data, 24000) # Save as WAV file
|
68 |
+
return output_file
|
69 |
+
|
70 |
+
### 5️⃣ Main Processing Function
|
71 |
+
def process_url(url):
|
72 |
+
"""Processes the URL, extracts text, detects language, and converts to audio."""
|
73 |
+
content = fetch_content(url)
|
74 |
+
cleaned_text = extract_and_clean_text(content)
|
75 |
+
detected_lang = detect_language(cleaned_text)
|
76 |
+
audio_file = generate_audio_kokoro(cleaned_text, detected_lang)
|
77 |
+
|
78 |
+
return cleaned_text, detected_lang, audio_file
|
79 |
+
|
80 |
+
### 6️⃣ Gradio Interface
|
81 |
+
with gr.Blocks() as demo:
|
82 |
+
gr.Markdown("# 🌍 Web-to-Audio Converter 🎙️")
|
83 |
+
|
84 |
+
url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com/article")
|
85 |
+
process_button = gr.Button("Generate Audio")
|
86 |
+
|
87 |
+
extracted_text = gr.Markdown(label="Extracted Content")
|
88 |
+
detected_language = gr.Textbox(label="Detected Language")
|
89 |
+
full_audio_output = gr.Audio(label="Generated Audio")
|
90 |
+
|
91 |
+
process_button.click(process_url, inputs=[url_input], outputs=[extracted_text, detected_language, full_audio_output])
|
92 |
+
|
93 |
+
demo.launch()
|