Spaces:
Sleeping
Sleeping
File size: 8,463 Bytes
a0dfdc4 183aa58 7eb5f41 183aa58 7eb5f41 183aa58 a0dfdc4 183aa58 b774e3c 23f13e6 183aa58 7eb5f41 7e5ccd3 791ecaf 183aa58 7eb5f41 791ecaf b774e3c d48b815 5faac94 183aa58 7eb5f41 183aa58 7eb5f41 791ecaf 7eb5f41 183aa58 b774e3c 7eb5f41 b774e3c 183aa58 7a01365 2a37fbd 183aa58 a0dfdc4 183aa58 b774e3c 183aa58 791ecaf 7eb5f41 791ecaf 7eb5f41 791ecaf 7eb5f41 791ecaf 183aa58 a30ee88 183aa58 7eb5f41 183aa58 bc23581 183aa58 d48b815 b774e3c 791ecaf b774e3c 183aa58 b774e3c d48b815 7e5ccd3 183aa58 791ecaf b774e3c 5faac94 791ecaf b774e3c 791ecaf b774e3c 2a37fbd 5faac94 791ecaf 183aa58 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
import spaces # Import spaces first to avoid CUDA initialization issues
import os
import gradio as gr
import trafilatura
from trafilatura import fetch_url, extract
import docling
from docling.document_converter import DocumentConverter
import torch
import soundfile as sf
import numpy as np
from langdetect import detect
from kokoro import KPipeline
import re
import json
import nltk
nltk.download("punkt")
# Initialize KokoroTTS with default English
#kokoro_tts = KPipeline(lang_code='a') # 'a' is for American English
kokoro_tts = KPipeline(lang_code='a', device="cpu") # Load initially on CPU
# Supported TTS Languages
SUPPORTED_TTS_LANGUAGES = {
"en": "a", # English (default)
"fr": "f", # French
"hi": "h", # Hindi
"it": "i", # Italian
"pt": "p", # Brazilian Portuguese
}
# Available voices in KokoroTTS
AVAILABLE_VOICES = [
'af_bella', 'af_sarah', 'am_adam', 'am_michael', 'bf_emma',
'bf_isabella', 'bm_george', 'bm_lewis', 'af_nicole', 'af_sky'
]
### 1️⃣ Fetch and Extract Content (Runs Immediately)
def fetch_and_display_content(url):
"""Fetch and extract text from a given URL (HTML or PDF)."""
if url.endswith(".pdf") or "pdf" in url:
converter = DocumentConverter()
#result = converter.convert(source)
text = converter.convert(url).document.export_to_markdown()
else:
downloaded = trafilatura.fetch_url(url)
text = extract(downloaded, output_format="markdown", with_metadata=True, include_tables=False, include_links=False, include_formatting=True, include_comments=False) #without metadata extraction
metadata, cleaned_text = extract_and_clean_text(text)
detected_lang = detect_language(cleaned_text)
# Add detected language to metadata
metadata["Detected Language"] = detected_lang.upper()
#return cleaned_text, detected_lang, gr.update(visible=True), gr.update(visible=True)
#return cleaned_text, metadata, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
return cleaned_text, metadata, detected_lang, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
### 2️⃣ Cleaning Function
def extract_and_clean_text(data):
metadata_dict = {}
# Step 1: Extract metadata enclosed between "---" at the beginning
metadata_pattern = re.match(r"^---(.*?)---", data, re.DOTALL)
if metadata_pattern:
metadata_raw = metadata_pattern.group(1).strip()
data = data[metadata_pattern.end():].strip() # Remove metadata from text
# Convert metadata into dictionary format manually (since YAML isn't reliable)
metadata_lines = metadata_raw.split("\n")
for line in metadata_lines:
if ": " in line: # Only process lines with key-value pairs
key, value = line.split(": ", 1) # Split at first ": "
# Convert lists (wrapped in square brackets) into Python lists
if value.startswith("[") and value.endswith("]"):
try:
value = json.loads(value) # Convert to list
except json.JSONDecodeError:
pass # If JSON parsing fails, keep it as a string
metadata_dict[key.strip()] = value.strip() # Store cleaned key-value pair
# Step 2: Clean the extracted text
def clean_text(text):
# Remove inline citations like [2][4]
text = re.sub(r'\[\d+\]', '', text)
# Remove URLs (both direct links and markdown-style links)
text = re.sub(r'http[s]?://\S+', '', text) # Direct links
text = re.sub(r'\[.*?\]\(http[s]?://\S+\)', '', text) # Markdown links
# Remove markdown-style headings and special characters (#, ##, *, etc.)
#text = re.sub(r'^\s*#+\s*', '', text, flags=re.MULTILINE) # Remove headings
#text = re.sub(r'[*_`]', '', text) # Remove bold/italic/monospace markers
# Remove References, Bibliography, External Links, and Comments sections
patterns = [r'References\b.*', r'Bibliography\b.*', r'External Links\b.*', r'COMMENTS\b.*']
for pattern in patterns:
text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
# Remove extra whitespace and newlines
text = re.sub(r'\n\s*\n+', '\n\n', text).strip()
return text
#cleaned_text = clean_text(data)
#return metadata_dict, cleaned_text
return metadata_dict, clean_text(data)
### 3️⃣ Language Detection
def detect_language(text):
"""Detects the language of extracted text."""
try:
lang = detect(text)
return lang if lang in SUPPORTED_TTS_LANGUAGES else "en" # Default to English if not supported
except:
return "en" # Default to English if detection fails
### 4️⃣ TTS Functionality (KokoroTTS)
@spaces.GPU(duration=1000)
def generate_audio_kokoro(text, lang, selected_voice):
"""Generate speech using KokoroTTS for supported languages."""
global kokoro_tts # Access the preloaded model
lang_code = SUPPORTED_TTS_LANGUAGES.get(lang, "a") # Default to English
#generator = kokoro_tts(text, voice="af_bella", speed=1, split_pattern=r'\n+')
generator = kokoro_tts(text, voice=selected_voice, speed=1, split_pattern=r'\n+')
# Generate and collect audio data
audio_data_list = [audio for _, _, audio in generator]
full_audio = np.concatenate(audio_data_list)
# Initialize an empty list to store audio data
#audio_data_list = []
# Generate and collect audio data
#for i, (gs, ps, audio) in enumerate(generator):
# print(f"Processing segment {i + 1}")
# print(gs) # Print the text segment
# audio_data_list.append(audio) # Append audio data to the list
# Concatenate all audio data into a single array
full_audio = np.concatenate(audio_data_list)
output_file = f"audio_{lang}.wav"
sf.write(output_file, full_audio, 24000) # Save as WAV file
return output_file
### 5️⃣ Main Processing Function
def process_url(url):
"""Processes the URL, extracts text, detects language, and converts to audio."""
content = fetch_content(url)
metadata,cleaned_text = extract_and_clean_text(content)
detected_lang = detect_language(cleaned_text)
audio_file = generate_audio_kokoro(cleaned_text, detected_lang)
return cleaned_text, detected_lang, audio_file
### 6️⃣ Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("# 🌍 Web-to-Audio Converter 🎙️")
url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com/article")
voice_selection = gr.Dropdown(AVAILABLE_VOICES, label="Select Voice", value="af_bella")
process_text_button = gr.Button("Fetch Text & Detect Language")
process_audio_button = gr.Button("Generate Audio", visible=False)
# Layout: Two adjacent columns (Text and Metadata)
with gr.Row():
extracted_text = gr.Textbox(label="Extracted Content", visible=False, interactive=False, lines=15)
metadata_output = gr.JSON(label="Article Metadata", visible=False) # Displays metadata
#extracted_text = gr.Markdown(label="Extracted Content")
detected_lang = gr.Textbox(label="Detected Language", visible=False)
full_audio_output = gr.Audio(label="Generated Audio", visible=True)
# Step 1: Fetch Text & Detect Language First
process_text_button.click(
fetch_and_display_content,
inputs=[url_input],
#outputs=[extracted_text, detected_language, process_audio_button, extracted_text]
#outputs=[extracted_text, metadata_output, process_audio_button, extracted_text, metadata_output]
outputs=[extracted_text, metadata_output, detected_lang, process_audio_button, extracted_text, metadata_output]
)
# Step 2: Generate Audio After Text & Language Are Displayed
process_audio_button.click(
generate_audio_kokoro,
#inputs=[extracted_text, detected_language],
#inputs=[extracted_text, metadata_output, voice_selection],
#inputs=[extracted_text, metadata_output["Detected Language"], voice_selection],
inputs=[extracted_text, detected_lang, voice_selection],
outputs=[full_audio_output]
)
#process_button.click(process_url, inputs=[url_input], outputs=[extracted_text, detected_language, full_audio_output])
demo.launch()
|