Spaces:

PuristanLabs1
/

VocalWeb

Sleeping

App Files Files Community

VocalWeb / app.py

PuristanLabs1

Update app.py

7e5ccd3 verified 6 months ago

raw

history blame

8.46 kB

	import spaces # Import spaces first to avoid CUDA initialization issues
	import os
	import gradio as gr
	import trafilatura
	from trafilatura import fetch_url, extract
	import docling
	from docling.document_converter import DocumentConverter
	import torch
	import soundfile as sf
	import numpy as np
	from langdetect import detect
	from kokoro import KPipeline
	import re
	import json
	import nltk

	nltk.download("punkt")

	# Initialize KokoroTTS with default English
	#kokoro_tts = KPipeline(lang_code='a') # 'a' is for American English
	kokoro_tts = KPipeline(lang_code='a', device="cpu") # Load initially on CPU

	# Supported TTS Languages
	SUPPORTED_TTS_LANGUAGES = {
	"en": "a", # English (default)
	"fr": "f", # French
	"hi": "h", # Hindi
	"it": "i", # Italian
	"pt": "p", # Brazilian Portuguese
	}

	# Available voices in KokoroTTS
	AVAILABLE_VOICES = [
	'af_bella', 'af_sarah', 'am_adam', 'am_michael', 'bf_emma',
	'bf_isabella', 'bm_george', 'bm_lewis', 'af_nicole', 'af_sky'
	]

	### 1️⃣ Fetch and Extract Content (Runs Immediately)
	def fetch_and_display_content(url):
	"""Fetch and extract text from a given URL (HTML or PDF)."""
	if url.endswith(".pdf") or "pdf" in url:
	converter = DocumentConverter()
	#result = converter.convert(source)
	text = converter.convert(url).document.export_to_markdown()
	else:
	downloaded = trafilatura.fetch_url(url)
	text = extract(downloaded, output_format="markdown", with_metadata=True, include_tables=False, include_links=False, include_formatting=True, include_comments=False) #without metadata extraction
	metadata, cleaned_text = extract_and_clean_text(text)
	detected_lang = detect_language(cleaned_text)

	# Add detected language to metadata
	metadata["Detected Language"] = detected_lang.upper()
	#return cleaned_text, detected_lang, gr.update(visible=True), gr.update(visible=True)
	#return cleaned_text, metadata, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
	return cleaned_text, metadata, detected_lang, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)

	### 2️⃣ Cleaning Function
	def extract_and_clean_text(data):

	metadata_dict = {}

	# Step 1: Extract metadata enclosed between "---" at the beginning
	metadata_pattern = re.match(r"^---(.*?)---", data, re.DOTALL)

	if metadata_pattern:
	metadata_raw = metadata_pattern.group(1).strip()
	data = data[metadata_pattern.end():].strip() # Remove metadata from text

	# Convert metadata into dictionary format manually (since YAML isn't reliable)
	metadata_lines = metadata_raw.split("\n")
	for line in metadata_lines:
	if ": " in line: # Only process lines with key-value pairs
	key, value = line.split(": ", 1) # Split at first ": "

	# Convert lists (wrapped in square brackets) into Python lists
	if value.startswith("[") and value.endswith("]"):
	try:
	value = json.loads(value) # Convert to list
	except json.JSONDecodeError:
	pass # If JSON parsing fails, keep it as a string

	metadata_dict[key.strip()] = value.strip() # Store cleaned key-value pair

	# Step 2: Clean the extracted text
	def clean_text(text):
	# Remove inline citations like [2][4]
	text = re.sub(r'\[\d+\]', '', text)

	# Remove URLs (both direct links and markdown-style links)
	text = re.sub(r'http[s]?://\S+', '', text) # Direct links
	text = re.sub(r'\[.*?\]\(http[s]?://\S+\)', '', text) # Markdown links

	# Remove markdown-style headings and special characters (#, ##, *, etc.)
	#text = re.sub(r'^\s#+\s', '', text, flags=re.MULTILINE) # Remove headings
	#text = re.sub(r'[*_`]', '', text) # Remove bold/italic/monospace markers

	# Remove References, Bibliography, External Links, and Comments sections
	patterns = [r'References\b.', r'Bibliography\b.', r'External Links\b.', r'COMMENTS\b.']
	for pattern in patterns:
	text = re.sub(pattern, '', text, flags=re.IGNORECASE \| re.DOTALL)

	# Remove extra whitespace and newlines
	text = re.sub(r'\n\s*\n+', '\n\n', text).strip()

	return text

	#cleaned_text = clean_text(data)

	#return metadata_dict, cleaned_text
	return metadata_dict, clean_text(data)

	### 3️⃣ Language Detection
	def detect_language(text):
	"""Detects the language of extracted text."""
	try:
	lang = detect(text)
	return lang if lang in SUPPORTED_TTS_LANGUAGES else "en" # Default to English if not supported
	except:
	return "en" # Default to English if detection fails

	### 4️⃣ TTS Functionality (KokoroTTS)
	@spaces.GPU(duration=1000)
	def generate_audio_kokoro(text, lang, selected_voice):
	"""Generate speech using KokoroTTS for supported languages."""
	global kokoro_tts # Access the preloaded model
	lang_code = SUPPORTED_TTS_LANGUAGES.get(lang, "a") # Default to English
	#generator = kokoro_tts(text, voice="af_bella", speed=1, split_pattern=r'\n+')
	generator = kokoro_tts(text, voice=selected_voice, speed=1, split_pattern=r'\n+')

	# Generate and collect audio data
	audio_data_list = [audio for _, _, audio in generator]
	full_audio = np.concatenate(audio_data_list)

	# Initialize an empty list to store audio data
	#audio_data_list = []
	# Generate and collect audio data
	#for i, (gs, ps, audio) in enumerate(generator):
	# print(f"Processing segment {i + 1}")
	# print(gs) # Print the text segment
	# audio_data_list.append(audio) # Append audio data to the list

	# Concatenate all audio data into a single array
	full_audio = np.concatenate(audio_data_list)

	output_file = f"audio_{lang}.wav"
	sf.write(output_file, full_audio, 24000) # Save as WAV file
	return output_file

	### 5️⃣ Main Processing Function
	def process_url(url):
	"""Processes the URL, extracts text, detects language, and converts to audio."""
	content = fetch_content(url)
	metadata,cleaned_text = extract_and_clean_text(content)
	detected_lang = detect_language(cleaned_text)
	audio_file = generate_audio_kokoro(cleaned_text, detected_lang)

	return cleaned_text, detected_lang, audio_file

	### 6️⃣ Gradio Interface
	with gr.Blocks() as demo:
	gr.Markdown("# 🌍 Web-to-Audio Converter 🎙️")

	url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com/article")

	voice_selection = gr.Dropdown(AVAILABLE_VOICES, label="Select Voice", value="af_bella")

	process_text_button = gr.Button("Fetch Text & Detect Language")
	process_audio_button = gr.Button("Generate Audio", visible=False)

	# Layout: Two adjacent columns (Text and Metadata)
	with gr.Row():
	extracted_text = gr.Textbox(label="Extracted Content", visible=False, interactive=False, lines=15)
	metadata_output = gr.JSON(label="Article Metadata", visible=False) # Displays metadata


	#extracted_text = gr.Markdown(label="Extracted Content")

	detected_lang = gr.Textbox(label="Detected Language", visible=False)
	full_audio_output = gr.Audio(label="Generated Audio", visible=True)

	# Step 1: Fetch Text & Detect Language First
	process_text_button.click(
	fetch_and_display_content,
	inputs=[url_input],
	#outputs=[extracted_text, detected_language, process_audio_button, extracted_text]
	#outputs=[extracted_text, metadata_output, process_audio_button, extracted_text, metadata_output]
	outputs=[extracted_text, metadata_output, detected_lang, process_audio_button, extracted_text, metadata_output]
	)

	# Step 2: Generate Audio After Text & Language Are Displayed
	process_audio_button.click(
	generate_audio_kokoro,
	#inputs=[extracted_text, detected_language],
	#inputs=[extracted_text, metadata_output, voice_selection],
	#inputs=[extracted_text, metadata_output["Detected Language"], voice_selection],
	inputs=[extracted_text, detected_lang, voice_selection],
	outputs=[full_audio_output]
	)

	#process_button.click(process_url, inputs=[url_input], outputs=[extracted_text, detected_language, full_audio_output])

	demo.launch()