Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import trafilatura
|
|
|
|
| 3 |
import docling
|
|
|
|
| 4 |
import torch
|
| 5 |
import soundfile as sf
|
| 6 |
import numpy as np
|
|
@@ -28,23 +30,66 @@ SUPPORTED_TTS_LANGUAGES = {
|
|
| 28 |
def fetch_content(url):
|
| 29 |
"""Fetch and extract text from a given URL (HTML or PDF)."""
|
| 30 |
if url.endswith(".pdf") or "pdf" in url:
|
| 31 |
-
|
|
|
|
| 32 |
else:
|
| 33 |
downloaded = trafilatura.fetch_url(url)
|
| 34 |
-
text =
|
| 35 |
return text
|
| 36 |
|
| 37 |
### 2️⃣ Cleaning Function
|
| 38 |
def extract_and_clean_text(data):
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
def clean_text(text):
|
| 41 |
-
|
| 42 |
-
text = re.sub(r'
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
return text
|
| 46 |
|
| 47 |
-
|
|
|
|
|
|
|
| 48 |
|
| 49 |
### 3️⃣ Language Detection
|
| 50 |
def detect_language(text):
|
|
@@ -60,22 +105,33 @@ def generate_audio_kokoro(text, lang):
|
|
| 60 |
"""Generate speech using KokoroTTS for supported languages."""
|
| 61 |
lang_code = SUPPORTED_TTS_LANGUAGES.get(lang, "a") # Default to English
|
| 62 |
generator = kokoro_tts(text, voice="af_bella", speed=1, split_pattern=r'\n+')
|
|
|
|
|
|
|
| 63 |
|
| 64 |
-
#
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
output_file = f"audio_{lang}.wav"
|
| 67 |
-
sf.write(output_file,
|
| 68 |
return output_file
|
| 69 |
|
| 70 |
### 5️⃣ Main Processing Function
|
| 71 |
def process_url(url):
|
| 72 |
"""Processes the URL, extracts text, detects language, and converts to audio."""
|
| 73 |
content = fetch_content(url)
|
| 74 |
-
cleaned_text = extract_and_clean_text(content)
|
| 75 |
detected_lang = detect_language(cleaned_text)
|
| 76 |
audio_file = generate_audio_kokoro(cleaned_text, detected_lang)
|
| 77 |
|
| 78 |
-
return cleaned_text, detected_lang, audio_file
|
| 79 |
|
| 80 |
### 6️⃣ Gradio Interface
|
| 81 |
with gr.Blocks() as demo:
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import trafilatura
|
| 3 |
+
from trafilatura import fetch_url, extract
|
| 4 |
import docling
|
| 5 |
+
from docling.document_converter import DocumentConverter
|
| 6 |
import torch
|
| 7 |
import soundfile as sf
|
| 8 |
import numpy as np
|
|
|
|
| 30 |
def fetch_content(url):
|
| 31 |
"""Fetch and extract text from a given URL (HTML or PDF)."""
|
| 32 |
if url.endswith(".pdf") or "pdf" in url:
|
| 33 |
+
converter = DocumentConverter()
|
| 34 |
+
text = converter.convert(url)
|
| 35 |
else:
|
| 36 |
downloaded = trafilatura.fetch_url(url)
|
| 37 |
+
text = extract(downloaded, output_format="markdown", with_metadata=True, include_tables=False, include_links=False, include_formatting=True, include_comments=False) #without metadata extraction
|
| 38 |
return text
|
| 39 |
|
| 40 |
### 2️⃣ Cleaning Function
|
| 41 |
def extract_and_clean_text(data):
|
| 42 |
+
|
| 43 |
+
metadata_dict = {}
|
| 44 |
+
|
| 45 |
+
# Step 1: Extract metadata enclosed between "---" at the beginning
|
| 46 |
+
metadata_pattern = re.match(r"^---(.*?)---", data, re.DOTALL)
|
| 47 |
+
|
| 48 |
+
if metadata_pattern:
|
| 49 |
+
metadata_raw = metadata_pattern.group(1).strip()
|
| 50 |
+
data = data[metadata_pattern.end():].strip() # Remove metadata from text
|
| 51 |
+
|
| 52 |
+
# Convert metadata into dictionary format manually (since YAML isn't reliable)
|
| 53 |
+
metadata_lines = metadata_raw.split("\n")
|
| 54 |
+
for line in metadata_lines:
|
| 55 |
+
if ": " in line: # Only process lines with key-value pairs
|
| 56 |
+
key, value = line.split(": ", 1) # Split at first ": "
|
| 57 |
+
|
| 58 |
+
# Convert lists (wrapped in square brackets) into Python lists
|
| 59 |
+
if value.startswith("[") and value.endswith("]"):
|
| 60 |
+
try:
|
| 61 |
+
value = json.loads(value) # Convert to list
|
| 62 |
+
except json.JSONDecodeError:
|
| 63 |
+
pass # If JSON parsing fails, keep it as a string
|
| 64 |
+
|
| 65 |
+
metadata_dict[key.strip()] = value.strip() # Store cleaned key-value pair
|
| 66 |
+
|
| 67 |
+
# Step 2: Clean the extracted text
|
| 68 |
def clean_text(text):
|
| 69 |
+
# Remove inline citations like [2][4]
|
| 70 |
+
text = re.sub(r'\[\d+\]', '', text)
|
| 71 |
+
|
| 72 |
+
# Remove URLs (both direct links and markdown-style links)
|
| 73 |
+
text = re.sub(r'http[s]?://\S+', '', text) # Direct links
|
| 74 |
+
text = re.sub(r'\[.*?\]\(http[s]?://\S+\)', '', text) # Markdown links
|
| 75 |
+
|
| 76 |
+
# Remove markdown-style headings and special characters (#, ##, *, etc.)
|
| 77 |
+
text = re.sub(r'^\s*#+\s*', '', text, flags=re.MULTILINE) # Remove headings
|
| 78 |
+
text = re.sub(r'[*_`]', '', text) # Remove bold/italic/monospace markers
|
| 79 |
+
|
| 80 |
+
# Remove References, Bibliography, External Links, and Comments sections
|
| 81 |
+
patterns = [r'References\b.*', r'Bibliography\b.*', r'External Links\b.*', r'COMMENTS\b.*']
|
| 82 |
+
for pattern in patterns:
|
| 83 |
+
text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
|
| 84 |
+
|
| 85 |
+
# Remove extra whitespace and newlines
|
| 86 |
+
text = re.sub(r'\n\s*\n+', '\n\n', text).strip()
|
| 87 |
+
|
| 88 |
return text
|
| 89 |
|
| 90 |
+
cleaned_text = clean_text(data)
|
| 91 |
+
|
| 92 |
+
return metadata_dict, cleaned_text
|
| 93 |
|
| 94 |
### 3️⃣ Language Detection
|
| 95 |
def detect_language(text):
|
|
|
|
| 105 |
"""Generate speech using KokoroTTS for supported languages."""
|
| 106 |
lang_code = SUPPORTED_TTS_LANGUAGES.get(lang, "a") # Default to English
|
| 107 |
generator = kokoro_tts(text, voice="af_bella", speed=1, split_pattern=r'\n+')
|
| 108 |
+
# 3. Specify Device
|
| 109 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 110 |
|
| 111 |
+
# Initialize an empty list to store audio data
|
| 112 |
+
audio_data_list = []
|
| 113 |
+
# Generate and collect audio data
|
| 114 |
+
for i, (gs, ps, audio) in enumerate(generator):
|
| 115 |
+
print(f"Processing segment {i + 1}")
|
| 116 |
+
print(gs) # Print the text segment
|
| 117 |
+
audio_data_list.append(audio) # Append audio data to the list
|
| 118 |
+
|
| 119 |
+
# Concatenate all audio data into a single array
|
| 120 |
+
full_audio = np.concatenate(audio_data_list)
|
| 121 |
+
|
| 122 |
output_file = f"audio_{lang}.wav"
|
| 123 |
+
sf.write(output_file, full, 24000) # Save as WAV file
|
| 124 |
return output_file
|
| 125 |
|
| 126 |
### 5️⃣ Main Processing Function
|
| 127 |
def process_url(url):
|
| 128 |
"""Processes the URL, extracts text, detects language, and converts to audio."""
|
| 129 |
content = fetch_content(url)
|
| 130 |
+
metadata,cleaned_text = extract_and_clean_text(content)
|
| 131 |
detected_lang = detect_language(cleaned_text)
|
| 132 |
audio_file = generate_audio_kokoro(cleaned_text, detected_lang)
|
| 133 |
|
| 134 |
+
return metadata, cleaned_text, detected_lang, audio_file
|
| 135 |
|
| 136 |
### 6️⃣ Gradio Interface
|
| 137 |
with gr.Blocks() as demo:
|