PuristanLabs1 commited on
Commit
183aa58
·
verified ·
1 Parent(s): d28c9e7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -0
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import trafilatura
3
+ import docling
4
+ import torch
5
+ import soundfile as sf
6
+ import numpy as np
7
+ from langdetect import detect
8
+ from kokoro import KPipeline
9
+ import re
10
+ import json
11
+ import nltk
12
+
13
+ nltk.download("punkt")
14
+
15
+ # Initialize KokoroTTS with default English
16
+ kokoro_tts = KPipeline(lang_code='a') # 'a' is for American English
17
+
18
+ # Supported TTS Languages
19
+ SUPPORTED_TTS_LANGUAGES = {
20
+ "en": "a", # English (default)
21
+ "fr": "f", # French
22
+ "hi": "h", # Hindi
23
+ "it": "i", # Italian
24
+ "pt": "p", # Brazilian Portuguese
25
+ }
26
+
27
+ ### 1️⃣ Fetch and Extract Content
28
+ def fetch_content(url):
29
+ """Fetch and extract text from a given URL (HTML or PDF)."""
30
+ if url.endswith(".pdf") or "pdf" in url:
31
+ text = docling.extract_text(url)
32
+ else:
33
+ downloaded = trafilatura.fetch_url(url)
34
+ text = trafilatura.extract(downloaded, output_format="markdown", with_metadata=False)
35
+ return text
36
+
37
+ ### 2️⃣ Cleaning Function
38
+ def extract_and_clean_text(data):
39
+ """Removes citations, links, markdown elements, and unnecessary sections."""
40
+ def clean_text(text):
41
+ text = re.sub(r'\[\d+\]', '', text) # Remove citations like [2][4]
42
+ text = re.sub(r'http[s]?://\S+', '', text) # Remove URLs
43
+ text = re.sub(r'[*_`]', '', text) # Remove markdown formatting
44
+ text = re.sub(r'\n\s*\n+', '\n\n', text).strip() # Remove excessive whitespace
45
+ return text
46
+
47
+ return clean_text(data)
48
+
49
+ ### 3️⃣ Language Detection
50
+ def detect_language(text):
51
+ """Detects the language of extracted text."""
52
+ try:
53
+ lang = detect(text)
54
+ return lang if lang in SUPPORTED_TTS_LANGUAGES else "en" # Default to English if not supported
55
+ except:
56
+ return "en" # Default to English if detection fails
57
+
58
+ ### 4️⃣ TTS Functionality (KokoroTTS)
59
+ def generate_audio_kokoro(text, lang):
60
+ """Generate speech using KokoroTTS for supported languages."""
61
+ lang_code = SUPPORTED_TTS_LANGUAGES.get(lang, "a") # Default to English
62
+ generator = kokoro_tts(text, voice="af_bella", speed=1, split_pattern=r'\n+')
63
+
64
+ # Combine audio segments into a single file
65
+ audio_data = np.concatenate([audio for gs, ps, audio in generator])
66
+ output_file = f"audio_{lang}.wav"
67
+ sf.write(output_file, audio_data, 24000) # Save as WAV file
68
+ return output_file
69
+
70
+ ### 5️⃣ Main Processing Function
71
+ def process_url(url):
72
+ """Processes the URL, extracts text, detects language, and converts to audio."""
73
+ content = fetch_content(url)
74
+ cleaned_text = extract_and_clean_text(content)
75
+ detected_lang = detect_language(cleaned_text)
76
+ audio_file = generate_audio_kokoro(cleaned_text, detected_lang)
77
+
78
+ return cleaned_text, detected_lang, audio_file
79
+
80
+ ### 6️⃣ Gradio Interface
81
+ with gr.Blocks() as demo:
82
+ gr.Markdown("# 🌍 Web-to-Audio Converter 🎙️")
83
+
84
+ url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com/article")
85
+ process_button = gr.Button("Generate Audio")
86
+
87
+ extracted_text = gr.Markdown(label="Extracted Content")
88
+ detected_language = gr.Textbox(label="Detected Language")
89
+ full_audio_output = gr.Audio(label="Generated Audio")
90
+
91
+ process_button.click(process_url, inputs=[url_input], outputs=[extracted_text, detected_language, full_audio_output])
92
+
93
+ demo.launch()