Spaces:

PuristanLabs1
/

VocalWeb

Sleeping

App Files Files Community

PuristanLabs1 commited on Feb 22

Commit

7eb5f41

verified ·

1 Parent(s): ff4eee6

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -13

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import gradio as gr
 import trafilatura
 import docling
 import torch
 import soundfile as sf
 import numpy as np
@@ -28,23 +30,66 @@ SUPPORTED_TTS_LANGUAGES = {
 def fetch_content(url):
     """Fetch and extract text from a given URL (HTML or PDF)."""
     if url.endswith(".pdf") or "pdf" in url:
-        text = docling.extract_text(url)
     else:
         downloaded = trafilatura.fetch_url(url)
-        text = trafilatura.extract(downloaded, output_format="markdown", with_metadata=False)
     return text
 ### 2️⃣ Cleaning Function
 def extract_and_clean_text(data):
-    """Removes citations, links, markdown elements, and unnecessary sections."""
     def clean_text(text):
-        text = re.sub(r'\[\d+\]', '', text)  # Remove citations like [2][4]
-        text = re.sub(r'http[s]?://\S+', '', text)  # Remove URLs
-        text = re.sub(r'[*_`]', '', text)  # Remove markdown formatting
-        text = re.sub(r'\n\s*\n+', '\n\n', text).strip()  # Remove excessive whitespace
         return text
-    return clean_text(data)
 ### 3️⃣ Language Detection
 def detect_language(text):
@@ -60,22 +105,33 @@ def generate_audio_kokoro(text, lang):
     """Generate speech using KokoroTTS for supported languages."""
     lang_code = SUPPORTED_TTS_LANGUAGES.get(lang, "a")  # Default to English
     generator = kokoro_tts(text, voice="af_bella", speed=1, split_pattern=r'\n+')
-    # Combine audio segments into a single file
-    audio_data = np.concatenate([audio for gs, ps, audio in generator])
     output_file = f"audio_{lang}.wav"
-    sf.write(output_file, audio_data, 24000)  # Save as WAV file
     return output_file
 ### 5️⃣ Main Processing Function
 def process_url(url):
     """Processes the URL, extracts text, detects language, and converts to audio."""
     content = fetch_content(url)
-    cleaned_text = extract_and_clean_text(content)
     detected_lang = detect_language(cleaned_text)
     audio_file = generate_audio_kokoro(cleaned_text, detected_lang)
-    return cleaned_text, detected_lang, audio_file
 ### 6️⃣ Gradio Interface
 with gr.Blocks() as demo:

 import gradio as gr
 import trafilatura
+from trafilatura import fetch_url, extract
 import docling
+from docling.document_converter import DocumentConverter
 import torch
 import soundfile as sf
 import numpy as np
 def fetch_content(url):
     """Fetch and extract text from a given URL (HTML or PDF)."""
     if url.endswith(".pdf") or "pdf" in url:
+        converter = DocumentConverter()
+        text = converter.convert(url)
     else:
         downloaded = trafilatura.fetch_url(url)
+        text = extract(downloaded, output_format="markdown", with_metadata=True, include_tables=False, include_links=False, include_formatting=True, include_comments=False) #without metadata extraction
     return text
 ### 2️⃣ Cleaning Function
 def extract_and_clean_text(data):
+    metadata_dict = {}
+    # Step 1: Extract metadata enclosed between "---" at the beginning
+    metadata_pattern = re.match(r"^---(.*?)---", data, re.DOTALL)
+    if metadata_pattern:
+        metadata_raw = metadata_pattern.group(1).strip()
+        data = data[metadata_pattern.end():].strip()  # Remove metadata from text
+        # Convert metadata into dictionary format manually (since YAML isn't reliable)
+        metadata_lines = metadata_raw.split("\n")
+        for line in metadata_lines:
+            if ": " in line:  # Only process lines with key-value pairs
+                key, value = line.split(": ", 1)  # Split at first ": "
+                # Convert lists (wrapped in square brackets) into Python lists
+                if value.startswith("[") and value.endswith("]"):
+                    try:
+                        value = json.loads(value)  # Convert to list
+                    except json.JSONDecodeError:
+                        pass  # If JSON parsing fails, keep it as a string
+                metadata_dict[key.strip()] = value.strip()  # Store cleaned key-value pair
+    # Step 2: Clean the extracted text
     def clean_text(text):
+        # Remove inline citations like [2][4]
+        text = re.sub(r'\[\d+\]', '', text)
+        # Remove URLs (both direct links and markdown-style links)
+        text = re.sub(r'http[s]?://\S+', '', text)  # Direct links
+        text = re.sub(r'\[.*?\]\(http[s]?://\S+\)', '', text)  # Markdown links
+        # Remove markdown-style headings and special characters (#, ##, *, etc.)
+        text = re.sub(r'^\s*#+\s*', '', text, flags=re.MULTILINE)  # Remove headings
+        text = re.sub(r'[*_`]', '', text)  # Remove bold/italic/monospace markers
+        # Remove References, Bibliography, External Links, and Comments sections
+        patterns = [r'References\b.*', r'Bibliography\b.*', r'External Links\b.*', r'COMMENTS\b.*']
+        for pattern in patterns:
+            text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
+        # Remove extra whitespace and newlines
+        text = re.sub(r'\n\s*\n+', '\n\n', text).strip()
         return text
+    cleaned_text = clean_text(data)
+    return metadata_dict, cleaned_text
 ### 3️⃣ Language Detection
 def detect_language(text):
     """Generate speech using KokoroTTS for supported languages."""
     lang_code = SUPPORTED_TTS_LANGUAGES.get(lang, "a")  # Default to English
     generator = kokoro_tts(text, voice="af_bella", speed=1, split_pattern=r'\n+')
+    # 3. Specify Device
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    # Initialize an empty list to store audio data
+    audio_data_list = []
+    # Generate and collect audio data
+    for i, (gs, ps, audio) in enumerate(generator):
+        print(f"Processing segment {i + 1}")
+        print(gs)  # Print the text segment
+        audio_data_list.append(audio)  # Append audio data to the list
+# Concatenate all audio data into a single array
+    full_audio = np.concatenate(audio_data_list)
     output_file = f"audio_{lang}.wav"
+    sf.write(output_file, full, 24000)  # Save as WAV file
     return output_file
 ### 5️⃣ Main Processing Function
 def process_url(url):
     """Processes the URL, extracts text, detects language, and converts to audio."""
     content = fetch_content(url)
+    metadata,cleaned_text = extract_and_clean_text(content)
     detected_lang = detect_language(cleaned_text)
     audio_file = generate_audio_kokoro(cleaned_text, detected_lang)
+    return metadata, cleaned_text, detected_lang, audio_file
 ### 6️⃣ Gradio Interface
 with gr.Blocks() as demo: