File size: 16,262 Bytes
0097003
a0dfdc4
183aa58
 
7eb5f41
23b607a
183aa58
 
 
 
 
 
 
 
07f2498
6e53660
 
3a2ae2d
 
 
 
392490b
877bcf1
5cd86ec
183aa58
 
ee078cd
183aa58
8b12154
183aa58
 
 
 
 
 
 
 
 
 
b774e3c
 
 
 
 
 
6e53660
 
392490b
 
 
 
8b12154
392490b
 
 
6e53660
877bcf1
 
 
392490b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23f13e6
392490b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183aa58
392490b
 
 
 
 
791ecaf
 
207f92d
b774e3c
 
392490b
c94e9c4
392490b
 
 
 
 
 
 
 
 
183aa58
0097003
183aa58
7eb5f41
 
 
 
 
 
 
 
 
 
0097003
7eb5f41
 
0097003
7eb5f41
 
0097003
7eb5f41
 
0097003
7eb5f41
0097003
7eb5f41
0097003
7eb5f41
32d9ffc
 
 
0097003
32d9ffc
 
 
0097003
 
32d9ffc
 
 
 
183aa58
0097003
7eb5f41
 
0097003
 
 
 
7eb5f41
 
 
 
 
 
183aa58
b774e3c
183aa58
 
 
0097003
183aa58
 
 
 
0097003
183aa58
0097003
07f2498
 
 
 
 
 
 
0097003
07f2498
 
 
 
 
 
 
 
 
 
 
 
0097003
07f2498
 
 
 
8c30d30
2a4c58e
 
 
49deb85
07f2498
3a2ae2d
0097003
392490b
 
 
3a2ae2d
392490b
3a2ae2d
 
 
 
 
 
 
 
 
0097003
3a2ae2d
 
 
183aa58
7a01365
2a37fbd
183aa58
0097003
9b5e89e
183aa58
b774e3c
183aa58
0097003
791ecaf
 
5cd86ec
1e95dc3
 
 
 
5cd86ec
1e95dc3
 
6e53660
 
 
59c2301
6e53660
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0097003
b362593
6e53660
 
8b12154
 
6e53660
392490b
8b12154
392490b
 
 
 
6e53660
392490b
 
 
 
 
 
 
8b12154
392490b
 
 
 
 
 
 
 
 
 
ce1491a
392490b
 
ce1491a
392490b
183aa58
392490b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
877bcf1
392490b
877bcf1
392490b
 
 
 
877bcf1
392490b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
877bcf1
 
183aa58
 
0097003
183aa58
7eb5f41
183aa58
 
 
bc23581
183aa58
 
 
 
 
 
d48b815
a3470c7
0097003
2a4c58e
 
 
 
b774e3c
207f92d
b774e3c
 
 
eca81bf
b774e3c
183aa58
d48b815
161a82d
7e5ccd3
2ff3fff
8b12154
 
 
877bcf1
 
183aa58
791ecaf
 
 
 
6e53660
eca81bf
791ecaf
3a2ae2d
 
4c1e97c
3a2ae2d
 
 
2ff3fff
3a2ae2d
eca81bf
392490b
 
 
 
 
 
eca81bf
392490b
eca81bf
392490b
eca81bf
 
b774e3c
392490b
09f4099
9b5e89e
6d76a3b
 
0097003
9b5e89e
207f92d
 
9b5e89e
9064672
791ecaf
 
392490b
07f2498
0097003
d2c615b
0097003
8b12154
07f2498
 
 
392490b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
import spaces  
import os
import gradio as gr
import trafilatura
from trafilatura import fetch_url, extract
from markitdown import MarkItDown
import torch
import soundfile as sf
import numpy as np
from langdetect import detect
from kokoro import KPipeline
import re
import json
import nltk
import stanza
from transformers import BartForConditionalGeneration, BartTokenizer
from nltk.tokenize import sent_tokenize
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image
import io
import requests
from gliner import GLiNER
import tempfile

nltk.download("punkt")
nltk.download("punkt_tab")

kokoro_tts = KPipeline(lang_code='a')  

# Supported TTS Languages
SUPPORTED_TTS_LANGUAGES = {
    "en": "a",  # English (default)
    "fr": "f",  # French
    "hi": "h",  # Hindi
    "it": "i",  # Italian
    "pt": "p",  # Brazilian Portuguese
}

# Available voices in KokoroTTS
AVAILABLE_VOICES = [
    'af_bella', 'af_sarah', 'am_adam', 'am_michael', 'bf_emma',
    'bf_isabella', 'bm_george', 'bm_lewis', 'af_nicole', 'af_sky'
]

# Load BART Large CNN Model for Summarization
model_name = "facebook/bart-large-cnn"


try:
    tokenizer = BartTokenizer.from_pretrained(model_name, cache_dir=os.path.join(os.getcwd(), ".cache"))
    model = BartForConditionalGeneration.from_pretrained(model_name, cache_dir=os.path.join(os.getcwd(), ".cache"))

except Exception as e:
    raise RuntimeError(f"Error loading BART model: {e}")

# Initialize GLINER model
gliner_model = GLiNER.from_pretrained("urchade/gliner_base")

def is_pdf_url(url):
    """Robustly detects PDF files via URL patterns and Content-Type headers."""
    # URL Pattern Check
    if url.endswith(".pdf") or "pdf" in url.lower():
        return True
    
    # Check Content-Type Header (for URLs without '.pdf')
    try:
        response = requests.head(url, timeout=10)
        content_type = response.headers.get('Content-Type', '')
        if 'application/pdf' in content_type:
            return True
    except requests.RequestException:
        pass  # Ignore errors in Content-Type check
    
    return False
	
def fetch_and_display_content(url):
    """
    Fetch and extract text from a given URL (HTML or PDF).
    Extract metadata, clean text, and detect language.
    """
	
    downloaded = trafilatura.fetch_url(url)
    if not downloaded:
        raise ValueError(f"❌ Failed to fetch content from URL: {url}")
  
    if is_pdf_url(url):
        converter = MarkItDown(enable_plugins=False)
        try:
            text = converter.convert(url).text_content
        except Exception as e:
            raise RuntimeError(f"❌ Error converting PDF with MarkItDown: {e}")
    else:
        text = extract(downloaded, output_format="markdown", with_metadata=True, include_tables=False, include_links=False, include_formatting=True, include_comments=False)
    
    if not text or len(text.strip()) == 0:
        raise ValueError("❌ No content found in the extracted data.")
		
    metadata, cleaned_text = extract_and_clean_text(text)
    detected_lang = detect_language(cleaned_text)
	
    # Add detected language to metadata
    metadata["Detected Language"] = detected_lang.upper()

    return (
        cleaned_text, 
        metadata, 
        detected_lang,
        gr.update(visible=True),  # Show Word Cloud
        gr.update(visible=True),  # Show Process Audio Button
        gr.update(visible=True),  # Show Process NER Button
        gr.update(visible=True),  # Show Extracted Text
        gr.update(visible=True)   # Show Metadata Output
    )


def extract_and_clean_text(data):
    
    metadata_dict = {}

    # Step 1: Extract metadata enclosed between "---" at the beginning
    metadata_pattern = re.match(r"^---(.*?)---", data, re.DOTALL)
    
    if metadata_pattern:
        metadata_raw = metadata_pattern.group(1).strip()
        data = data[metadata_pattern.end():].strip()  # Remove metadata from text

        
        metadata_lines = metadata_raw.split("\n")
        for line in metadata_lines:
            if ": " in line:  
                key, value = line.split(": ", 1)  # Split at first ": "
                
                
                if value.startswith("[") and value.endswith("]"):
                    try:
                        value = json.loads(value)  
                    except json.JSONDecodeError:
                        pass  
                
                metadata_dict[key.strip()] = value.strip()  

    #Step 2: Remove everything before the "Abstract" section
    def remove_text_before_abstract(text):
        """Removes all text before the first occurrence of 'Abstract'."""
        abstract_pattern = re.compile(r"(?i)\babstract\b")  
        match = abstract_pattern.search(text)
        
        if match:
            return text[match.start():]  
        return text  

    data = remove_text_before_abstract(data)            

    # Step 3: Clean the extracted text
    def clean_text(text):
        
        text = re.sub(r'\[\d+\]', '', text)
        
        text = re.sub(r'http[s]?://\S+', '', text)  
        text = re.sub(r'\[.*?\]\(http[s]?://\S+\)', '', text)  

                
        patterns = [r'References\b.*', r'Bibliography\b.*', r'External Links\b.*', r'COMMENTS\b.*']
        for pattern in patterns:
            text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)

        text = re.sub(r'\n\s*\n+', '\n\n', text).strip()
        
        return text
    return metadata_dict, clean_text(data)

### 3️⃣ Language Detection
def detect_language(text):
    
    try:
        lang = detect(text)
        return lang if lang in SUPPORTED_TTS_LANGUAGES else "en"  # Default to English if not supported
    except:
        return "en"  

#Not using this one below. Using Gliner
def extract_entities_with_stanza(text, chunk_size=1000):
    """Splits text into chunks, runs Stanza NER, and combines results."""
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    
    for sentence in sentences:
        if current_length + len(sentence) > chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_length = len(sentence)
        else:
            current_chunk.append(sentence)
            current_length += len(sentence)

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    
    entities = []
    for chunk in chunks:
        doc = nlp(chunk)
        for ent in doc.ents:
            entities.append({"text": ent.text, "type": ent.type})
			
    formatted_entities = "\n".join([f"{i+1}: {ent['text']} --> {ent['type']}" for i, ent in enumerate(entities)])
    return formatted_entities
    return entities

def generate_wordcloud(text):
    
    if not text.strip():
        raise ValueError("❌ Text is empty or invalid for WordCloud generation.")
   
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
        
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    
    buf = io.BytesIO()
    plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0)
    buf.seek(0)
    plt.close()
    
    
    image = Image.open(buf)
    return image

### 4️⃣ TTS Functionality (KokoroTTS)
@spaces.GPU(duration=1000)  
def generate_audio_kokoro(text, lang, selected_voice):
    """Generate speech using KokoroTTS for supported languages."""
    global kokoro_tts  
    
    lang_code = SUPPORTED_TTS_LANGUAGES.get(lang, "a")  # Default to English
    generator = kokoro_tts(text, voice=selected_voice, speed=1, split_pattern=r'\n+')

    
    audio_data_list = [audio for _, _, audio in generator]
    full_audio = np.concatenate(audio_data_list)
    
	# Save to a temporary file
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
        sf.write(temp_file, full_audio, 24000, format='wav')
        temp_file_path = temp_file.name
	
    print("Audio generated successfully.")
    return temp_file_path
	
### 5️⃣ Chunk-Based Summarization
def split_text_with_optimized_overlap(text, max_tokens=1024, overlap_tokens=25):
    """Splits text into optimized overlapping chunks."""
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0
    previous_chunk_text = ""
    for sentence in sentences:
        tokenized_sentence = tokenizer.encode(sentence, add_special_tokens=False)
        token_length = len(tokenized_sentence)
        if current_length + token_length > max_tokens:
            chunks.append(previous_chunk_text + " " + " ".join(current_chunk))
            previous_chunk_text = " ".join(current_chunk)[-overlap_tokens:]
            current_chunk = [sentence]
            current_length = token_length
        else:
            current_chunk.append(sentence)
            current_length += token_length
    if current_chunk:
        chunks.append(previous_chunk_text + " " + " ".join(current_chunk))
    return chunks
def summarize_text(text, max_input_tokens=1024, max_output_tokens=200):
    
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=max_input_tokens, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_output_tokens, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

@spaces.GPU(duration=1000)	
def hierarchical_summarization(text):
    """Performs hierarchical summarization by chunking content first."""
    #print(f"βœ… Summarization will run on: {DEVICE.upper()}")
	
    if len(text) > 10000:
        print("⚠️ Warning: Large input text detected. Summarization may take longer than usual.")

    chunks = split_text_with_optimized_overlap(text)
    #Tokenize the input cleaned text
    encoded_inputs = tokenizer(
        ["summarize: " + chunk for chunk in chunks],
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=1024
    )
	
	#Generate the summary
    summary_ids = model.generate(
        encoded_inputs["input_ids"],
        max_length=200,
        min_length=50,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    
	#decode the summary generated in above step
    chunk_summaries = [tokenizer.decode(ids, skip_special_tokens=True) for ids in summary_ids]
    final_summary = " ".join(chunk_summaries)
    return final_summary

def chunk_text_with_overlap(text, max_tokens=500, overlap_tokens=50):
    """Splits text into overlapping chunks for large document processing."""
    sentences = re.split(r'(?<=[.!?])\s+', text)  # Split on sentence boundaries
    chunks = []
    current_chunk = []
    current_length = 0
    previous_chunk_text = ""

    for sentence in sentences:
        token_length = len(sentence.split())
        if current_length + token_length > max_tokens:
            chunks.append(previous_chunk_text + " " + " ".join(current_chunk))
            previous_chunk_text = " ".join(current_chunk)[-overlap_tokens:]
            current_chunk = [sentence]
            current_length = token_length
        else:
            current_chunk.append(sentence)
            current_length += token_length

    if current_chunk:
        chunks.append(previous_chunk_text + " " + " ".join(current_chunk))

    return chunks

def extract_entities_with_gliner(text, default_entity_types, custom_entity_types, batch_size=4):
    """
    Extract entities using GLINER with efficient chunking, sliding window, and batching.
    """
    # Entity types preparation
    entity_types = default_entity_types.split(",") + [
        etype.strip() for etype in custom_entity_types.split(",") if custom_entity_types
    ]
    entity_types = list(set([etype.strip() for etype in entity_types if etype.strip()]))

    # Chunk the text to avoid overflow
    chunks = chunk_text_with_overlap(text)

    # Process each chunk individually for improved stability
    all_entities = []
    for i, chunk in enumerate(chunks):
        try:
            entities = gliner_model.predict_entities(chunk, entity_types)
            all_entities.extend(entities)
        except Exception as e:
            print(f"⚠️ Error processing chunk {i}: {e}")

    # Format the results
    formatted_entities = "\n".join(
        [f"{i+1}: {ent['text']} --> {ent['label']}" for i, ent in enumerate(all_entities)]
    )

    return formatted_entities

### 5️⃣ Main Processing Function
def process_url(url):
    
    content = fetch_content(url)
    metadata,cleaned_text = extract_and_clean_text(content)
    detected_lang = detect_language(cleaned_text)
    audio_file = generate_audio_kokoro(cleaned_text, detected_lang)

    return cleaned_text, detected_lang, audio_file

### 6️⃣ Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# 🌍 Web-to-Audio Converter πŸŽ™οΈ")
    
    url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com/article")
    
    voice_selection = gr.Dropdown(AVAILABLE_VOICES, label="Select Voice", value="bm_george")
    tts_option = gr.Radio(["TTS based on Summary", "TTS based on Raw Data"], value="TTS based on Summary", label="Select TTS Source")
    with gr.Row():
        process_text_button = gr.Button("Fetch Text & Detect Language",scale = 1)
        process_audio_button = gr.Button("Generate Audio", visible=False,scale = 1)
        process_ner_button = gr.Button("Extract Entities", visible=False,scale = 1)  # βœ… New button for NER

    
    with gr.Row():
        extracted_text = gr.Textbox(label="Extracted Content", visible=False, interactive=False, lines=15)
        metadata_output = gr.JSON(label="Article Metadata", visible=False)  # Displays metadata
        wordcloud_output = gr.Image(label="Word Cloud", visible=False)

    
    detected_lang = gr.Textbox(label="Detected Language", visible=False)
    summary_output = gr.Textbox(label="Summary", visible=True, interactive=False)
    full_audio_output = gr.Audio(label="Generated Audio", visible=True)
    ner_output = gr.Textbox(label="Extracted Entities", visible=True, interactive=False)  
    
	
	
    default_entity_types = gr.Textbox(label="Default Entity Types", value="PERSON, Organization, location, Date, PRODUCT, EVENT", interactive=True)
    custom_entity_types = gr.Textbox(label="Custom Entity Types", placeholder="Enter additional entity types (comma-separated)", interactive=True)

    # Step 1: Fetch Text & Detect Language First
    process_text_button.click(
        fetch_and_display_content, 
        inputs=[url_input], 
        
        outputs=[extracted_text, metadata_output, detected_lang, wordcloud_output, process_audio_button,process_ner_button, extracted_text, metadata_output]
    )
	
	# Automatically generate word cloud when extracted_text changes
    extracted_text.change(
        generate_wordcloud,
        inputs=[extracted_text],
        outputs=[wordcloud_output],
        show_progress=True
    )
	
	# Step 3: Summarization (Generate Summary Before Enabling TTS Button)
    def generate_summary_and_enable_tts(text):
        summary = hierarchical_summarization(text)
        return summary, gr.update(visible=True)  # Enable the TTS button only after summary is generated

    # Summarization
    extracted_text.change(
        generate_summary_and_enable_tts,
        inputs=[extracted_text],
        outputs=[summary_output, process_audio_button],
        show_progress=True
    )
    
    # Audio Generation	
    process_audio_button.click(
        lambda text, summary, lang, voice, tts_choice: (
		    None,  # Clear previous audio
			generate_audio_kokoro(
                summary if tts_choice == "TTS based on Summary" else text, lang, voice
            )
        ),
        inputs=[extracted_text, summary_output, detected_lang, voice_selection, tts_option],
        outputs=[full_audio_output, full_audio_output],  # Clear first, then display new audio
        show_progress=True
    )

    # NER Extraction
    process_ner_button.click(
         
	    extract_entities_with_gliner,
         
		inputs=[extracted_text, default_entity_types, custom_entity_types],
        outputs=[ner_output]
    )

demo.launch(share=True)