File size: 2,874 Bytes
10e9b7d
494dad8
 
 
eccf8e4
494dad8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e80aab9
494dad8
 
 
 
 
 
 
7d65c66
494dad8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e80aab9
494dad8
 
 
 
 
 
 
7d65c66
494dad8
3c4371f
494dad8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import gradio as gr
from duckduckgo_search import DDGS
from transformers import pipeline
from PIL import Image
import requests
from bs4 import BeautifulSoup
import re
import torch
from io import BytesIO

# Pipelines
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
caption_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

# Utils
def search_web(query, max_results=3):
    with DDGS() as ddgs:
        results = ddgs.text(query, max_results=max_results)
        return "\n\n".join([f"**{r['title']}**\n{r['body']}\n{r['href']}" for r in results])

def explain_image(img):
    return caption_pipeline(img)[0]['generated_text']

def extract_text_from_url(url):
    try:
        res = requests.get(url, timeout=5)
        soup = BeautifulSoup(res.text, 'html.parser')
        # Remove scripts/styles
        for script in soup(["script", "style"]): script.extract()
        text = soup.get_text(separator=' ')
        clean_text = re.sub(r'\s+', ' ', text)
        return clean_text[:3000]  # Limit to 3000 characters
    except Exception as e:
        return f"Failed to extract text: {str(e)}"

def summarize_url(url):
    text = extract_text_from_url(url)
    if len(text) > 100:
        summary = summarizer(text[:1024])[0]['summary_text']
        return summary
    return "Not enough text to summarize."

# Main Agent Function
def ai_agent(input_text, image=None, url=None):
    results = []

    # Process Image
    if image:
        results.append("πŸ–ΌοΈ **Image Explanation:**\n" + explain_image(image))

    # Process URL
    if url:
        if "youtube.com" in url or "youtu.be" in url:
            results.append("πŸ“Ή **Video URL detected.** Currently only summaries of page content are available.")
        results.append("πŸ”— **Webpage Summary:**\n" + summarize_url(url))

    # Web search for complex questions
    if input_text:
        if len(input_text.split()) > 10:  # assume complex
            web_results = search_web(input_text)
            results.append("πŸ” **Web Search Results:**\n" + web_results)
        else:
            results.append("🧠 **Answer:**\n" + search_web(input_text))

    return "\n\n---\n\n".join(results)

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## 🌐🧠 Multi-Modal AI Agent (Web + Image + URL)")
    with gr.Row():
        input_text = gr.Textbox(label="Ask a Question", lines=2, placeholder="E.g. What are the latest AI trends?")
        image = gr.Image(type="pil", label="Upload an Image (optional)")
        url = gr.Textbox(label="Provide a URL (optional)", placeholder="https://example.com")
    submit = gr.Button("Get Answer")
    output = gr.Markdown()

    submit.click(fn=ai_agent, inputs=[input_text, image, url], outputs=output)

demo.launch()