gaur3009 commited on
Commit
a8b2206
Β·
verified Β·
1 Parent(s): c71bafb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -80
app.py CHANGED
@@ -3,93 +3,130 @@ from search import search_google
3
  from scraper import scrape_url
4
  from rag import VectorStore
5
  from llm import generate_answer
6
- import time
7
 
8
  vs = VectorStore()
9
 
10
  def ask_agent(question):
11
- # Search Google
12
- with gr.Blocks(analytics_enabled=False) as progress_section:
13
- with gr.Row():
14
- gr.Textbox("Searching web...", show_label=False)
15
-
16
  urls = [u for u in search_google(question, num_results=3) if u.startswith("http")]
17
- if not urls:
18
- return "⚠️ No search results found. Try a different query."
19
-
20
- # Scrape URLs
21
- progress_section.children[0].children[0].value = "Scraping content..."
22
- texts_images = []
23
- for url in urls:
24
- texts_images.append(scrape_url(url))
25
-
26
  texts = [ti[0] for ti in texts_images if not ti[0].startswith("[Error")]
27
- images = [ti[1] for ti in texts_images]
28
-
29
- # Add to vector store
30
- if texts:
31
- vs.add_texts(texts)
32
-
33
- # Retrieve context
34
- progress_section.children[0].children[0].value = "Analyzing content..."
35
  relevant = vs.retrieve(question, top_k=2)
36
- context = "\n\n".join(relevant) if relevant else "No relevant context found."
37
-
38
- # Generate answer
39
- progress_section.children[0].children[0].value = "Generating answer..."
40
  answer = generate_answer(context, question)
41
-
42
- # Prepare image gallery
43
- image_gallery = []
44
  for url, imgs in zip(urls, images):
45
  if imgs:
46
- image_gallery.extend(imgs[:3]) # Show max 3 images per site
47
-
48
- # Prepare sources
49
- sources = "\n".join([f"- [{url}]({url})" for url in urls])
50
-
51
- return answer, image_gallery, sources
52
-
53
- with gr.Blocks(
54
- theme=gr.themes.Soft(
55
- primary_hue="violet",
56
- font=[gr.themes.GoogleFont("Poppins")]
57
- ),
58
- css=".gradio-container {max-width: 900px !important}"
59
- ) as demo:
60
- gr.Markdown("""
61
- # 🌐 **Smart Web Research Agent**
62
- Ask anything - I'll search the web, analyze content, and provide answers with sources!
63
- """)
64
-
65
- with gr.Row():
66
- question = gr.Textbox(
67
- label="Your question",
68
- placeholder="e.g., Best budget laptop 2024?",
69
- scale=4
70
- )
71
- submit_btn = gr.Button("Search", variant="primary", scale=1)
72
-
73
- progress = gr.Textbox(visible=False)
74
-
75
- with gr.Accordion("Answer", open=True):
76
- answer = gr.Markdown()
77
-
78
- with gr.Accordion("Sources", open=False):
79
- sources = gr.Markdown()
80
-
81
- with gr.Accordion("Images", open=False):
82
- gallery = gr.Gallery(
83
- columns=3,
84
- object_fit="contain",
85
- height="auto"
86
- )
87
-
88
- submit_btn.click(
89
- fn=ask_agent,
90
- inputs=question,
91
- outputs=[answer, gallery, sources],
92
- api_name="search"
93
- )
94
-
95
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from scraper import scrape_url
4
  from rag import VectorStore
5
  from llm import generate_answer
 
6
 
7
  vs = VectorStore()
8
 
9
  def ask_agent(question):
 
 
 
 
 
10
  urls = [u for u in search_google(question, num_results=3) if u.startswith("http")]
11
+
12
+ texts_images = [scrape_url(url) for url in urls]
 
 
 
 
 
 
 
13
  texts = [ti[0] for ti in texts_images if not ti[0].startswith("[Error")]
14
+ images = [ti[1] for ti in texts_images] # list of list of images
15
+
16
+ # add to vector store
17
+ vs.add_texts(texts)
 
 
 
 
18
  relevant = vs.retrieve(question, top_k=2)
19
+ context = "\n\n".join(relevant)
20
+
21
+ # generate answer
 
22
  answer = generate_answer(context, question)
23
+
24
+ # build image markdown with source
25
+ image_markdown = ""
26
  for url, imgs in zip(urls, images):
27
  if imgs:
28
+ # show first image as thumbnail
29
+ img_url = imgs[0]
30
+ image_markdown += f"![image]({img_url})\n"
31
+ image_markdown += f"[Source]({url})\n\n"
32
+
33
+ final_output = f"## 🧠 Answer\n\n{answer}\n\n---\n## πŸ“Έ Images & Sources\n\n{image_markdown}"
34
+ return final_output
35
+
36
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet")) as demo:
37
+ gr.Markdown("# πŸ” **AI Web RAG Agent**\nAsk me anything, I'll search, scrape text & images, and answer!")
38
+ inp = gr.Textbox(label="Your question", placeholder="e.g., Best laptop under 50,000 INR")
39
+ btn = gr.Button("Ask")
40
+ out = gr.Markdown()
41
+
42
+ btn.click(fn=ask_agent, inputs=inp, outputs=out)
43
+
44
+ demo.launch()
45
+
46
+ rag.py:
47
+ from sentence_transformers import SentenceTransformer
48
+ import faiss
49
+ import numpy as np
50
+
51
+ # load model only once
52
+ embedder = SentenceTransformer('all-MiniLM-L6-v2')
53
+
54
+ class VectorStore:
55
+ def __init__(self):
56
+ self.texts = []
57
+ self.embeddings = []
58
+ self.index = None
59
+
60
+ def add_texts(self, texts):
61
+ """Add list of texts to the store."""
62
+ new_embeds = embedder.encode(texts)
63
+ self.texts.extend(texts)
64
+ self.embeddings.extend(new_embeds)
65
+ self.index = faiss.IndexFlatL2(new_embeds.shape[1])
66
+ self.index.add(np.array(self.embeddings))
67
+
68
+ def retrieve(self, query, top_k=3):
69
+ """Return top-k relevant texts for the query."""
70
+ if not self.index:
71
+ return []
72
+ query_embed = embedder.encode([query])
73
+ D, I = self.index.search(np.array(query_embed), k=top_k)
74
+ return [self.texts[i] for i in I[0]]
75
+
76
+ scarper.py:
77
+ import requests
78
+ from bs4 import BeautifulSoup
79
+ from urllib.parse import urljoin
80
+
81
+ def scrape_url(url):
82
+ """Fetch text + image URLs from webpage."""
83
+ try:
84
+ res = requests.get(url, timeout=10)
85
+ res.raise_for_status()
86
+ soup = BeautifulSoup(res.text, 'html.parser')
87
+
88
+ # get text
89
+ text = soup.get_text(separator='\n', strip=True)
90
+
91
+ # get image URLs (absolute)
92
+ images = []
93
+ for img in soup.find_all('img'):
94
+ src = img.get('src')
95
+ if src:
96
+ images.append(urljoin(url, src))
97
+
98
+ return text, images
99
+ except Exception as e:
100
+ return f"[Error scraping {url}: {e}]", []
101
+
102
+
103
+ from googlesearch import search
104
+
105
+ def search_google(query, num_results=5):
106
+ """Search Google and return list of URLs."""
107
+ return list(search(query, num_results=num_results))
108
+
109
+
110
+ Make proper one cause it is showing this error and make a better like use Langchain for better one also make a good UI:
111
+ Traceback (most recent call last):
112
+ File "/usr/local/lib/python3.10/site-packages/gradio/queueing.py", line 626, in process_events
113
+ response = await route_utils.call_process_api(
114
+ File "/usr/local/lib/python3.10/site-packages/gradio/route_utils.py", line 322, in call_process_api
115
+ output = await app.get_blocks().process_api(
116
+ File "/usr/local/lib/python3.10/site-packages/gradio/blocks.py", line 2229, in process_api
117
+ result = await self.call_function(
118
+ File "/usr/local/lib/python3.10/site-packages/gradio/blocks.py", line 1740, in call_function
119
+ prediction = await anyio.to_thread.run_sync( # type: ignore
120
+ File "/usr/local/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
121
+ return await get_async_backend().run_sync_in_worker_thread(
122
+ File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2470, in run_sync_in_worker_thread
123
+ return await future
124
+ File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 967, in run
125
+ result = context.run(func, *args)
126
+ File "/usr/local/lib/python3.10/site-packages/gradio/utils.py", line 940, in wrapper
127
+ response = f(*args, **kwargs)
128
+ File "/home/user/app/app.py", line 17, in ask_agent
129
+ vs.add_texts(texts)
130
+ File "/home/user/app/rag.py", line 19, in add_texts
131
+ self.index = faiss.IndexFlatL2(new_embeds.shape[1])
132
+ IndexError: tuple index out of range