Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -3,93 +3,130 @@ from search import search_google
|
|
3 |
from scraper import scrape_url
|
4 |
from rag import VectorStore
|
5 |
from llm import generate_answer
|
6 |
-
import time
|
7 |
|
8 |
vs = VectorStore()
|
9 |
|
10 |
def ask_agent(question):
|
11 |
-
# Search Google
|
12 |
-
with gr.Blocks(analytics_enabled=False) as progress_section:
|
13 |
-
with gr.Row():
|
14 |
-
gr.Textbox("Searching web...", show_label=False)
|
15 |
-
|
16 |
urls = [u for u in search_google(question, num_results=3) if u.startswith("http")]
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
# Scrape URLs
|
21 |
-
progress_section.children[0].children[0].value = "Scraping content..."
|
22 |
-
texts_images = []
|
23 |
-
for url in urls:
|
24 |
-
texts_images.append(scrape_url(url))
|
25 |
-
|
26 |
texts = [ti[0] for ti in texts_images if not ti[0].startswith("[Error")]
|
27 |
-
images = [ti[1] for ti in texts_images]
|
28 |
-
|
29 |
-
#
|
30 |
-
|
31 |
-
vs.add_texts(texts)
|
32 |
-
|
33 |
-
# Retrieve context
|
34 |
-
progress_section.children[0].children[0].value = "Analyzing content..."
|
35 |
relevant = vs.retrieve(question, top_k=2)
|
36 |
-
context = "\n\n".join(relevant)
|
37 |
-
|
38 |
-
#
|
39 |
-
progress_section.children[0].children[0].value = "Generating answer..."
|
40 |
answer = generate_answer(context, question)
|
41 |
-
|
42 |
-
#
|
43 |
-
|
44 |
for url, imgs in zip(urls, images):
|
45 |
if imgs:
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
)
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
from scraper import scrape_url
|
4 |
from rag import VectorStore
|
5 |
from llm import generate_answer
|
|
|
6 |
|
7 |
vs = VectorStore()
|
8 |
|
9 |
def ask_agent(question):
|
|
|
|
|
|
|
|
|
|
|
10 |
urls = [u for u in search_google(question, num_results=3) if u.startswith("http")]
|
11 |
+
|
12 |
+
texts_images = [scrape_url(url) for url in urls]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
texts = [ti[0] for ti in texts_images if not ti[0].startswith("[Error")]
|
14 |
+
images = [ti[1] for ti in texts_images] # list of list of images
|
15 |
+
|
16 |
+
# add to vector store
|
17 |
+
vs.add_texts(texts)
|
|
|
|
|
|
|
|
|
18 |
relevant = vs.retrieve(question, top_k=2)
|
19 |
+
context = "\n\n".join(relevant)
|
20 |
+
|
21 |
+
# generate answer
|
|
|
22 |
answer = generate_answer(context, question)
|
23 |
+
|
24 |
+
# build image markdown with source
|
25 |
+
image_markdown = ""
|
26 |
for url, imgs in zip(urls, images):
|
27 |
if imgs:
|
28 |
+
# show first image as thumbnail
|
29 |
+
img_url = imgs[0]
|
30 |
+
image_markdown += f"\n"
|
31 |
+
image_markdown += f"[Source]({url})\n\n"
|
32 |
+
|
33 |
+
final_output = f"## π§ Answer\n\n{answer}\n\n---\n## πΈ Images & Sources\n\n{image_markdown}"
|
34 |
+
return final_output
|
35 |
+
|
36 |
+
with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet")) as demo:
|
37 |
+
gr.Markdown("# π **AI Web RAG Agent**\nAsk me anything, I'll search, scrape text & images, and answer!")
|
38 |
+
inp = gr.Textbox(label="Your question", placeholder="e.g., Best laptop under 50,000 INR")
|
39 |
+
btn = gr.Button("Ask")
|
40 |
+
out = gr.Markdown()
|
41 |
+
|
42 |
+
btn.click(fn=ask_agent, inputs=inp, outputs=out)
|
43 |
+
|
44 |
+
demo.launch()
|
45 |
+
|
46 |
+
rag.py:
|
47 |
+
from sentence_transformers import SentenceTransformer
|
48 |
+
import faiss
|
49 |
+
import numpy as np
|
50 |
+
|
51 |
+
# load model only once
|
52 |
+
embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
53 |
+
|
54 |
+
class VectorStore:
|
55 |
+
def __init__(self):
|
56 |
+
self.texts = []
|
57 |
+
self.embeddings = []
|
58 |
+
self.index = None
|
59 |
+
|
60 |
+
def add_texts(self, texts):
|
61 |
+
"""Add list of texts to the store."""
|
62 |
+
new_embeds = embedder.encode(texts)
|
63 |
+
self.texts.extend(texts)
|
64 |
+
self.embeddings.extend(new_embeds)
|
65 |
+
self.index = faiss.IndexFlatL2(new_embeds.shape[1])
|
66 |
+
self.index.add(np.array(self.embeddings))
|
67 |
+
|
68 |
+
def retrieve(self, query, top_k=3):
|
69 |
+
"""Return top-k relevant texts for the query."""
|
70 |
+
if not self.index:
|
71 |
+
return []
|
72 |
+
query_embed = embedder.encode([query])
|
73 |
+
D, I = self.index.search(np.array(query_embed), k=top_k)
|
74 |
+
return [self.texts[i] for i in I[0]]
|
75 |
+
|
76 |
+
scarper.py:
|
77 |
+
import requests
|
78 |
+
from bs4 import BeautifulSoup
|
79 |
+
from urllib.parse import urljoin
|
80 |
+
|
81 |
+
def scrape_url(url):
|
82 |
+
"""Fetch text + image URLs from webpage."""
|
83 |
+
try:
|
84 |
+
res = requests.get(url, timeout=10)
|
85 |
+
res.raise_for_status()
|
86 |
+
soup = BeautifulSoup(res.text, 'html.parser')
|
87 |
+
|
88 |
+
# get text
|
89 |
+
text = soup.get_text(separator='\n', strip=True)
|
90 |
+
|
91 |
+
# get image URLs (absolute)
|
92 |
+
images = []
|
93 |
+
for img in soup.find_all('img'):
|
94 |
+
src = img.get('src')
|
95 |
+
if src:
|
96 |
+
images.append(urljoin(url, src))
|
97 |
+
|
98 |
+
return text, images
|
99 |
+
except Exception as e:
|
100 |
+
return f"[Error scraping {url}: {e}]", []
|
101 |
+
|
102 |
+
|
103 |
+
from googlesearch import search
|
104 |
+
|
105 |
+
def search_google(query, num_results=5):
|
106 |
+
"""Search Google and return list of URLs."""
|
107 |
+
return list(search(query, num_results=num_results))
|
108 |
+
|
109 |
+
|
110 |
+
Make proper one cause it is showing this error and make a better like use Langchain for better one also make a good UI:
|
111 |
+
Traceback (most recent call last):
|
112 |
+
File "/usr/local/lib/python3.10/site-packages/gradio/queueing.py", line 626, in process_events
|
113 |
+
response = await route_utils.call_process_api(
|
114 |
+
File "/usr/local/lib/python3.10/site-packages/gradio/route_utils.py", line 322, in call_process_api
|
115 |
+
output = await app.get_blocks().process_api(
|
116 |
+
File "/usr/local/lib/python3.10/site-packages/gradio/blocks.py", line 2229, in process_api
|
117 |
+
result = await self.call_function(
|
118 |
+
File "/usr/local/lib/python3.10/site-packages/gradio/blocks.py", line 1740, in call_function
|
119 |
+
prediction = await anyio.to_thread.run_sync( # type: ignore
|
120 |
+
File "/usr/local/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
|
121 |
+
return await get_async_backend().run_sync_in_worker_thread(
|
122 |
+
File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2470, in run_sync_in_worker_thread
|
123 |
+
return await future
|
124 |
+
File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 967, in run
|
125 |
+
result = context.run(func, *args)
|
126 |
+
File "/usr/local/lib/python3.10/site-packages/gradio/utils.py", line 940, in wrapper
|
127 |
+
response = f(*args, **kwargs)
|
128 |
+
File "/home/user/app/app.py", line 17, in ask_agent
|
129 |
+
vs.add_texts(texts)
|
130 |
+
File "/home/user/app/rag.py", line 19, in add_texts
|
131 |
+
self.index = faiss.IndexFlatL2(new_embeds.shape[1])
|
132 |
+
IndexError: tuple index out of range
|