Spaces:

gaur3009
/

Scaper_search

Runtime error

App Files Files Community

gaur3009 commited on Jul 13

Commit

a8b2206

verified ·

1 Parent(s): c71bafb

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -80

app.py CHANGED Viewed

@@ -3,93 +3,130 @@ from search import search_google
 from scraper import scrape_url
 from rag import VectorStore
 from llm import generate_answer
-import time
 vs = VectorStore()
 def ask_agent(question):
-    # Search Google
-    with gr.Blocks(analytics_enabled=False) as progress_section:
-        with gr.Row():
-            gr.Textbox("Searching web...", show_label=False)
     urls = [u for u in search_google(question, num_results=3) if u.startswith("http")]
-    if not urls:
-        return "⚠️ No search results found. Try a different query."
-    # Scrape URLs
-    progress_section.children[0].children[0].value = "Scraping content..."
-    texts_images = []
-    for url in urls:
-        texts_images.append(scrape_url(url))
     texts = [ti[0] for ti in texts_images if not ti[0].startswith("[Error")]
-    images = [ti[1] for ti in texts_images]
-    # Add to vector store
-    if texts:
-        vs.add_texts(texts)
-    # Retrieve context
-    progress_section.children[0].children[0].value = "Analyzing content..."
     relevant = vs.retrieve(question, top_k=2)
-    context = "\n\n".join(relevant) if relevant else "No relevant context found."
-    # Generate answer
-    progress_section.children[0].children[0].value = "Generating answer..."
     answer = generate_answer(context, question)
-    # Prepare image gallery
-    image_gallery = []
     for url, imgs in zip(urls, images):
         if imgs:
-            image_gallery.extend(imgs[:3])  # Show max 3 images per site
-    # Prepare sources
-    sources = "\n".join([f"- [{url}]({url})" for url in urls])
-    return answer, image_gallery, sources
-with gr.Blocks(
-    theme=gr.themes.Soft(
-        primary_hue="violet",
-        font=[gr.themes.GoogleFont("Poppins")]
-    ),
-    css=".gradio-container {max-width: 900px !important}"
-) as demo:
-    gr.Markdown("""
-    # 🌐 **Smart Web Research Agent**
-    Ask anything - I'll search the web, analyze content, and provide answers with sources!
-    """)
-    with gr.Row():
-        question = gr.Textbox(
-            label="Your question",
-            placeholder="e.g., Best budget laptop 2024?",
-            scale=4
-        )
-        submit_btn = gr.Button("Search", variant="primary", scale=1)
-    progress = gr.Textbox(visible=False)
-    with gr.Accordion("Answer", open=True):
-        answer = gr.Markdown()
-    with gr.Accordion("Sources", open=False):
-        sources = gr.Markdown()
-    with gr.Accordion("Images", open=False):
-        gallery = gr.Gallery(
-            columns=3,
-            object_fit="contain",
-            height="auto"
-        )
-    submit_btn.click(
-        fn=ask_agent,
-        inputs=question,
-        outputs=[answer, gallery, sources],
-        api_name="search"
-    )
-demo.launch()

 from scraper import scrape_url
 from rag import VectorStore
 from llm import generate_answer
 vs = VectorStore()
 def ask_agent(question):
     urls = [u for u in search_google(question, num_results=3) if u.startswith("http")]
+    texts_images = [scrape_url(url) for url in urls]
     texts = [ti[0] for ti in texts_images if not ti[0].startswith("[Error")]
+    images = [ti[1] for ti in texts_images]  # list of list of images
+    # add to vector store
+    vs.add_texts(texts)
     relevant = vs.retrieve(question, top_k=2)
+    context = "\n\n".join(relevant)
+    # generate answer
     answer = generate_answer(context, question)
+    # build image markdown with source
+    image_markdown = ""
     for url, imgs in zip(urls, images):
         if imgs:
+            # show first image as thumbnail
+            img_url = imgs[0]
+            image_markdown += f"![image]({img_url})\n"
+        image_markdown += f"[Source]({url})\n\n"
+    final_output = f"## 🧠 Answer\n\n{answer}\n\n---\n## 📸 Images & Sources\n\n{image_markdown}"
+    return final_output
+with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet")) as demo:
+    gr.Markdown("# 🔍 **AI Web RAG Agent**\nAsk me anything, I'll search, scrape text & images, and answer!")
+    inp = gr.Textbox(label="Your question", placeholder="e.g., Best laptop under 50,000 INR")
+    btn = gr.Button("Ask")
+    out = gr.Markdown()
+    btn.click(fn=ask_agent, inputs=inp, outputs=out)
+demo.launch()
+rag.py:
+from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+# load model only once
+embedder = SentenceTransformer('all-MiniLM-L6-v2')
+class VectorStore:
+    def __init__(self):
+        self.texts = []
+        self.embeddings = []
+        self.index = None
+    def add_texts(self, texts):
+        """Add list of texts to the store."""
+        new_embeds = embedder.encode(texts)
+        self.texts.extend(texts)
+        self.embeddings.extend(new_embeds)
+        self.index = faiss.IndexFlatL2(new_embeds.shape[1])
+        self.index.add(np.array(self.embeddings))
+    def retrieve(self, query, top_k=3):
+        """Return top-k relevant texts for the query."""
+        if not self.index:
+            return []
+        query_embed = embedder.encode([query])
+        D, I = self.index.search(np.array(query_embed), k=top_k)
+        return [self.texts[i] for i in I[0]]
+scarper.py:
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+def scrape_url(url):
+    """Fetch text + image URLs from webpage."""
+    try:
+        res = requests.get(url, timeout=10)
+        res.raise_for_status()
+        soup = BeautifulSoup(res.text, 'html.parser')
+        # get text
+        text = soup.get_text(separator='\n', strip=True)
+        # get image URLs (absolute)
+        images = []
+        for img in soup.find_all('img'):
+            src = img.get('src')
+            if src:
+                images.append(urljoin(url, src))
+        return text, images
+    except Exception as e:
+        return f"[Error scraping {url}: {e}]", []
+from googlesearch import search
+def search_google(query, num_results=5):
+    """Search Google and return list of URLs."""
+    return list(search(query, num_results=num_results))
+Make proper one cause it is showing this error and make a better like use Langchain for better one also make a good UI:
+Traceback (most recent call last):
+  File "/usr/local/lib/python3.10/site-packages/gradio/queueing.py", line 626, in process_events
+    response = await route_utils.call_process_api(
+  File "/usr/local/lib/python3.10/site-packages/gradio/route_utils.py", line 322, in call_process_api
+    output = await app.get_blocks().process_api(
+  File "/usr/local/lib/python3.10/site-packages/gradio/blocks.py", line 2229, in process_api
+    result = await self.call_function(
+  File "/usr/local/lib/python3.10/site-packages/gradio/blocks.py", line 1740, in call_function
+    prediction = await anyio.to_thread.run_sync(  # type: ignore
+  File "/usr/local/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
+    return await get_async_backend().run_sync_in_worker_thread(
+  File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2470, in run_sync_in_worker_thread
+    return await future
+  File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 967, in run
+    result = context.run(func, *args)
+  File "/usr/local/lib/python3.10/site-packages/gradio/utils.py", line 940, in wrapper
+    response = f(*args, **kwargs)
+  File "/home/user/app/app.py", line 17, in ask_agent
+    vs.add_texts(texts)
+  File "/home/user/app/rag.py", line 19, in add_texts
+    self.index = faiss.IndexFlatL2(new_embeds.shape[1])
+IndexError: tuple index out of range