Spaces:

gaur3009
/

scrapper

Sleeping

App Files Files Community

gaur3009 commited on 14 days ago

Commit

316fb4c

verified ·

1 Parent(s): e7b019e

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -68

app.py CHANGED Viewed

@@ -6,25 +6,20 @@ import time
 import zipfile
 from io import BytesIO
 from urllib.parse import urljoin, urlparse
-from dataclasses import dataclass, asdict
 from typing import List, Dict, Any, Optional, Tuple
 import requests
 import pandas as pd
 from bs4 import BeautifulSoup
 import gradio as gr
 # ---------- Optional LLM (OpenAI) ----------
 def openai_extract_json(html: str, url: str, fields: List[str], api_key: Optional[str]) -> Optional[List[Dict[str, Any]]]:
-    """Use OpenAI to extract structured data from HTML. Returns a list of dicts or None on failure."""
     if not api_key:
         return None
     try:
-        # Use modern OpenAI SDK
         from openai import OpenAI
         client = OpenAI(api_key=api_key)
-        # Create a concise instruction for robust JSON output
         field_hint = ", ".join(fields) if fields else "title, price, image, rating, url"
         system = (
             "You are a robust web extractor. Given raw HTML and the page URL, "
@@ -35,7 +30,7 @@ def openai_extract_json(html: str, url: str, fields: List[str], api_key: Optiona
             f"URL: {url}\n\n"
             f"Required fields to attempt: [{field_hint}]\n\n"
             "Return JSON array only. Do not include any commentary.\n\n"
-            f"HTML:\n{html[:180000]}"  # avoid extremely long prompts
         )
         resp = client.chat.completions.create(
             model="gpt-4o-mini",
@@ -43,7 +38,6 @@ def openai_extract_json(html: str, url: str, fields: List[str], api_key: Optiona
             temperature=0,
         )
         content = resp.choices[0].message.content.strip()
-        # If the model wrapped JSON in code fences, strip them
         content = re.sub(r"^```(?:json)?|```$", "", content).strip()
         data = json.loads(content)
         if isinstance(data, dict):
@@ -62,7 +56,6 @@ async def fetch_dom(url: str, wait_ms: int = 1500) -> str:
         browser = await p.chromium.launch(headless=True)
         page = await browser.new_page()
         await page.goto(url, wait_until="domcontentloaded")
-        # try to settle network
         try:
             await page.wait_for_load_state("networkidle", timeout=8000)
         except Exception:
@@ -75,15 +68,8 @@ async def fetch_dom(url: str, wait_ms: int = 1500) -> str:
 # ---------- Heuristic extraction ----------
 def extract_images_and_items(html: str, base_url: str, card_selector: Optional[str] = None) -> Tuple[List[Dict[str, Any]], List[str]]:
-    """
-    Heuristically extract items and image URLs.
-    Returns: (items, image_urls)
-    items: list of dicts with title/price/url/image if found
-    image_urls: all image URLs found on page
-    """
     soup = BeautifulSoup(html, "html.parser")
-    # collect all images on page
     images = []
     for img in soup.find_all("img"):
         src = img.get("src") or img.get("data-src") or img.get("data-original")
@@ -92,47 +78,39 @@ def extract_images_and_items(html: str, base_url: str, card_selector: Optional[s
         abs_src = urljoin(base_url, src)
         images.append(abs_src)
-    # find likely product/article cards
     items = []
     candidates = []
     if card_selector:
         candidates = soup.select(card_selector)
     else:
-        # common product/article containers
         candidates = soup.select(
             "div.product, li.product, div.card, article, div.product-item, div.s-result-item, div._1AtVbE, div._4ddWXP"
         )
         if not candidates:
-            # fallback: take top-level links with images
             candidates = [a.parent for a in soup.select("a img") if a.parent]
     for c in candidates:
         try:
             title = None
-            # title heuristics
             for sel in ["h1", "h2", "h3", ".title", ".product-title", "._4rR01T", ".s1Q9rs"]:
                 n = c.select_one(sel)
                 if n and n.get_text(strip=True):
                     title = n.get_text(strip=True)
                     break
             if not title:
-                # maybe the image alt
                 img = c.find("img")
                 if img and img.get("alt"):
                     title = img.get("alt").strip()
-            # price heuristics
             price = None
             price_text = c.get_text(" ", strip=True)
             m = re.search(r"(?:₹|Rs\.?|INR|\$|€|£)\s?\d[\d,]*(?:\.\d+)?", price_text)
             if m:
                 price = m.group(0)
-            # url
             link = c.find("a")
             href = urljoin(base_url, link.get("href")) if link and link.get("href") else base_url
-            # image
             img = c.find("img")
             img_src = None
             if img:
@@ -145,7 +123,6 @@ def extract_images_and_items(html: str, base_url: str, card_selector: Optional[s
         except Exception:
             continue
-    # de-duplicate images
     seen = set()
     unique_images = []
     for u in images:
@@ -162,7 +139,6 @@ def download_images(image_urls: List[str], out_dir: str) -> List[str]:
     for u in image_urls:
         try:
             name = os.path.basename(urlparse(u).path) or f"img_{len(saved)+1}.jpg"
-            # ensure extension
             if not os.path.splitext(name)[1]:
                 name += ".jpg"
             path = os.path.join(out_dir, name)
@@ -175,9 +151,7 @@ def download_images(image_urls: List[str], out_dir: str) -> List[str]:
             print("Image download failed:", u, e)
     return saved
 def caption_images(paths: List[str]) -> Dict[str, str]:
-    """Caption images with BLIP (optional, slow). Returns {path: caption}."""
     try:
         from transformers import BlipProcessor, BlipForConditionalGeneration
         from PIL import Image
@@ -209,6 +183,22 @@ def zip_paths(paths: List[str], zip_path: str) -> str:
                 zf.write(p, arcname=os.path.basename(p))
     return zip_path
 # ---------- Main scrape orchestrator ----------
 async def scrape_one(url: str, fields: List[str], use_llm: bool, api_key: Optional[str], card_selector: Optional[str]) -> Dict[str, Any]:
     html = await fetch_dom(url)
@@ -218,19 +208,11 @@ async def scrape_one(url: str, fields: List[str], use_llm: bool, api_key: Option
     if use_llm:
         llm_rows = openai_extract_json(html, url, fields, api_key)
-    return {
-        "url": url,
-        "html": html,
-        "items": items,
-        "images": images,
-        "llm_rows": llm_rows or []
-    }
 def to_dataframe(rows: List[Dict[str, Any]]) -> pd.DataFrame:
     if not rows:
         return pd.DataFrame()
-    # normalize list of dicts with possibly different keys
     all_keys = set()
     for r in rows:
         all_keys.update(r.keys())
@@ -239,7 +221,6 @@ def to_dataframe(rows: List[Dict[str, Any]]) -> pd.DataFrame:
         d = {k: r.get(k) for k in all_keys}
         ordered.append(d)
     df = pd.DataFrame(ordered)
-    # helpful column order
     preferred = [k for k in ["title", "name", "price", "rating", "image", "url"] if k in df.columns]
     others = [c for c in df.columns if c not in preferred]
     df = df[preferred + others]
@@ -254,13 +235,18 @@ def run_scrape(urls_text: str,
                download_imgs: bool,
                do_caption: bool):
     start = time.time()
-    urls = [u.strip() for u in urls_text.splitlines() if u.strip()]
     fields = [f.strip() for f in fields_text.split(',')] if fields_text.strip() else []
     out_dir = os.path.abspath("scrape_output")
     os.makedirs(out_dir, exist_ok=True)
-    # scrape all urls
     results = []
     async def gather_all():
         return await asyncio.gather(*[
@@ -270,24 +256,17 @@ def run_scrape(urls_text: str,
     try:
         scraped = asyncio.run(gather_all())
     except RuntimeError:
-        # if event loop already running (e.g. notebooks), fallback
         scraped = asyncio.get_event_loop().run_until_complete(gather_all())
-    # aggregate rows
-    heuristic_rows: List[Dict[str, Any]] = []
-    llm_rows: List[Dict[str, Any]] = []
-    all_images: List[str] = []
     for s in scraped:
-        heuristic_rows.extend(s["items"])  # might be empty
-        llm_rows.extend(s["llm_rows"])     # might be empty
-        all_images.extend(s["images"])     # all page images
-    # Choose which rows to present: prefer LLM if available, else heuristics
     rows = llm_rows if use_llm and llm_rows else heuristic_rows
     df = to_dataframe(rows)
-    # save JSON/CSV
     ts = int(time.time())
     json_path = os.path.join(out_dir, f"scrape_{ts}.json")
     csv_path = os.path.join(out_dir, f"scrape_{ts}.csv")
@@ -295,23 +274,18 @@ def run_scrape(urls_text: str,
     with open(json_path, "w", encoding="utf-8") as f:
         json.dump(rows, f, ensure_ascii=False, indent=2)
-    # optionally download images
-    gallery_paths: List[str] = []
-    zip_path = None
     if download_imgs and all_images:
         img_dir = os.path.join(out_dir, f"images_{ts}")
         saved = download_images(all_images, img_dir)
-        gallery_paths = saved[:120]  # limit gallery size for performance
-        # optional captioning
         captions_map: Dict[str, str] = {}
         if do_caption and saved:
             captions_map = caption_images(saved)
-            # if captions found and we have a df with image column, try to map
             if not df.empty:
                 img_col = None
                 for c in df.columns:
-                    if c.lower() in ("image", "image_url", "img", "imageUrl"):
                         img_col = c
                         break
                 if img_col:
@@ -319,26 +293,19 @@ def run_scrape(urls_text: str,
                     df.to_csv(csv_path, index=False)
                     with open(json_path, "w", encoding="utf-8") as f:
                         json.dump(json.loads(df.to_json(orient="records")), f, ensure_ascii=False, indent=2)
-        # zip
         zip_path = os.path.join(out_dir, f"images_{ts}.zip")
         zip_paths(saved, zip_path)
     elapsed = round(time.time() - start, 2)
-    # Build gallery data: [(path, caption)]
-    gallery_data = []
-    for p in gallery_paths:
-        gallery_data.append((p, os.path.basename(p)))
     status = f"Scraped {len(urls)} URL(s) • Rows: {len(df)} • Images found: {len(all_images)} • Time: {elapsed}s"
     return df, gallery_data, json_path, csv_path, (zip_path if zip_path and os.path.isfile(zip_path) else None), status
 # ---------- Gradio UI ----------
 with gr.Blocks(title="AI Scraper — Text + Images", css=".gradio-container {max-width: 1200px !important}") as demo:
     gr.Markdown("""
     # 🕷️ AI-Powered Web Scraper (2025)
     - Render dynamic pages (Playwright)
     - Extract **text + images**
     - Optional **LLM semantic parsing** to JSON
@@ -346,7 +313,7 @@ with gr.Blocks(title="AI Scraper — Text + Images", css=".gradio-container {max
     """)
     with gr.Row():
-        urls = gr.Textbox(label="Target URLs (one per line)", placeholder="https://example.com\nhttps://example.com/products")
         fields = gr.Textbox(label="Fields to extract (comma-separated)", placeholder="title, price, image, rating, url")
     card_selector = gr.Textbox(label="Optional CSS selector for item cards (e.g., div.product, article, .card)")

 import zipfile
 from io import BytesIO
 from urllib.parse import urljoin, urlparse
 from typing import List, Dict, Any, Optional, Tuple
 import requests
 import pandas as pd
 from bs4 import BeautifulSoup
 import gradio as gr
 # ---------- Optional LLM (OpenAI) ----------
 def openai_extract_json(html: str, url: str, fields: List[str], api_key: Optional[str]) -> Optional[List[Dict[str, Any]]]:
     if not api_key:
         return None
     try:
         from openai import OpenAI
         client = OpenAI(api_key=api_key)
         field_hint = ", ".join(fields) if fields else "title, price, image, rating, url"
         system = (
             "You are a robust web extractor. Given raw HTML and the page URL, "
             f"URL: {url}\n\n"
             f"Required fields to attempt: [{field_hint}]\n\n"
             "Return JSON array only. Do not include any commentary.\n\n"
+            f"HTML:\n{html[:180000]}"
         )
         resp = client.chat.completions.create(
             model="gpt-4o-mini",
             temperature=0,
         )
         content = resp.choices[0].message.content.strip()
         content = re.sub(r"^```(?:json)?|```$", "", content).strip()
         data = json.loads(content)
         if isinstance(data, dict):
         browser = await p.chromium.launch(headless=True)
         page = await browser.new_page()
         await page.goto(url, wait_until="domcontentloaded")
         try:
             await page.wait_for_load_state("networkidle", timeout=8000)
         except Exception:
 # ---------- Heuristic extraction ----------
 def extract_images_and_items(html: str, base_url: str, card_selector: Optional[str] = None) -> Tuple[List[Dict[str, Any]], List[str]]:
     soup = BeautifulSoup(html, "html.parser")
     images = []
     for img in soup.find_all("img"):
         src = img.get("src") or img.get("data-src") or img.get("data-original")
         abs_src = urljoin(base_url, src)
         images.append(abs_src)
     items = []
     candidates = []
     if card_selector:
         candidates = soup.select(card_selector)
     else:
         candidates = soup.select(
             "div.product, li.product, div.card, article, div.product-item, div.s-result-item, div._1AtVbE, div._4ddWXP"
         )
         if not candidates:
             candidates = [a.parent for a in soup.select("a img") if a.parent]
     for c in candidates:
         try:
             title = None
             for sel in ["h1", "h2", "h3", ".title", ".product-title", "._4rR01T", ".s1Q9rs"]:
                 n = c.select_one(sel)
                 if n and n.get_text(strip=True):
                     title = n.get_text(strip=True)
                     break
             if not title:
                 img = c.find("img")
                 if img and img.get("alt"):
                     title = img.get("alt").strip()
             price = None
             price_text = c.get_text(" ", strip=True)
             m = re.search(r"(?:₹|Rs\.?|INR|\$|€|£)\s?\d[\d,]*(?:\.\d+)?", price_text)
             if m:
                 price = m.group(0)
             link = c.find("a")
             href = urljoin(base_url, link.get("href")) if link and link.get("href") else base_url
             img = c.find("img")
             img_src = None
             if img:
         except Exception:
             continue
     seen = set()
     unique_images = []
     for u in images:
     for u in image_urls:
         try:
             name = os.path.basename(urlparse(u).path) or f"img_{len(saved)+1}.jpg"
             if not os.path.splitext(name)[1]:
                 name += ".jpg"
             path = os.path.join(out_dir, name)
             print("Image download failed:", u, e)
     return saved
 def caption_images(paths: List[str]) -> Dict[str, str]:
     try:
         from transformers import BlipProcessor, BlipForConditionalGeneration
         from PIL import Image
                 zf.write(p, arcname=os.path.basename(p))
     return zip_path
+# ---------- Search helper ----------
+def search_links(query: str, num_results: int = 5) -> List[str]:
+    search_url = "https://duckduckgo.com/html/"
+    params = {"q": query}
+    headers = {"User-Agent": "Mozilla/5.0"}
+    r = requests.get(search_url, params=params, headers=headers, timeout=15)
+    soup = BeautifulSoup(r.text, "html.parser")
+    links = []
+    for a in soup.select(".result__a"):
+        href = a.get("href")
+        if href:
+            links.append(href)
+        if len(links) >= num_results:
+            break
+    return links
 # ---------- Main scrape orchestrator ----------
 async def scrape_one(url: str, fields: List[str], use_llm: bool, api_key: Optional[str], card_selector: Optional[str]) -> Dict[str, Any]:
     html = await fetch_dom(url)
     if use_llm:
         llm_rows = openai_extract_json(html, url, fields, api_key)
+    return {"url": url, "html": html, "items": items, "images": images, "llm_rows": llm_rows or []}
 def to_dataframe(rows: List[Dict[str, Any]]) -> pd.DataFrame:
     if not rows:
         return pd.DataFrame()
     all_keys = set()
     for r in rows:
         all_keys.update(r.keys())
         d = {k: r.get(k) for k in all_keys}
         ordered.append(d)
     df = pd.DataFrame(ordered)
     preferred = [k for k in ["title", "name", "price", "rating", "image", "url"] if k in df.columns]
     others = [c for c in df.columns if c not in preferred]
     df = df[preferred + others]
                download_imgs: bool,
                do_caption: bool):
     start = time.time()
+    # Auto-detect: if input doesn’t look like a URL, treat as keyword query
+    if not urls_text.strip().startswith("http"):
+        urls = search_links(urls_text.strip(), num_results=5)
+    else:
+        urls = [u.strip() for u in urls_text.splitlines() if u.strip()]
     fields = [f.strip() for f in fields_text.split(',')] if fields_text.strip() else []
     out_dir = os.path.abspath("scrape_output")
     os.makedirs(out_dir, exist_ok=True)
     results = []
     async def gather_all():
         return await asyncio.gather(*[
     try:
         scraped = asyncio.run(gather_all())
     except RuntimeError:
         scraped = asyncio.get_event_loop().run_until_complete(gather_all())
+    heuristic_rows, llm_rows, all_images = [], [], []
     for s in scraped:
+        heuristic_rows.extend(s["items"])
+        llm_rows.extend(s["llm_rows"])
+        all_images.extend(s["images"])
     rows = llm_rows if use_llm and llm_rows else heuristic_rows
     df = to_dataframe(rows)
     ts = int(time.time())
     json_path = os.path.join(out_dir, f"scrape_{ts}.json")
     csv_path = os.path.join(out_dir, f"scrape_{ts}.csv")
     with open(json_path, "w", encoding="utf-8") as f:
         json.dump(rows, f, ensure_ascii=False, indent=2)
+    gallery_paths, zip_path = [], None
     if download_imgs and all_images:
         img_dir = os.path.join(out_dir, f"images_{ts}")
         saved = download_images(all_images, img_dir)
+        gallery_paths = saved[:120]
         captions_map: Dict[str, str] = {}
         if do_caption and saved:
             captions_map = caption_images(saved)
             if not df.empty:
                 img_col = None
                 for c in df.columns:
+                    if c.lower() in ("image", "image_url", "img", "imageurl"):
                         img_col = c
                         break
                 if img_col:
                     df.to_csv(csv_path, index=False)
                     with open(json_path, "w", encoding="utf-8") as f:
                         json.dump(json.loads(df.to_json(orient="records")), f, ensure_ascii=False, indent=2)
         zip_path = os.path.join(out_dir, f"images_{ts}.zip")
         zip_paths(saved, zip_path)
     elapsed = round(time.time() - start, 2)
+    gallery_data = [(p, os.path.basename(p)) for p in gallery_paths]
     status = f"Scraped {len(urls)} URL(s) • Rows: {len(df)} • Images found: {len(all_images)} • Time: {elapsed}s"
     return df, gallery_data, json_path, csv_path, (zip_path if zip_path and os.path.isfile(zip_path) else None), status
 # ---------- Gradio UI ----------
 with gr.Blocks(title="AI Scraper — Text + Images", css=".gradio-container {max-width: 1200px !important}") as demo:
     gr.Markdown("""
     # 🕷️ AI-Powered Web Scraper (2025)
+    - Enter a **URL** or just a **keyword query**
     - Render dynamic pages (Playwright)
     - Extract **text + images**
     - Optional **LLM semantic parsing** to JSON
     """)
     with gr.Row():
+        urls = gr.Textbox(label="Target URLs or Keywords", placeholder="https://example.com\nOR\nred nike shoes site:amazon.in")
         fields = gr.Textbox(label="Fields to extract (comma-separated)", placeholder="title, price, image, rating, url")
     card_selector = gr.Textbox(label="Optional CSS selector for item cards (e.g., div.product, article, .card)")