Spaces:

gaur3009
/

scrapper

Sleeping

App Files Files Community

gaur3009 commited on 17 days ago

Commit

e7c9c17

verified ·

1 Parent(s): 41d77cd

Create app.py

Browse files

Files changed (1) hide show

app.py +379 -0

app.py ADDED Viewed

	@@ -0,0 +1,379 @@

+import asyncio
+import os
+import re
+import json
+import time
+import zipfile
+from io import BytesIO
+from urllib.parse import urljoin, urlparse
+from dataclasses import dataclass, asdict
+from typing import List, Dict, Any, Optional, Tuple
+import requests
+import pandas as pd
+from bs4 import BeautifulSoup
+import gradio as gr
+# ---------- Optional LLM (OpenAI) ----------
+def openai_extract_json(html: str, url: str, fields: List[str], api_key: Optional[str]) -> Optional[List[Dict[str, Any]]]:
+    """Use OpenAI to extract structured data from HTML. Returns a list of dicts or None on failure."""
+    if not api_key:
+        return None
+    try:
+        # Use modern OpenAI SDK
+        from openai import OpenAI
+        client = OpenAI(api_key=api_key)
+        # Create a concise instruction for robust JSON output
+        field_hint = ", ".join(fields) if fields else "title, price, image, rating, url"
+        system = (
+            "You are a robust web extractor. Given raw HTML and the page URL, "
+            "return an array of JSON objects with fields you can infer (and the requested fields if present). "
+            "Always output strictly valid JSON with double-quoted keys/strings. Include absolute image URLs if possible."
+        )
+        user = (
+            f"URL: {url}\n\n"
+            f"Required fields to attempt: [{field_hint}]\n\n"
+            "Return JSON array only. Do not include any commentary.\n\n"
+            f"HTML:\n{html[:180000]}"  # avoid extremely long prompts
+        )
+        resp = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[{"role": "system", "content": system}, {"role": "user", "content": user}],
+            temperature=0,
+        )
+        content = resp.choices[0].message.content.strip()
+        # If the model wrapped JSON in code fences, strip them
+        content = re.sub(r"^```(?:json)?|```$", "", content).strip()
+        data = json.loads(content)
+        if isinstance(data, dict):
+            data = [data]
+        if isinstance(data, list):
+            return data
+        return None
+    except Exception as e:
+        print("OpenAI extraction failed:", e)
+        return None
+# ---------- Playwright page loader ----------
+async def fetch_dom(url: str, wait_ms: int = 1500) -> str:
+    from playwright.async_api import async_playwright
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        page = await browser.new_page()
+        await page.goto(url, wait_until="domcontentloaded")
+        # try to settle network
+        try:
+            await page.wait_for_load_state("networkidle", timeout=8000)
+        except Exception:
+            pass
+        if wait_ms > 0:
+            await asyncio.sleep(wait_ms / 1000)
+        html = await page.content()
+        await browser.close()
+        return html
+# ---------- Heuristic extraction ----------
+def extract_images_and_items(html: str, base_url: str, card_selector: Optional[str] = None) -> Tuple[List[Dict[str, Any]], List[str]]:
+    """
+    Heuristically extract items and image URLs.
+    Returns: (items, image_urls)
+    items: list of dicts with title/price/url/image if found
+    image_urls: all image URLs found on page
+    """
+    soup = BeautifulSoup(html, "html.parser")
+    # collect all images on page
+    images = []
+    for img in soup.find_all("img"):
+        src = img.get("src") or img.get("data-src") or img.get("data-original")
+        if not src:
+            continue
+        abs_src = urljoin(base_url, src)
+        images.append(abs_src)
+    # find likely product/article cards
+    items = []
+    candidates = []
+    if card_selector:
+        candidates = soup.select(card_selector)
+    else:
+        # common product/article containers
+        candidates = soup.select(
+            "div.product, li.product, div.card, article, div.product-item, div.s-result-item, div._1AtVbE, div._4ddWXP"
+        )
+        if not candidates:
+            # fallback: take top-level links with images
+            candidates = [a.parent for a in soup.select("a img") if a.parent]
+    for c in candidates:
+        try:
+            title = None
+            # title heuristics
+            for sel in ["h1", "h2", "h3", ".title", ".product-title", "._4rR01T", ".s1Q9rs"]:
+                n = c.select_one(sel)
+                if n and n.get_text(strip=True):
+                    title = n.get_text(strip=True)
+                    break
+            if not title:
+                # maybe the image alt
+                img = c.find("img")
+                if img and img.get("alt"):
+                    title = img.get("alt").strip()
+            # price heuristics
+            price = None
+            price_text = c.get_text(" ", strip=True)
+            m = re.search(r"(?:₹|Rs\.?|INR|\$|€|£)\s?\d[\d,]*(?:\.\d+)?", price_text)
+            if m:
+                price = m.group(0)
+            # url
+            link = c.find("a")
+            href = urljoin(base_url, link.get("href")) if link and link.get("href") else base_url
+            # image
+            img = c.find("img")
+            img_src = None
+            if img:
+                img_src = img.get("src") or img.get("data-src") or img.get("data-original")
+                if img_src:
+                    img_src = urljoin(base_url, img_src)
+            if any([title, price, img_src]):
+                items.append({"title": title, "price": price, "url": href, "image": img_src})
+        except Exception:
+            continue
+    # de-duplicate images
+    seen = set()
+    unique_images = []
+    for u in images:
+        if u not in seen:
+            seen.add(u)
+            unique_images.append(u)
+    return items, unique_images
+# ---------- Image download & optional captioning ----------
+def download_images(image_urls: List[str], out_dir: str) -> List[str]:
+    os.makedirs(out_dir, exist_ok=True)
+    saved = []
+    for u in image_urls:
+        try:
+            name = os.path.basename(urlparse(u).path) or f"img_{len(saved)+1}.jpg"
+            # ensure extension
+            if not os.path.splitext(name)[1]:
+                name += ".jpg"
+            path = os.path.join(out_dir, name)
+            r = requests.get(u, timeout=20)
+            if r.status_code == 200 and r.content:
+                with open(path, "wb") as f:
+                    f.write(r.content)
+                saved.append(path)
+        except Exception as e:
+            print("Image download failed:", u, e)
+    return saved
+def caption_images(paths: List[str]) -> Dict[str, str]:
+    """Caption images with BLIP (optional, slow). Returns {path: caption}."""
+    try:
+        from transformers import BlipProcessor, BlipForConditionalGeneration
+        from PIL import Image
+        import torch
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+        model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
+        captions = {}
+        for p in paths:
+            try:
+                im = Image.open(p).convert("RGB")
+                inputs = processor(im, return_tensors="pt").to(device)
+                out = model.generate(**inputs, max_new_tokens=40)
+                text = processor.decode(out[0], skip_special_tokens=True)
+                captions[p] = text
+            except Exception as e:
+                captions[p] = f"(caption failed: {e})"
+        return captions
+    except Exception as e:
+        print("Captioning unavailable:", e)
+        return {}
+# ---------- ZIP helper ----------
+def zip_paths(paths: List[str], zip_path: str) -> str:
+    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
+        for p in paths:
+            if os.path.isfile(p):
+                zf.write(p, arcname=os.path.basename(p))
+    return zip_path
+# ---------- Main scrape orchestrator ----------
+async def scrape_one(url: str, fields: List[str], use_llm: bool, api_key: Optional[str], card_selector: Optional[str]) -> Dict[str, Any]:
+    html = await fetch_dom(url)
+    items, images = extract_images_and_items(html, url, card_selector)
+    llm_rows = None
+    if use_llm:
+        llm_rows = openai_extract_json(html, url, fields, api_key)
+    return {
+        "url": url,
+        "html": html,
+        "items": items,
+        "images": images,
+        "llm_rows": llm_rows or []
+    }
+def to_dataframe(rows: List[Dict[str, Any]]) -> pd.DataFrame:
+    if not rows:
+        return pd.DataFrame()
+    # normalize list of dicts with possibly different keys
+    all_keys = set()
+    for r in rows:
+        all_keys.update(r.keys())
+    ordered = []
+    for r in rows:
+        d = {k: r.get(k) for k in all_keys}
+        ordered.append(d)
+    df = pd.DataFrame(ordered)
+    # helpful column order
+    preferred = [k for k in ["title", "name", "price", "rating", "image", "url"] if k in df.columns]
+    others = [c for c in df.columns if c not in preferred]
+    df = df[preferred + others]
+    return df
+# ---------- Gradio wrapper ----------
+def run_scrape(urls_text: str,
+               fields_text: str,
+               card_selector: str,
+               use_llm: bool,
+               api_key: str,
+               download_imgs: bool,
+               do_caption: bool):
+    start = time.time()
+    urls = [u.strip() for u in urls_text.splitlines() if u.strip()]
+    fields = [f.strip() for f in fields_text.split(',')] if fields_text.strip() else []
+    out_dir = os.path.abspath("scrape_output")
+    os.makedirs(out_dir, exist_ok=True)
+    # scrape all urls
+    results = []
+    async def gather_all():
+        return await asyncio.gather(*[
+            scrape_one(u, fields, use_llm, api_key if use_llm else None, card_selector or None)
+            for u in urls
+        ])
+    try:
+        scraped = asyncio.run(gather_all())
+    except RuntimeError:
+        # if event loop already running (e.g. notebooks), fallback
+        scraped = asyncio.get_event_loop().run_until_complete(gather_all())
+    # aggregate rows
+    heuristic_rows: List[Dict[str, Any]] = []
+    llm_rows: List[Dict[str, Any]] = []
+    all_images: List[str] = []
+    for s in scraped:
+        heuristic_rows.extend(s["items"])  # might be empty
+        llm_rows.extend(s["llm_rows"])     # might be empty
+        all_images.extend(s["images"])     # all page images
+    # Choose which rows to present: prefer LLM if available, else heuristics
+    rows = llm_rows if use_llm and llm_rows else heuristic_rows
+    df = to_dataframe(rows)
+    # save JSON/CSV
+    ts = int(time.time())
+    json_path = os.path.join(out_dir, f"scrape_{ts}.json")
+    csv_path = os.path.join(out_dir, f"scrape_{ts}.csv")
+    df.to_csv(csv_path, index=False)
+    with open(json_path, "w", encoding="utf-8") as f:
+        json.dump(rows, f, ensure_ascii=False, indent=2)
+    # optionally download images
+    gallery_paths: List[str] = []
+    zip_path = None
+    if download_imgs and all_images:
+        img_dir = os.path.join(out_dir, f"images_{ts}")
+        saved = download_images(all_images, img_dir)
+        gallery_paths = saved[:120]  # limit gallery size for performance
+        # optional captioning
+        captions_map: Dict[str, str] = {}
+        if do_caption and saved:
+            captions_map = caption_images(saved)
+            # if captions found and we have a df with image column, try to map
+            if not df.empty:
+                img_col = None
+                for c in df.columns:
+                    if c.lower() in ("image", "image_url", "img", "imageUrl"):
+                        img_col = c
+                        break
+                if img_col:
+                    df["caption"] = df[img_col].map(lambda u: captions_map.get(os.path.join(img_dir, os.path.basename(urlparse(str(u)).path)), ""))
+                    df.to_csv(csv_path, index=False)
+                    with open(json_path, "w", encoding="utf-8") as f:
+                        json.dump(json.loads(df.to_json(orient="records")), f, ensure_ascii=False, indent=2)
+        # zip
+        zip_path = os.path.join(out_dir, f"images_{ts}.zip")
+        zip_paths(saved, zip_path)
+    elapsed = round(time.time() - start, 2)
+    # Build gallery data: [(path, caption)]
+    gallery_data = []
+    for p in gallery_paths:
+        gallery_data.append((p, os.path.basename(p)))
+    status = f"Scraped {len(urls)} URL(s) • Rows: {len(df)} • Images found: {len(all_images)} • Time: {elapsed}s"
+    return df, gallery_data, json_path, csv_path, (zip_path if zip_path and os.path.isfile(zip_path) else None), status
+# ---------- Gradio UI ----------
+with gr.Blocks(title="AI Scraper — Text + Images", css=".gradio-container {max-width: 1200px !important}") as demo:
+    gr.Markdown("""
+    # 🕷️ AI-Powered Web Scraper (2025)
+    - Render dynamic pages (Playwright)
+    - Extract **text + images**
+    - Optional **LLM semantic parsing** to JSON
+    - Optional **image captioning** (BLIP)
+    """)
+    with gr.Row():
+        urls = gr.Textbox(label="Target URLs (one per line)", placeholder="https://example.com\nhttps://example.com/products")
+        fields = gr.Textbox(label="Fields to extract (comma-separated)", placeholder="title, price, image, rating, url")
+    card_selector = gr.Textbox(label="Optional CSS selector for item cards (e.g., div.product, article, .card)")
+    with gr.Row():
+        use_llm = gr.Checkbox(label="Use OpenAI for semantic extraction", value=False)
+        api_key = gr.Textbox(label="OpenAI API Key (if using LLM)", type="password")
+        download_imgs = gr.Checkbox(label="Download images", value=True)
+        do_caption = gr.Checkbox(label="Caption images (slow)", value=False)
+    run_btn = gr.Button("🚀 Run Scraper", variant="primary")
+    with gr.Row():
+        table = gr.Dataframe(label="Extracted Data (preview)", interactive=False)
+    gallery = gr.Gallery(label="Scraped Images (subset)", show_label=True, height=400, allow_preview=True)
+    with gr.Row():
+        json_file = gr.File(label="Download JSON")
+        csv_file = gr.File(label="Download CSV")
+        zip_file = gr.File(label="Download Images ZIP")
+    status = gr.Markdown("Ready.")
+    run_btn.click(
+        fn=run_scrape,
+        inputs=[urls, fields, card_selector, use_llm, api_key, download_imgs, do_caption],
+        outputs=[table, gallery, json_file, csv_file, zip_file, status]
+    )
+if __name__ == "__main__":
+    demo.launch()