Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,379 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
import json
|
5 |
+
import time
|
6 |
+
import zipfile
|
7 |
+
from io import BytesIO
|
8 |
+
from urllib.parse import urljoin, urlparse
|
9 |
+
from dataclasses import dataclass, asdict
|
10 |
+
from typing import List, Dict, Any, Optional, Tuple
|
11 |
+
|
12 |
+
import requests
|
13 |
+
import pandas as pd
|
14 |
+
from bs4 import BeautifulSoup
|
15 |
+
|
16 |
+
import gradio as gr
|
17 |
+
|
18 |
+
# ---------- Optional LLM (OpenAI) ----------
|
19 |
+
def openai_extract_json(html: str, url: str, fields: List[str], api_key: Optional[str]) -> Optional[List[Dict[str, Any]]]:
|
20 |
+
"""Use OpenAI to extract structured data from HTML. Returns a list of dicts or None on failure."""
|
21 |
+
if not api_key:
|
22 |
+
return None
|
23 |
+
try:
|
24 |
+
# Use modern OpenAI SDK
|
25 |
+
from openai import OpenAI
|
26 |
+
client = OpenAI(api_key=api_key)
|
27 |
+
# Create a concise instruction for robust JSON output
|
28 |
+
field_hint = ", ".join(fields) if fields else "title, price, image, rating, url"
|
29 |
+
system = (
|
30 |
+
"You are a robust web extractor. Given raw HTML and the page URL, "
|
31 |
+
"return an array of JSON objects with fields you can infer (and the requested fields if present). "
|
32 |
+
"Always output strictly valid JSON with double-quoted keys/strings. Include absolute image URLs if possible."
|
33 |
+
)
|
34 |
+
user = (
|
35 |
+
f"URL: {url}\n\n"
|
36 |
+
f"Required fields to attempt: [{field_hint}]\n\n"
|
37 |
+
"Return JSON array only. Do not include any commentary.\n\n"
|
38 |
+
f"HTML:\n{html[:180000]}" # avoid extremely long prompts
|
39 |
+
)
|
40 |
+
resp = client.chat.completions.create(
|
41 |
+
model="gpt-4o-mini",
|
42 |
+
messages=[{"role": "system", "content": system}, {"role": "user", "content": user}],
|
43 |
+
temperature=0,
|
44 |
+
)
|
45 |
+
content = resp.choices[0].message.content.strip()
|
46 |
+
# If the model wrapped JSON in code fences, strip them
|
47 |
+
content = re.sub(r"^```(?:json)?|```$", "", content).strip()
|
48 |
+
data = json.loads(content)
|
49 |
+
if isinstance(data, dict):
|
50 |
+
data = [data]
|
51 |
+
if isinstance(data, list):
|
52 |
+
return data
|
53 |
+
return None
|
54 |
+
except Exception as e:
|
55 |
+
print("OpenAI extraction failed:", e)
|
56 |
+
return None
|
57 |
+
|
58 |
+
# ---------- Playwright page loader ----------
|
59 |
+
async def fetch_dom(url: str, wait_ms: int = 1500) -> str:
|
60 |
+
from playwright.async_api import async_playwright
|
61 |
+
async with async_playwright() as p:
|
62 |
+
browser = await p.chromium.launch(headless=True)
|
63 |
+
page = await browser.new_page()
|
64 |
+
await page.goto(url, wait_until="domcontentloaded")
|
65 |
+
# try to settle network
|
66 |
+
try:
|
67 |
+
await page.wait_for_load_state("networkidle", timeout=8000)
|
68 |
+
except Exception:
|
69 |
+
pass
|
70 |
+
if wait_ms > 0:
|
71 |
+
await asyncio.sleep(wait_ms / 1000)
|
72 |
+
html = await page.content()
|
73 |
+
await browser.close()
|
74 |
+
return html
|
75 |
+
|
76 |
+
# ---------- Heuristic extraction ----------
|
77 |
+
def extract_images_and_items(html: str, base_url: str, card_selector: Optional[str] = None) -> Tuple[List[Dict[str, Any]], List[str]]:
|
78 |
+
"""
|
79 |
+
Heuristically extract items and image URLs.
|
80 |
+
Returns: (items, image_urls)
|
81 |
+
items: list of dicts with title/price/url/image if found
|
82 |
+
image_urls: all image URLs found on page
|
83 |
+
"""
|
84 |
+
soup = BeautifulSoup(html, "html.parser")
|
85 |
+
|
86 |
+
# collect all images on page
|
87 |
+
images = []
|
88 |
+
for img in soup.find_all("img"):
|
89 |
+
src = img.get("src") or img.get("data-src") or img.get("data-original")
|
90 |
+
if not src:
|
91 |
+
continue
|
92 |
+
abs_src = urljoin(base_url, src)
|
93 |
+
images.append(abs_src)
|
94 |
+
|
95 |
+
# find likely product/article cards
|
96 |
+
items = []
|
97 |
+
candidates = []
|
98 |
+
if card_selector:
|
99 |
+
candidates = soup.select(card_selector)
|
100 |
+
else:
|
101 |
+
# common product/article containers
|
102 |
+
candidates = soup.select(
|
103 |
+
"div.product, li.product, div.card, article, div.product-item, div.s-result-item, div._1AtVbE, div._4ddWXP"
|
104 |
+
)
|
105 |
+
if not candidates:
|
106 |
+
# fallback: take top-level links with images
|
107 |
+
candidates = [a.parent for a in soup.select("a img") if a.parent]
|
108 |
+
|
109 |
+
for c in candidates:
|
110 |
+
try:
|
111 |
+
title = None
|
112 |
+
# title heuristics
|
113 |
+
for sel in ["h1", "h2", "h3", ".title", ".product-title", "._4rR01T", ".s1Q9rs"]:
|
114 |
+
n = c.select_one(sel)
|
115 |
+
if n and n.get_text(strip=True):
|
116 |
+
title = n.get_text(strip=True)
|
117 |
+
break
|
118 |
+
if not title:
|
119 |
+
# maybe the image alt
|
120 |
+
img = c.find("img")
|
121 |
+
if img and img.get("alt"):
|
122 |
+
title = img.get("alt").strip()
|
123 |
+
|
124 |
+
# price heuristics
|
125 |
+
price = None
|
126 |
+
price_text = c.get_text(" ", strip=True)
|
127 |
+
m = re.search(r"(?:₹|Rs\.?|INR|\$|€|£)\s?\d[\d,]*(?:\.\d+)?", price_text)
|
128 |
+
if m:
|
129 |
+
price = m.group(0)
|
130 |
+
|
131 |
+
# url
|
132 |
+
link = c.find("a")
|
133 |
+
href = urljoin(base_url, link.get("href")) if link and link.get("href") else base_url
|
134 |
+
|
135 |
+
# image
|
136 |
+
img = c.find("img")
|
137 |
+
img_src = None
|
138 |
+
if img:
|
139 |
+
img_src = img.get("src") or img.get("data-src") or img.get("data-original")
|
140 |
+
if img_src:
|
141 |
+
img_src = urljoin(base_url, img_src)
|
142 |
+
|
143 |
+
if any([title, price, img_src]):
|
144 |
+
items.append({"title": title, "price": price, "url": href, "image": img_src})
|
145 |
+
except Exception:
|
146 |
+
continue
|
147 |
+
|
148 |
+
# de-duplicate images
|
149 |
+
seen = set()
|
150 |
+
unique_images = []
|
151 |
+
for u in images:
|
152 |
+
if u not in seen:
|
153 |
+
seen.add(u)
|
154 |
+
unique_images.append(u)
|
155 |
+
|
156 |
+
return items, unique_images
|
157 |
+
|
158 |
+
# ---------- Image download & optional captioning ----------
|
159 |
+
def download_images(image_urls: List[str], out_dir: str) -> List[str]:
|
160 |
+
os.makedirs(out_dir, exist_ok=True)
|
161 |
+
saved = []
|
162 |
+
for u in image_urls:
|
163 |
+
try:
|
164 |
+
name = os.path.basename(urlparse(u).path) or f"img_{len(saved)+1}.jpg"
|
165 |
+
# ensure extension
|
166 |
+
if not os.path.splitext(name)[1]:
|
167 |
+
name += ".jpg"
|
168 |
+
path = os.path.join(out_dir, name)
|
169 |
+
r = requests.get(u, timeout=20)
|
170 |
+
if r.status_code == 200 and r.content:
|
171 |
+
with open(path, "wb") as f:
|
172 |
+
f.write(r.content)
|
173 |
+
saved.append(path)
|
174 |
+
except Exception as e:
|
175 |
+
print("Image download failed:", u, e)
|
176 |
+
return saved
|
177 |
+
|
178 |
+
|
179 |
+
def caption_images(paths: List[str]) -> Dict[str, str]:
|
180 |
+
"""Caption images with BLIP (optional, slow). Returns {path: caption}."""
|
181 |
+
try:
|
182 |
+
from transformers import BlipProcessor, BlipForConditionalGeneration
|
183 |
+
from PIL import Image
|
184 |
+
import torch
|
185 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
186 |
+
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
187 |
+
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
|
188 |
+
|
189 |
+
captions = {}
|
190 |
+
for p in paths:
|
191 |
+
try:
|
192 |
+
im = Image.open(p).convert("RGB")
|
193 |
+
inputs = processor(im, return_tensors="pt").to(device)
|
194 |
+
out = model.generate(**inputs, max_new_tokens=40)
|
195 |
+
text = processor.decode(out[0], skip_special_tokens=True)
|
196 |
+
captions[p] = text
|
197 |
+
except Exception as e:
|
198 |
+
captions[p] = f"(caption failed: {e})"
|
199 |
+
return captions
|
200 |
+
except Exception as e:
|
201 |
+
print("Captioning unavailable:", e)
|
202 |
+
return {}
|
203 |
+
|
204 |
+
# ---------- ZIP helper ----------
|
205 |
+
def zip_paths(paths: List[str], zip_path: str) -> str:
|
206 |
+
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
|
207 |
+
for p in paths:
|
208 |
+
if os.path.isfile(p):
|
209 |
+
zf.write(p, arcname=os.path.basename(p))
|
210 |
+
return zip_path
|
211 |
+
|
212 |
+
# ---------- Main scrape orchestrator ----------
|
213 |
+
async def scrape_one(url: str, fields: List[str], use_llm: bool, api_key: Optional[str], card_selector: Optional[str]) -> Dict[str, Any]:
|
214 |
+
html = await fetch_dom(url)
|
215 |
+
items, images = extract_images_and_items(html, url, card_selector)
|
216 |
+
|
217 |
+
llm_rows = None
|
218 |
+
if use_llm:
|
219 |
+
llm_rows = openai_extract_json(html, url, fields, api_key)
|
220 |
+
|
221 |
+
return {
|
222 |
+
"url": url,
|
223 |
+
"html": html,
|
224 |
+
"items": items,
|
225 |
+
"images": images,
|
226 |
+
"llm_rows": llm_rows or []
|
227 |
+
}
|
228 |
+
|
229 |
+
|
230 |
+
def to_dataframe(rows: List[Dict[str, Any]]) -> pd.DataFrame:
|
231 |
+
if not rows:
|
232 |
+
return pd.DataFrame()
|
233 |
+
# normalize list of dicts with possibly different keys
|
234 |
+
all_keys = set()
|
235 |
+
for r in rows:
|
236 |
+
all_keys.update(r.keys())
|
237 |
+
ordered = []
|
238 |
+
for r in rows:
|
239 |
+
d = {k: r.get(k) for k in all_keys}
|
240 |
+
ordered.append(d)
|
241 |
+
df = pd.DataFrame(ordered)
|
242 |
+
# helpful column order
|
243 |
+
preferred = [k for k in ["title", "name", "price", "rating", "image", "url"] if k in df.columns]
|
244 |
+
others = [c for c in df.columns if c not in preferred]
|
245 |
+
df = df[preferred + others]
|
246 |
+
return df
|
247 |
+
|
248 |
+
# ---------- Gradio wrapper ----------
|
249 |
+
def run_scrape(urls_text: str,
|
250 |
+
fields_text: str,
|
251 |
+
card_selector: str,
|
252 |
+
use_llm: bool,
|
253 |
+
api_key: str,
|
254 |
+
download_imgs: bool,
|
255 |
+
do_caption: bool):
|
256 |
+
start = time.time()
|
257 |
+
urls = [u.strip() for u in urls_text.splitlines() if u.strip()]
|
258 |
+
fields = [f.strip() for f in fields_text.split(',')] if fields_text.strip() else []
|
259 |
+
|
260 |
+
out_dir = os.path.abspath("scrape_output")
|
261 |
+
os.makedirs(out_dir, exist_ok=True)
|
262 |
+
|
263 |
+
# scrape all urls
|
264 |
+
results = []
|
265 |
+
async def gather_all():
|
266 |
+
return await asyncio.gather(*[
|
267 |
+
scrape_one(u, fields, use_llm, api_key if use_llm else None, card_selector or None)
|
268 |
+
for u in urls
|
269 |
+
])
|
270 |
+
try:
|
271 |
+
scraped = asyncio.run(gather_all())
|
272 |
+
except RuntimeError:
|
273 |
+
# if event loop already running (e.g. notebooks), fallback
|
274 |
+
scraped = asyncio.get_event_loop().run_until_complete(gather_all())
|
275 |
+
|
276 |
+
# aggregate rows
|
277 |
+
heuristic_rows: List[Dict[str, Any]] = []
|
278 |
+
llm_rows: List[Dict[str, Any]] = []
|
279 |
+
all_images: List[str] = []
|
280 |
+
|
281 |
+
for s in scraped:
|
282 |
+
heuristic_rows.extend(s["items"]) # might be empty
|
283 |
+
llm_rows.extend(s["llm_rows"]) # might be empty
|
284 |
+
all_images.extend(s["images"]) # all page images
|
285 |
+
|
286 |
+
# Choose which rows to present: prefer LLM if available, else heuristics
|
287 |
+
rows = llm_rows if use_llm and llm_rows else heuristic_rows
|
288 |
+
df = to_dataframe(rows)
|
289 |
+
|
290 |
+
# save JSON/CSV
|
291 |
+
ts = int(time.time())
|
292 |
+
json_path = os.path.join(out_dir, f"scrape_{ts}.json")
|
293 |
+
csv_path = os.path.join(out_dir, f"scrape_{ts}.csv")
|
294 |
+
df.to_csv(csv_path, index=False)
|
295 |
+
with open(json_path, "w", encoding="utf-8") as f:
|
296 |
+
json.dump(rows, f, ensure_ascii=False, indent=2)
|
297 |
+
|
298 |
+
# optionally download images
|
299 |
+
gallery_paths: List[str] = []
|
300 |
+
zip_path = None
|
301 |
+
|
302 |
+
if download_imgs and all_images:
|
303 |
+
img_dir = os.path.join(out_dir, f"images_{ts}")
|
304 |
+
saved = download_images(all_images, img_dir)
|
305 |
+
gallery_paths = saved[:120] # limit gallery size for performance
|
306 |
+
# optional captioning
|
307 |
+
captions_map: Dict[str, str] = {}
|
308 |
+
if do_caption and saved:
|
309 |
+
captions_map = caption_images(saved)
|
310 |
+
# if captions found and we have a df with image column, try to map
|
311 |
+
if not df.empty:
|
312 |
+
img_col = None
|
313 |
+
for c in df.columns:
|
314 |
+
if c.lower() in ("image", "image_url", "img", "imageUrl"):
|
315 |
+
img_col = c
|
316 |
+
break
|
317 |
+
if img_col:
|
318 |
+
df["caption"] = df[img_col].map(lambda u: captions_map.get(os.path.join(img_dir, os.path.basename(urlparse(str(u)).path)), ""))
|
319 |
+
df.to_csv(csv_path, index=False)
|
320 |
+
with open(json_path, "w", encoding="utf-8") as f:
|
321 |
+
json.dump(json.loads(df.to_json(orient="records")), f, ensure_ascii=False, indent=2)
|
322 |
+
# zip
|
323 |
+
zip_path = os.path.join(out_dir, f"images_{ts}.zip")
|
324 |
+
zip_paths(saved, zip_path)
|
325 |
+
|
326 |
+
elapsed = round(time.time() - start, 2)
|
327 |
+
|
328 |
+
# Build gallery data: [(path, caption)]
|
329 |
+
gallery_data = []
|
330 |
+
for p in gallery_paths:
|
331 |
+
gallery_data.append((p, os.path.basename(p)))
|
332 |
+
|
333 |
+
status = f"Scraped {len(urls)} URL(s) • Rows: {len(df)} • Images found: {len(all_images)} • Time: {elapsed}s"
|
334 |
+
|
335 |
+
return df, gallery_data, json_path, csv_path, (zip_path if zip_path and os.path.isfile(zip_path) else None), status
|
336 |
+
|
337 |
+
|
338 |
+
# ---------- Gradio UI ----------
|
339 |
+
with gr.Blocks(title="AI Scraper — Text + Images", css=".gradio-container {max-width: 1200px !important}") as demo:
|
340 |
+
gr.Markdown("""
|
341 |
+
# 🕷️ AI-Powered Web Scraper (2025)
|
342 |
+
- Render dynamic pages (Playwright)
|
343 |
+
- Extract **text + images**
|
344 |
+
- Optional **LLM semantic parsing** to JSON
|
345 |
+
- Optional **image captioning** (BLIP)
|
346 |
+
""")
|
347 |
+
|
348 |
+
with gr.Row():
|
349 |
+
urls = gr.Textbox(label="Target URLs (one per line)", placeholder="https://example.com\nhttps://example.com/products")
|
350 |
+
fields = gr.Textbox(label="Fields to extract (comma-separated)", placeholder="title, price, image, rating, url")
|
351 |
+
card_selector = gr.Textbox(label="Optional CSS selector for item cards (e.g., div.product, article, .card)")
|
352 |
+
|
353 |
+
with gr.Row():
|
354 |
+
use_llm = gr.Checkbox(label="Use OpenAI for semantic extraction", value=False)
|
355 |
+
api_key = gr.Textbox(label="OpenAI API Key (if using LLM)", type="password")
|
356 |
+
download_imgs = gr.Checkbox(label="Download images", value=True)
|
357 |
+
do_caption = gr.Checkbox(label="Caption images (slow)", value=False)
|
358 |
+
|
359 |
+
run_btn = gr.Button("🚀 Run Scraper", variant="primary")
|
360 |
+
|
361 |
+
with gr.Row():
|
362 |
+
table = gr.Dataframe(label="Extracted Data (preview)", interactive=False)
|
363 |
+
gallery = gr.Gallery(label="Scraped Images (subset)", show_label=True, height=400, allow_preview=True)
|
364 |
+
|
365 |
+
with gr.Row():
|
366 |
+
json_file = gr.File(label="Download JSON")
|
367 |
+
csv_file = gr.File(label="Download CSV")
|
368 |
+
zip_file = gr.File(label="Download Images ZIP")
|
369 |
+
|
370 |
+
status = gr.Markdown("Ready.")
|
371 |
+
|
372 |
+
run_btn.click(
|
373 |
+
fn=run_scrape,
|
374 |
+
inputs=[urls, fields, card_selector, use_llm, api_key, download_imgs, do_caption],
|
375 |
+
outputs=[table, gallery, json_file, csv_file, zip_file, status]
|
376 |
+
)
|
377 |
+
|
378 |
+
if __name__ == "__main__":
|
379 |
+
demo.launch()
|