gaur3009 commited on
Commit
e7c9c17
·
verified ·
1 Parent(s): 41d77cd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +379 -0
app.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+ import re
4
+ import json
5
+ import time
6
+ import zipfile
7
+ from io import BytesIO
8
+ from urllib.parse import urljoin, urlparse
9
+ from dataclasses import dataclass, asdict
10
+ from typing import List, Dict, Any, Optional, Tuple
11
+
12
+ import requests
13
+ import pandas as pd
14
+ from bs4 import BeautifulSoup
15
+
16
+ import gradio as gr
17
+
18
+ # ---------- Optional LLM (OpenAI) ----------
19
+ def openai_extract_json(html: str, url: str, fields: List[str], api_key: Optional[str]) -> Optional[List[Dict[str, Any]]]:
20
+ """Use OpenAI to extract structured data from HTML. Returns a list of dicts or None on failure."""
21
+ if not api_key:
22
+ return None
23
+ try:
24
+ # Use modern OpenAI SDK
25
+ from openai import OpenAI
26
+ client = OpenAI(api_key=api_key)
27
+ # Create a concise instruction for robust JSON output
28
+ field_hint = ", ".join(fields) if fields else "title, price, image, rating, url"
29
+ system = (
30
+ "You are a robust web extractor. Given raw HTML and the page URL, "
31
+ "return an array of JSON objects with fields you can infer (and the requested fields if present). "
32
+ "Always output strictly valid JSON with double-quoted keys/strings. Include absolute image URLs if possible."
33
+ )
34
+ user = (
35
+ f"URL: {url}\n\n"
36
+ f"Required fields to attempt: [{field_hint}]\n\n"
37
+ "Return JSON array only. Do not include any commentary.\n\n"
38
+ f"HTML:\n{html[:180000]}" # avoid extremely long prompts
39
+ )
40
+ resp = client.chat.completions.create(
41
+ model="gpt-4o-mini",
42
+ messages=[{"role": "system", "content": system}, {"role": "user", "content": user}],
43
+ temperature=0,
44
+ )
45
+ content = resp.choices[0].message.content.strip()
46
+ # If the model wrapped JSON in code fences, strip them
47
+ content = re.sub(r"^```(?:json)?|```$", "", content).strip()
48
+ data = json.loads(content)
49
+ if isinstance(data, dict):
50
+ data = [data]
51
+ if isinstance(data, list):
52
+ return data
53
+ return None
54
+ except Exception as e:
55
+ print("OpenAI extraction failed:", e)
56
+ return None
57
+
58
+ # ---------- Playwright page loader ----------
59
+ async def fetch_dom(url: str, wait_ms: int = 1500) -> str:
60
+ from playwright.async_api import async_playwright
61
+ async with async_playwright() as p:
62
+ browser = await p.chromium.launch(headless=True)
63
+ page = await browser.new_page()
64
+ await page.goto(url, wait_until="domcontentloaded")
65
+ # try to settle network
66
+ try:
67
+ await page.wait_for_load_state("networkidle", timeout=8000)
68
+ except Exception:
69
+ pass
70
+ if wait_ms > 0:
71
+ await asyncio.sleep(wait_ms / 1000)
72
+ html = await page.content()
73
+ await browser.close()
74
+ return html
75
+
76
+ # ---------- Heuristic extraction ----------
77
+ def extract_images_and_items(html: str, base_url: str, card_selector: Optional[str] = None) -> Tuple[List[Dict[str, Any]], List[str]]:
78
+ """
79
+ Heuristically extract items and image URLs.
80
+ Returns: (items, image_urls)
81
+ items: list of dicts with title/price/url/image if found
82
+ image_urls: all image URLs found on page
83
+ """
84
+ soup = BeautifulSoup(html, "html.parser")
85
+
86
+ # collect all images on page
87
+ images = []
88
+ for img in soup.find_all("img"):
89
+ src = img.get("src") or img.get("data-src") or img.get("data-original")
90
+ if not src:
91
+ continue
92
+ abs_src = urljoin(base_url, src)
93
+ images.append(abs_src)
94
+
95
+ # find likely product/article cards
96
+ items = []
97
+ candidates = []
98
+ if card_selector:
99
+ candidates = soup.select(card_selector)
100
+ else:
101
+ # common product/article containers
102
+ candidates = soup.select(
103
+ "div.product, li.product, div.card, article, div.product-item, div.s-result-item, div._1AtVbE, div._4ddWXP"
104
+ )
105
+ if not candidates:
106
+ # fallback: take top-level links with images
107
+ candidates = [a.parent for a in soup.select("a img") if a.parent]
108
+
109
+ for c in candidates:
110
+ try:
111
+ title = None
112
+ # title heuristics
113
+ for sel in ["h1", "h2", "h3", ".title", ".product-title", "._4rR01T", ".s1Q9rs"]:
114
+ n = c.select_one(sel)
115
+ if n and n.get_text(strip=True):
116
+ title = n.get_text(strip=True)
117
+ break
118
+ if not title:
119
+ # maybe the image alt
120
+ img = c.find("img")
121
+ if img and img.get("alt"):
122
+ title = img.get("alt").strip()
123
+
124
+ # price heuristics
125
+ price = None
126
+ price_text = c.get_text(" ", strip=True)
127
+ m = re.search(r"(?:₹|Rs\.?|INR|\$|€|£)\s?\d[\d,]*(?:\.\d+)?", price_text)
128
+ if m:
129
+ price = m.group(0)
130
+
131
+ # url
132
+ link = c.find("a")
133
+ href = urljoin(base_url, link.get("href")) if link and link.get("href") else base_url
134
+
135
+ # image
136
+ img = c.find("img")
137
+ img_src = None
138
+ if img:
139
+ img_src = img.get("src") or img.get("data-src") or img.get("data-original")
140
+ if img_src:
141
+ img_src = urljoin(base_url, img_src)
142
+
143
+ if any([title, price, img_src]):
144
+ items.append({"title": title, "price": price, "url": href, "image": img_src})
145
+ except Exception:
146
+ continue
147
+
148
+ # de-duplicate images
149
+ seen = set()
150
+ unique_images = []
151
+ for u in images:
152
+ if u not in seen:
153
+ seen.add(u)
154
+ unique_images.append(u)
155
+
156
+ return items, unique_images
157
+
158
+ # ---------- Image download & optional captioning ----------
159
+ def download_images(image_urls: List[str], out_dir: str) -> List[str]:
160
+ os.makedirs(out_dir, exist_ok=True)
161
+ saved = []
162
+ for u in image_urls:
163
+ try:
164
+ name = os.path.basename(urlparse(u).path) or f"img_{len(saved)+1}.jpg"
165
+ # ensure extension
166
+ if not os.path.splitext(name)[1]:
167
+ name += ".jpg"
168
+ path = os.path.join(out_dir, name)
169
+ r = requests.get(u, timeout=20)
170
+ if r.status_code == 200 and r.content:
171
+ with open(path, "wb") as f:
172
+ f.write(r.content)
173
+ saved.append(path)
174
+ except Exception as e:
175
+ print("Image download failed:", u, e)
176
+ return saved
177
+
178
+
179
+ def caption_images(paths: List[str]) -> Dict[str, str]:
180
+ """Caption images with BLIP (optional, slow). Returns {path: caption}."""
181
+ try:
182
+ from transformers import BlipProcessor, BlipForConditionalGeneration
183
+ from PIL import Image
184
+ import torch
185
+ device = "cuda" if torch.cuda.is_available() else "cpu"
186
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
187
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
188
+
189
+ captions = {}
190
+ for p in paths:
191
+ try:
192
+ im = Image.open(p).convert("RGB")
193
+ inputs = processor(im, return_tensors="pt").to(device)
194
+ out = model.generate(**inputs, max_new_tokens=40)
195
+ text = processor.decode(out[0], skip_special_tokens=True)
196
+ captions[p] = text
197
+ except Exception as e:
198
+ captions[p] = f"(caption failed: {e})"
199
+ return captions
200
+ except Exception as e:
201
+ print("Captioning unavailable:", e)
202
+ return {}
203
+
204
+ # ---------- ZIP helper ----------
205
+ def zip_paths(paths: List[str], zip_path: str) -> str:
206
+ with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
207
+ for p in paths:
208
+ if os.path.isfile(p):
209
+ zf.write(p, arcname=os.path.basename(p))
210
+ return zip_path
211
+
212
+ # ---------- Main scrape orchestrator ----------
213
+ async def scrape_one(url: str, fields: List[str], use_llm: bool, api_key: Optional[str], card_selector: Optional[str]) -> Dict[str, Any]:
214
+ html = await fetch_dom(url)
215
+ items, images = extract_images_and_items(html, url, card_selector)
216
+
217
+ llm_rows = None
218
+ if use_llm:
219
+ llm_rows = openai_extract_json(html, url, fields, api_key)
220
+
221
+ return {
222
+ "url": url,
223
+ "html": html,
224
+ "items": items,
225
+ "images": images,
226
+ "llm_rows": llm_rows or []
227
+ }
228
+
229
+
230
+ def to_dataframe(rows: List[Dict[str, Any]]) -> pd.DataFrame:
231
+ if not rows:
232
+ return pd.DataFrame()
233
+ # normalize list of dicts with possibly different keys
234
+ all_keys = set()
235
+ for r in rows:
236
+ all_keys.update(r.keys())
237
+ ordered = []
238
+ for r in rows:
239
+ d = {k: r.get(k) for k in all_keys}
240
+ ordered.append(d)
241
+ df = pd.DataFrame(ordered)
242
+ # helpful column order
243
+ preferred = [k for k in ["title", "name", "price", "rating", "image", "url"] if k in df.columns]
244
+ others = [c for c in df.columns if c not in preferred]
245
+ df = df[preferred + others]
246
+ return df
247
+
248
+ # ---------- Gradio wrapper ----------
249
+ def run_scrape(urls_text: str,
250
+ fields_text: str,
251
+ card_selector: str,
252
+ use_llm: bool,
253
+ api_key: str,
254
+ download_imgs: bool,
255
+ do_caption: bool):
256
+ start = time.time()
257
+ urls = [u.strip() for u in urls_text.splitlines() if u.strip()]
258
+ fields = [f.strip() for f in fields_text.split(',')] if fields_text.strip() else []
259
+
260
+ out_dir = os.path.abspath("scrape_output")
261
+ os.makedirs(out_dir, exist_ok=True)
262
+
263
+ # scrape all urls
264
+ results = []
265
+ async def gather_all():
266
+ return await asyncio.gather(*[
267
+ scrape_one(u, fields, use_llm, api_key if use_llm else None, card_selector or None)
268
+ for u in urls
269
+ ])
270
+ try:
271
+ scraped = asyncio.run(gather_all())
272
+ except RuntimeError:
273
+ # if event loop already running (e.g. notebooks), fallback
274
+ scraped = asyncio.get_event_loop().run_until_complete(gather_all())
275
+
276
+ # aggregate rows
277
+ heuristic_rows: List[Dict[str, Any]] = []
278
+ llm_rows: List[Dict[str, Any]] = []
279
+ all_images: List[str] = []
280
+
281
+ for s in scraped:
282
+ heuristic_rows.extend(s["items"]) # might be empty
283
+ llm_rows.extend(s["llm_rows"]) # might be empty
284
+ all_images.extend(s["images"]) # all page images
285
+
286
+ # Choose which rows to present: prefer LLM if available, else heuristics
287
+ rows = llm_rows if use_llm and llm_rows else heuristic_rows
288
+ df = to_dataframe(rows)
289
+
290
+ # save JSON/CSV
291
+ ts = int(time.time())
292
+ json_path = os.path.join(out_dir, f"scrape_{ts}.json")
293
+ csv_path = os.path.join(out_dir, f"scrape_{ts}.csv")
294
+ df.to_csv(csv_path, index=False)
295
+ with open(json_path, "w", encoding="utf-8") as f:
296
+ json.dump(rows, f, ensure_ascii=False, indent=2)
297
+
298
+ # optionally download images
299
+ gallery_paths: List[str] = []
300
+ zip_path = None
301
+
302
+ if download_imgs and all_images:
303
+ img_dir = os.path.join(out_dir, f"images_{ts}")
304
+ saved = download_images(all_images, img_dir)
305
+ gallery_paths = saved[:120] # limit gallery size for performance
306
+ # optional captioning
307
+ captions_map: Dict[str, str] = {}
308
+ if do_caption and saved:
309
+ captions_map = caption_images(saved)
310
+ # if captions found and we have a df with image column, try to map
311
+ if not df.empty:
312
+ img_col = None
313
+ for c in df.columns:
314
+ if c.lower() in ("image", "image_url", "img", "imageUrl"):
315
+ img_col = c
316
+ break
317
+ if img_col:
318
+ df["caption"] = df[img_col].map(lambda u: captions_map.get(os.path.join(img_dir, os.path.basename(urlparse(str(u)).path)), ""))
319
+ df.to_csv(csv_path, index=False)
320
+ with open(json_path, "w", encoding="utf-8") as f:
321
+ json.dump(json.loads(df.to_json(orient="records")), f, ensure_ascii=False, indent=2)
322
+ # zip
323
+ zip_path = os.path.join(out_dir, f"images_{ts}.zip")
324
+ zip_paths(saved, zip_path)
325
+
326
+ elapsed = round(time.time() - start, 2)
327
+
328
+ # Build gallery data: [(path, caption)]
329
+ gallery_data = []
330
+ for p in gallery_paths:
331
+ gallery_data.append((p, os.path.basename(p)))
332
+
333
+ status = f"Scraped {len(urls)} URL(s) • Rows: {len(df)} • Images found: {len(all_images)} • Time: {elapsed}s"
334
+
335
+ return df, gallery_data, json_path, csv_path, (zip_path if zip_path and os.path.isfile(zip_path) else None), status
336
+
337
+
338
+ # ---------- Gradio UI ----------
339
+ with gr.Blocks(title="AI Scraper — Text + Images", css=".gradio-container {max-width: 1200px !important}") as demo:
340
+ gr.Markdown("""
341
+ # 🕷️ AI-Powered Web Scraper (2025)
342
+ - Render dynamic pages (Playwright)
343
+ - Extract **text + images**
344
+ - Optional **LLM semantic parsing** to JSON
345
+ - Optional **image captioning** (BLIP)
346
+ """)
347
+
348
+ with gr.Row():
349
+ urls = gr.Textbox(label="Target URLs (one per line)", placeholder="https://example.com\nhttps://example.com/products")
350
+ fields = gr.Textbox(label="Fields to extract (comma-separated)", placeholder="title, price, image, rating, url")
351
+ card_selector = gr.Textbox(label="Optional CSS selector for item cards (e.g., div.product, article, .card)")
352
+
353
+ with gr.Row():
354
+ use_llm = gr.Checkbox(label="Use OpenAI for semantic extraction", value=False)
355
+ api_key = gr.Textbox(label="OpenAI API Key (if using LLM)", type="password")
356
+ download_imgs = gr.Checkbox(label="Download images", value=True)
357
+ do_caption = gr.Checkbox(label="Caption images (slow)", value=False)
358
+
359
+ run_btn = gr.Button("🚀 Run Scraper", variant="primary")
360
+
361
+ with gr.Row():
362
+ table = gr.Dataframe(label="Extracted Data (preview)", interactive=False)
363
+ gallery = gr.Gallery(label="Scraped Images (subset)", show_label=True, height=400, allow_preview=True)
364
+
365
+ with gr.Row():
366
+ json_file = gr.File(label="Download JSON")
367
+ csv_file = gr.File(label="Download CSV")
368
+ zip_file = gr.File(label="Download Images ZIP")
369
+
370
+ status = gr.Markdown("Ready.")
371
+
372
+ run_btn.click(
373
+ fn=run_scrape,
374
+ inputs=[urls, fields, card_selector, use_llm, api_key, download_imgs, do_caption],
375
+ outputs=[table, gallery, json_file, csv_file, zip_file, status]
376
+ )
377
+
378
+ if __name__ == "__main__":
379
+ demo.launch()