gaur3009 commited on
Commit
316fb4c
·
verified ·
1 Parent(s): e7b019e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -68
app.py CHANGED
@@ -6,25 +6,20 @@ import time
6
  import zipfile
7
  from io import BytesIO
8
  from urllib.parse import urljoin, urlparse
9
- from dataclasses import dataclass, asdict
10
  from typing import List, Dict, Any, Optional, Tuple
11
 
12
  import requests
13
  import pandas as pd
14
  from bs4 import BeautifulSoup
15
-
16
  import gradio as gr
17
 
18
  # ---------- Optional LLM (OpenAI) ----------
19
  def openai_extract_json(html: str, url: str, fields: List[str], api_key: Optional[str]) -> Optional[List[Dict[str, Any]]]:
20
- """Use OpenAI to extract structured data from HTML. Returns a list of dicts or None on failure."""
21
  if not api_key:
22
  return None
23
  try:
24
- # Use modern OpenAI SDK
25
  from openai import OpenAI
26
  client = OpenAI(api_key=api_key)
27
- # Create a concise instruction for robust JSON output
28
  field_hint = ", ".join(fields) if fields else "title, price, image, rating, url"
29
  system = (
30
  "You are a robust web extractor. Given raw HTML and the page URL, "
@@ -35,7 +30,7 @@ def openai_extract_json(html: str, url: str, fields: List[str], api_key: Optiona
35
  f"URL: {url}\n\n"
36
  f"Required fields to attempt: [{field_hint}]\n\n"
37
  "Return JSON array only. Do not include any commentary.\n\n"
38
- f"HTML:\n{html[:180000]}" # avoid extremely long prompts
39
  )
40
  resp = client.chat.completions.create(
41
  model="gpt-4o-mini",
@@ -43,7 +38,6 @@ def openai_extract_json(html: str, url: str, fields: List[str], api_key: Optiona
43
  temperature=0,
44
  )
45
  content = resp.choices[0].message.content.strip()
46
- # If the model wrapped JSON in code fences, strip them
47
  content = re.sub(r"^```(?:json)?|```$", "", content).strip()
48
  data = json.loads(content)
49
  if isinstance(data, dict):
@@ -62,7 +56,6 @@ async def fetch_dom(url: str, wait_ms: int = 1500) -> str:
62
  browser = await p.chromium.launch(headless=True)
63
  page = await browser.new_page()
64
  await page.goto(url, wait_until="domcontentloaded")
65
- # try to settle network
66
  try:
67
  await page.wait_for_load_state("networkidle", timeout=8000)
68
  except Exception:
@@ -75,15 +68,8 @@ async def fetch_dom(url: str, wait_ms: int = 1500) -> str:
75
 
76
  # ---------- Heuristic extraction ----------
77
  def extract_images_and_items(html: str, base_url: str, card_selector: Optional[str] = None) -> Tuple[List[Dict[str, Any]], List[str]]:
78
- """
79
- Heuristically extract items and image URLs.
80
- Returns: (items, image_urls)
81
- items: list of dicts with title/price/url/image if found
82
- image_urls: all image URLs found on page
83
- """
84
  soup = BeautifulSoup(html, "html.parser")
85
 
86
- # collect all images on page
87
  images = []
88
  for img in soup.find_all("img"):
89
  src = img.get("src") or img.get("data-src") or img.get("data-original")
@@ -92,47 +78,39 @@ def extract_images_and_items(html: str, base_url: str, card_selector: Optional[s
92
  abs_src = urljoin(base_url, src)
93
  images.append(abs_src)
94
 
95
- # find likely product/article cards
96
  items = []
97
  candidates = []
98
  if card_selector:
99
  candidates = soup.select(card_selector)
100
  else:
101
- # common product/article containers
102
  candidates = soup.select(
103
  "div.product, li.product, div.card, article, div.product-item, div.s-result-item, div._1AtVbE, div._4ddWXP"
104
  )
105
  if not candidates:
106
- # fallback: take top-level links with images
107
  candidates = [a.parent for a in soup.select("a img") if a.parent]
108
 
109
  for c in candidates:
110
  try:
111
  title = None
112
- # title heuristics
113
  for sel in ["h1", "h2", "h3", ".title", ".product-title", "._4rR01T", ".s1Q9rs"]:
114
  n = c.select_one(sel)
115
  if n and n.get_text(strip=True):
116
  title = n.get_text(strip=True)
117
  break
118
  if not title:
119
- # maybe the image alt
120
  img = c.find("img")
121
  if img and img.get("alt"):
122
  title = img.get("alt").strip()
123
 
124
- # price heuristics
125
  price = None
126
  price_text = c.get_text(" ", strip=True)
127
  m = re.search(r"(?:₹|Rs\.?|INR|\$|€|£)\s?\d[\d,]*(?:\.\d+)?", price_text)
128
  if m:
129
  price = m.group(0)
130
 
131
- # url
132
  link = c.find("a")
133
  href = urljoin(base_url, link.get("href")) if link and link.get("href") else base_url
134
 
135
- # image
136
  img = c.find("img")
137
  img_src = None
138
  if img:
@@ -145,7 +123,6 @@ def extract_images_and_items(html: str, base_url: str, card_selector: Optional[s
145
  except Exception:
146
  continue
147
 
148
- # de-duplicate images
149
  seen = set()
150
  unique_images = []
151
  for u in images:
@@ -162,7 +139,6 @@ def download_images(image_urls: List[str], out_dir: str) -> List[str]:
162
  for u in image_urls:
163
  try:
164
  name = os.path.basename(urlparse(u).path) or f"img_{len(saved)+1}.jpg"
165
- # ensure extension
166
  if not os.path.splitext(name)[1]:
167
  name += ".jpg"
168
  path = os.path.join(out_dir, name)
@@ -175,9 +151,7 @@ def download_images(image_urls: List[str], out_dir: str) -> List[str]:
175
  print("Image download failed:", u, e)
176
  return saved
177
 
178
-
179
  def caption_images(paths: List[str]) -> Dict[str, str]:
180
- """Caption images with BLIP (optional, slow). Returns {path: caption}."""
181
  try:
182
  from transformers import BlipProcessor, BlipForConditionalGeneration
183
  from PIL import Image
@@ -209,6 +183,22 @@ def zip_paths(paths: List[str], zip_path: str) -> str:
209
  zf.write(p, arcname=os.path.basename(p))
210
  return zip_path
211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  # ---------- Main scrape orchestrator ----------
213
  async def scrape_one(url: str, fields: List[str], use_llm: bool, api_key: Optional[str], card_selector: Optional[str]) -> Dict[str, Any]:
214
  html = await fetch_dom(url)
@@ -218,19 +208,11 @@ async def scrape_one(url: str, fields: List[str], use_llm: bool, api_key: Option
218
  if use_llm:
219
  llm_rows = openai_extract_json(html, url, fields, api_key)
220
 
221
- return {
222
- "url": url,
223
- "html": html,
224
- "items": items,
225
- "images": images,
226
- "llm_rows": llm_rows or []
227
- }
228
-
229
 
230
  def to_dataframe(rows: List[Dict[str, Any]]) -> pd.DataFrame:
231
  if not rows:
232
  return pd.DataFrame()
233
- # normalize list of dicts with possibly different keys
234
  all_keys = set()
235
  for r in rows:
236
  all_keys.update(r.keys())
@@ -239,7 +221,6 @@ def to_dataframe(rows: List[Dict[str, Any]]) -> pd.DataFrame:
239
  d = {k: r.get(k) for k in all_keys}
240
  ordered.append(d)
241
  df = pd.DataFrame(ordered)
242
- # helpful column order
243
  preferred = [k for k in ["title", "name", "price", "rating", "image", "url"] if k in df.columns]
244
  others = [c for c in df.columns if c not in preferred]
245
  df = df[preferred + others]
@@ -254,13 +235,18 @@ def run_scrape(urls_text: str,
254
  download_imgs: bool,
255
  do_caption: bool):
256
  start = time.time()
257
- urls = [u.strip() for u in urls_text.splitlines() if u.strip()]
 
 
 
 
 
 
258
  fields = [f.strip() for f in fields_text.split(',')] if fields_text.strip() else []
259
 
260
  out_dir = os.path.abspath("scrape_output")
261
  os.makedirs(out_dir, exist_ok=True)
262
 
263
- # scrape all urls
264
  results = []
265
  async def gather_all():
266
  return await asyncio.gather(*[
@@ -270,24 +256,17 @@ def run_scrape(urls_text: str,
270
  try:
271
  scraped = asyncio.run(gather_all())
272
  except RuntimeError:
273
- # if event loop already running (e.g. notebooks), fallback
274
  scraped = asyncio.get_event_loop().run_until_complete(gather_all())
275
 
276
- # aggregate rows
277
- heuristic_rows: List[Dict[str, Any]] = []
278
- llm_rows: List[Dict[str, Any]] = []
279
- all_images: List[str] = []
280
-
281
  for s in scraped:
282
- heuristic_rows.extend(s["items"]) # might be empty
283
- llm_rows.extend(s["llm_rows"]) # might be empty
284
- all_images.extend(s["images"]) # all page images
285
 
286
- # Choose which rows to present: prefer LLM if available, else heuristics
287
  rows = llm_rows if use_llm and llm_rows else heuristic_rows
288
  df = to_dataframe(rows)
289
 
290
- # save JSON/CSV
291
  ts = int(time.time())
292
  json_path = os.path.join(out_dir, f"scrape_{ts}.json")
293
  csv_path = os.path.join(out_dir, f"scrape_{ts}.csv")
@@ -295,23 +274,18 @@ def run_scrape(urls_text: str,
295
  with open(json_path, "w", encoding="utf-8") as f:
296
  json.dump(rows, f, ensure_ascii=False, indent=2)
297
 
298
- # optionally download images
299
- gallery_paths: List[str] = []
300
- zip_path = None
301
-
302
  if download_imgs and all_images:
303
  img_dir = os.path.join(out_dir, f"images_{ts}")
304
  saved = download_images(all_images, img_dir)
305
- gallery_paths = saved[:120] # limit gallery size for performance
306
- # optional captioning
307
  captions_map: Dict[str, str] = {}
308
  if do_caption and saved:
309
  captions_map = caption_images(saved)
310
- # if captions found and we have a df with image column, try to map
311
  if not df.empty:
312
  img_col = None
313
  for c in df.columns:
314
- if c.lower() in ("image", "image_url", "img", "imageUrl"):
315
  img_col = c
316
  break
317
  if img_col:
@@ -319,26 +293,19 @@ def run_scrape(urls_text: str,
319
  df.to_csv(csv_path, index=False)
320
  with open(json_path, "w", encoding="utf-8") as f:
321
  json.dump(json.loads(df.to_json(orient="records")), f, ensure_ascii=False, indent=2)
322
- # zip
323
  zip_path = os.path.join(out_dir, f"images_{ts}.zip")
324
  zip_paths(saved, zip_path)
325
 
326
  elapsed = round(time.time() - start, 2)
327
-
328
- # Build gallery data: [(path, caption)]
329
- gallery_data = []
330
- for p in gallery_paths:
331
- gallery_data.append((p, os.path.basename(p)))
332
-
333
  status = f"Scraped {len(urls)} URL(s) • Rows: {len(df)} • Images found: {len(all_images)} • Time: {elapsed}s"
334
-
335
  return df, gallery_data, json_path, csv_path, (zip_path if zip_path and os.path.isfile(zip_path) else None), status
336
 
337
-
338
  # ---------- Gradio UI ----------
339
  with gr.Blocks(title="AI Scraper — Text + Images", css=".gradio-container {max-width: 1200px !important}") as demo:
340
  gr.Markdown("""
341
  # 🕷️ AI-Powered Web Scraper (2025)
 
342
  - Render dynamic pages (Playwright)
343
  - Extract **text + images**
344
  - Optional **LLM semantic parsing** to JSON
@@ -346,7 +313,7 @@ with gr.Blocks(title="AI Scraper — Text + Images", css=".gradio-container {max
346
  """)
347
 
348
  with gr.Row():
349
- urls = gr.Textbox(label="Target URLs (one per line)", placeholder="https://example.com\nhttps://example.com/products")
350
  fields = gr.Textbox(label="Fields to extract (comma-separated)", placeholder="title, price, image, rating, url")
351
  card_selector = gr.Textbox(label="Optional CSS selector for item cards (e.g., div.product, article, .card)")
352
 
 
6
  import zipfile
7
  from io import BytesIO
8
  from urllib.parse import urljoin, urlparse
 
9
  from typing import List, Dict, Any, Optional, Tuple
10
 
11
  import requests
12
  import pandas as pd
13
  from bs4 import BeautifulSoup
 
14
  import gradio as gr
15
 
16
  # ---------- Optional LLM (OpenAI) ----------
17
  def openai_extract_json(html: str, url: str, fields: List[str], api_key: Optional[str]) -> Optional[List[Dict[str, Any]]]:
 
18
  if not api_key:
19
  return None
20
  try:
 
21
  from openai import OpenAI
22
  client = OpenAI(api_key=api_key)
 
23
  field_hint = ", ".join(fields) if fields else "title, price, image, rating, url"
24
  system = (
25
  "You are a robust web extractor. Given raw HTML and the page URL, "
 
30
  f"URL: {url}\n\n"
31
  f"Required fields to attempt: [{field_hint}]\n\n"
32
  "Return JSON array only. Do not include any commentary.\n\n"
33
+ f"HTML:\n{html[:180000]}"
34
  )
35
  resp = client.chat.completions.create(
36
  model="gpt-4o-mini",
 
38
  temperature=0,
39
  )
40
  content = resp.choices[0].message.content.strip()
 
41
  content = re.sub(r"^```(?:json)?|```$", "", content).strip()
42
  data = json.loads(content)
43
  if isinstance(data, dict):
 
56
  browser = await p.chromium.launch(headless=True)
57
  page = await browser.new_page()
58
  await page.goto(url, wait_until="domcontentloaded")
 
59
  try:
60
  await page.wait_for_load_state("networkidle", timeout=8000)
61
  except Exception:
 
68
 
69
  # ---------- Heuristic extraction ----------
70
  def extract_images_and_items(html: str, base_url: str, card_selector: Optional[str] = None) -> Tuple[List[Dict[str, Any]], List[str]]:
 
 
 
 
 
 
71
  soup = BeautifulSoup(html, "html.parser")
72
 
 
73
  images = []
74
  for img in soup.find_all("img"):
75
  src = img.get("src") or img.get("data-src") or img.get("data-original")
 
78
  abs_src = urljoin(base_url, src)
79
  images.append(abs_src)
80
 
 
81
  items = []
82
  candidates = []
83
  if card_selector:
84
  candidates = soup.select(card_selector)
85
  else:
 
86
  candidates = soup.select(
87
  "div.product, li.product, div.card, article, div.product-item, div.s-result-item, div._1AtVbE, div._4ddWXP"
88
  )
89
  if not candidates:
 
90
  candidates = [a.parent for a in soup.select("a img") if a.parent]
91
 
92
  for c in candidates:
93
  try:
94
  title = None
 
95
  for sel in ["h1", "h2", "h3", ".title", ".product-title", "._4rR01T", ".s1Q9rs"]:
96
  n = c.select_one(sel)
97
  if n and n.get_text(strip=True):
98
  title = n.get_text(strip=True)
99
  break
100
  if not title:
 
101
  img = c.find("img")
102
  if img and img.get("alt"):
103
  title = img.get("alt").strip()
104
 
 
105
  price = None
106
  price_text = c.get_text(" ", strip=True)
107
  m = re.search(r"(?:₹|Rs\.?|INR|\$|€|£)\s?\d[\d,]*(?:\.\d+)?", price_text)
108
  if m:
109
  price = m.group(0)
110
 
 
111
  link = c.find("a")
112
  href = urljoin(base_url, link.get("href")) if link and link.get("href") else base_url
113
 
 
114
  img = c.find("img")
115
  img_src = None
116
  if img:
 
123
  except Exception:
124
  continue
125
 
 
126
  seen = set()
127
  unique_images = []
128
  for u in images:
 
139
  for u in image_urls:
140
  try:
141
  name = os.path.basename(urlparse(u).path) or f"img_{len(saved)+1}.jpg"
 
142
  if not os.path.splitext(name)[1]:
143
  name += ".jpg"
144
  path = os.path.join(out_dir, name)
 
151
  print("Image download failed:", u, e)
152
  return saved
153
 
 
154
  def caption_images(paths: List[str]) -> Dict[str, str]:
 
155
  try:
156
  from transformers import BlipProcessor, BlipForConditionalGeneration
157
  from PIL import Image
 
183
  zf.write(p, arcname=os.path.basename(p))
184
  return zip_path
185
 
186
+ # ---------- Search helper ----------
187
+ def search_links(query: str, num_results: int = 5) -> List[str]:
188
+ search_url = "https://duckduckgo.com/html/"
189
+ params = {"q": query}
190
+ headers = {"User-Agent": "Mozilla/5.0"}
191
+ r = requests.get(search_url, params=params, headers=headers, timeout=15)
192
+ soup = BeautifulSoup(r.text, "html.parser")
193
+ links = []
194
+ for a in soup.select(".result__a"):
195
+ href = a.get("href")
196
+ if href:
197
+ links.append(href)
198
+ if len(links) >= num_results:
199
+ break
200
+ return links
201
+
202
  # ---------- Main scrape orchestrator ----------
203
  async def scrape_one(url: str, fields: List[str], use_llm: bool, api_key: Optional[str], card_selector: Optional[str]) -> Dict[str, Any]:
204
  html = await fetch_dom(url)
 
208
  if use_llm:
209
  llm_rows = openai_extract_json(html, url, fields, api_key)
210
 
211
+ return {"url": url, "html": html, "items": items, "images": images, "llm_rows": llm_rows or []}
 
 
 
 
 
 
 
212
 
213
  def to_dataframe(rows: List[Dict[str, Any]]) -> pd.DataFrame:
214
  if not rows:
215
  return pd.DataFrame()
 
216
  all_keys = set()
217
  for r in rows:
218
  all_keys.update(r.keys())
 
221
  d = {k: r.get(k) for k in all_keys}
222
  ordered.append(d)
223
  df = pd.DataFrame(ordered)
 
224
  preferred = [k for k in ["title", "name", "price", "rating", "image", "url"] if k in df.columns]
225
  others = [c for c in df.columns if c not in preferred]
226
  df = df[preferred + others]
 
235
  download_imgs: bool,
236
  do_caption: bool):
237
  start = time.time()
238
+
239
+ # Auto-detect: if input doesn’t look like a URL, treat as keyword query
240
+ if not urls_text.strip().startswith("http"):
241
+ urls = search_links(urls_text.strip(), num_results=5)
242
+ else:
243
+ urls = [u.strip() for u in urls_text.splitlines() if u.strip()]
244
+
245
  fields = [f.strip() for f in fields_text.split(',')] if fields_text.strip() else []
246
 
247
  out_dir = os.path.abspath("scrape_output")
248
  os.makedirs(out_dir, exist_ok=True)
249
 
 
250
  results = []
251
  async def gather_all():
252
  return await asyncio.gather(*[
 
256
  try:
257
  scraped = asyncio.run(gather_all())
258
  except RuntimeError:
 
259
  scraped = asyncio.get_event_loop().run_until_complete(gather_all())
260
 
261
+ heuristic_rows, llm_rows, all_images = [], [], []
 
 
 
 
262
  for s in scraped:
263
+ heuristic_rows.extend(s["items"])
264
+ llm_rows.extend(s["llm_rows"])
265
+ all_images.extend(s["images"])
266
 
 
267
  rows = llm_rows if use_llm and llm_rows else heuristic_rows
268
  df = to_dataframe(rows)
269
 
 
270
  ts = int(time.time())
271
  json_path = os.path.join(out_dir, f"scrape_{ts}.json")
272
  csv_path = os.path.join(out_dir, f"scrape_{ts}.csv")
 
274
  with open(json_path, "w", encoding="utf-8") as f:
275
  json.dump(rows, f, ensure_ascii=False, indent=2)
276
 
277
+ gallery_paths, zip_path = [], None
 
 
 
278
  if download_imgs and all_images:
279
  img_dir = os.path.join(out_dir, f"images_{ts}")
280
  saved = download_images(all_images, img_dir)
281
+ gallery_paths = saved[:120]
 
282
  captions_map: Dict[str, str] = {}
283
  if do_caption and saved:
284
  captions_map = caption_images(saved)
 
285
  if not df.empty:
286
  img_col = None
287
  for c in df.columns:
288
+ if c.lower() in ("image", "image_url", "img", "imageurl"):
289
  img_col = c
290
  break
291
  if img_col:
 
293
  df.to_csv(csv_path, index=False)
294
  with open(json_path, "w", encoding="utf-8") as f:
295
  json.dump(json.loads(df.to_json(orient="records")), f, ensure_ascii=False, indent=2)
 
296
  zip_path = os.path.join(out_dir, f"images_{ts}.zip")
297
  zip_paths(saved, zip_path)
298
 
299
  elapsed = round(time.time() - start, 2)
300
+ gallery_data = [(p, os.path.basename(p)) for p in gallery_paths]
 
 
 
 
 
301
  status = f"Scraped {len(urls)} URL(s) • Rows: {len(df)} • Images found: {len(all_images)} • Time: {elapsed}s"
 
302
  return df, gallery_data, json_path, csv_path, (zip_path if zip_path and os.path.isfile(zip_path) else None), status
303
 
 
304
  # ---------- Gradio UI ----------
305
  with gr.Blocks(title="AI Scraper — Text + Images", css=".gradio-container {max-width: 1200px !important}") as demo:
306
  gr.Markdown("""
307
  # 🕷️ AI-Powered Web Scraper (2025)
308
+ - Enter a **URL** or just a **keyword query**
309
  - Render dynamic pages (Playwright)
310
  - Extract **text + images**
311
  - Optional **LLM semantic parsing** to JSON
 
313
  """)
314
 
315
  with gr.Row():
316
+ urls = gr.Textbox(label="Target URLs or Keywords", placeholder="https://example.com\nOR\nred nike shoes site:amazon.in")
317
  fields = gr.Textbox(label="Fields to extract (comma-separated)", placeholder="title, price, image, rating, url")
318
  card_selector = gr.Textbox(label="Optional CSS selector for item cards (e.g., div.product, article, .card)")
319