gaur3009 commited on
Commit
228aaa0
Β·
verified Β·
1 Parent(s): 316fb4c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +326 -88
app.py CHANGED
@@ -4,16 +4,30 @@ import re
4
  import json
5
  import time
6
  import zipfile
7
- from io import BytesIO
8
  from urllib.parse import urljoin, urlparse
9
- from typing import List, Dict, Any, Optional, Tuple
10
 
11
  import requests
12
  import pandas as pd
13
  from bs4 import BeautifulSoup
14
  import gradio as gr
15
 
16
- # ---------- Optional LLM (OpenAI) ----------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def openai_extract_json(html: str, url: str, fields: List[str], api_key: Optional[str]) -> Optional[List[Dict[str, Any]]]:
18
  if not api_key:
19
  return None
@@ -49,13 +63,15 @@ def openai_extract_json(html: str, url: str, fields: List[str], api_key: Optiona
49
  print("OpenAI extraction failed:", e)
50
  return None
51
 
52
- # ---------- Playwright page loader ----------
53
- async def fetch_dom(url: str, wait_ms: int = 1500) -> str:
 
 
54
  from playwright.async_api import async_playwright
55
  async with async_playwright() as p:
56
  browser = await p.chromium.launch(headless=True)
57
- page = await browser.new_page()
58
- await page.goto(url, wait_until="domcontentloaded")
59
  try:
60
  await page.wait_for_load_state("networkidle", timeout=8000)
61
  except Exception:
@@ -66,10 +82,23 @@ async def fetch_dom(url: str, wait_ms: int = 1500) -> str:
66
  await browser.close()
67
  return html
68
 
69
- # ---------- Heuristic extraction ----------
 
 
 
 
 
 
 
 
 
 
 
 
70
  def extract_images_and_items(html: str, base_url: str, card_selector: Optional[str] = None) -> Tuple[List[Dict[str, Any]], List[str]]:
71
  soup = BeautifulSoup(html, "html.parser")
72
 
 
73
  images = []
74
  for img in soup.find_all("img"):
75
  src = img.get("src") or img.get("data-src") or img.get("data-original")
@@ -78,13 +107,15 @@ def extract_images_and_items(html: str, base_url: str, card_selector: Optional[s
78
  abs_src = urljoin(base_url, src)
79
  images.append(abs_src)
80
 
 
81
  items = []
82
- candidates = []
83
  if card_selector:
84
  candidates = soup.select(card_selector)
85
  else:
86
  candidates = soup.select(
87
- "div.product, li.product, div.card, article, div.product-item, div.s-result-item, div._1AtVbE, div._4ddWXP"
 
 
88
  )
89
  if not candidates:
90
  candidates = [a.parent for a in soup.select("a img") if a.parent]
@@ -92,7 +123,7 @@ def extract_images_and_items(html: str, base_url: str, card_selector: Optional[s
92
  for c in candidates:
93
  try:
94
  title = None
95
- for sel in ["h1", "h2", "h3", ".title", ".product-title", "._4rR01T", ".s1Q9rs"]:
96
  n = c.select_one(sel)
97
  if n and n.get_text(strip=True):
98
  title = n.get_text(strip=True)
@@ -123,6 +154,7 @@ def extract_images_and_items(html: str, base_url: str, card_selector: Optional[s
123
  except Exception:
124
  continue
125
 
 
126
  seen = set()
127
  unique_images = []
128
  for u in images:
@@ -132,17 +164,21 @@ def extract_images_and_items(html: str, base_url: str, card_selector: Optional[s
132
 
133
  return items, unique_images
134
 
135
- # ---------- Image download & optional captioning ----------
 
 
136
  def download_images(image_urls: List[str], out_dir: str) -> List[str]:
137
  os.makedirs(out_dir, exist_ok=True)
138
  saved = []
 
 
139
  for u in image_urls:
140
  try:
141
  name = os.path.basename(urlparse(u).path) or f"img_{len(saved)+1}.jpg"
142
  if not os.path.splitext(name)[1]:
143
  name += ".jpg"
144
  path = os.path.join(out_dir, name)
145
- r = requests.get(u, timeout=20)
146
  if r.status_code == 200 and r.content:
147
  with open(path, "wb") as f:
148
  f.write(r.content)
@@ -175,7 +211,9 @@ def caption_images(paths: List[str]) -> Dict[str, str]:
175
  print("Captioning unavailable:", e)
176
  return {}
177
 
178
- # ---------- ZIP helper ----------
 
 
179
  def zip_paths(paths: List[str], zip_path: str) -> str:
180
  with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
181
  for p in paths:
@@ -183,32 +221,154 @@ def zip_paths(paths: List[str], zip_path: str) -> str:
183
  zf.write(p, arcname=os.path.basename(p))
184
  return zip_path
185
 
186
- # ---------- Search helper ----------
187
- def search_links(query: str, num_results: int = 5) -> List[str]:
188
- search_url = "https://duckduckgo.com/html/"
189
- params = {"q": query}
190
- headers = {"User-Agent": "Mozilla/5.0"}
191
- r = requests.get(search_url, params=params, headers=headers, timeout=15)
192
- soup = BeautifulSoup(r.text, "html.parser")
193
- links = []
194
- for a in soup.select(".result__a"):
195
- href = a.get("href")
196
- if href:
197
- links.append(href)
198
- if len(links) >= num_results:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  break
200
- return links
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
- # ---------- Main scrape orchestrator ----------
203
- async def scrape_one(url: str, fields: List[str], use_llm: bool, api_key: Optional[str], card_selector: Optional[str]) -> Dict[str, Any]:
204
- html = await fetch_dom(url)
205
- items, images = extract_images_and_items(html, url, card_selector)
 
206
 
207
- llm_rows = None
208
- if use_llm:
209
- llm_rows = openai_extract_json(html, url, fields, api_key)
 
 
 
210
 
211
- return {"url": url, "html": html, "items": items, "images": images, "llm_rows": llm_rows or []}
212
 
213
  def to_dataframe(rows: List[Dict[str, Any]]) -> pd.DataFrame:
214
  if not rows:
@@ -226,100 +386,173 @@ def to_dataframe(rows: List[Dict[str, Any]]) -> pd.DataFrame:
226
  df = df[preferred + others]
227
  return df
228
 
229
- # ---------- Gradio wrapper ----------
230
- def run_scrape(urls_text: str,
 
 
 
231
  fields_text: str,
232
  card_selector: str,
 
 
 
233
  use_llm: bool,
234
  api_key: str,
235
  download_imgs: bool,
236
  do_caption: bool):
237
  start = time.time()
238
-
239
- # Auto-detect: if input doesn’t look like a URL, treat as keyword query
240
- if not urls_text.strip().startswith("http"):
241
- urls = search_links(urls_text.strip(), num_results=5)
 
 
 
 
 
 
 
 
 
 
 
 
242
  else:
243
- urls = [u.strip() for u in urls_text.splitlines() if u.strip()]
 
 
 
244
 
245
  fields = [f.strip() for f in fields_text.split(',')] if fields_text.strip() else []
246
 
247
  out_dir = os.path.abspath("scrape_output")
248
  os.makedirs(out_dir, exist_ok=True)
249
 
250
- results = []
 
 
251
  async def gather_all():
252
- return await asyncio.gather(*[
253
- scrape_one(u, fields, use_llm, api_key if use_llm else None, card_selector or None)
254
  for u in urls
255
- ])
 
 
256
  try:
257
  scraped = asyncio.run(gather_all())
258
  except RuntimeError:
259
  scraped = asyncio.get_event_loop().run_until_complete(gather_all())
 
 
 
 
 
 
 
260
 
261
- heuristic_rows, llm_rows, all_images = [], [], []
262
  for s in scraped:
263
- heuristic_rows.extend(s["items"])
264
- llm_rows.extend(s["llm_rows"])
265
- all_images.extend(s["images"])
 
 
266
 
 
267
  rows = llm_rows if use_llm and llm_rows else heuristic_rows
268
  df = to_dataframe(rows)
269
 
270
  ts = int(time.time())
271
  json_path = os.path.join(out_dir, f"scrape_{ts}.json")
272
  csv_path = os.path.join(out_dir, f"scrape_{ts}.csv")
273
- df.to_csv(csv_path, index=False)
274
- with open(json_path, "w", encoding="utf-8") as f:
275
- json.dump(rows, f, ensure_ascii=False, indent=2)
 
 
 
 
 
276
 
277
  gallery_paths, zip_path = [], None
278
  if download_imgs and all_images:
279
- img_dir = os.path.join(out_dir, f"images_{ts}")
280
- saved = download_images(all_images, img_dir)
281
- gallery_paths = saved[:120]
282
- captions_map: Dict[str, str] = {}
283
- if do_caption and saved:
284
- captions_map = caption_images(saved)
285
- if not df.empty:
286
- img_col = None
287
- for c in df.columns:
288
- if c.lower() in ("image", "image_url", "img", "imageurl"):
289
- img_col = c
290
- break
291
- if img_col:
292
- df["caption"] = df[img_col].map(lambda u: captions_map.get(os.path.join(img_dir, os.path.basename(urlparse(str(u)).path)), ""))
293
- df.to_csv(csv_path, index=False)
294
- with open(json_path, "w", encoding="utf-8") as f:
295
- json.dump(json.loads(df.to_json(orient="records")), f, ensure_ascii=False, indent=2)
296
- zip_path = os.path.join(out_dir, f"images_{ts}.zip")
297
- zip_paths(saved, zip_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
  elapsed = round(time.time() - start, 2)
300
  gallery_data = [(p, os.path.basename(p)) for p in gallery_paths]
301
  status = f"Scraped {len(urls)} URL(s) β€’ Rows: {len(df)} β€’ Images found: {len(all_images)} β€’ Time: {elapsed}s"
302
- return df, gallery_data, json_path, csv_path, (zip_path if zip_path and os.path.isfile(zip_path) else None), status
303
-
304
- # ---------- Gradio UI ----------
305
- with gr.Blocks(title="AI Scraper β€” Text + Images", css=".gradio-container {max-width: 1200px !important}") as demo:
 
 
 
 
 
306
  gr.Markdown("""
307
- # πŸ•·οΈ AI-Powered Web Scraper (2025)
308
- - Enter a **URL** or just a **keyword query**
309
- - Render dynamic pages (Playwright)
310
- - Extract **text + images**
311
- - Optional **LLM semantic parsing** to JSON
312
- - Optional **image captioning** (BLIP)
313
  """)
314
 
315
  with gr.Row():
316
- urls = gr.Textbox(label="Target URLs or Keywords", placeholder="https://example.com\nOR\nred nike shoes site:amazon.in")
 
 
 
 
 
 
 
 
317
  fields = gr.Textbox(label="Fields to extract (comma-separated)", placeholder="title, price, image, rating, url")
318
- card_selector = gr.Textbox(label="Optional CSS selector for item cards (e.g., div.product, article, .card)")
 
 
 
 
 
319
 
320
  with gr.Row():
321
  use_llm = gr.Checkbox(label="Use OpenAI for semantic extraction", value=False)
322
  api_key = gr.Textbox(label="OpenAI API Key (if using LLM)", type="password")
 
 
323
  download_imgs = gr.Checkbox(label="Download images", value=True)
324
  do_caption = gr.Checkbox(label="Caption images (slow)", value=False)
325
 
@@ -327,7 +560,7 @@ with gr.Blocks(title="AI Scraper β€” Text + Images", css=".gradio-container {max
327
 
328
  with gr.Row():
329
  table = gr.Dataframe(label="Extracted Data (preview)", interactive=False)
330
- gallery = gr.Gallery(label="Scraped Images (subset)", show_label=True, height=400, allow_preview=True)
331
 
332
  with gr.Row():
333
  json_file = gr.File(label="Download JSON")
@@ -335,11 +568,16 @@ with gr.Blocks(title="AI Scraper β€” Text + Images", css=".gradio-container {max
335
  zip_file = gr.File(label="Download Images ZIP")
336
 
337
  status = gr.Markdown("Ready.")
 
338
 
339
  run_btn.click(
340
  fn=run_scrape,
341
- inputs=[urls, fields, card_selector, use_llm, api_key, download_imgs, do_caption],
342
- outputs=[table, gallery, json_file, csv_file, zip_file, status]
 
 
 
 
343
  )
344
 
345
  if __name__ == "__main__":
 
4
  import json
5
  import time
6
  import zipfile
 
7
  from urllib.parse import urljoin, urlparse
8
+ from typing import List, Dict, Any, Optional, Tuple, Set
9
 
10
  import requests
11
  import pandas as pd
12
  from bs4 import BeautifulSoup
13
  import gradio as gr
14
 
15
+ # =========================
16
+ # Config
17
+ # =========================
18
+ MAX_CONCURRENCY = 4 # concurrent pages to scrape
19
+ PLAYWRIGHT_WAIT_MS = 1500 # wait a bit for JS
20
+ FETCH_RETRIES = 2 # playwright retries per URL
21
+ SEARCH_PAGES = 2 # DDG result pages per query
22
+ RESULTS_PER_QUERY = 10 # target results per query
23
+ USER_AGENT = (
24
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
25
+ "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
26
+ )
27
+
28
+ # =========================
29
+ # Optional LLM (OpenAI)
30
+ # =========================
31
  def openai_extract_json(html: str, url: str, fields: List[str], api_key: Optional[str]) -> Optional[List[Dict[str, Any]]]:
32
  if not api_key:
33
  return None
 
63
  print("OpenAI extraction failed:", e)
64
  return None
65
 
66
+ # =========================
67
+ # Playwright page loader (with retries)
68
+ # =========================
69
+ async def _fetch_dom_once(url: str, wait_ms: int) -> str:
70
  from playwright.async_api import async_playwright
71
  async with async_playwright() as p:
72
  browser = await p.chromium.launch(headless=True)
73
+ page = await browser.new_page(user_agent=USER_AGENT)
74
+ await page.goto(url, wait_until="domcontentloaded", timeout=30000)
75
  try:
76
  await page.wait_for_load_state("networkidle", timeout=8000)
77
  except Exception:
 
82
  await browser.close()
83
  return html
84
 
85
+ async def fetch_dom(url: str, wait_ms: int = PLAYWRIGHT_WAIT_MS, retries: int = FETCH_RETRIES) -> str:
86
+ last_err = None
87
+ for attempt in range(1, retries + 2):
88
+ try:
89
+ return await _fetch_dom_once(url, wait_ms)
90
+ except Exception as e:
91
+ last_err = e
92
+ await asyncio.sleep(0.6 * attempt)
93
+ raise last_err
94
+
95
+ # =========================
96
+ # Heuristic extraction
97
+ # =========================
98
  def extract_images_and_items(html: str, base_url: str, card_selector: Optional[str] = None) -> Tuple[List[Dict[str, Any]], List[str]]:
99
  soup = BeautifulSoup(html, "html.parser")
100
 
101
+ # Collect all images on page
102
  images = []
103
  for img in soup.find_all("img"):
104
  src = img.get("src") or img.get("data-src") or img.get("data-original")
 
107
  abs_src = urljoin(base_url, src)
108
  images.append(abs_src)
109
 
110
+ # Find likely product/article cards
111
  items = []
 
112
  if card_selector:
113
  candidates = soup.select(card_selector)
114
  else:
115
  candidates = soup.select(
116
+ "div.product, li.product, div.card, article, div.product-item, "
117
+ "div.s-result-item, div._1AtVbE, div._4ddWXP, div.MuiCard-root, "
118
+ "section, li.grid-item"
119
  )
120
  if not candidates:
121
  candidates = [a.parent for a in soup.select("a img") if a.parent]
 
123
  for c in candidates:
124
  try:
125
  title = None
126
+ for sel in ["h1", "h2", "h3", ".title", ".product-title", "._4rR01T", ".s1Q9rs", "a[title]"]:
127
  n = c.select_one(sel)
128
  if n and n.get_text(strip=True):
129
  title = n.get_text(strip=True)
 
154
  except Exception:
155
  continue
156
 
157
+ # De-duplicate images
158
  seen = set()
159
  unique_images = []
160
  for u in images:
 
164
 
165
  return items, unique_images
166
 
167
+ # =========================
168
+ # Image download & optional captioning
169
+ # =========================
170
  def download_images(image_urls: List[str], out_dir: str) -> List[str]:
171
  os.makedirs(out_dir, exist_ok=True)
172
  saved = []
173
+ s = requests.Session()
174
+ s.headers.update({"User-Agent": USER_AGENT})
175
  for u in image_urls:
176
  try:
177
  name = os.path.basename(urlparse(u).path) or f"img_{len(saved)+1}.jpg"
178
  if not os.path.splitext(name)[1]:
179
  name += ".jpg"
180
  path = os.path.join(out_dir, name)
181
+ r = s.get(u, timeout=20)
182
  if r.status_code == 200 and r.content:
183
  with open(path, "wb") as f:
184
  f.write(r.content)
 
211
  print("Captioning unavailable:", e)
212
  return {}
213
 
214
+ # =========================
215
+ # ZIP helper
216
+ # =========================
217
  def zip_paths(paths: List[str], zip_path: str) -> str:
218
  with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
219
  for p in paths:
 
221
  zf.write(p, arcname=os.path.basename(p))
222
  return zip_path
223
 
224
+ # =========================
225
+ # Search helpers (Prompt β†’ Queries β†’ Links)
226
+ # =========================
227
+ ADS_PRESETS = [
228
+ # public/archival ad sources (safer than scraping walled platforms)
229
+ "site:adsoftheworld.com",
230
+ "site:theinspiration.com",
231
+ "site:ads-of-the-world.s3", # mirrors sometimes
232
+ "site:behance.net ad campaign",
233
+ "site:dribbble.com case study ad",
234
+ ]
235
+
236
+ NEWS_SIGNAL = [
237
+ "site:news.ycombinator.com", "site:techcrunch.com", "site:theverge.com",
238
+ "site:adage.com", "site:campaignlive.com"
239
+ ]
240
+
241
+ def build_queries_from_prompt(prompt: str, include_ads_sources: bool) -> List[str]:
242
+ # very lightweight keyword clean
243
+ base = re.sub(r"[^a-zA-Z0-9\s:+\-_/\.]", " ", prompt).strip()
244
+ base = re.sub(r"\s+", " ", base)
245
+
246
+ core_variants = [
247
+ base,
248
+ f'{base} best examples',
249
+ f'{base} recent campaigns',
250
+ f'{base} case study',
251
+ f'{base} images',
252
+ ]
253
+
254
+ queries = []
255
+ for v in core_variants:
256
+ queries.append(v)
257
+ # tilt towards news relevance
258
+ for ns in NEWS_SIGNAL[:2]:
259
+ queries.append(f"{v} {ns}")
260
+
261
+ if include_ads_sources:
262
+ for v in core_variants:
263
+ for siteq in ADS_PRESETS:
264
+ queries.append(f"{v} {siteq}")
265
+
266
+ # de-dup while keeping order
267
+ seen = set()
268
+ uniq = []
269
+ for q in queries:
270
+ if q not in seen:
271
+ seen.add(q)
272
+ uniq.append(q)
273
+ return uniq[:12] # cap
274
+
275
+ def ddg_search(query: str, pages: int = 1) -> List[Tuple[str, str]]:
276
+ """
277
+ Returns list of (title, url) from DuckDuckGo HTML results, across pages.
278
+ """
279
+ results = []
280
+ session = requests.Session()
281
+ session.headers.update({"User-Agent": USER_AGENT})
282
+
283
+ for page in range(pages):
284
+ params = {"q": query}
285
+ if page > 0:
286
+ params["s"] = str(page * 50) # pagination hint
287
+ r = session.get("https://duckduckgo.com/html/", params=params, timeout=20)
288
+ soup = BeautifulSoup(r.text, "html.parser")
289
+ for res in soup.select(".result"):
290
+ a = res.select_one(".result__a")
291
+ if not a:
292
+ continue
293
+ title = a.get_text(strip=True)
294
+ href = a.get("href")
295
+ if not href:
296
+ continue
297
+ results.append((title, href))
298
+ return results
299
+
300
+ def pick_best_links(all_results: List[Tuple[str, str]], want: int = 10) -> List[str]:
301
+ """
302
+ Simple pragmatic ranking:
303
+ - de-duplicate by URL & domain
304
+ - prefer diverse domains
305
+ """
306
+ picked = []
307
+ seen_urls: Set[str] = set()
308
+ seen_domains: Set[str] = set()
309
+
310
+ for _, url in all_results:
311
+ u = url.strip()
312
+ if not u or u in seen_urls:
313
+ continue
314
+ dom = urlparse(u).netloc.lower()
315
+ if dom.startswith("www."):
316
+ dom = dom[4:]
317
+ # skip obvious DDG redirectors or trackers if any
318
+ if dom in {"duckduckgo.com"}:
319
+ continue
320
+ if dom in seen_domains and len(picked) < want // 2:
321
+ # allow later, but early phase enforce domain diversity
322
+ continue
323
+
324
+ seen_urls.add(u)
325
+ seen_domains.add(dom)
326
+ picked.append(u)
327
+ if len(picked) >= want:
328
  break
329
+ return picked
330
+
331
+ def search_links_from_prompt(prompt: str, include_ads_sources: bool, per_query: int, pages: int) -> List[str]:
332
+ queries = build_queries_from_prompt(prompt, include_ads_sources)
333
+ all_results: List[Tuple[str, str]] = []
334
+ for q in queries:
335
+ try:
336
+ res = ddg_search(q, pages=pages)
337
+ # take top-k per query
338
+ all_results.extend(res[:per_query])
339
+ except Exception as e:
340
+ print("Search failed for query:", q, e)
341
+ continue
342
+ # global pick
343
+ best = pick_best_links(all_results, want=max(5, per_query * 2))
344
+ return best
345
+
346
+ # =========================
347
+ # Main scrape orchestrator (async with semaphore)
348
+ # =========================
349
+ async def scrape_one(url: str, fields: List[str], use_llm: bool, api_key: Optional[str],
350
+ card_selector: Optional[str], log: List[str], sem: asyncio.Semaphore) -> Dict[str, Any]:
351
+ async with sem:
352
+ try:
353
+ html = await fetch_dom(url)
354
+ except Exception as e:
355
+ log.append(f"[ERROR] Failed to load: {url} -> {e}")
356
+ return {"url": url, "html": "", "items": [], "images": [], "llm_rows": []}
357
 
358
+ items, images = [], []
359
+ try:
360
+ items, images = extract_images_and_items(html, url, card_selector)
361
+ except Exception as e:
362
+ log.append(f"[WARN] Parse issue on: {url} -> {e}")
363
 
364
+ llm_rows = []
365
+ if use_llm:
366
+ try:
367
+ llm_rows = openai_extract_json(html, url, fields, api_key) or []
368
+ except Exception as e:
369
+ log.append(f"[WARN] LLM extraction failed: {url} -> {e}")
370
 
371
+ return {"url": url, "html": html, "items": items, "images": images, "llm_rows": llm_rows}
372
 
373
  def to_dataframe(rows: List[Dict[str, Any]]) -> pd.DataFrame:
374
  if not rows:
 
386
  df = df[preferred + others]
387
  return df
388
 
389
+ # =========================
390
+ # Gradio wrapper
391
+ # =========================
392
+ def run_scrape(input_mode: str,
393
+ prompt_or_urls: str,
394
  fields_text: str,
395
  card_selector: str,
396
+ include_ads_sources: bool,
397
+ per_query_results: int,
398
+ search_pages: int,
399
  use_llm: bool,
400
  api_key: str,
401
  download_imgs: bool,
402
  do_caption: bool):
403
  start = time.time()
404
+ log: List[str] = []
405
+
406
+ # Resolve URLs
407
+ if input_mode == "Prompt":
408
+ if not prompt_or_urls.strip():
409
+ return pd.DataFrame(), [], None, None, None, "Enter a prompt.", "No prompt given."
410
+ log.append(f"[INFO] Building queries from prompt: {prompt_or_urls!r}")
411
+ urls = search_links_from_prompt(
412
+ prompt_or_urls.strip(),
413
+ include_ads_sources=include_ads_sources,
414
+ per_query=per_query_results,
415
+ pages=max(1, search_pages)
416
+ )
417
+ if not urls:
418
+ return pd.DataFrame(), [], None, None, None, "No links found.", "\n".join(log)
419
+ log.append(f"[INFO] Selected {len(urls)} links from search.")
420
  else:
421
+ urls = [u.strip() for u in prompt_or_urls.splitlines() if u.strip()]
422
+ if not urls:
423
+ return pd.DataFrame(), [], None, None, None, "Enter at least one URL.", "No URLs supplied."
424
+ log.append(f"[INFO] Using {len(urls)} direct URL(s).")
425
 
426
  fields = [f.strip() for f in fields_text.split(',')] if fields_text.strip() else []
427
 
428
  out_dir = os.path.abspath("scrape_output")
429
  os.makedirs(out_dir, exist_ok=True)
430
 
431
+ # Async scrape with semaphore
432
+ sem = asyncio.Semaphore(MAX_CONCURRENCY)
433
+
434
  async def gather_all():
435
+ tasks = [
436
+ scrape_one(u, fields, use_llm, api_key if use_llm else None, card_selector or None, log, sem)
437
  for u in urls
438
+ ]
439
+ return await asyncio.gather(*tasks)
440
+
441
  try:
442
  scraped = asyncio.run(gather_all())
443
  except RuntimeError:
444
  scraped = asyncio.get_event_loop().run_until_complete(gather_all())
445
+ except Exception as e:
446
+ log.append(f"[FATAL] Async run failed: {e}")
447
+ return pd.DataFrame(), [], None, None, None, "Run failed.", "\n".join(log)
448
+
449
+ heuristic_rows: List[Dict[str, Any]] = []
450
+ llm_rows: List[Dict[str, Any]] = []
451
+ all_images: List[str] = []
452
 
 
453
  for s in scraped:
454
+ if not isinstance(s, dict):
455
+ continue
456
+ heuristic_rows.extend(s.get("items", []))
457
+ llm_rows.extend(s.get("llm_rows", []))
458
+ all_images.extend(s.get("images", []))
459
 
460
+ # prefer LLM rows if available
461
  rows = llm_rows if use_llm and llm_rows else heuristic_rows
462
  df = to_dataframe(rows)
463
 
464
  ts = int(time.time())
465
  json_path = os.path.join(out_dir, f"scrape_{ts}.json")
466
  csv_path = os.path.join(out_dir, f"scrape_{ts}.csv")
467
+ try:
468
+ df.to_csv(csv_path, index=False)
469
+ with open(json_path, "w", encoding="utf-8") as f:
470
+ json.dump(rows, f, ensure_ascii=False, indent=2)
471
+ except Exception as e:
472
+ log.append(f"[WARN] Failed to save CSV/JSON: {e}")
473
+ json_path = None
474
+ csv_path = None
475
 
476
  gallery_paths, zip_path = [], None
477
  if download_imgs and all_images:
478
+ try:
479
+ img_dir = os.path.join(out_dir, f"images_{ts}")
480
+ saved = download_images(all_images, img_dir)
481
+ gallery_paths = saved[:120]
482
+ if do_caption and saved:
483
+ try:
484
+ captions_map = caption_images(saved)
485
+ if not df.empty:
486
+ img_col = None
487
+ for c in df.columns:
488
+ if c.lower() in ("image", "image_url", "img", "imageurl"):
489
+ img_col = c
490
+ break
491
+ if img_col:
492
+ def _map_caption(u):
493
+ if not u:
494
+ return ""
495
+ fname = os.path.basename(urlparse(str(u)).path)
496
+ return captions_map.get(os.path.join(img_dir, fname), "")
497
+ df["caption"] = df[img_col].map(_map_caption)
498
+ df.to_csv(csv_path, index=False)
499
+ with open(json_path, "w", encoding="utf-8") as f:
500
+ json.dump(json.loads(df.to_json(orient="records")), f, ensure_ascii=False, indent=2)
501
+ except Exception as e:
502
+ log.append(f"[WARN] Captioning failed: {e}")
503
+
504
+ zip_path = os.path.join(out_dir, f"images_{ts}.zip")
505
+ try:
506
+ zip_paths(saved, zip_path)
507
+ except Exception as e:
508
+ log.append(f"[WARN] ZIP failed: {e}")
509
+ zip_path = None
510
+ except Exception as e:
511
+ log.append(f"[WARN] Image pipeline failed: {e}")
512
 
513
  elapsed = round(time.time() - start, 2)
514
  gallery_data = [(p, os.path.basename(p)) for p in gallery_paths]
515
  status = f"Scraped {len(urls)} URL(s) β€’ Rows: {len(df)} β€’ Images found: {len(all_images)} β€’ Time: {elapsed}s"
516
+ return df, gallery_data, (json_path if json_path and os.path.isfile(json_path) else None), \
517
+ (csv_path if csv_path and os.path.isfile(csv_path) else None), \
518
+ (zip_path if zip_path and os.path.isfile(zip_path) else None), \
519
+ status, "\n".join(log) if log else "OK"
520
+
521
+ # =========================
522
+ # Gradio UI
523
+ # =========================
524
+ with gr.Blocks(title="AI Scraper β€” Prompt β†’ Best Links β†’ Text+Images", css=".gradio-container {max-width: 1200px !important}") as demo:
525
  gr.Markdown("""
526
+ # πŸ•·οΈ AI-Powered Prompt Scraper (2025)
527
+ - Give a **prompt** (e.g., "Gen Z pink organic skincare ad campaign in India 2024")
528
+ β†’ we search smartly, pick strong links (optionally ad archives), and scrape **text + images**
529
+ - Or switch to **Direct URLs** mode and paste URLs.
530
+ - Optional **LLM semantic parsing** to structured JSON.
 
531
  """)
532
 
533
  with gr.Row():
534
+ input_mode = gr.Radio(choices=["Prompt", "Direct URLs"], value="Prompt", label="Input Mode")
535
+
536
+ with gr.Row():
537
+ prompt_or_urls = gr.Textbox(
538
+ label="Prompt (or URLs if in Direct mode)",
539
+ placeholder="e.g., gen z pink skincare ad campaign india 2024"
540
+ )
541
+
542
+ with gr.Row():
543
  fields = gr.Textbox(label="Fields to extract (comma-separated)", placeholder="title, price, image, rating, url")
544
+ card_selector = gr.Textbox(label="Optional CSS selector for item cards", placeholder="div.product, article, .card")
545
+
546
+ with gr.Row():
547
+ include_ads_sources = gr.Checkbox(label="Bias search towards ad archives/sources", value=True)
548
+ per_query_results = gr.Slider(1, 15, value=6, step=1, label="Top results to keep per query")
549
+ search_pages = gr.Slider(1, 3, value=2, step=1, label="Search pages per query (DDG)")
550
 
551
  with gr.Row():
552
  use_llm = gr.Checkbox(label="Use OpenAI for semantic extraction", value=False)
553
  api_key = gr.Textbox(label="OpenAI API Key (if using LLM)", type="password")
554
+
555
+ with gr.Row():
556
  download_imgs = gr.Checkbox(label="Download images", value=True)
557
  do_caption = gr.Checkbox(label="Caption images (slow)", value=False)
558
 
 
560
 
561
  with gr.Row():
562
  table = gr.Dataframe(label="Extracted Data (preview)", interactive=False)
563
+ gallery = gr.Gallery(label="Scraped Images (subset)", show_label=True, height=420, allow_preview=True)
564
 
565
  with gr.Row():
566
  json_file = gr.File(label="Download JSON")
 
568
  zip_file = gr.File(label="Download Images ZIP")
569
 
570
  status = gr.Markdown("Ready.")
571
+ logs = gr.Textbox(label="Run Logs", lines=10)
572
 
573
  run_btn.click(
574
  fn=run_scrape,
575
+ inputs=[
576
+ input_mode, prompt_or_urls, fields, card_selector,
577
+ include_ads_sources, per_query_results, search_pages,
578
+ use_llm, api_key, download_imgs, do_caption
579
+ ],
580
+ outputs=[table, gallery, json_file, csv_file, zip_file, status, logs]
581
  )
582
 
583
  if __name__ == "__main__":