Update app.py
Browse files
app.py
CHANGED
@@ -6,25 +6,20 @@ import time
|
|
6 |
import zipfile
|
7 |
from io import BytesIO
|
8 |
from urllib.parse import urljoin, urlparse
|
9 |
-
from dataclasses import dataclass, asdict
|
10 |
from typing import List, Dict, Any, Optional, Tuple
|
11 |
|
12 |
import requests
|
13 |
import pandas as pd
|
14 |
from bs4 import BeautifulSoup
|
15 |
-
|
16 |
import gradio as gr
|
17 |
|
18 |
# ---------- Optional LLM (OpenAI) ----------
|
19 |
def openai_extract_json(html: str, url: str, fields: List[str], api_key: Optional[str]) -> Optional[List[Dict[str, Any]]]:
|
20 |
-
"""Use OpenAI to extract structured data from HTML. Returns a list of dicts or None on failure."""
|
21 |
if not api_key:
|
22 |
return None
|
23 |
try:
|
24 |
-
# Use modern OpenAI SDK
|
25 |
from openai import OpenAI
|
26 |
client = OpenAI(api_key=api_key)
|
27 |
-
# Create a concise instruction for robust JSON output
|
28 |
field_hint = ", ".join(fields) if fields else "title, price, image, rating, url"
|
29 |
system = (
|
30 |
"You are a robust web extractor. Given raw HTML and the page URL, "
|
@@ -35,7 +30,7 @@ def openai_extract_json(html: str, url: str, fields: List[str], api_key: Optiona
|
|
35 |
f"URL: {url}\n\n"
|
36 |
f"Required fields to attempt: [{field_hint}]\n\n"
|
37 |
"Return JSON array only. Do not include any commentary.\n\n"
|
38 |
-
f"HTML:\n{html[:180000]}"
|
39 |
)
|
40 |
resp = client.chat.completions.create(
|
41 |
model="gpt-4o-mini",
|
@@ -43,7 +38,6 @@ def openai_extract_json(html: str, url: str, fields: List[str], api_key: Optiona
|
|
43 |
temperature=0,
|
44 |
)
|
45 |
content = resp.choices[0].message.content.strip()
|
46 |
-
# If the model wrapped JSON in code fences, strip them
|
47 |
content = re.sub(r"^```(?:json)?|```$", "", content).strip()
|
48 |
data = json.loads(content)
|
49 |
if isinstance(data, dict):
|
@@ -62,7 +56,6 @@ async def fetch_dom(url: str, wait_ms: int = 1500) -> str:
|
|
62 |
browser = await p.chromium.launch(headless=True)
|
63 |
page = await browser.new_page()
|
64 |
await page.goto(url, wait_until="domcontentloaded")
|
65 |
-
# try to settle network
|
66 |
try:
|
67 |
await page.wait_for_load_state("networkidle", timeout=8000)
|
68 |
except Exception:
|
@@ -75,15 +68,8 @@ async def fetch_dom(url: str, wait_ms: int = 1500) -> str:
|
|
75 |
|
76 |
# ---------- Heuristic extraction ----------
|
77 |
def extract_images_and_items(html: str, base_url: str, card_selector: Optional[str] = None) -> Tuple[List[Dict[str, Any]], List[str]]:
|
78 |
-
"""
|
79 |
-
Heuristically extract items and image URLs.
|
80 |
-
Returns: (items, image_urls)
|
81 |
-
items: list of dicts with title/price/url/image if found
|
82 |
-
image_urls: all image URLs found on page
|
83 |
-
"""
|
84 |
soup = BeautifulSoup(html, "html.parser")
|
85 |
|
86 |
-
# collect all images on page
|
87 |
images = []
|
88 |
for img in soup.find_all("img"):
|
89 |
src = img.get("src") or img.get("data-src") or img.get("data-original")
|
@@ -92,47 +78,39 @@ def extract_images_and_items(html: str, base_url: str, card_selector: Optional[s
|
|
92 |
abs_src = urljoin(base_url, src)
|
93 |
images.append(abs_src)
|
94 |
|
95 |
-
# find likely product/article cards
|
96 |
items = []
|
97 |
candidates = []
|
98 |
if card_selector:
|
99 |
candidates = soup.select(card_selector)
|
100 |
else:
|
101 |
-
# common product/article containers
|
102 |
candidates = soup.select(
|
103 |
"div.product, li.product, div.card, article, div.product-item, div.s-result-item, div._1AtVbE, div._4ddWXP"
|
104 |
)
|
105 |
if not candidates:
|
106 |
-
# fallback: take top-level links with images
|
107 |
candidates = [a.parent for a in soup.select("a img") if a.parent]
|
108 |
|
109 |
for c in candidates:
|
110 |
try:
|
111 |
title = None
|
112 |
-
# title heuristics
|
113 |
for sel in ["h1", "h2", "h3", ".title", ".product-title", "._4rR01T", ".s1Q9rs"]:
|
114 |
n = c.select_one(sel)
|
115 |
if n and n.get_text(strip=True):
|
116 |
title = n.get_text(strip=True)
|
117 |
break
|
118 |
if not title:
|
119 |
-
# maybe the image alt
|
120 |
img = c.find("img")
|
121 |
if img and img.get("alt"):
|
122 |
title = img.get("alt").strip()
|
123 |
|
124 |
-
# price heuristics
|
125 |
price = None
|
126 |
price_text = c.get_text(" ", strip=True)
|
127 |
m = re.search(r"(?:₹|Rs\.?|INR|\$|€|£)\s?\d[\d,]*(?:\.\d+)?", price_text)
|
128 |
if m:
|
129 |
price = m.group(0)
|
130 |
|
131 |
-
# url
|
132 |
link = c.find("a")
|
133 |
href = urljoin(base_url, link.get("href")) if link and link.get("href") else base_url
|
134 |
|
135 |
-
# image
|
136 |
img = c.find("img")
|
137 |
img_src = None
|
138 |
if img:
|
@@ -145,7 +123,6 @@ def extract_images_and_items(html: str, base_url: str, card_selector: Optional[s
|
|
145 |
except Exception:
|
146 |
continue
|
147 |
|
148 |
-
# de-duplicate images
|
149 |
seen = set()
|
150 |
unique_images = []
|
151 |
for u in images:
|
@@ -162,7 +139,6 @@ def download_images(image_urls: List[str], out_dir: str) -> List[str]:
|
|
162 |
for u in image_urls:
|
163 |
try:
|
164 |
name = os.path.basename(urlparse(u).path) or f"img_{len(saved)+1}.jpg"
|
165 |
-
# ensure extension
|
166 |
if not os.path.splitext(name)[1]:
|
167 |
name += ".jpg"
|
168 |
path = os.path.join(out_dir, name)
|
@@ -175,9 +151,7 @@ def download_images(image_urls: List[str], out_dir: str) -> List[str]:
|
|
175 |
print("Image download failed:", u, e)
|
176 |
return saved
|
177 |
|
178 |
-
|
179 |
def caption_images(paths: List[str]) -> Dict[str, str]:
|
180 |
-
"""Caption images with BLIP (optional, slow). Returns {path: caption}."""
|
181 |
try:
|
182 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
183 |
from PIL import Image
|
@@ -209,6 +183,22 @@ def zip_paths(paths: List[str], zip_path: str) -> str:
|
|
209 |
zf.write(p, arcname=os.path.basename(p))
|
210 |
return zip_path
|
211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
# ---------- Main scrape orchestrator ----------
|
213 |
async def scrape_one(url: str, fields: List[str], use_llm: bool, api_key: Optional[str], card_selector: Optional[str]) -> Dict[str, Any]:
|
214 |
html = await fetch_dom(url)
|
@@ -218,19 +208,11 @@ async def scrape_one(url: str, fields: List[str], use_llm: bool, api_key: Option
|
|
218 |
if use_llm:
|
219 |
llm_rows = openai_extract_json(html, url, fields, api_key)
|
220 |
|
221 |
-
return {
|
222 |
-
"url": url,
|
223 |
-
"html": html,
|
224 |
-
"items": items,
|
225 |
-
"images": images,
|
226 |
-
"llm_rows": llm_rows or []
|
227 |
-
}
|
228 |
-
|
229 |
|
230 |
def to_dataframe(rows: List[Dict[str, Any]]) -> pd.DataFrame:
|
231 |
if not rows:
|
232 |
return pd.DataFrame()
|
233 |
-
# normalize list of dicts with possibly different keys
|
234 |
all_keys = set()
|
235 |
for r in rows:
|
236 |
all_keys.update(r.keys())
|
@@ -239,7 +221,6 @@ def to_dataframe(rows: List[Dict[str, Any]]) -> pd.DataFrame:
|
|
239 |
d = {k: r.get(k) for k in all_keys}
|
240 |
ordered.append(d)
|
241 |
df = pd.DataFrame(ordered)
|
242 |
-
# helpful column order
|
243 |
preferred = [k for k in ["title", "name", "price", "rating", "image", "url"] if k in df.columns]
|
244 |
others = [c for c in df.columns if c not in preferred]
|
245 |
df = df[preferred + others]
|
@@ -254,13 +235,18 @@ def run_scrape(urls_text: str,
|
|
254 |
download_imgs: bool,
|
255 |
do_caption: bool):
|
256 |
start = time.time()
|
257 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
258 |
fields = [f.strip() for f in fields_text.split(',')] if fields_text.strip() else []
|
259 |
|
260 |
out_dir = os.path.abspath("scrape_output")
|
261 |
os.makedirs(out_dir, exist_ok=True)
|
262 |
|
263 |
-
# scrape all urls
|
264 |
results = []
|
265 |
async def gather_all():
|
266 |
return await asyncio.gather(*[
|
@@ -270,24 +256,17 @@ def run_scrape(urls_text: str,
|
|
270 |
try:
|
271 |
scraped = asyncio.run(gather_all())
|
272 |
except RuntimeError:
|
273 |
-
# if event loop already running (e.g. notebooks), fallback
|
274 |
scraped = asyncio.get_event_loop().run_until_complete(gather_all())
|
275 |
|
276 |
-
|
277 |
-
heuristic_rows: List[Dict[str, Any]] = []
|
278 |
-
llm_rows: List[Dict[str, Any]] = []
|
279 |
-
all_images: List[str] = []
|
280 |
-
|
281 |
for s in scraped:
|
282 |
-
heuristic_rows.extend(s["items"])
|
283 |
-
llm_rows.extend(s["llm_rows"])
|
284 |
-
all_images.extend(s["images"])
|
285 |
|
286 |
-
# Choose which rows to present: prefer LLM if available, else heuristics
|
287 |
rows = llm_rows if use_llm and llm_rows else heuristic_rows
|
288 |
df = to_dataframe(rows)
|
289 |
|
290 |
-
# save JSON/CSV
|
291 |
ts = int(time.time())
|
292 |
json_path = os.path.join(out_dir, f"scrape_{ts}.json")
|
293 |
csv_path = os.path.join(out_dir, f"scrape_{ts}.csv")
|
@@ -295,23 +274,18 @@ def run_scrape(urls_text: str,
|
|
295 |
with open(json_path, "w", encoding="utf-8") as f:
|
296 |
json.dump(rows, f, ensure_ascii=False, indent=2)
|
297 |
|
298 |
-
|
299 |
-
gallery_paths: List[str] = []
|
300 |
-
zip_path = None
|
301 |
-
|
302 |
if download_imgs and all_images:
|
303 |
img_dir = os.path.join(out_dir, f"images_{ts}")
|
304 |
saved = download_images(all_images, img_dir)
|
305 |
-
gallery_paths = saved[:120]
|
306 |
-
# optional captioning
|
307 |
captions_map: Dict[str, str] = {}
|
308 |
if do_caption and saved:
|
309 |
captions_map = caption_images(saved)
|
310 |
-
# if captions found and we have a df with image column, try to map
|
311 |
if not df.empty:
|
312 |
img_col = None
|
313 |
for c in df.columns:
|
314 |
-
if c.lower() in ("image", "image_url", "img", "
|
315 |
img_col = c
|
316 |
break
|
317 |
if img_col:
|
@@ -319,26 +293,19 @@ def run_scrape(urls_text: str,
|
|
319 |
df.to_csv(csv_path, index=False)
|
320 |
with open(json_path, "w", encoding="utf-8") as f:
|
321 |
json.dump(json.loads(df.to_json(orient="records")), f, ensure_ascii=False, indent=2)
|
322 |
-
# zip
|
323 |
zip_path = os.path.join(out_dir, f"images_{ts}.zip")
|
324 |
zip_paths(saved, zip_path)
|
325 |
|
326 |
elapsed = round(time.time() - start, 2)
|
327 |
-
|
328 |
-
# Build gallery data: [(path, caption)]
|
329 |
-
gallery_data = []
|
330 |
-
for p in gallery_paths:
|
331 |
-
gallery_data.append((p, os.path.basename(p)))
|
332 |
-
|
333 |
status = f"Scraped {len(urls)} URL(s) • Rows: {len(df)} • Images found: {len(all_images)} • Time: {elapsed}s"
|
334 |
-
|
335 |
return df, gallery_data, json_path, csv_path, (zip_path if zip_path and os.path.isfile(zip_path) else None), status
|
336 |
|
337 |
-
|
338 |
# ---------- Gradio UI ----------
|
339 |
with gr.Blocks(title="AI Scraper — Text + Images", css=".gradio-container {max-width: 1200px !important}") as demo:
|
340 |
gr.Markdown("""
|
341 |
# 🕷️ AI-Powered Web Scraper (2025)
|
|
|
342 |
- Render dynamic pages (Playwright)
|
343 |
- Extract **text + images**
|
344 |
- Optional **LLM semantic parsing** to JSON
|
@@ -346,7 +313,7 @@ with gr.Blocks(title="AI Scraper — Text + Images", css=".gradio-container {max
|
|
346 |
""")
|
347 |
|
348 |
with gr.Row():
|
349 |
-
urls = gr.Textbox(label="Target URLs
|
350 |
fields = gr.Textbox(label="Fields to extract (comma-separated)", placeholder="title, price, image, rating, url")
|
351 |
card_selector = gr.Textbox(label="Optional CSS selector for item cards (e.g., div.product, article, .card)")
|
352 |
|
|
|
6 |
import zipfile
|
7 |
from io import BytesIO
|
8 |
from urllib.parse import urljoin, urlparse
|
|
|
9 |
from typing import List, Dict, Any, Optional, Tuple
|
10 |
|
11 |
import requests
|
12 |
import pandas as pd
|
13 |
from bs4 import BeautifulSoup
|
|
|
14 |
import gradio as gr
|
15 |
|
16 |
# ---------- Optional LLM (OpenAI) ----------
|
17 |
def openai_extract_json(html: str, url: str, fields: List[str], api_key: Optional[str]) -> Optional[List[Dict[str, Any]]]:
|
|
|
18 |
if not api_key:
|
19 |
return None
|
20 |
try:
|
|
|
21 |
from openai import OpenAI
|
22 |
client = OpenAI(api_key=api_key)
|
|
|
23 |
field_hint = ", ".join(fields) if fields else "title, price, image, rating, url"
|
24 |
system = (
|
25 |
"You are a robust web extractor. Given raw HTML and the page URL, "
|
|
|
30 |
f"URL: {url}\n\n"
|
31 |
f"Required fields to attempt: [{field_hint}]\n\n"
|
32 |
"Return JSON array only. Do not include any commentary.\n\n"
|
33 |
+
f"HTML:\n{html[:180000]}"
|
34 |
)
|
35 |
resp = client.chat.completions.create(
|
36 |
model="gpt-4o-mini",
|
|
|
38 |
temperature=0,
|
39 |
)
|
40 |
content = resp.choices[0].message.content.strip()
|
|
|
41 |
content = re.sub(r"^```(?:json)?|```$", "", content).strip()
|
42 |
data = json.loads(content)
|
43 |
if isinstance(data, dict):
|
|
|
56 |
browser = await p.chromium.launch(headless=True)
|
57 |
page = await browser.new_page()
|
58 |
await page.goto(url, wait_until="domcontentloaded")
|
|
|
59 |
try:
|
60 |
await page.wait_for_load_state("networkidle", timeout=8000)
|
61 |
except Exception:
|
|
|
68 |
|
69 |
# ---------- Heuristic extraction ----------
|
70 |
def extract_images_and_items(html: str, base_url: str, card_selector: Optional[str] = None) -> Tuple[List[Dict[str, Any]], List[str]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
soup = BeautifulSoup(html, "html.parser")
|
72 |
|
|
|
73 |
images = []
|
74 |
for img in soup.find_all("img"):
|
75 |
src = img.get("src") or img.get("data-src") or img.get("data-original")
|
|
|
78 |
abs_src = urljoin(base_url, src)
|
79 |
images.append(abs_src)
|
80 |
|
|
|
81 |
items = []
|
82 |
candidates = []
|
83 |
if card_selector:
|
84 |
candidates = soup.select(card_selector)
|
85 |
else:
|
|
|
86 |
candidates = soup.select(
|
87 |
"div.product, li.product, div.card, article, div.product-item, div.s-result-item, div._1AtVbE, div._4ddWXP"
|
88 |
)
|
89 |
if not candidates:
|
|
|
90 |
candidates = [a.parent for a in soup.select("a img") if a.parent]
|
91 |
|
92 |
for c in candidates:
|
93 |
try:
|
94 |
title = None
|
|
|
95 |
for sel in ["h1", "h2", "h3", ".title", ".product-title", "._4rR01T", ".s1Q9rs"]:
|
96 |
n = c.select_one(sel)
|
97 |
if n and n.get_text(strip=True):
|
98 |
title = n.get_text(strip=True)
|
99 |
break
|
100 |
if not title:
|
|
|
101 |
img = c.find("img")
|
102 |
if img and img.get("alt"):
|
103 |
title = img.get("alt").strip()
|
104 |
|
|
|
105 |
price = None
|
106 |
price_text = c.get_text(" ", strip=True)
|
107 |
m = re.search(r"(?:₹|Rs\.?|INR|\$|€|£)\s?\d[\d,]*(?:\.\d+)?", price_text)
|
108 |
if m:
|
109 |
price = m.group(0)
|
110 |
|
|
|
111 |
link = c.find("a")
|
112 |
href = urljoin(base_url, link.get("href")) if link and link.get("href") else base_url
|
113 |
|
|
|
114 |
img = c.find("img")
|
115 |
img_src = None
|
116 |
if img:
|
|
|
123 |
except Exception:
|
124 |
continue
|
125 |
|
|
|
126 |
seen = set()
|
127 |
unique_images = []
|
128 |
for u in images:
|
|
|
139 |
for u in image_urls:
|
140 |
try:
|
141 |
name = os.path.basename(urlparse(u).path) or f"img_{len(saved)+1}.jpg"
|
|
|
142 |
if not os.path.splitext(name)[1]:
|
143 |
name += ".jpg"
|
144 |
path = os.path.join(out_dir, name)
|
|
|
151 |
print("Image download failed:", u, e)
|
152 |
return saved
|
153 |
|
|
|
154 |
def caption_images(paths: List[str]) -> Dict[str, str]:
|
|
|
155 |
try:
|
156 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
157 |
from PIL import Image
|
|
|
183 |
zf.write(p, arcname=os.path.basename(p))
|
184 |
return zip_path
|
185 |
|
186 |
+
# ---------- Search helper ----------
|
187 |
+
def search_links(query: str, num_results: int = 5) -> List[str]:
|
188 |
+
search_url = "https://duckduckgo.com/html/"
|
189 |
+
params = {"q": query}
|
190 |
+
headers = {"User-Agent": "Mozilla/5.0"}
|
191 |
+
r = requests.get(search_url, params=params, headers=headers, timeout=15)
|
192 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
193 |
+
links = []
|
194 |
+
for a in soup.select(".result__a"):
|
195 |
+
href = a.get("href")
|
196 |
+
if href:
|
197 |
+
links.append(href)
|
198 |
+
if len(links) >= num_results:
|
199 |
+
break
|
200 |
+
return links
|
201 |
+
|
202 |
# ---------- Main scrape orchestrator ----------
|
203 |
async def scrape_one(url: str, fields: List[str], use_llm: bool, api_key: Optional[str], card_selector: Optional[str]) -> Dict[str, Any]:
|
204 |
html = await fetch_dom(url)
|
|
|
208 |
if use_llm:
|
209 |
llm_rows = openai_extract_json(html, url, fields, api_key)
|
210 |
|
211 |
+
return {"url": url, "html": html, "items": items, "images": images, "llm_rows": llm_rows or []}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
|
213 |
def to_dataframe(rows: List[Dict[str, Any]]) -> pd.DataFrame:
|
214 |
if not rows:
|
215 |
return pd.DataFrame()
|
|
|
216 |
all_keys = set()
|
217 |
for r in rows:
|
218 |
all_keys.update(r.keys())
|
|
|
221 |
d = {k: r.get(k) for k in all_keys}
|
222 |
ordered.append(d)
|
223 |
df = pd.DataFrame(ordered)
|
|
|
224 |
preferred = [k for k in ["title", "name", "price", "rating", "image", "url"] if k in df.columns]
|
225 |
others = [c for c in df.columns if c not in preferred]
|
226 |
df = df[preferred + others]
|
|
|
235 |
download_imgs: bool,
|
236 |
do_caption: bool):
|
237 |
start = time.time()
|
238 |
+
|
239 |
+
# Auto-detect: if input doesn’t look like a URL, treat as keyword query
|
240 |
+
if not urls_text.strip().startswith("http"):
|
241 |
+
urls = search_links(urls_text.strip(), num_results=5)
|
242 |
+
else:
|
243 |
+
urls = [u.strip() for u in urls_text.splitlines() if u.strip()]
|
244 |
+
|
245 |
fields = [f.strip() for f in fields_text.split(',')] if fields_text.strip() else []
|
246 |
|
247 |
out_dir = os.path.abspath("scrape_output")
|
248 |
os.makedirs(out_dir, exist_ok=True)
|
249 |
|
|
|
250 |
results = []
|
251 |
async def gather_all():
|
252 |
return await asyncio.gather(*[
|
|
|
256 |
try:
|
257 |
scraped = asyncio.run(gather_all())
|
258 |
except RuntimeError:
|
|
|
259 |
scraped = asyncio.get_event_loop().run_until_complete(gather_all())
|
260 |
|
261 |
+
heuristic_rows, llm_rows, all_images = [], [], []
|
|
|
|
|
|
|
|
|
262 |
for s in scraped:
|
263 |
+
heuristic_rows.extend(s["items"])
|
264 |
+
llm_rows.extend(s["llm_rows"])
|
265 |
+
all_images.extend(s["images"])
|
266 |
|
|
|
267 |
rows = llm_rows if use_llm and llm_rows else heuristic_rows
|
268 |
df = to_dataframe(rows)
|
269 |
|
|
|
270 |
ts = int(time.time())
|
271 |
json_path = os.path.join(out_dir, f"scrape_{ts}.json")
|
272 |
csv_path = os.path.join(out_dir, f"scrape_{ts}.csv")
|
|
|
274 |
with open(json_path, "w", encoding="utf-8") as f:
|
275 |
json.dump(rows, f, ensure_ascii=False, indent=2)
|
276 |
|
277 |
+
gallery_paths, zip_path = [], None
|
|
|
|
|
|
|
278 |
if download_imgs and all_images:
|
279 |
img_dir = os.path.join(out_dir, f"images_{ts}")
|
280 |
saved = download_images(all_images, img_dir)
|
281 |
+
gallery_paths = saved[:120]
|
|
|
282 |
captions_map: Dict[str, str] = {}
|
283 |
if do_caption and saved:
|
284 |
captions_map = caption_images(saved)
|
|
|
285 |
if not df.empty:
|
286 |
img_col = None
|
287 |
for c in df.columns:
|
288 |
+
if c.lower() in ("image", "image_url", "img", "imageurl"):
|
289 |
img_col = c
|
290 |
break
|
291 |
if img_col:
|
|
|
293 |
df.to_csv(csv_path, index=False)
|
294 |
with open(json_path, "w", encoding="utf-8") as f:
|
295 |
json.dump(json.loads(df.to_json(orient="records")), f, ensure_ascii=False, indent=2)
|
|
|
296 |
zip_path = os.path.join(out_dir, f"images_{ts}.zip")
|
297 |
zip_paths(saved, zip_path)
|
298 |
|
299 |
elapsed = round(time.time() - start, 2)
|
300 |
+
gallery_data = [(p, os.path.basename(p)) for p in gallery_paths]
|
|
|
|
|
|
|
|
|
|
|
301 |
status = f"Scraped {len(urls)} URL(s) • Rows: {len(df)} • Images found: {len(all_images)} • Time: {elapsed}s"
|
|
|
302 |
return df, gallery_data, json_path, csv_path, (zip_path if zip_path and os.path.isfile(zip_path) else None), status
|
303 |
|
|
|
304 |
# ---------- Gradio UI ----------
|
305 |
with gr.Blocks(title="AI Scraper — Text + Images", css=".gradio-container {max-width: 1200px !important}") as demo:
|
306 |
gr.Markdown("""
|
307 |
# 🕷️ AI-Powered Web Scraper (2025)
|
308 |
+
- Enter a **URL** or just a **keyword query**
|
309 |
- Render dynamic pages (Playwright)
|
310 |
- Extract **text + images**
|
311 |
- Optional **LLM semantic parsing** to JSON
|
|
|
313 |
""")
|
314 |
|
315 |
with gr.Row():
|
316 |
+
urls = gr.Textbox(label="Target URLs or Keywords", placeholder="https://example.com\nOR\nred nike shoes site:amazon.in")
|
317 |
fields = gr.Textbox(label="Fields to extract (comma-separated)", placeholder="title, price, image, rating, url")
|
318 |
card_selector = gr.Textbox(label="Optional CSS selector for item cards (e.g., div.product, article, .card)")
|
319 |
|