apexherbert200 commited on
Commit
9e4b598
·
1 Parent(s): 0540d86

Focus scraper on body content, links, and webpage images

Browse files

Features:
- Extract clean body text content (removes scripts/styles)
- Get all meaningful links with text and URLs
- Take full page screenshots (not just viewport)
- Extract page title and meta description
- Filter links to only include valid HTTP URLs
- Limit link text to 200 characters for better performance
- Changed parameter from get_content to get_body for clarity

Example: /scrape?url=https://example.com&screenshot=true&get_links=true&get_body=true

Files changed (1) hide show
  1. scrape.py +60 -20
scrape.py CHANGED
@@ -17,27 +17,35 @@ class LinkInfo(BaseModel):
17
  href: str
18
 
19
  class ScrapeResponse(BaseModel):
20
- content: Optional[str] = None
21
  screenshot: Optional[str] = None
22
  links: Optional[List[LinkInfo]] = None
 
 
23
 
24
  @app.get("/")
25
  async def root():
26
  return {
27
- "message": "Playwright Web Scraper API",
28
  "endpoints": {
29
- "/scrape": "Scrape a webpage (GET request)",
30
  "/docs": "API documentation"
31
  },
32
- "example": "/scrape?url=https://example.com&screenshot=true&get_links=true&get_content=false"
 
 
 
 
 
 
33
  }
34
 
35
  @app.get("/scrape")
36
  async def scrape_page(
37
  url: str = Query(..., description="URL to scrape"),
38
- screenshot: bool = Query(True, description="Take a screenshot"),
39
- get_links: bool = Query(True, description="Extract links"),
40
- get_content: bool = Query(False, description="Get page content (can be large)")
41
  ):
42
  logger.info(f"Starting scrape for URL: {url}")
43
  try:
@@ -62,28 +70,60 @@ async def scrape_page(
62
  await page.goto(url, wait_until="networkidle")
63
  response = ScrapeResponse()
64
 
65
- # Get page content
66
- if get_content:
67
- logger.info("Getting page content...")
68
- response.content = await page.content()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
- # Get screenshot
71
  if screenshot:
72
- logger.info("Taking screenshot...")
73
- screenshot_bytes = await page.screenshot()
74
  response.screenshot = base64.b64encode(screenshot_bytes).decode('utf-8')
75
 
76
- # Get links
77
  if get_links:
78
  logger.info("Extracting links...")
79
  links = await page.evaluate("""
80
  () => {
81
- return Array.from(document.querySelectorAll('a')).map(a => {
82
- return {
83
- text: a.innerText.trim(),
84
- href: a.href
 
 
 
 
 
 
85
  }
86
- });
 
87
  }
88
  """)
89
  response.links = [LinkInfo(**link) for link in links]
 
17
  href: str
18
 
19
  class ScrapeResponse(BaseModel):
20
+ body_content: Optional[str] = None
21
  screenshot: Optional[str] = None
22
  links: Optional[List[LinkInfo]] = None
23
+ page_title: Optional[str] = None
24
+ meta_description: Optional[str] = None
25
 
26
  @app.get("/")
27
  async def root():
28
  return {
29
+ "message": "Playwright Web Scraper API - Body, Links & Images",
30
  "endpoints": {
31
+ "/scrape": "Scrape webpage body content, links, and take screenshot",
32
  "/docs": "API documentation"
33
  },
34
+ "example": "/scrape?url=https://example.com&screenshot=true&get_links=true&get_body=true",
35
+ "features": [
36
+ "Extract body tag content (clean text)",
37
+ "Get all links with text and URLs",
38
+ "Take full page screenshot",
39
+ "Extract page title and meta description"
40
+ ]
41
  }
42
 
43
  @app.get("/scrape")
44
  async def scrape_page(
45
  url: str = Query(..., description="URL to scrape"),
46
+ screenshot: bool = Query(True, description="Take a full page screenshot"),
47
+ get_links: bool = Query(True, description="Extract all links from the page"),
48
+ get_body: bool = Query(True, description="Extract body tag content")
49
  ):
50
  logger.info(f"Starting scrape for URL: {url}")
51
  try:
 
70
  await page.goto(url, wait_until="networkidle")
71
  response = ScrapeResponse()
72
 
73
+ # Always get page title and meta description
74
+ logger.info("Getting page metadata...")
75
+ response.page_title = await page.title()
76
+
77
+ meta_desc = await page.evaluate("""
78
+ () => {
79
+ const meta = document.querySelector('meta[name="description"]');
80
+ return meta ? meta.getAttribute('content') : null;
81
+ }
82
+ """)
83
+ response.meta_description = meta_desc
84
+
85
+ # Get body content (clean text)
86
+ if get_body:
87
+ logger.info("Extracting body content...")
88
+ body_content = await page.evaluate("""
89
+ () => {
90
+ const body = document.querySelector('body');
91
+ if (!body) return null;
92
+
93
+ // Remove script and style elements
94
+ const scripts = body.querySelectorAll('script, style, noscript');
95
+ scripts.forEach(el => el.remove());
96
+
97
+ // Get clean text content
98
+ return body.innerText.trim();
99
+ }
100
+ """)
101
+ response.body_content = body_content
102
 
103
+ # Get screenshot (full page)
104
  if screenshot:
105
+ logger.info("Taking full page screenshot...")
106
+ screenshot_bytes = await page.screenshot(full_page=True)
107
  response.screenshot = base64.b64encode(screenshot_bytes).decode('utf-8')
108
 
109
+ # Get links with better filtering
110
  if get_links:
111
  logger.info("Extracting links...")
112
  links = await page.evaluate("""
113
  () => {
114
+ return Array.from(document.querySelectorAll('a[href]')).map(a => {
115
+ const text = a.innerText.trim();
116
+ const href = a.href;
117
+
118
+ // Only include links with meaningful text and valid URLs
119
+ if (text && href && href.startsWith('http')) {
120
+ return {
121
+ text: text.substring(0, 200), // Limit text length
122
+ href: href
123
+ }
124
  }
125
+ return null;
126
+ }).filter(link => link !== null);
127
  }
128
  """)
129
  response.links = [LinkInfo(**link) for link in links]