apexherbert200 commited on
Commit
d382ddb
·
1 Parent(s): 8b5a84c

Fix Python syntax errors and add requirements.txt

Browse files

- Fixed all syntax errors in scrape.py (imports, indentation, class definitions)
- Created requirements.txt with necessary dependencies
- Cleaned up Dockerfile to remove redundant installations
- Ready for Hugging Face deployment

Files changed (3) hide show
  1. Dockerfile +18 -1
  2. requirements.txt +4 -0
  3. scrape.py +87 -0
Dockerfile CHANGED
@@ -1,4 +1,21 @@
1
- FROM herbert400/web-scraper
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  EXPOSE 7860
 
 
4
  CMD ["python", "-m", "uvicorn", "scrape:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ FROM zenika/alpine-chrome:with-playwright
2
+
3
+ WORKDIR /app
4
+
5
+ # Install Python and pip
6
+ RUN apk add --no-cache python3 py3-pip
7
+
8
+ # Copy requirements first for better caching
9
+ COPY requirements.txt .
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ # Copy your code
13
+ COPY . .
14
+
15
+ # FastAPI and Uvicorn are already installed via requirements.txt
16
+ # Playwright browsers are already available in the base image
17
 
18
  EXPOSE 7860
19
+
20
+ # Run the FastAPI application
21
  CMD ["python", "-m", "uvicorn", "scrape:app", "--host", "0.0.0.0", "--port", "7860"]
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastapi==0.104.1
2
+ uvicorn[standard]==0.24.0
3
+ playwright==1.40.0
4
+ pydantic==2.5.0
scrape.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from playwright.async_api import async_playwright
4
+ import asyncio
5
+ import base64
6
+ from typing import List, Optional
7
+
8
+ app = FastAPI()
9
+
10
+ class ScrapeRequest(BaseModel):
11
+ url: str
12
+ screenshot: bool = True
13
+ get_links: bool = True
14
+ get_content: bool = True
15
+
16
+ class LinkInfo(BaseModel):
17
+ text: str
18
+ href: str
19
+
20
+ class ScrapeResponse(BaseModel):
21
+ content: Optional[str] = None
22
+ screenshot: Optional[str] = None
23
+ links: Optional[List[LinkInfo]] = None
24
+
25
+ @app.post("/scrape")
26
+ async def scrape_page(request: ScrapeRequest):
27
+ async with async_playwright() as p:
28
+ browser = await p.chromium.launch()
29
+ page = await browser.new_page()
30
+
31
+ try:
32
+ await page.goto(request.url, wait_until="networkidle")
33
+ response = ScrapeResponse()
34
+
35
+ # Get page content
36
+ if request.get_content:
37
+ response.content = await page.content()
38
+
39
+ # Get screenshot
40
+ if request.screenshot:
41
+ screenshot_bytes = await page.screenshot()
42
+ response.screenshot = base64.b64encode(screenshot_bytes).decode('utf-8')
43
+
44
+ # Get links
45
+ if request.get_links:
46
+ links = await page.evaluate("""
47
+ () => {
48
+ return Array.from(document.querySelectorAll('a')).map(a => {
49
+ return {
50
+ text: a.innerText.trim(),
51
+ href: a.href
52
+ }
53
+ });
54
+ }
55
+ """)
56
+ response.links = [LinkInfo(**link) for link in links]
57
+
58
+ await browser.close()
59
+ return response
60
+
61
+ except Exception as e:
62
+ await browser.close()
63
+ raise HTTPException(status_code=500, detail=str(e))
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
80
+
81
+
82
+
83
+
84
+
85
+
86
+
87
+