Spaces:

apexherbert200
/

playwright-scraper-clean

Running

App Files Files Community

apexherbert200 commited on Jun 15

Commit

c577586

1 Parent(s): 366b9dd

Making changes to old webrify

Browse files

Files changed (2) hide show

Dockerfile +1 -1
test1.py +32 -79

Dockerfile CHANGED Viewed

@@ -53,4 +53,4 @@ RUN python -m playwright install chromium
 EXPOSE 7860
 # Run the FastAPI application
-CMD ["python", "-m", "uvicorn", "test1:app", "--host", "0.0.0.0", "--port", "7860"]

 EXPOSE 7860
 # Run the FastAPI application
+CMD ["python", "-m", "uvicorn", "webrify2:app", "--host", "0.0.0.0", "--port", "7860"]

test1.py CHANGED Viewed

@@ -1,95 +1,48 @@
-# from fastapi import FastAPI
-# from playwright.async_api import async_playwright, TimeoutError
-# import re
-# app = FastAPI()
-# async def scrape_google(query: str):
-#     url = f"https://www.google.com/search?q={query}"
-#     async with async_playwright() as pw:
-#         browser = await pw.chromium.launch(headless=True)
-#         context = await browser.new_context()
-#         page = await context.new_page()
-#         await page.goto(url, wait_until="domcontentloaded", timeout=60000)
-#         try:
-#             await page.wait_for_selector("div#search", timeout=10000)
-#         except TimeoutError:
-#             pass
-#         links = []
-#         for h in await page.query_selector_all("h3"):
-#             try:
-#                 a = await h.evaluate_handle("e => e.closest('a')")
-#                 href = await a.get_attribute("href")
-#                 title = await h.inner_text()
-#                 links.append({"title": title, "link": href})
-#             except:
-#                 continue
-#         results = []
-#         for item in links[:5]:
-#             await page.goto(item["link"], wait_until="domcontentloaded", timeout=30000)
-#             html = await page.content()
-#             emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", html)
-#             phones = re.findall(r"\+?\d[\d\s\-/]{7,}\d", html)
-#             results.append({
-#                 **item,
-#                 "emails": list(set(emails))[:2],
-#                 "phones": list(set(phones))[:2]
-#             })
-#         await browser.close()
-#     return results
-# @app.get("/search")
-# async def search(query: str):
-#     data = await scrape_google(query.replace(" ", "+"))
-#     return {"query": query, "results": data}
 from fastapi import FastAPI
 from playwright.async_api import async_playwright, TimeoutError
 app = FastAPI()
-async def scrape_full_page(url: str):
     async with async_playwright() as pw:
         browser = await pw.chromium.launch(headless=True)
         context = await browser.new_context()
         page = await context.new_page()
         await page.goto(url, wait_until="domcontentloaded", timeout=60000)
         try:
-            await page.wait_for_selector("body", timeout=10000)
         except TimeoutError:
             pass
-        html = await page.content()
-        # Extract headings & paragraphs as structured JSON
-        items = await page.evaluate("""
-        () => {
-            const data = [];
-            document.querySelectorAll('h1,h2,h3,h4,h5,h6,p').forEach(el => {
-                data.push({ tag: el.tagName.toLowerCase(), text: el.innerText.trim() });
-            });
-            return data;
-        }
-        """)
         await browser.close()
-    return {"html": html, "content": items}
-@app.get("/scrape")
-async def scrape(url: str):
-    """
-    Fetches the full page and returns:
-    - raw HTML
-    - an array of objects: { tag: 'h1'|'p'|..., text: '...' }
-    """
-    result = await scrape_full_page(url)
-    return result

 from fastapi import FastAPI
 from playwright.async_api import async_playwright, TimeoutError
+import re
 app = FastAPI()
+async def scrape_google(query: str):
+    url = f"https://www.google.com/search?q={query}"
     async with async_playwright() as pw:
         browser = await pw.chromium.launch(headless=True)
         context = await browser.new_context()
         page = await context.new_page()
         await page.goto(url, wait_until="domcontentloaded", timeout=60000)
         try:
+            await page.wait_for_selector("div#search", timeout=10000)
         except TimeoutError:
             pass
+        links = []
+        for h in await page.query_selector_all("h3"):
+            try:
+                a = await h.evaluate_handle("e => e.closest('a')")
+                href = await a.get_attribute("href")
+                title = await h.inner_text()
+                links.append({"title": title, "link": href})
+            except:
+                continue
+        results = []
+        for item in links[:5]:
+            await page.goto(item["link"], wait_until="domcontentloaded", timeout=30000)
+            html = await page.content()
+            emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", html)
+            phones = re.findall(r"\+?\d[\d\s\-/]{7,}\d", html)
+            results.append({
+                **item,
+                "emails": list(set(emails))[:2],
+                "phones": list(set(phones))[:2]
+            })
         await browser.close()
+    return results
+@app.get("/search")
+async def search(query: str):
+    data = await scrape_google(query.replace(" ", "+"))
+    return {"query": query, "results": data}