apexherbert200 commited on
Commit
c577586
·
1 Parent(s): 366b9dd

Making changes to old webrify

Browse files
Files changed (2) hide show
  1. Dockerfile +1 -1
  2. test1.py +32 -79
Dockerfile CHANGED
@@ -53,4 +53,4 @@ RUN python -m playwright install chromium
53
  EXPOSE 7860
54
 
55
  # Run the FastAPI application
56
- CMD ["python", "-m", "uvicorn", "test1:app", "--host", "0.0.0.0", "--port", "7860"]
 
53
  EXPOSE 7860
54
 
55
  # Run the FastAPI application
56
+ CMD ["python", "-m", "uvicorn", "webrify2:app", "--host", "0.0.0.0", "--port", "7860"]
test1.py CHANGED
@@ -1,95 +1,48 @@
1
- # from fastapi import FastAPI
2
- # from playwright.async_api import async_playwright, TimeoutError
3
- # import re
4
-
5
- # app = FastAPI()
6
-
7
- # async def scrape_google(query: str):
8
- # url = f"https://www.google.com/search?q={query}"
9
- # async with async_playwright() as pw:
10
- # browser = await pw.chromium.launch(headless=True)
11
- # context = await browser.new_context()
12
- # page = await context.new_page()
13
-
14
- # await page.goto(url, wait_until="domcontentloaded", timeout=60000)
15
- # try:
16
- # await page.wait_for_selector("div#search", timeout=10000)
17
- # except TimeoutError:
18
- # pass
19
-
20
- # links = []
21
- # for h in await page.query_selector_all("h3"):
22
- # try:
23
- # a = await h.evaluate_handle("e => e.closest('a')")
24
- # href = await a.get_attribute("href")
25
- # title = await h.inner_text()
26
- # links.append({"title": title, "link": href})
27
- # except:
28
- # continue
29
-
30
- # results = []
31
- # for item in links[:5]:
32
- # await page.goto(item["link"], wait_until="domcontentloaded", timeout=30000)
33
- # html = await page.content()
34
- # emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", html)
35
- # phones = re.findall(r"\+?\d[\d\s\-/]{7,}\d", html)
36
- # results.append({
37
- # **item,
38
- # "emails": list(set(emails))[:2],
39
- # "phones": list(set(phones))[:2]
40
- # })
41
-
42
- # await browser.close()
43
- # return results
44
-
45
- # @app.get("/search")
46
- # async def search(query: str):
47
- # data = await scrape_google(query.replace(" ", "+"))
48
- # return {"query": query, "results": data}
49
-
50
-
51
-
52
-
53
-
54
-
55
-
56
  from fastapi import FastAPI
57
  from playwright.async_api import async_playwright, TimeoutError
 
58
 
59
  app = FastAPI()
60
 
61
- async def scrape_full_page(url: str):
 
62
  async with async_playwright() as pw:
63
  browser = await pw.chromium.launch(headless=True)
64
  context = await browser.new_context()
65
  page = await context.new_page()
66
-
67
  await page.goto(url, wait_until="domcontentloaded", timeout=60000)
68
  try:
69
- await page.wait_for_selector("body", timeout=10000)
70
  except TimeoutError:
71
  pass
72
 
73
- html = await page.content()
74
- # Extract headings & paragraphs as structured JSON
75
- items = await page.evaluate("""
76
- () => {
77
- const data = [];
78
- document.querySelectorAll('h1,h2,h3,h4,h5,h6,p').forEach(el => {
79
- data.push({ tag: el.tagName.toLowerCase(), text: el.innerText.trim() });
80
- });
81
- return data;
82
- }
83
- """)
 
 
 
 
 
 
 
 
 
 
 
84
  await browser.close()
85
- return {"html": html, "content": items}
86
 
87
- @app.get("/scrape")
88
- async def scrape(url: str):
89
- """
90
- Fetches the full page and returns:
91
- - raw HTML
92
- - an array of objects: { tag: 'h1'|'p'|..., text: '...' }
93
- """
94
- result = await scrape_full_page(url)
95
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from fastapi import FastAPI
2
  from playwright.async_api import async_playwright, TimeoutError
3
+ import re
4
 
5
  app = FastAPI()
6
 
7
+ async def scrape_google(query: str):
8
+ url = f"https://www.google.com/search?q={query}"
9
  async with async_playwright() as pw:
10
  browser = await pw.chromium.launch(headless=True)
11
  context = await browser.new_context()
12
  page = await context.new_page()
13
+
14
  await page.goto(url, wait_until="domcontentloaded", timeout=60000)
15
  try:
16
+ await page.wait_for_selector("div#search", timeout=10000)
17
  except TimeoutError:
18
  pass
19
 
20
+ links = []
21
+ for h in await page.query_selector_all("h3"):
22
+ try:
23
+ a = await h.evaluate_handle("e => e.closest('a')")
24
+ href = await a.get_attribute("href")
25
+ title = await h.inner_text()
26
+ links.append({"title": title, "link": href})
27
+ except:
28
+ continue
29
+
30
+ results = []
31
+ for item in links[:5]:
32
+ await page.goto(item["link"], wait_until="domcontentloaded", timeout=30000)
33
+ html = await page.content()
34
+ emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", html)
35
+ phones = re.findall(r"\+?\d[\d\s\-/]{7,}\d", html)
36
+ results.append({
37
+ **item,
38
+ "emails": list(set(emails))[:2],
39
+ "phones": list(set(phones))[:2]
40
+ })
41
+
42
  await browser.close()
43
+ return results
44
 
45
+ @app.get("/search")
46
+ async def search(query: str):
47
+ data = await scrape_google(query.replace(" ", "+"))
48
+ return {"query": query, "results": data}