apexherbert200 commited on
Commit
36092a0
·
1 Parent(s): be7cc52

Using google search

Browse files
Files changed (1) hide show
  1. test1.py +22 -16
test1.py CHANGED
@@ -1,38 +1,44 @@
1
  from fastapi import FastAPI
2
- from playwright.async_api import async_playwright
 
3
 
4
  app = FastAPI()
5
 
6
  async def scrape_google(query: str):
7
- url = (
8
- "https://www.google.com/search"
9
- f"?q={query}"
10
- "&sxsrf=AE3TifOZcTbH54cOkE27wqRqSVEmaqb7fw%3A1750003707838"
11
- )
12
  async with async_playwright() as pw:
13
  browser = await pw.chromium.launch(headless=True)
14
  context = await browser.new_context()
15
  page = await context.new_page()
16
 
17
- # Accept cookie/consent pop-ups
18
  try:
19
- btn = await page.wait_for_selector('button:has-text("I agree")', timeout=5000)
20
- await btn.click()
21
- except:
22
  pass
23
 
24
- await page.goto(url, wait_until="domcontentloaded")
25
- await page.wait_for_selector("h3")
26
-
27
- results = []
28
  for h in await page.query_selector_all("h3"):
29
  try:
30
- link = await h.evaluate("(e) => e.closest('a').href")
 
31
  title = await h.inner_text()
32
- results.append({"title": title, "link": link})
33
  except:
34
  continue
35
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  await browser.close()
37
  return results
38
 
 
1
  from fastapi import FastAPI
2
+ from playwright.async_api import async_playwright, TimeoutError
3
+ import re
4
 
5
  app = FastAPI()
6
 
7
  async def scrape_google(query: str):
8
+ url = f"https://www.google.com/search?q={query}"
 
 
 
 
9
  async with async_playwright() as pw:
10
  browser = await pw.chromium.launch(headless=True)
11
  context = await browser.new_context()
12
  page = await context.new_page()
13
 
14
+ await page.goto(url, wait_until="domcontentloaded", timeout=60000)
15
  try:
16
+ await page.wait_for_selector("div#search", timeout=10000)
17
+ except TimeoutError:
 
18
  pass
19
 
20
+ links = []
 
 
 
21
  for h in await page.query_selector_all("h3"):
22
  try:
23
+ a = await h.evaluate_handle("e => e.closest('a')")
24
+ href = await a.get_attribute("href")
25
  title = await h.inner_text()
26
+ links.append({"title": title, "link": href})
27
  except:
28
  continue
29
 
30
+ results = []
31
+ for item in links[:5]:
32
+ await page.goto(item["link"], wait_until="domcontentloaded", timeout=30000)
33
+ html = await page.content()
34
+ emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", html)
35
+ phones = re.findall(r"\+?\d[\d\s\-/]{7,}\d", html)
36
+ results.append({
37
+ **item,
38
+ "emails": list(set(emails))[:2],
39
+ "phones": list(set(phones))[:2]
40
+ })
41
+
42
  await browser.close()
43
  return results
44