adpro commited on
Commit
58c22b3
·
verified ·
1 Parent(s): 54811ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -2
app.py CHANGED
@@ -8,12 +8,13 @@ from googlesearch import search
8
  from urllib.parse import parse_qs
9
 
10
  def run_lora(prompt,site,start,end):
11
-
12
  j = "https://www.google.com/search?q=inurl:"+site +"+"+ prompt + "&tbs=cdr%3A1%2Ccd_min%3A"+start+"%2Ccd_max%3A" + end
13
  links=""
14
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)\
15
  AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'}
16
- page = requests.get(j, headers = headers)
 
17
  soup = BeautifulSoup(page.content, "html.parser")
18
  import re
19
  links = soup.findAll("a")
@@ -33,6 +34,60 @@ def extract_href(href):
33
  return None
34
  return query['q'][0]
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  with gr.Blocks() as app:
37
  gr.HTML("""<html>
38
  <head>
 
8
  from urllib.parse import parse_qs
9
 
10
  def run_lora(prompt,site,start,end):
11
+ jurl = make_url(inurl:"+site +"+"+ prompt)
12
  j = "https://www.google.com/search?q=inurl:"+site +"+"+ prompt + "&tbs=cdr%3A1%2Ccd_min%3A"+start+"%2Ccd_max%3A" + end
13
  links=""
14
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)\
15
  AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'}
16
+ page = requests.get(jurl, headers = headers)
17
+
18
  soup = BeautifulSoup(page.content, "html.parser")
19
  import re
20
  links = soup.findAll("a")
 
34
  return None
35
  return query['q'][0]
36
 
37
+ def make_url(query,start,end):
38
+ return f"https://www.google.com/search?q={query}&rlz=1C1CHBF_enUS1024US1025&biw=1564&bih=932&sxsrf=ALiCzsaGPneyPAo-kyllnxBBtXe-FGWorQ%3A1665448856808&source=lnt&tbs=sbd%3A1%2Ccdr%3A1%2Ccd_min%3A{start}%2Ccd_max%3A{end}&tbm=nws"
39
+
40
+ # find all html elements with given css and get their text values
41
+ def parse_html_text(html, css_selector):
42
+ soup = BeautifulSoup(html, 'html.parser')
43
+ html_elements = soup.select(css_selector)
44
+ text = [element.text for element in html_elements]
45
+ return text
46
+
47
+
48
+ # Find all article links
49
+ def parse_html_urls(html, css_selector):
50
+ soup = BeautifulSoup(html, 'html.parser')
51
+ html_elements = soup.select(css_selector)
52
+ href_values = [element.get('href') for element in html_elements]
53
+ return href_values
54
+
55
+ # send GET request and execute website's JavaScript with Chromium driver
56
+ def get_and_render_html(url):
57
+ driver = webdriver.Chrome()
58
+ driver.get(url)
59
+
60
+ # wait for all HTML to render
61
+ sleep(3)
62
+
63
+ page_source = driver.page_source
64
+
65
+ # end selenium session
66
+ driver.quit()
67
+ return page_source
68
+
69
+ if __name__ == "__main__":
70
+
71
+ # amek query string
72
+ query = "coca+cola"
73
+
74
+ # set date range
75
+ m_min, d_min, y_min = 11, 26, 2018
76
+ m_max, d_max, y_max = 12, 26, 2018
77
+
78
+ url = make_url(query, m_min, d_min, y_min, m_max, d_max, y_max)
79
+ pg_source = get_and_render_html(url)
80
+
81
+ # use css selectors to parse html for desired information
82
+ date_selector = "div.OSrXXb.ZE0LJd.YsWzw"
83
+ publish_dates = parse_html_text(pg_source, date_selector)
84
+
85
+ url_selector = "a.WlydOe"
86
+ article_urls = parse_html_urls(pg_source, url_selector)
87
+
88
+ print(publish_dates)
89
+ print(article_urls)
90
+
91
  with gr.Blocks() as app:
92
  gr.HTML("""<html>
93
  <head>