Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -8,12 +8,13 @@ from googlesearch import search
|
|
8 |
from urllib.parse import parse_qs
|
9 |
|
10 |
def run_lora(prompt,site,start,end):
|
11 |
-
|
12 |
j = "https://www.google.com/search?q=inurl:"+site +"+"+ prompt + "&tbs=cdr%3A1%2Ccd_min%3A"+start+"%2Ccd_max%3A" + end
|
13 |
links=""
|
14 |
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)\
|
15 |
AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'}
|
16 |
-
page = requests.get(
|
|
|
17 |
soup = BeautifulSoup(page.content, "html.parser")
|
18 |
import re
|
19 |
links = soup.findAll("a")
|
@@ -33,6 +34,60 @@ def extract_href(href):
|
|
33 |
return None
|
34 |
return query['q'][0]
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
with gr.Blocks() as app:
|
37 |
gr.HTML("""<html>
|
38 |
<head>
|
|
|
8 |
from urllib.parse import parse_qs
|
9 |
|
10 |
def run_lora(prompt,site,start,end):
|
11 |
+
jurl = make_url(inurl:"+site +"+"+ prompt)
|
12 |
j = "https://www.google.com/search?q=inurl:"+site +"+"+ prompt + "&tbs=cdr%3A1%2Ccd_min%3A"+start+"%2Ccd_max%3A" + end
|
13 |
links=""
|
14 |
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)\
|
15 |
AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'}
|
16 |
+
page = requests.get(jurl, headers = headers)
|
17 |
+
|
18 |
soup = BeautifulSoup(page.content, "html.parser")
|
19 |
import re
|
20 |
links = soup.findAll("a")
|
|
|
34 |
return None
|
35 |
return query['q'][0]
|
36 |
|
37 |
+
def make_url(query,start,end):
|
38 |
+
return f"https://www.google.com/search?q={query}&rlz=1C1CHBF_enUS1024US1025&biw=1564&bih=932&sxsrf=ALiCzsaGPneyPAo-kyllnxBBtXe-FGWorQ%3A1665448856808&source=lnt&tbs=sbd%3A1%2Ccdr%3A1%2Ccd_min%3A{start}%2Ccd_max%3A{end}&tbm=nws"
|
39 |
+
|
40 |
+
# find all html elements with given css and get their text values
|
41 |
+
def parse_html_text(html, css_selector):
|
42 |
+
soup = BeautifulSoup(html, 'html.parser')
|
43 |
+
html_elements = soup.select(css_selector)
|
44 |
+
text = [element.text for element in html_elements]
|
45 |
+
return text
|
46 |
+
|
47 |
+
|
48 |
+
# Find all article links
|
49 |
+
def parse_html_urls(html, css_selector):
|
50 |
+
soup = BeautifulSoup(html, 'html.parser')
|
51 |
+
html_elements = soup.select(css_selector)
|
52 |
+
href_values = [element.get('href') for element in html_elements]
|
53 |
+
return href_values
|
54 |
+
|
55 |
+
# send GET request and execute website's JavaScript with Chromium driver
|
56 |
+
def get_and_render_html(url):
|
57 |
+
driver = webdriver.Chrome()
|
58 |
+
driver.get(url)
|
59 |
+
|
60 |
+
# wait for all HTML to render
|
61 |
+
sleep(3)
|
62 |
+
|
63 |
+
page_source = driver.page_source
|
64 |
+
|
65 |
+
# end selenium session
|
66 |
+
driver.quit()
|
67 |
+
return page_source
|
68 |
+
|
69 |
+
if __name__ == "__main__":
|
70 |
+
|
71 |
+
# amek query string
|
72 |
+
query = "coca+cola"
|
73 |
+
|
74 |
+
# set date range
|
75 |
+
m_min, d_min, y_min = 11, 26, 2018
|
76 |
+
m_max, d_max, y_max = 12, 26, 2018
|
77 |
+
|
78 |
+
url = make_url(query, m_min, d_min, y_min, m_max, d_max, y_max)
|
79 |
+
pg_source = get_and_render_html(url)
|
80 |
+
|
81 |
+
# use css selectors to parse html for desired information
|
82 |
+
date_selector = "div.OSrXXb.ZE0LJd.YsWzw"
|
83 |
+
publish_dates = parse_html_text(pg_source, date_selector)
|
84 |
+
|
85 |
+
url_selector = "a.WlydOe"
|
86 |
+
article_urls = parse_html_urls(pg_source, url_selector)
|
87 |
+
|
88 |
+
print(publish_dates)
|
89 |
+
print(article_urls)
|
90 |
+
|
91 |
with gr.Blocks() as app:
|
92 |
gr.HTML("""<html>
|
93 |
<head>
|