Spaces:

adpro
/

querysurvey

Runtime error

App Files Files Community

adpro commited on Aug 22, 2024

Commit

58c22b3

verified ·

1 Parent(s): 54811ce

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -2

app.py CHANGED Viewed

@@ -8,12 +8,13 @@ from googlesearch import search
 from urllib.parse import parse_qs
 def run_lora(prompt,site,start,end):
     j = "https://www.google.com/search?q=inurl:"+site +"+"+ prompt + "&tbs=cdr%3A1%2Ccd_min%3A"+start+"%2Ccd_max%3A" + end
     links=""
     headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)\
             AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'}
-    page = requests.get(j, headers = headers)
     soup = BeautifulSoup(page.content, "html.parser")
     import re
     links = soup.findAll("a")
@@ -33,6 +34,60 @@ def extract_href(href):
         return None
     return query['q'][0]
 with gr.Blocks() as app:
     gr.HTML("""<html>
   <head>

 from urllib.parse import parse_qs
 def run_lora(prompt,site,start,end):
+    jurl = make_url(inurl:"+site +"+"+ prompt)
     j = "https://www.google.com/search?q=inurl:"+site +"+"+ prompt + "&tbs=cdr%3A1%2Ccd_min%3A"+start+"%2Ccd_max%3A" + end
     links=""
     headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)\
             AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'}
+    page = requests.get(jurl, headers = headers)
     soup = BeautifulSoup(page.content, "html.parser")
     import re
     links = soup.findAll("a")
         return None
     return query['q'][0]
+def make_url(query,start,end):
+    return f"https://www.google.com/search?q={query}&rlz=1C1CHBF_enUS1024US1025&biw=1564&bih=932&sxsrf=ALiCzsaGPneyPAo-kyllnxBBtXe-FGWorQ%3A1665448856808&source=lnt&tbs=sbd%3A1%2Ccdr%3A1%2Ccd_min%3A{start}%2Ccd_max%3A{end}&tbm=nws"
+# find all html elements with given css and get their text values
+def parse_html_text(html, css_selector):
+    soup = BeautifulSoup(html, 'html.parser')
+    html_elements = soup.select(css_selector)
+    text = [element.text for element in html_elements]
+    return text
+# Find all article links
+def parse_html_urls(html, css_selector):
+    soup = BeautifulSoup(html, 'html.parser')
+    html_elements = soup.select(css_selector)
+    href_values = [element.get('href') for element in html_elements]
+    return href_values
+# send GET request and execute website's JavaScript with Chromium driver
+def get_and_render_html(url):
+    driver = webdriver.Chrome()
+    driver.get(url)
+    # wait for all HTML to render
+    sleep(3)
+    page_source = driver.page_source
+    # end selenium session
+    driver.quit()
+    return page_source
+if __name__ == "__main__":
+    # amek query string
+    query = "coca+cola"
+    # set date range
+    m_min, d_min, y_min = 11, 26, 2018
+    m_max, d_max, y_max = 12, 26, 2018
+    url = make_url(query, m_min, d_min, y_min, m_max, d_max, y_max)
+    pg_source = get_and_render_html(url)
+    # use css selectors to parse html for desired information
+    date_selector = "div.OSrXXb.ZE0LJd.YsWzw"
+    publish_dates = parse_html_text(pg_source, date_selector)
+    url_selector = "a.WlydOe"
+    article_urls = parse_html_urls(pg_source, url_selector)
+    print(publish_dates)
+    print(article_urls)
 with gr.Blocks() as app:
     gr.HTML("""<html>
   <head>