mattritchey commited on
Commit
ee63323
·
verified ·
1 Parent(s): 1c99527

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -22
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
@@ -8,6 +10,7 @@ from bs4 import BeautifulSoup
8
  import time
9
  from joblib import Parallel, delayed
10
  from nltk import ngrams
 
11
 
12
  @st.cache_data
13
  def convert_df(df):
@@ -55,33 +58,44 @@ def extract_website_domain(url):
55
  def google_address(address):
56
  # address_number = re.findall(r'\b\d+\b', address)[0]
57
  # address_zip =re.search(r'(\d{5})$', address).group()[:2]
 
 
58
 
59
- search_query = quote(address)
60
- url=f'https://www.google.com/search?q={search_query}'
61
- response = requests.get(url)
62
- soup = BeautifulSoup(response.content, "html.parser")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
- texts_links = []
65
- for link in soup.find_all("a"):
66
- t,l=link.get_text(), link.get("href")
67
- if (l[:11]=='/url?q=http') and (len(t)>20 ):
68
- texts_links.append((t,l))
69
 
70
- text = soup.get_text()
 
 
71
 
72
- texts_links_des=[]
73
- for i,t_l in enumerate(texts_links):
74
- start=text.find(texts_links[i][0][:50])
75
- try:
76
- end=text.find(texts_links[i+1][0][:50])
77
- except:
78
- end=text.find('Related searches')
79
-
80
- description=text[start:end]
81
- texts_links_des.append((t_l[0],t_l[1],description))
82
 
83
- df=pd.DataFrame(texts_links_des,columns=['Title','Link','Description'])
84
- df['Description']=df['Description'].bfill()
85
  df['Address Output']=df['Title'].str.extract(r'(.+? \d{5})').fillna("**DID NOT EXTRACT ADDRESS**")
86
 
87
  df['Link']=[i[7:i.find('&sa=')] for i in df['Link']]
 
1
+
2
+
3
  import streamlit as st
4
  import pandas as pd
5
  import numpy as np
 
10
  import time
11
  from joblib import Parallel, delayed
12
  from nltk import ngrams
13
+ from googlesearch import search
14
 
15
  @st.cache_data
16
  def convert_df(df):
 
58
  def google_address(address):
59
  # address_number = re.findall(r'\b\d+\b', address)[0]
60
  # address_zip =re.search(r'(\d{5})$', address).group()[:2]
61
+ all_data=[i for i in search(address, ssl_verify=False, advanced=True,
62
+ num_results=11)]
63
 
64
+
65
+ # search_query = quote(address)
66
+ # url=f'https://www.google.com/search?q={search_query}'
67
+ # response = requests.get(url)
68
+ # soup = BeautifulSoup(response.content, "html.parser")
69
+
70
+ # texts_links = []
71
+ # for link in soup.find_all("a"):
72
+ # t,l=link.get_text(), link.get("href")
73
+ # if (l[:11]=='/url?q=http') and (len(t)>20 ):
74
+ # texts_links.append((t,l))
75
+
76
+ # text = soup.get_text()
77
+
78
+ # texts_links_des=[]
79
+ # for i,t_l in enumerate(texts_links):
80
+ # start=text.find(texts_links[i][0][:50])
81
+ # try:
82
+ # end=text.find(texts_links[i+1][0][:50])
83
+ # except:
84
+ # end=text.find('Related searches')
85
+
86
+ # description=text[start:end]
87
+ # texts_links_des.append((t_l[0],t_l[1],description))
88
 
89
+ # df=pd.DataFrame(texts_links_des,columns=['Title','Link','Description'])
 
 
 
 
90
 
91
+ df=pd.DataFrame({'Title':[i.title for i in all_data],
92
+ 'Link':[i.url for i in all_data],
93
+ 'Description':[i.description for i in all_data],})
94
 
95
+ df=df.query("Title==Title")
96
+ df['Link']=df['Link'].str.replace('/www.','www.')
 
 
 
 
 
 
 
 
97
 
98
+ # df['Description']=df['Description'].bfill()
 
99
  df['Address Output']=df['Title'].str.extract(r'(.+? \d{5})').fillna("**DID NOT EXTRACT ADDRESS**")
100
 
101
  df['Link']=[i[7:i.find('&sa=')] for i in df['Link']]