mattritchey commited on
Commit
b42ca14
·
1 Parent(s): 29b1a8f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -17
app.py CHANGED
@@ -12,17 +12,47 @@ from bs4 import BeautifulSoup
12
  from joblib import Parallel, delayed
13
  import gradio as gr
14
  from io import StringIO
 
15
 
16
- default_csv = "addresses 100 generated.csv"
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def extract_website_domain(url):
19
  parsed_url = urlparse(url)
20
  return parsed_url.netloc
21
 
22
  def google_address(address):
23
-
24
- address_number = re.findall(r'\b\d+\b', address)[0]
25
- address_zip =re.search(r'(\d{5})$', address).group()[:2]
26
 
27
  search_query = quote(address)
28
  url=f'https://www.google.com/search?q={search_query}'
@@ -35,7 +65,6 @@ def google_address(address):
35
  if (l[:11]=='/url?q=http') and (len(t)>20 ):
36
  texts_links.append((t,l))
37
 
38
-
39
  text = soup.get_text()
40
 
41
  texts_links_des=[]
@@ -51,13 +80,16 @@ def google_address(address):
51
 
52
  df=pd.DataFrame(texts_links_des,columns=['Title','Link','Description'])
53
  df['Description']=df['Description'].bfill()
54
- df['Address']=df['Title'].str.extract(r'(.+? \d{5})')
 
55
  df['Link']=[i[7:i.find('&sa=')] for i in df['Link']]
56
  df['Website'] = df['Link'].apply(extract_website_domain)
57
 
58
  df['Square Footage']=df['Description'].str.extract(r"((\d+) Square Feet|(\d+) sq. ft.|(\d+) sqft|(\d+) Sq. Ft.|(\d+) sq|(\d+(?:,\d+)?) Sq\. Ft\.|(\d+(?:,\d+)?) sq)")[0]
59
- df['Square Footage']=df['Square Footage'].replace({',':''},regex=True).str.replace(r'\D', '')
60
-
 
 
61
  df['Beds']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"(\d+) bed")
62
 
63
 
@@ -66,18 +98,21 @@ def google_address(address):
66
 
67
  df['Year Built']=df['Description'].str.extract(r"built in (\d{4})")
68
 
69
- df_final=df[df['Address'].notnull()]
70
- df_final=df_final[(df_final['Address'].str.contains(str(address_number))) & (df_final['Address'].str.contains(str(address_zip)))]
 
 
 
71
 
72
- df_final.insert(0,'Address Input',address)
73
- return df_final
74
 
75
 
76
  def process_csv_text(temp_file):
77
  if isinstance(temp_file, str):
78
- df = pd.read_csv(StringIO(temp_file))
79
  else:
80
- df = pd.read_csv(temp_file.name)
81
 
82
  address_cols=list(df.columns[:4])
83
  df[address_cols[-1]]=df[address_cols[-1]].astype(str).str[:5].astype(int).astype(str)
@@ -86,11 +121,11 @@ def process_csv_text(temp_file):
86
  df['Address All']=df[address_cols[0]]+', '+df[address_cols[1]]+', '+df[address_cols[2]]+' '+df[address_cols[3]]
87
  return df
88
 
89
- def catch_errors(addresses):
90
  try:
91
- return google_address(addresses)
92
  except:
93
- return pd.DataFrame({'Address Input':[addresses]})
94
 
95
  def process_multiple_address(addresses):
96
  results=Parallel(n_jobs=32, prefer="threads")(delayed(catch_errors)(i) for i in addresses)
 
12
  from joblib import Parallel, delayed
13
  import gradio as gr
14
  from io import StringIO
15
+ from nltk import ngrams
16
 
 
17
 
18
+ def normalize_string(string):
19
+ normalized_string = string.lower()
20
+ normalized_string = re.sub(r'[^\w\s]', '', normalized_string)
21
+
22
+ return normalized_string
23
+
24
+
25
+ def jaccard_similarity(string1, string2,n = 2, normalize=True):
26
+ try:
27
+ if normalize:
28
+ string1,string2= normalize_string(string1),normalize_string(string2)
29
+
30
+ grams1 = set(ngrams(string1, n))
31
+ grams2 = set(ngrams(string2, n))
32
+ similarity = len(grams1.intersection(grams2)) / len(grams1.union(grams2))
33
+ except:
34
+ similarity=0
35
+
36
+ if string2=='did not extract address':
37
+ similarity=0
38
+
39
+ return similarity
40
+
41
+ def jaccard_sim_split_word_number(string1,string2):
42
+ numbers1 = ' '.join(re.findall(r'\d+', string1))
43
+ words1 = ' '.join(re.findall(r'\b[A-Za-z]+\b', string1))
44
+
45
+ numbers2 = ' '.join(re.findall(r'\d+', string2))
46
+ words2 = ' '.join(re.findall(r'\b[A-Za-z]+\b', string2))
47
+
48
+ number_similarity=jaccard_similarity(numbers1,numbers2)
49
+ words_similarity=jaccard_similarity(words1,words2)
50
+ return (number_similarity+words_similarity)/2
51
  def extract_website_domain(url):
52
  parsed_url = urlparse(url)
53
  return parsed_url.netloc
54
 
55
  def google_address(address):
 
 
 
56
 
57
  search_query = quote(address)
58
  url=f'https://www.google.com/search?q={search_query}'
 
65
  if (l[:11]=='/url?q=http') and (len(t)>20 ):
66
  texts_links.append((t,l))
67
 
 
68
  text = soup.get_text()
69
 
70
  texts_links_des=[]
 
80
 
81
  df=pd.DataFrame(texts_links_des,columns=['Title','Link','Description'])
82
  df['Description']=df['Description'].bfill()
83
+ df['Address Output']=df['Title'].str.extract(r'(.+? \d{5})').fillna("**DID NOT EXTRACT ADDRESS**")
84
+
85
  df['Link']=[i[7:i.find('&sa=')] for i in df['Link']]
86
  df['Website'] = df['Link'].apply(extract_website_domain)
87
 
88
  df['Square Footage']=df['Description'].str.extract(r"((\d+) Square Feet|(\d+) sq. ft.|(\d+) sqft|(\d+) Sq. Ft.|(\d+) sq|(\d+(?:,\d+)?) Sq\. Ft\.|(\d+(?:,\d+)?) sq)")[0]
89
+ try:
90
+ df['Square Footage']=df['Square Footage'].replace({',':''},regex=True).str.replace(r'\D', '')
91
+ except:
92
+ pass
93
  df['Beds']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"(\d+) bed")
94
 
95
 
 
98
 
99
  df['Year Built']=df['Description'].str.extract(r"built in (\d{4})")
100
 
101
+ df['Match Percent']=[jaccard_sim_split_word_number(address,i)*100 for i in df['Address Output']]
102
+ df['Google Search Result']=[*range(1,df.shape[0]+1)]
103
+
104
+ # df_final=df[df['Address Output'].notnull()]
105
+ # df_final=df_final[(df_final['Address Output'].str.contains(str(address_number))) & (df_final['Address Output'].str.contains(str(address_zip)))]
106
 
107
+ df.insert(0,'Address Input',address)
108
+ return df
109
 
110
 
111
  def process_csv_text(temp_file):
112
  if isinstance(temp_file, str):
113
+ df = pd.read_csv(StringIO(temp_file))
114
  else:
115
+ df = pd.read_csv(temp_file.name)
116
 
117
  address_cols=list(df.columns[:4])
118
  df[address_cols[-1]]=df[address_cols[-1]].astype(str).str[:5].astype(int).astype(str)
 
121
  df['Address All']=df[address_cols[0]]+', '+df[address_cols[1]]+', '+df[address_cols[2]]+' '+df[address_cols[3]]
122
  return df
123
 
124
+ def catch_errors(address):
125
  try:
126
+ return google_address(address)
127
  except:
128
+ return pd.DataFrame({'Address Input':[address]})
129
 
130
  def process_multiple_address(addresses):
131
  results=Parallel(n_jobs=32, prefer="threads")(delayed(catch_errors)(i) for i in addresses)