mattritchey commited on
Commit
f0b8b67
·
1 Parent(s): 8eaf256

Upload 3 files

Browse files
Files changed (3) hide show
  1. addresses 100 generated.csv +80 -0
  2. app.py +110 -0
  3. requirements.txt +5 -0
addresses 100 generated.csv ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Address1,City,State,Zipcode
2
+ 123 Main St,Chicago,IL,60001
3
+ 245 Oak Ave,New York,NY,10001
4
+ 356 Pine Rd,Los Angeles,CA,90001
5
+ 467 Elm St,Houston,TX,70001
6
+ 578 Maple Ave,Philadelphia,PA,19101
7
+ 689 Cherry St,Phoenix,AZ,85001
8
+ 178 Oak Dr,San Diego,CA,92101
9
+ 286 Pine Crt,San Jose,CA,95101
10
+ 395 Elm Ln,Austin,TX,78751
11
+ 504 Maple Way,Jacksonville,FL,32204
12
+ 613 Cherry Pl,Fort Worth,TX,76104
13
+ 722 Oak Blvd,Dallas,TX,75201
14
+ 831 Pine St,San Antonio,TX,78201
15
+ 940 Elm Rd,Indianapolis,IN,46201
16
+ 1051 Maple Cir,San Francisco,CA,94101
17
+ 1162 Cherry Dr,Columbus,OH,43201
18
+ 1273 Oak Way,Charlotte,NC,28201
19
+ 1384 Pine Ct,Detroit,MI,48201
20
+ 1495 Elm Pl,El Paso,TX,79901
21
+ 1606 Maple St,Memphis,TN,38101
22
+ 1717 Cherry Ave,Boston,MA,2110
23
+ 1828 Oak Pl,Nashville,TN,37201
24
+ 1939 Pine Dr,Baltimore,MD,21201
25
+ 2050 Elm St,Louisville,KY,40201
26
+ 2161 Maple Pl,Portland,OR,97201
27
+ 2272 Cherry Way,Oklahoma City,OK,73101
28
+ 2383 Oak Rd,Milwaukee,WI,53201
29
+ 2494 Pine Cir,Albuquerque,NM,87101
30
+ 2605 Elm Dr,Tucson,AZ,85704
31
+ 2716 Maple Ave,Fresno,CA,93701
32
+ 2827 Cherry Rd,Sacramento,CA,95814
33
+ 2938 Oak St,Kansas City,MO,64105
34
+ 3049 Pine Ln,Mesa,AZ,85201
35
+ 3160 Elm Way,Virginia Beach,VA,23450
36
+ 3271 Maple St,Atlanta,GA,30301
37
+ 3382 Cherry Cir,Colorado Springs,CO,80903
38
+ 3493 Oak Pl,Raleigh,NC,27601
39
+ 3604 Pine Dr,Omaha,NE,68101
40
+ 3715 Elm Way,Miami,FL,33101
41
+ 3826 Maple Ln,Tulsa,OK,74101
42
+ 3937 Cherry Ct,Minneapolis,MN,55401
43
+ 4048 Oak Rd,Cleveland,OH,44101
44
+ 4159 Pine St,Wichita,KS,67201
45
+ 4270 Elm St,Arlington,TX,76001
46
+ 4381 Maple Pl,New Orleans,LA,70101
47
+ 4492 Cherry Ct,Bakersfield,CA,93301
48
+ 4603 Oak Ave,Honolulu,HI,96801
49
+ 4714 Pine Dr,Tampa,FL,33602
50
+ 4825 Elm Way,Anaheim,CA,92801
51
+ 4936 Maple St,Aurora,CO,80011
52
+ 5047 Cherry Rd,Santa Ana,CA,92703
53
+ 5158 Oak Dr,St. Louis,MO,63108
54
+ 5269 Pine St,Riverside,CA,92501
55
+ 5380 Elm Way,Corpus Christi,TX,78401
56
+ 5491 Maple Pl,Lexington,KY,40516
57
+ 6602 Cherry Ct,Lincoln,NE,68521
58
+ 6713 Oak Rd,Stockton,CA,95201
59
+ 6824 Pine Cir,Henderson,NV,89011
60
+ 6935 Elm Dr,Cincinnati,OH,45202
61
+ 7046 Maple St,Pittsburgh,PA,15203
62
+ 7157 Cherry Ave,Saint Paul,MN,55101
63
+ 7268 Oak Pl,Toledo,OH,43604
64
+ 7379 Pine Dr,Newark,NJ,7101
65
+ 7490 Elm Way,Greensboro,NC,27401
66
+ 7501 Maple Ln,Plano,TX,75023
67
+ 7612 Cherry Ct,Richmond,VA,23219
68
+ 7723 Oak Rd,Buffalo,NY,14202
69
+ 7834 Pine St,Fort Wayne,IN,46802
70
+ 7945 Elm St,Jersey City,NJ,7301
71
+ 8056 Maple Pl,Chula Vista,CA,91911
72
+ 8167 Cherry Way,Norfolk,VA,23501
73
+ 8278 Oak Rd,Orlando,FL,32801
74
+ 8389 Pine Cir,Akron,OH,44301
75
+ 8500 Elm Dr,Huntington Beach,CA,92648
76
+ 8611 Maple St,Durham,NC,27701
77
+ 8722 Cherry Rd,Rochester,NY,14601
78
+ 8833 Oak St,Garland,TX,75041
79
+ 8944 Pine Ln,Eugene,OR,97401
80
+ 8055 Elm Way,Chandler,AZ,85225
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Tue Oct 24 12:51:42 2023
4
+
5
+ @author: mritchey
6
+ """
7
+ import pandas as pd
8
+ import requests
9
+ from urllib.parse import urlparse, quote
10
+ import re
11
+ from bs4 import BeautifulSoup
12
+ from joblib import Parallel, delayed
13
+ import gradio as gr
14
+ from io import StringIO
15
+
16
+ default_csv = "addresses 100 generated.csv"
17
+
18
+ def extract_website_domain(url):
19
+ parsed_url = urlparse(url)
20
+ return parsed_url.netloc
21
+
22
+ def google_address(address):
23
+
24
+ address_number = re.findall(r'\b\d+\b', address)[0]
25
+ address_zip =re.search(r'(\d{5})$', address).group()[:2]
26
+
27
+ search_query = quote(address)
28
+ url=f'https://www.google.com/search?q={search_query}'
29
+ response = requests.get(url)
30
+ soup = BeautifulSoup(response.content, "html.parser")
31
+
32
+ texts_links = []
33
+ for link in soup.find_all("a"):
34
+ t,l=link.get_text(), link.get("href")
35
+ if (l[:11]=='/url?q=http') and (len(t)>20 ):
36
+ texts_links.append((t,l))
37
+
38
+
39
+ text = soup.get_text()
40
+
41
+ texts_links_des=[]
42
+ for i,t_l in enumerate(texts_links):
43
+ start=text.find(texts_links[i][0][:50])
44
+ try:
45
+ end=text.find(texts_links[i+1][0][:50])
46
+ except:
47
+ end=text.find('Related searches')
48
+
49
+ description=text[start:end]
50
+ texts_links_des.append((t_l[0],t_l[1],description))
51
+
52
+ df=pd.DataFrame(texts_links_des,columns=['Title','Link','Description'])
53
+ df['Description']=df['Description'].bfill()
54
+ df['Address']=df['Title'].str.extract(r'(.+? \d{5})')
55
+ df['Link']=[i[7:i.find('&sa=')] for i in df['Link']]
56
+ df['Website'] = df['Link'].apply(extract_website_domain)
57
+
58
+ df['Square Footage']=df['Description'].str.extract(r"((\d+) Square Feet|(\d+) sq. ft.|(\d+) sqft|(\d+) Sq. Ft.|(\d+) sq|(\d+(?:,\d+)?) Sq\. Ft\.|(\d+(?:,\d+)?) sq)")[0]
59
+ df['Square Footage']=df['Square Footage'].replace({',':''},regex=True).str.replace(r'\D', '')
60
+
61
+ df['Beds']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"(\d+) bed")
62
+
63
+
64
+ df['Baths']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"((\d+) bath|(\d+(?:\.\d+)?) bath)")[0]
65
+ df['Baths']=df['Baths'].str.extract(r'([\d.]+)').astype(float)
66
+
67
+ df['Year Built']=df['Description'].str.extract(r"built in (\d{4})")
68
+
69
+ df_final=df[df['Address'].notnull()]
70
+ df_final=df_final[(df_final['Address'].str.contains(str(address_number))) & (df_final['Address'].str.contains(str(address_zip)))]
71
+
72
+ df_final.insert(0,'Address Input',address)
73
+ return df_final
74
+
75
+
76
+ def process_csv_text(temp_file):
77
+ if isinstance(temp_file, str):
78
+ df = pd.read_csv(StringIO(temp_file))
79
+ else:
80
+ df = pd.read_csv(temp_file.name)
81
+
82
+ address_cols=list(df.columns[:4])
83
+ df[address_cols[-1]]=df[address_cols[-1]].astype(str).str[:5].astype(int).astype(str)
84
+ df[address_cols[-1]]=df[address_cols[-1]].apply(lambda x: x.zfill(5))
85
+
86
+ df['Address All']=df[address_cols[0]]+', '+df[address_cols[1]]+', '+df[address_cols[2]]+' '+df[address_cols[3]]
87
+ return df
88
+
89
+ def process_multiple_address(addresses):
90
+ results=Parallel(n_jobs=-1, prefer="threads")(delayed(google_address)(i) for i in addresses)
91
+ return results
92
+
93
+ def feed_process_multiple(temp_file):
94
+ df=process_csv_text(temp_file)
95
+ addresses=df['Address All'].to_list()
96
+ results=process_multiple_address(addresses)
97
+ results=pd.concat(results)
98
+ return results
99
+
100
+
101
+ with gr.Blocks() as demo:
102
+ upload_button = gr.UploadButton(label="Upload Addresses", file_types = ['.csv'], live=True, file_count = "single")
103
+ table = gr.Dataframe(headers=['Address Input', 'Title', 'Link', 'Description', 'Address', 'Website',
104
+ 'Square Footage', 'Beds', 'Baths', 'Year Built'], type="pandas", col_count=10)
105
+ upload_button.upload(fn=feed_process_multiple, inputs=upload_button,
106
+ outputs=table, api_name="Address Scrap")
107
+
108
+
109
+ demo.launch()
110
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ beautifulsoup4==4.12.2
2
+ gradio==3.50.2
3
+ joblib==1.1.0
4
+ pandas==1.5.2
5
+ requests==2.27.1