Spaces:
Sleeping
Sleeping
Commit
·
f0b8b67
1
Parent(s):
8eaf256
Upload 3 files
Browse files- addresses 100 generated.csv +80 -0
- app.py +110 -0
- requirements.txt +5 -0
addresses 100 generated.csv
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Address1,City,State,Zipcode
|
2 |
+
123 Main St,Chicago,IL,60001
|
3 |
+
245 Oak Ave,New York,NY,10001
|
4 |
+
356 Pine Rd,Los Angeles,CA,90001
|
5 |
+
467 Elm St,Houston,TX,70001
|
6 |
+
578 Maple Ave,Philadelphia,PA,19101
|
7 |
+
689 Cherry St,Phoenix,AZ,85001
|
8 |
+
178 Oak Dr,San Diego,CA,92101
|
9 |
+
286 Pine Crt,San Jose,CA,95101
|
10 |
+
395 Elm Ln,Austin,TX,78751
|
11 |
+
504 Maple Way,Jacksonville,FL,32204
|
12 |
+
613 Cherry Pl,Fort Worth,TX,76104
|
13 |
+
722 Oak Blvd,Dallas,TX,75201
|
14 |
+
831 Pine St,San Antonio,TX,78201
|
15 |
+
940 Elm Rd,Indianapolis,IN,46201
|
16 |
+
1051 Maple Cir,San Francisco,CA,94101
|
17 |
+
1162 Cherry Dr,Columbus,OH,43201
|
18 |
+
1273 Oak Way,Charlotte,NC,28201
|
19 |
+
1384 Pine Ct,Detroit,MI,48201
|
20 |
+
1495 Elm Pl,El Paso,TX,79901
|
21 |
+
1606 Maple St,Memphis,TN,38101
|
22 |
+
1717 Cherry Ave,Boston,MA,2110
|
23 |
+
1828 Oak Pl,Nashville,TN,37201
|
24 |
+
1939 Pine Dr,Baltimore,MD,21201
|
25 |
+
2050 Elm St,Louisville,KY,40201
|
26 |
+
2161 Maple Pl,Portland,OR,97201
|
27 |
+
2272 Cherry Way,Oklahoma City,OK,73101
|
28 |
+
2383 Oak Rd,Milwaukee,WI,53201
|
29 |
+
2494 Pine Cir,Albuquerque,NM,87101
|
30 |
+
2605 Elm Dr,Tucson,AZ,85704
|
31 |
+
2716 Maple Ave,Fresno,CA,93701
|
32 |
+
2827 Cherry Rd,Sacramento,CA,95814
|
33 |
+
2938 Oak St,Kansas City,MO,64105
|
34 |
+
3049 Pine Ln,Mesa,AZ,85201
|
35 |
+
3160 Elm Way,Virginia Beach,VA,23450
|
36 |
+
3271 Maple St,Atlanta,GA,30301
|
37 |
+
3382 Cherry Cir,Colorado Springs,CO,80903
|
38 |
+
3493 Oak Pl,Raleigh,NC,27601
|
39 |
+
3604 Pine Dr,Omaha,NE,68101
|
40 |
+
3715 Elm Way,Miami,FL,33101
|
41 |
+
3826 Maple Ln,Tulsa,OK,74101
|
42 |
+
3937 Cherry Ct,Minneapolis,MN,55401
|
43 |
+
4048 Oak Rd,Cleveland,OH,44101
|
44 |
+
4159 Pine St,Wichita,KS,67201
|
45 |
+
4270 Elm St,Arlington,TX,76001
|
46 |
+
4381 Maple Pl,New Orleans,LA,70101
|
47 |
+
4492 Cherry Ct,Bakersfield,CA,93301
|
48 |
+
4603 Oak Ave,Honolulu,HI,96801
|
49 |
+
4714 Pine Dr,Tampa,FL,33602
|
50 |
+
4825 Elm Way,Anaheim,CA,92801
|
51 |
+
4936 Maple St,Aurora,CO,80011
|
52 |
+
5047 Cherry Rd,Santa Ana,CA,92703
|
53 |
+
5158 Oak Dr,St. Louis,MO,63108
|
54 |
+
5269 Pine St,Riverside,CA,92501
|
55 |
+
5380 Elm Way,Corpus Christi,TX,78401
|
56 |
+
5491 Maple Pl,Lexington,KY,40516
|
57 |
+
6602 Cherry Ct,Lincoln,NE,68521
|
58 |
+
6713 Oak Rd,Stockton,CA,95201
|
59 |
+
6824 Pine Cir,Henderson,NV,89011
|
60 |
+
6935 Elm Dr,Cincinnati,OH,45202
|
61 |
+
7046 Maple St,Pittsburgh,PA,15203
|
62 |
+
7157 Cherry Ave,Saint Paul,MN,55101
|
63 |
+
7268 Oak Pl,Toledo,OH,43604
|
64 |
+
7379 Pine Dr,Newark,NJ,7101
|
65 |
+
7490 Elm Way,Greensboro,NC,27401
|
66 |
+
7501 Maple Ln,Plano,TX,75023
|
67 |
+
7612 Cherry Ct,Richmond,VA,23219
|
68 |
+
7723 Oak Rd,Buffalo,NY,14202
|
69 |
+
7834 Pine St,Fort Wayne,IN,46802
|
70 |
+
7945 Elm St,Jersey City,NJ,7301
|
71 |
+
8056 Maple Pl,Chula Vista,CA,91911
|
72 |
+
8167 Cherry Way,Norfolk,VA,23501
|
73 |
+
8278 Oak Rd,Orlando,FL,32801
|
74 |
+
8389 Pine Cir,Akron,OH,44301
|
75 |
+
8500 Elm Dr,Huntington Beach,CA,92648
|
76 |
+
8611 Maple St,Durham,NC,27701
|
77 |
+
8722 Cherry Rd,Rochester,NY,14601
|
78 |
+
8833 Oak St,Garland,TX,75041
|
79 |
+
8944 Pine Ln,Eugene,OR,97401
|
80 |
+
8055 Elm Way,Chandler,AZ,85225
|
app.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
Created on Tue Oct 24 12:51:42 2023
|
4 |
+
|
5 |
+
@author: mritchey
|
6 |
+
"""
|
7 |
+
import pandas as pd
|
8 |
+
import requests
|
9 |
+
from urllib.parse import urlparse, quote
|
10 |
+
import re
|
11 |
+
from bs4 import BeautifulSoup
|
12 |
+
from joblib import Parallel, delayed
|
13 |
+
import gradio as gr
|
14 |
+
from io import StringIO
|
15 |
+
|
16 |
+
default_csv = "addresses 100 generated.csv"
|
17 |
+
|
18 |
+
def extract_website_domain(url):
|
19 |
+
parsed_url = urlparse(url)
|
20 |
+
return parsed_url.netloc
|
21 |
+
|
22 |
+
def google_address(address):
|
23 |
+
|
24 |
+
address_number = re.findall(r'\b\d+\b', address)[0]
|
25 |
+
address_zip =re.search(r'(\d{5})$', address).group()[:2]
|
26 |
+
|
27 |
+
search_query = quote(address)
|
28 |
+
url=f'https://www.google.com/search?q={search_query}'
|
29 |
+
response = requests.get(url)
|
30 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
31 |
+
|
32 |
+
texts_links = []
|
33 |
+
for link in soup.find_all("a"):
|
34 |
+
t,l=link.get_text(), link.get("href")
|
35 |
+
if (l[:11]=='/url?q=http') and (len(t)>20 ):
|
36 |
+
texts_links.append((t,l))
|
37 |
+
|
38 |
+
|
39 |
+
text = soup.get_text()
|
40 |
+
|
41 |
+
texts_links_des=[]
|
42 |
+
for i,t_l in enumerate(texts_links):
|
43 |
+
start=text.find(texts_links[i][0][:50])
|
44 |
+
try:
|
45 |
+
end=text.find(texts_links[i+1][0][:50])
|
46 |
+
except:
|
47 |
+
end=text.find('Related searches')
|
48 |
+
|
49 |
+
description=text[start:end]
|
50 |
+
texts_links_des.append((t_l[0],t_l[1],description))
|
51 |
+
|
52 |
+
df=pd.DataFrame(texts_links_des,columns=['Title','Link','Description'])
|
53 |
+
df['Description']=df['Description'].bfill()
|
54 |
+
df['Address']=df['Title'].str.extract(r'(.+? \d{5})')
|
55 |
+
df['Link']=[i[7:i.find('&sa=')] for i in df['Link']]
|
56 |
+
df['Website'] = df['Link'].apply(extract_website_domain)
|
57 |
+
|
58 |
+
df['Square Footage']=df['Description'].str.extract(r"((\d+) Square Feet|(\d+) sq. ft.|(\d+) sqft|(\d+) Sq. Ft.|(\d+) sq|(\d+(?:,\d+)?) Sq\. Ft\.|(\d+(?:,\d+)?) sq)")[0]
|
59 |
+
df['Square Footage']=df['Square Footage'].replace({',':''},regex=True).str.replace(r'\D', '')
|
60 |
+
|
61 |
+
df['Beds']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"(\d+) bed")
|
62 |
+
|
63 |
+
|
64 |
+
df['Baths']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"((\d+) bath|(\d+(?:\.\d+)?) bath)")[0]
|
65 |
+
df['Baths']=df['Baths'].str.extract(r'([\d.]+)').astype(float)
|
66 |
+
|
67 |
+
df['Year Built']=df['Description'].str.extract(r"built in (\d{4})")
|
68 |
+
|
69 |
+
df_final=df[df['Address'].notnull()]
|
70 |
+
df_final=df_final[(df_final['Address'].str.contains(str(address_number))) & (df_final['Address'].str.contains(str(address_zip)))]
|
71 |
+
|
72 |
+
df_final.insert(0,'Address Input',address)
|
73 |
+
return df_final
|
74 |
+
|
75 |
+
|
76 |
+
def process_csv_text(temp_file):
|
77 |
+
if isinstance(temp_file, str):
|
78 |
+
df = pd.read_csv(StringIO(temp_file))
|
79 |
+
else:
|
80 |
+
df = pd.read_csv(temp_file.name)
|
81 |
+
|
82 |
+
address_cols=list(df.columns[:4])
|
83 |
+
df[address_cols[-1]]=df[address_cols[-1]].astype(str).str[:5].astype(int).astype(str)
|
84 |
+
df[address_cols[-1]]=df[address_cols[-1]].apply(lambda x: x.zfill(5))
|
85 |
+
|
86 |
+
df['Address All']=df[address_cols[0]]+', '+df[address_cols[1]]+', '+df[address_cols[2]]+' '+df[address_cols[3]]
|
87 |
+
return df
|
88 |
+
|
89 |
+
def process_multiple_address(addresses):
|
90 |
+
results=Parallel(n_jobs=-1, prefer="threads")(delayed(google_address)(i) for i in addresses)
|
91 |
+
return results
|
92 |
+
|
93 |
+
def feed_process_multiple(temp_file):
|
94 |
+
df=process_csv_text(temp_file)
|
95 |
+
addresses=df['Address All'].to_list()
|
96 |
+
results=process_multiple_address(addresses)
|
97 |
+
results=pd.concat(results)
|
98 |
+
return results
|
99 |
+
|
100 |
+
|
101 |
+
with gr.Blocks() as demo:
|
102 |
+
upload_button = gr.UploadButton(label="Upload Addresses", file_types = ['.csv'], live=True, file_count = "single")
|
103 |
+
table = gr.Dataframe(headers=['Address Input', 'Title', 'Link', 'Description', 'Address', 'Website',
|
104 |
+
'Square Footage', 'Beds', 'Baths', 'Year Built'], type="pandas", col_count=10)
|
105 |
+
upload_button.upload(fn=feed_process_multiple, inputs=upload_button,
|
106 |
+
outputs=table, api_name="Address Scrap")
|
107 |
+
|
108 |
+
|
109 |
+
demo.launch()
|
110 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
beautifulsoup4==4.12.2
|
2 |
+
gradio==3.50.2
|
3 |
+
joblib==1.1.0
|
4 |
+
pandas==1.5.2
|
5 |
+
requests==2.27.1
|