Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
""" | |
Created on Tue Oct 24 12:51:42 2023 | |
@author: mritchey | |
""" | |
import pandas as pd | |
import requests | |
from urllib.parse import urlparse, quote | |
import re | |
from bs4 import BeautifulSoup | |
from joblib import Parallel, delayed | |
import gradio as gr | |
from io import StringIO | |
default_csv = "addresses 100 generated.csv" | |
def extract_website_domain(url): | |
parsed_url = urlparse(url) | |
return parsed_url.netloc | |
def google_address(address): | |
address_number = re.findall(r'\b\d+\b', address)[0] | |
address_zip =re.search(r'(\d{5})$', address).group()[:2] | |
search_query = quote(address) | |
url=f'https://www.google.com/search?q={search_query}' | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, "html.parser") | |
texts_links = [] | |
for link in soup.find_all("a"): | |
t,l=link.get_text(), link.get("href") | |
if (l[:11]=='/url?q=http') and (len(t)>20 ): | |
texts_links.append((t,l)) | |
text = soup.get_text() | |
texts_links_des=[] | |
for i,t_l in enumerate(texts_links): | |
start=text.find(texts_links[i][0][:50]) | |
try: | |
end=text.find(texts_links[i+1][0][:50]) | |
except: | |
end=text.find('Related searches') | |
description=text[start:end] | |
texts_links_des.append((t_l[0],t_l[1],description)) | |
df=pd.DataFrame(texts_links_des,columns=['Title','Link','Description']) | |
df['Description']=df['Description'].bfill() | |
df['Address']=df['Title'].str.extract(r'(.+? \d{5})') | |
df['Link']=[i[7:i.find('&sa=')] for i in df['Link']] | |
df['Website'] = df['Link'].apply(extract_website_domain) | |
df['Square Footage']=df['Description'].str.extract(r"((\d+) Square Feet|(\d+) sq. ft.|(\d+) sqft|(\d+) Sq. Ft.|(\d+) sq|(\d+(?:,\d+)?) Sq\. Ft\.|(\d+(?:,\d+)?) sq)")[0] | |
df['Square Footage']=df['Square Footage'].replace({',':''},regex=True).str.replace(r'\D', '') | |
df['Beds']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"(\d+) bed") | |
df['Baths']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"((\d+) bath|(\d+(?:\.\d+)?) bath)")[0] | |
df['Baths']=df['Baths'].str.extract(r'([\d.]+)').astype(float) | |
df['Year Built']=df['Description'].str.extract(r"built in (\d{4})") | |
df_final=df[df['Address'].notnull()] | |
df_final=df_final[(df_final['Address'].str.contains(str(address_number))) & (df_final['Address'].str.contains(str(address_zip)))] | |
df_final.insert(0,'Address Input',address) | |
return df_final | |
def process_csv_text(temp_file): | |
if isinstance(temp_file, str): | |
df = pd.read_csv(StringIO(temp_file)) | |
else: | |
df = pd.read_csv(temp_file.name) | |
address_cols=list(df.columns[:4]) | |
df[address_cols[-1]]=df[address_cols[-1]].astype(str).str[:5].astype(int).astype(str) | |
df[address_cols[-1]]=df[address_cols[-1]].apply(lambda x: x.zfill(5)) | |
df['Address All']=df[address_cols[0]]+', '+df[address_cols[1]]+', '+df[address_cols[2]]+' '+df[address_cols[3]] | |
return df | |
def catch_errors(addresses): | |
try: | |
return google_address(addresses) | |
except: | |
return pd.DataFrame({'Address Input':[addresses]}) | |
def process_multiple_address(addresses): | |
results=Parallel(n_jobs=-1, prefer="threads")(delayed(catch_errors)(i) for i in addresses) | |
return results | |
def feed_process_multiple(temp_file): | |
df=process_csv_text(temp_file) | |
addresses=df['Address All'].to_list() | |
results=process_multiple_address(addresses) | |
results=pd.concat(results) | |
return results | |
with gr.Blocks() as demo: | |
upload_button = gr.UploadButton(label="Upload Addresses", file_types = ['.csv'], live=True, file_count = "single") | |
table = gr.Dataframe(headers=['Address Input', 'Title', 'Link', 'Description', 'Address', 'Website', | |
'Square Footage', 'Beds', 'Baths', 'Year Built'], type="pandas", col_count=10) | |
upload_button.upload(fn=feed_process_multiple, inputs=upload_button, | |
outputs=table, api_name="Address Scrap") | |
demo.launch() | |