|
from fastapi import FastAPI |
|
import uvicorn |
|
|
|
|
|
|
|
import pandas as pd |
|
import numpy as np |
|
import requests |
|
from urllib.parse import urlparse, quote |
|
import re |
|
from bs4 import BeautifulSoup |
|
import time |
|
from joblib import Parallel, delayed |
|
from nltk import ngrams |
|
from googlesearch import search |
|
|
|
|
|
app = FastAPI() |
|
|
|
|
|
|
|
|
|
@app.get("/") |
|
def root(): |
|
return {"API": "Adress Scrap"} |
|
|
|
def normalize_string(string): |
|
normalized_string = string.lower() |
|
normalized_string = re.sub(r'[^\w\s]', '', normalized_string) |
|
|
|
return normalized_string |
|
|
|
|
|
def jaccard_similarity(string1, string2,n = 2, normalize=True): |
|
try: |
|
if normalize: |
|
string1,string2= normalize_string(string1),normalize_string(string2) |
|
|
|
grams1 = set(ngrams(string1, n)) |
|
grams2 = set(ngrams(string2, n)) |
|
similarity = len(grams1.intersection(grams2)) / len(grams1.union(grams2)) |
|
except: |
|
similarity=0 |
|
|
|
if string2=='did not extract address': |
|
similarity=0 |
|
|
|
return similarity |
|
|
|
def jaccard_sim_split_word_number(string1,string2): |
|
numbers1 = ' '.join(re.findall(r'\d+', string1)) |
|
words1 = ' '.join(re.findall(r'\b[A-Za-z]+\b', string1)) |
|
|
|
numbers2 = ' '.join(re.findall(r'\d+', string2)) |
|
words2 = ' '.join(re.findall(r'\b[A-Za-z]+\b', string2)) |
|
|
|
number_similarity=jaccard_similarity(numbers1,numbers2) |
|
words_similarity=jaccard_similarity(words1,words2) |
|
return (number_similarity+words_similarity)/2 |
|
|
|
def extract_website_domain(url): |
|
parsed_url = urlparse(url) |
|
return parsed_url.netloc |
|
|
|
|
|
def google_address(address): |
|
all_data=[i for i in search(address, ssl_verify=False, advanced=True, |
|
num_results=11)] |
|
|
|
|
|
df=pd.DataFrame({'Title':[i.title for i in all_data], |
|
'Link':[i.url for i in all_data], |
|
'Description':[i.description for i in all_data],}) |
|
|
|
df=df.query("Title==Title") |
|
df['Link']=df['Link'].str.replace('/www.','https://www.') |
|
|
|
|
|
df['Address Output']=df['Title'].str.extract(r'(.+? \d{5})').fillna("**DID NOT EXTRACT ADDRESS**") |
|
|
|
df['Link']=[i[7:i.find('&sa=')] for i in df['Link']] |
|
df['Website'] = df['Link'].apply(extract_website_domain) |
|
|
|
df['Square Footage']=df['Description'].str.extract(r"((\d+) Square Feet|(\d+) sq. ft.|(\d+) sqft|(\d+) Sq. Ft.|(\d+) sq|(\d+(?:,\d+)?) Sq\. Ft\.|(\d+(?:,\d+)?) sq)")[0] |
|
try: |
|
df['Square Footage']=df['Square Footage'].replace({',':''},regex=True).str.replace(r'\D', '') |
|
except: |
|
pass |
|
df['Beds']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"(\d+) bed") |
|
|
|
|
|
df['Baths']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"((\d+) bath|(\d+(?:\.\d+)?) bath)")[0] |
|
df['Baths']=df['Baths'].str.extract(r'([\d.]+)').astype(float) |
|
|
|
df['Year Built']=df['Description'].str.extract(r"built in (\d{4})") |
|
|
|
df['Match Percent']=[jaccard_sim_split_word_number(address,i)*100 for i in df['Address Output']] |
|
df['Google Search Result']=[*range(1,df.shape[0]+1)] |
|
|
|
|
|
|
|
|
|
df.insert(0,'Address Input',address) |
|
|
|
return df |
|
|
|
|
|
|
|
@app.get('/Address_Scrap') |
|
async def predict(address: str): |
|
try: |
|
results= google_address(address) |
|
results=results[['Address Input', 'Address Output','Match Percent','Website','Square Footage', 'Beds', 'Baths', 'Year Built', |
|
'Link','Google Search Result', 'Description' ]] |
|
except: |
|
results= pd.DataFrame({'Address Input':[address]}) |
|
|
|
return results.to_json() |
|
|
|
|