|
import gradio as gr |
|
import requests |
|
import bs4 |
|
|
|
def link_find(url): |
|
out = [] |
|
source = requests.get(url) |
|
if source.status_code ==200: |
|
print("YES") |
|
|
|
soup = bs4.BeautifulSoup(source.content,'html.parser') |
|
|
|
rawp=(f'RAW TEXT RETURNED: {soup.text}') |
|
cnt=0 |
|
cnt+=len(rawp) |
|
rawt=soup.text |
|
|
|
|
|
node1 = {"URL":url,"TITLE":soup.title,"STRING":soup.description,"TEXT":rawt,"LINKS":[],"TREE":[]} |
|
node2 = {"URL":url,"LINKS":[],"TREE":[]} |
|
|
|
q=("a","p","span","content","article") |
|
for p in soup.find_all("a"): |
|
url0=p.get('href') |
|
if url0.startswith("//"): |
|
uri1=url.split("//")[0] |
|
|
|
|
|
|
|
uri=f'{uri1}{url0}' |
|
print(uri) |
|
if url0.startswith("/"): |
|
uri1=url.split("//")[0] |
|
uri2=url.split("//")[1] |
|
uri3=uri2.split("/")[0] |
|
uri=f'{uri1}//{uri3}' |
|
uri=f'{uri}{url0}' |
|
print(uri) |
|
else: |
|
uri=url0 |
|
node1['LINKS'].append(uri) |
|
node1['TREE'].append({"URL":uri,"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","LINKS":[],"TREE":[]}) |
|
node2['TREE'].append({"URL":uri,"LINKS":[],"TREE":[]}) |
|
node2['LINKS'].append(uri) |
|
|
|
|
|
|
|
else: |
|
print("NO") |
|
pass |
|
return node1,node2 |
|
|
|
|
|
def sitemap(url,level): |
|
uri="" |
|
uri0="" |
|
if url != "" and url != None: |
|
link1,link2=link_find(url) |
|
if level >=2: |
|
for i,ea in enumerate(link1['TREE']): |
|
print(ea) |
|
try: |
|
|
|
|
|
|
|
|
|
|
|
|
|
out_list1,out_list2=link_find(f"{uri}{ea['URL']}") |
|
link1['TREE'][i]=out_list1 |
|
link2['TREE'][i]=out_list2 |
|
|
|
|
|
if level>=3: |
|
for n,na in enumerate(link1['TREE'][i]['TREE']): |
|
print(na) |
|
try: |
|
|
|
|
|
|
|
|
|
|
|
|
|
out_list1,out_list2=link_find(f"{uri0}{na['URL']}") |
|
link1['TREE'][i]['TREE'][n]=out_list1 |
|
link2['TREE'][i]['TREE'][n]=out_list2 |
|
|
|
except Exception as e: |
|
print (e) |
|
except Exception as e: |
|
print (e) |
|
return link1,link2 |
|
|
|
|
|
|
|
def sitemap_OG(url,level): |
|
uri="" |
|
if url != "" and url != None: |
|
link1=link_find(url) |
|
if level >=2: |
|
for i,ea in enumerate(link1): |
|
print(ea) |
|
try: |
|
if not ea['URL'].startswith("http"): |
|
uri1=url.split("//")[0] |
|
uri2=url.split("//")[1] |
|
uri3=uri2.split("/")[0] |
|
uri=f'{uri1}//{uri3}' |
|
print(uri) |
|
out_list=link_find(f"{uri}{ea['URL']}") |
|
link1[i]['TREE']=out_list |
|
if level>=3: |
|
for n,na in enumerate(link1[i]['TREE']): |
|
print(na) |
|
try: |
|
if not na['URL'].startswith("http"): |
|
uri11=url.split("//")[0] |
|
uri22=url.split("//")[1] |
|
uri33=uri22.split("/")[0] |
|
uri0=f'{uri11}//{uri33}' |
|
print(uri0) |
|
out_list1=link_find(f"{uri0}{na['URL']}") |
|
link1[i]['TREE'][n]['TREE']=out_list1 |
|
except Exception as e: |
|
print (e) |
|
except Exception as e: |
|
print (e) |
|
return link1 |
|
with gr.Blocks() as app: |
|
with gr.Row(): |
|
with gr.Column(scale=3): |
|
with gr.Row(): |
|
inp=gr.Textbox(label="URL") |
|
level=gr.Slider(minimum=1,maximum=2,step=1,value=1) |
|
btn=gr.Button() |
|
outp=gr.JSON() |
|
with gr.Column(scale=1): |
|
outmap=gr.JSON() |
|
btn.click(sitemap,[inp,level],[outp,outmap]) |
|
app.launch() |