Update app.py
Browse files
app.py
CHANGED
@@ -302,6 +302,60 @@ def link_find(url):
|
|
302 |
return node1,node2
|
303 |
#https://huggingface.co/spaces/Omnibus/crawl
|
304 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
def sitemap(url,file_state,level):
|
306 |
uri=""
|
307 |
uri0=""
|
@@ -416,16 +470,18 @@ def sitemap_OG(url,level):
|
|
416 |
return link1
|
417 |
|
418 |
def test():
|
|
|
419 |
with open("./seed.txt") as f:
|
420 |
this = f.readlines()
|
421 |
f.close()
|
422 |
for ea in this:
|
423 |
ea=ea.strip().strip("\n")
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
|
|
429 |
|
430 |
with gr.Blocks() as app:
|
431 |
file_state=gr.State()
|
|
|
302 |
return node1,node2
|
303 |
#https://huggingface.co/spaces/Omnibus/crawl
|
304 |
|
305 |
+
def sitemap_test(url,file_state,level):
|
306 |
+
url_page=[]
|
307 |
+
url_front=[]
|
308 |
+
url_json=[]
|
309 |
+
for each_url in url:
|
310 |
+
uri=""
|
311 |
+
uri0=""
|
312 |
+
if url != "" and url != None:
|
313 |
+
link1,link2=link_find(url)
|
314 |
+
if level >=2:
|
315 |
+
for i,ea in enumerate(link1['TREE']):
|
316 |
+
print(ea)
|
317 |
+
try:
|
318 |
+
out_list1,out_list2=link_find(f"{uri}{ea['URL']}")
|
319 |
+
link1['TREE'][i]=out_list1
|
320 |
+
link2['TREE'][i]=out_list2
|
321 |
+
#link1['TREE'].append(out_list)
|
322 |
+
|
323 |
+
if level>=3:
|
324 |
+
for n,na in enumerate(link1['TREE'][i]['TREE']):
|
325 |
+
print(na)
|
326 |
+
try:
|
327 |
+
out_list1,out_list2=link_find(f"{uri0}{na['URL']}")
|
328 |
+
link1['TREE'][i]['TREE'][n]=out_list1
|
329 |
+
link2['TREE'][i]['TREE'][n]=out_list2
|
330 |
+
#link1['TREE'][i]['TREE'].append(out_list1)
|
331 |
+
except Exception as e:
|
332 |
+
print (e)
|
333 |
+
except Exception as e:
|
334 |
+
print (e)
|
335 |
+
|
336 |
+
for ea_link in link2['TREE']:
|
337 |
+
url_list=ea_link['URL'].split("/")
|
338 |
+
url_front.append(f'{url_list[1]}//{url_list[3]}')
|
339 |
+
uri_key=sort_doc(url_front,file_state,8)
|
340 |
+
|
341 |
+
######## Save Database ########
|
342 |
+
uid=uuid.uuid4()
|
343 |
+
with open(f'{uid}.json', 'w') as f:
|
344 |
+
json_hist=json.dumps(uri_key, indent=4)
|
345 |
+
f.write(json_hist)
|
346 |
+
f.close()
|
347 |
+
|
348 |
+
upload_file(
|
349 |
+
path_or_fileobj =f"{uid}.json",
|
350 |
+
path_in_repo = f"crawl/{filename}.json",
|
351 |
+
repo_id =f"{username}/{dataset_name}",
|
352 |
+
repo_type = "dataset",
|
353 |
+
token=token,
|
354 |
+
)
|
355 |
+
#################################
|
356 |
+
return link1,link2,uri_key
|
357 |
+
|
358 |
+
|
359 |
def sitemap(url,file_state,level):
|
360 |
uri=""
|
361 |
uri0=""
|
|
|
470 |
return link1
|
471 |
|
472 |
def test():
|
473 |
+
seed_box=[]
|
474 |
with open("./seed.txt") as f:
|
475 |
this = f.readlines()
|
476 |
f.close()
|
477 |
for ea in this:
|
478 |
ea=ea.strip().strip("\n")
|
479 |
+
seed_box.append(ea)
|
480 |
+
#print(ea)
|
481 |
+
try:
|
482 |
+
a,b,c = sitemap_test(seed_box,None,1)
|
483 |
+
except Exception as e:
|
484 |
+
print (e)
|
485 |
|
486 |
with gr.Blocks() as app:
|
487 |
file_state=gr.State()
|