Omnibus commited on
Commit
3c9e629
·
verified ·
1 Parent(s): 5f5c3c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -5
app.py CHANGED
@@ -302,6 +302,60 @@ def link_find(url):
302
  return node1,node2
303
  #https://huggingface.co/spaces/Omnibus/crawl
304
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  def sitemap(url,file_state,level):
306
  uri=""
307
  uri0=""
@@ -416,16 +470,18 @@ def sitemap_OG(url,level):
416
  return link1
417
 
418
  def test():
 
419
  with open("./seed.txt") as f:
420
  this = f.readlines()
421
  f.close()
422
  for ea in this:
423
  ea=ea.strip().strip("\n")
424
- print(ea)
425
- try:
426
- a,b = sitemap(ea,None,1)
427
- except Exception as e:
428
- print (e)
 
429
 
430
  with gr.Blocks() as app:
431
  file_state=gr.State()
 
302
  return node1,node2
303
  #https://huggingface.co/spaces/Omnibus/crawl
304
 
305
+ def sitemap_test(url,file_state,level):
306
+ url_page=[]
307
+ url_front=[]
308
+ url_json=[]
309
+ for each_url in url:
310
+ uri=""
311
+ uri0=""
312
+ if url != "" and url != None:
313
+ link1,link2=link_find(url)
314
+ if level >=2:
315
+ for i,ea in enumerate(link1['TREE']):
316
+ print(ea)
317
+ try:
318
+ out_list1,out_list2=link_find(f"{uri}{ea['URL']}")
319
+ link1['TREE'][i]=out_list1
320
+ link2['TREE'][i]=out_list2
321
+ #link1['TREE'].append(out_list)
322
+
323
+ if level>=3:
324
+ for n,na in enumerate(link1['TREE'][i]['TREE']):
325
+ print(na)
326
+ try:
327
+ out_list1,out_list2=link_find(f"{uri0}{na['URL']}")
328
+ link1['TREE'][i]['TREE'][n]=out_list1
329
+ link2['TREE'][i]['TREE'][n]=out_list2
330
+ #link1['TREE'][i]['TREE'].append(out_list1)
331
+ except Exception as e:
332
+ print (e)
333
+ except Exception as e:
334
+ print (e)
335
+
336
+ for ea_link in link2['TREE']:
337
+ url_list=ea_link['URL'].split("/")
338
+ url_front.append(f'{url_list[1]}//{url_list[3]}')
339
+ uri_key=sort_doc(url_front,file_state,8)
340
+
341
+ ######## Save Database ########
342
+ uid=uuid.uuid4()
343
+ with open(f'{uid}.json', 'w') as f:
344
+ json_hist=json.dumps(uri_key, indent=4)
345
+ f.write(json_hist)
346
+ f.close()
347
+
348
+ upload_file(
349
+ path_or_fileobj =f"{uid}.json",
350
+ path_in_repo = f"crawl/{filename}.json",
351
+ repo_id =f"{username}/{dataset_name}",
352
+ repo_type = "dataset",
353
+ token=token,
354
+ )
355
+ #################################
356
+ return link1,link2,uri_key
357
+
358
+
359
  def sitemap(url,file_state,level):
360
  uri=""
361
  uri0=""
 
470
  return link1
471
 
472
  def test():
473
+ seed_box=[]
474
  with open("./seed.txt") as f:
475
  this = f.readlines()
476
  f.close()
477
  for ea in this:
478
  ea=ea.strip().strip("\n")
479
+ seed_box.append(ea)
480
+ #print(ea)
481
+ try:
482
+ a,b,c = sitemap_test(seed_box,None,1)
483
+ except Exception as e:
484
+ print (e)
485
 
486
  with gr.Blocks() as app:
487
  file_state=gr.State()