Update app.py
Browse files
app.py
CHANGED
@@ -12,17 +12,14 @@ username="omnibus"
|
|
12 |
dataset_name="tmp"
|
13 |
save_data=f'https://huggingface.co/datasets/{username}/{dataset_name}/raw/main/'
|
14 |
api=HfApi(token="")
|
15 |
-
filename="
|
16 |
-
|
17 |
|
18 |
def init():
|
19 |
r = requests.get(f'{save_data}crawl/{filename}.json')
|
20 |
print(f'status code main:: {r.status_code}')
|
21 |
if r.status_code==200:
|
22 |
lod = json.loads(r.text)
|
23 |
-
#print(f'lod:: {lod}')
|
24 |
-
#lod[0]['comment']=lod[0]['comment']+1
|
25 |
-
#lod[0]['comment_list'].append({'user':persona[persona2]['name'],'datetime':'','comment':output,'reply_list':[]})
|
26 |
else:
|
27 |
lod={}
|
28 |
return lod
|
@@ -338,7 +335,14 @@ def sitemap(url,file_state,level):
|
|
338 |
print (e)
|
339 |
except Exception as e:
|
340 |
print (e)
|
341 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
342 |
######## Save Database ########
|
343 |
uid=uuid.uuid4()
|
344 |
#for ea in list(uri_key.keys()):
|
|
|
12 |
dataset_name="tmp"
|
13 |
save_data=f'https://huggingface.co/datasets/{username}/{dataset_name}/raw/main/'
|
14 |
api=HfApi(token="")
|
15 |
+
filename="urls"
|
16 |
+
filename2="pages"
|
17 |
|
18 |
def init():
|
19 |
r = requests.get(f'{save_data}crawl/{filename}.json')
|
20 |
print(f'status code main:: {r.status_code}')
|
21 |
if r.status_code==200:
|
22 |
lod = json.loads(r.text)
|
|
|
|
|
|
|
23 |
else:
|
24 |
lod={}
|
25 |
return lod
|
|
|
335 |
print (e)
|
336 |
except Exception as e:
|
337 |
print (e)
|
338 |
+
#url_page=[]
|
339 |
+
url_front=[]
|
340 |
+
for ea_link in link2['TREE']:
|
341 |
+
url_list=ea_link.split("/")
|
342 |
+
url_front.append(url_list[0])
|
343 |
+
print(f'URL_FRONT:: {url_front}')
|
344 |
+
#url_key=sort
|
345 |
+
uri_key=sort_doc(url_front,file_state,8)
|
346 |
######## Save Database ########
|
347 |
uid=uuid.uuid4()
|
348 |
#for ea in list(uri_key.keys()):
|