Update app.py
Browse files
app.py
CHANGED
@@ -2,13 +2,33 @@ import gradio as gr
|
|
2 |
import requests
|
3 |
import bs4
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
def sort_doc(in_list,steps_in=0,control=None):
|
6 |
control_json={'control':'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ','char':'','leng':62}
|
7 |
text=str(in_list)
|
8 |
-
|
9 |
-
########################################
|
10 |
-
sen_list=in_list
|
11 |
-
######################################
|
12 |
key_cnt=len(in_list)
|
13 |
print(key_cnt)
|
14 |
control_char=list(control_json['control'])
|
@@ -76,9 +96,6 @@ def sort_doc(in_list,steps_in=0,control=None):
|
|
76 |
print(j)
|
77 |
out_js = out_js+control_char[j]
|
78 |
sen_obj=in_list[i]
|
79 |
-
#sen_obj=proc_sen(sen_list,i)
|
80 |
-
|
81 |
-
#json_out[out_js]={'nouns':ea}
|
82 |
json_out[out_js]=sen_obj
|
83 |
print ("#################")
|
84 |
print (out_js)
|
@@ -186,7 +203,25 @@ def sitemap(url,level):
|
|
186 |
except Exception as e:
|
187 |
print (e)
|
188 |
uri_key=sort_doc(link_box,8)
|
|
|
|
|
|
|
|
|
|
|
189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
return link1,link2,uri_key
|
191 |
|
192 |
|
|
|
2 |
import requests
|
3 |
import bs4
|
4 |
|
5 |
+
######## Load Database ########
|
6 |
+
|
7 |
+
from huggingface_hub import HfApi, upload_file
|
8 |
+
import json
|
9 |
+
import uuid
|
10 |
+
token=os.environ.get("HF_TOKEN")
|
11 |
+
username="omnibus"
|
12 |
+
dataset_name="tmp"
|
13 |
+
save_data=f'https://huggingface.co/datasets/{username}/{dataset_name}/raw/main/'
|
14 |
+
api=HfApi(token="")
|
15 |
+
filename="test"
|
16 |
+
|
17 |
+
r = requests.get(f'{save_data}crawl/{file_n}.json')
|
18 |
+
print(f'status code main:: {r.status_code}')
|
19 |
+
if r.status_code==200:
|
20 |
+
lod = json.loads(r.text)
|
21 |
+
#print(f'lod:: {lod}')
|
22 |
+
#lod[0]['comment']=lod[0]['comment']+1
|
23 |
+
#lod[0]['comment_list'].append({'user':persona[persona2]['name'],'datetime':'','comment':output,'reply_list':[]})
|
24 |
+
else:
|
25 |
+
lod={}
|
26 |
+
|
27 |
+
#############################
|
28 |
+
|
29 |
def sort_doc(in_list,steps_in=0,control=None):
|
30 |
control_json={'control':'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ','char':'','leng':62}
|
31 |
text=str(in_list)
|
|
|
|
|
|
|
|
|
32 |
key_cnt=len(in_list)
|
33 |
print(key_cnt)
|
34 |
control_char=list(control_json['control'])
|
|
|
96 |
print(j)
|
97 |
out_js = out_js+control_char[j]
|
98 |
sen_obj=in_list[i]
|
|
|
|
|
|
|
99 |
json_out[out_js]=sen_obj
|
100 |
print ("#################")
|
101 |
print (out_js)
|
|
|
203 |
except Exception as e:
|
204 |
print (e)
|
205 |
uri_key=sort_doc(link_box,8)
|
206 |
+
######## Save Database ########
|
207 |
+
uid=uuid.uuid4()
|
208 |
+
for ea in list(uri_key.keys()):
|
209 |
+
if not uri_key[ea] == x for x in list(lod.values()):
|
210 |
+
lod[ea]=uri_key[ea]
|
211 |
|
212 |
+
with open(f'{uid}.json', 'w') as f:
|
213 |
+
json_hist=json.dumps(uri_key, indent=4)
|
214 |
+
f.write(json_hist)
|
215 |
+
f.close()
|
216 |
+
|
217 |
+
upload_file(
|
218 |
+
path_or_fileobj =f"{uid}.json",
|
219 |
+
path_in_repo = f"crawl/{filename}.json",
|
220 |
+
repo_id =f"{username}/{dataset_name}",
|
221 |
+
repo_type = "dataset",
|
222 |
+
token=token,
|
223 |
+
)
|
224 |
+
#################################
|
225 |
return link1,link2,uri_key
|
226 |
|
227 |
|