Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,68 +1,177 @@
|
|
1 |
import json
|
2 |
-
|
|
|
|
|
|
|
3 |
import gradio as gr
|
|
|
|
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
qg_model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-
|
8 |
-
qg_tokenizer =
|
9 |
-
qg_pipeline = pipeline(
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
if len(
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
else:
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
seen.add(key)
|
56 |
-
return json.dumps(unique, indent=2, ensure_ascii=False)
|
57 |
-
|
58 |
-
# Gradio interface
|
59 |
def main():
|
60 |
with gr.Blocks() as demo:
|
61 |
-
gr.Markdown("
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
demo.launch()
|
67 |
|
68 |
if __name__ == "__main__":
|
|
|
1 |
import json
|
2 |
+
import re
|
3 |
+
import os
|
4 |
+
import spacy
|
5 |
+
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
|
6 |
import gradio as gr
|
7 |
+
from huggingface_hub import Repository
|
8 |
+
from datetime import datetime
|
9 |
|
10 |
+
nlp = spacy.load("en_core_web_sm")
|
11 |
+
|
12 |
+
qg_model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-base-qa-qg-hl")
|
13 |
+
qg_tokenizer = AutoTokenizer.from_pretrained("valhalla/t5-base-qa-qg-hl", use_fast=True)
|
14 |
+
qg_pipeline = pipeline("text2text-generation", model=qg_model, tokenizer=qg_tokenizer)
|
15 |
+
|
16 |
+
def extract_paragraph_facts(raw_text):
|
17 |
+
return [p.strip() for p in raw_text.strip().split("\n\n") if p.strip()]
|
18 |
+
|
19 |
+
def extract_noun_phrases(text):
|
20 |
+
doc = nlp(text)
|
21 |
+
return [np.text for np in doc.noun_chunks]
|
22 |
+
|
23 |
+
def auto_highlight_noun_phrase(text):
|
24 |
+
doc = nlp(text)
|
25 |
+
noun_phrases = sorted(doc.noun_chunks, key=lambda np: len(np.text), reverse=True)
|
26 |
+
for np in noun_phrases:
|
27 |
+
if len(np.text.split()) > 1 or np.root.pos_ == "NOUN":
|
28 |
+
return np.text
|
29 |
+
return text
|
30 |
+
|
31 |
+
def highlight_selected_phrase(fact, selected_np):
|
32 |
+
return fact.replace(selected_np, f"<hl>{selected_np}<hl>", 1)
|
33 |
+
|
34 |
+
def generate_single_qna(fact, noun_phrase, min_len, max_len, temperature, top_k, top_p):
|
35 |
+
hl_fact = highlight_selected_phrase(fact, noun_phrase)
|
36 |
+
try:
|
37 |
+
prompt = f"generate question: {hl_fact}"
|
38 |
+
output = qg_pipeline(
|
39 |
+
prompt,
|
40 |
+
min_length=min_len,
|
41 |
+
max_length=max_len,
|
42 |
+
temperature=temperature,
|
43 |
+
top_k=top_k,
|
44 |
+
top_p=top_p,
|
45 |
+
do_sample=True
|
46 |
+
)[0]
|
47 |
+
question = output.get("generated_text", "").strip()
|
48 |
+
if not question.endswith("?"):
|
49 |
+
question += "?"
|
50 |
+
except Exception as e:
|
51 |
+
question = f"Error generating question: {str(e)}"
|
52 |
+
|
53 |
+
return {"question": question, "answer": fact}
|
54 |
+
|
55 |
+
def generate_qna_all(input_text, selected_fact, selected_np, min_len, max_len, temperature, top_k, top_p):
|
56 |
+
facts = extract_paragraph_facts(input_text)
|
57 |
+
results = []
|
58 |
+
|
59 |
+
if selected_fact:
|
60 |
+
noun_phrase = selected_np if selected_np else auto_highlight_noun_phrase(selected_fact)
|
61 |
+
result = generate_single_qna(selected_fact, noun_phrase, min_len, max_len, temperature, top_k, top_p)
|
62 |
+
results.append(result)
|
63 |
+
else:
|
64 |
+
for fact in facts:
|
65 |
+
noun_phrase = auto_highlight_noun_phrase(fact)
|
66 |
+
result = generate_single_qna(fact, noun_phrase, min_len, max_len, temperature, top_k, top_p)
|
67 |
+
results.append(result)
|
68 |
+
|
69 |
+
return json.dumps(results, indent=2, ensure_ascii=False)
|
70 |
+
|
71 |
+
def save_json_to_dataset(json_str):
|
72 |
+
try:
|
73 |
+
hf_token = os.environ.get("QandA_Generator")
|
74 |
+
if not hf_token:
|
75 |
+
return "β HF_TOKEN not found in environment."
|
76 |
+
|
77 |
+
repo_id = "University_Inquiries_AI_Chatbot"
|
78 |
+
dataset_file = "dataset.json"
|
79 |
+
local_dir = "hf_repo"
|
80 |
+
|
81 |
+
repo = Repository(
|
82 |
+
local_dir=local_dir,
|
83 |
+
clone_from=f"datasets/{repo_id}",
|
84 |
+
use_auth_token=hf_token
|
85 |
+
)
|
86 |
+
repo.git_pull()
|
87 |
+
|
88 |
+
full_path = os.path.join(local_dir, dataset_file)
|
89 |
+
|
90 |
+
if os.path.exists(full_path):
|
91 |
+
with open(full_path, "r", encoding="utf-8") as f:
|
92 |
+
existing_data = json.load(f)
|
93 |
else:
|
94 |
+
existing_data = []
|
95 |
+
|
96 |
+
new_data = json.loads(json_str)
|
97 |
+
|
98 |
+
now = datetime.now()
|
99 |
+
for entry in new_data:
|
100 |
+
entry["month"] = now.strftime("%B")
|
101 |
+
entry["year"] = now.year
|
102 |
+
|
103 |
+
updated_data = existing_data + new_data
|
104 |
+
|
105 |
+
with open(full_path, "w", encoding="utf-8") as f:
|
106 |
+
json.dump(updated_data, f, indent=2, ensure_ascii=False)
|
107 |
+
|
108 |
+
repo.push_to_hub(commit_message="π₯ Add new Q&A with timestamp")
|
109 |
+
|
110 |
+
return "β
Data with timestamp successfully pushed to HF dataset!"
|
111 |
+
except Exception as e:
|
112 |
+
return f"β Error: {str(e)}"
|
113 |
+
|
114 |
+
|
115 |
+
def on_extract_facts(text):
|
116 |
+
facts = extract_paragraph_facts(text)
|
117 |
+
default_fact = facts[0] if facts else None
|
118 |
+
return gr.update(choices=facts, value=default_fact), gr.update(choices=[], value=None)
|
119 |
+
|
120 |
+
def on_select_fact(fact):
|
121 |
+
noun_phrases = extract_noun_phrases(fact)
|
122 |
+
return gr.update(choices=noun_phrases, value=noun_phrases[0] if noun_phrases else None)
|
123 |
+
|
|
|
|
|
|
|
|
|
124 |
def main():
|
125 |
with gr.Blocks() as demo:
|
126 |
+
gr.Markdown("## Paragraph-to-Question Generator (Auto Q&A for HF Dataset)")
|
127 |
+
|
128 |
+
input_text = gr.Textbox(lines=10, label="Enter Data (Seperated by paragraph per question)")
|
129 |
+
|
130 |
+
with gr.Accordion("βοΈ Customize Question Generation", open=False):
|
131 |
+
extract_btn = gr.Button("Extract & Customize")
|
132 |
+
fact_dropdown = gr.Dropdown(label="Select a Fact", interactive=True)
|
133 |
+
np_dropdown = gr.Dropdown(label="Select Noun Phrase to Highlight (optional)", interactive=True)
|
134 |
+
|
135 |
+
extract_btn.click(fn=on_extract_facts, inputs=input_text, outputs=[fact_dropdown, np_dropdown])
|
136 |
+
fact_dropdown.change(fn=on_select_fact, inputs=fact_dropdown, outputs=np_dropdown)
|
137 |
+
|
138 |
+
gr.Markdown("π½ **Min Length**: Minimum number of tokens in the generated question.")
|
139 |
+
min_len = gr.Slider(5, 50, value=10, step=1, label="Min Length")
|
140 |
+
|
141 |
+
gr.Markdown("πΌ **Max Length**: Maximum number of tokens in the generated question.")
|
142 |
+
max_len = gr.Slider(20, 100, value=64, step=1, label="Max Length")
|
143 |
+
|
144 |
+
gr.Markdown("π‘οΈ **Temperature**: Controls randomness. Lower = more predictable, higher = more creative.")
|
145 |
+
temperature = gr.Slider(0.1, 1.5, value=1.0, step=0.1, label="Temperature")
|
146 |
+
|
147 |
+
gr.Markdown("π― **Top-k Sampling**: Limits sampling to the top-k most likely words.")
|
148 |
+
top_k = gr.Slider(0, 100, value=50, step=1, label="Top-k")
|
149 |
+
|
150 |
+
gr.Markdown("π² **Top-p (Nucleus Sampling)**: Selects from the smallest set of words with a cumulative probability > p.")
|
151 |
+
top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p")
|
152 |
+
|
153 |
+
gr.Markdown("βοΈ You can manually edit the generated JSON here or paste your own in the same format.")
|
154 |
+
output_json = gr.Textbox(
|
155 |
+
lines=14,
|
156 |
+
label="Q&A JSON",
|
157 |
+
interactive=True,
|
158 |
+
placeholder='{\n"question": "Your question?",\n"answer": "Your answer."\n},'
|
159 |
+
)
|
160 |
+
|
161 |
+
|
162 |
+
with gr.Row():
|
163 |
+
generate_btn = gr.Button("Generate Q&A")
|
164 |
+
send_btn = gr.Button("π€ Send to Dataset")
|
165 |
+
|
166 |
+
generate_btn.click(
|
167 |
+
fn=generate_qna_all,
|
168 |
+
inputs=[input_text, fact_dropdown, np_dropdown, min_len, max_len, temperature, top_k, top_p],
|
169 |
+
outputs=output_json
|
170 |
+
)
|
171 |
+
|
172 |
+
send_status = gr.Textbox(label="Save Status", interactive=False)
|
173 |
+
send_btn.click(fn=save_json_to_dataset, inputs=output_json, outputs=send_status)
|
174 |
+
|
175 |
demo.launch()
|
176 |
|
177 |
if __name__ == "__main__":
|