oceddyyy commited on
Commit
dc2c5df
Β·
verified Β·
1 Parent(s): 6c7b457

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +168 -59
app.py CHANGED
@@ -1,68 +1,177 @@
1
  import json
2
- from transformers import pipeline, AutoModelForSeq2SeqLM, T5Tokenizer, AutoTokenizer
 
 
 
3
  import gradio as gr
 
 
4
 
5
- # Load question-generation and question-answering pipelines
6
- # Use T5Tokenizer with use_fast=False to avoid tiktoken dependency
7
- qg_model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-small-qa-qg-hl")
8
- qg_tokenizer = T5Tokenizer.from_pretrained("valhalla/t5-small-qa-qg-hl", use_fast=False)
9
- qg_pipeline = pipeline(
10
- "text2text-generation",
11
- model=qg_model,
12
- tokenizer=qg_tokenizer
13
- )
14
- qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="distilbert-base-cased-distilled-squad")
15
-
16
- # Simple chunking: split on paragraphs (for demo)
17
- def split_chunks(text, max_len=200):
18
- paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
19
- chunks = []
20
- for p in paragraphs:
21
- words = p.split()
22
- if len(words) <= max_len:
23
- chunks.append(p)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  else:
25
- for i in range(0, len(words), max_len):
26
- chunk = " ".join(words[i : i + max_len])
27
- chunks.append(chunk)
28
- return chunks
29
-
30
- # Conversion function
31
- def convert_text(raw_text):
32
- chunks = split_chunks(raw_text)
33
- qna_list = []
34
- for chunk in chunks:
35
- try:
36
- prompt = f"generate question: {chunk}"
37
- outputs = qg_pipeline(prompt, max_length=64, clean_up_tokenization_spaces=True)
38
- except Exception:
39
- continue
40
- for out in outputs:
41
- question = out.get("generated_text", out.get("text", "")).strip()
42
- if not question.endswith("?"):
43
- question += "?"
44
- # Refine answer using QA pipeline
45
- ans = qa_pipeline({"question": question, "context": chunk})
46
- answer = ans.get("answer", "").strip()
47
- qna_list.append({"question": question, "answer": answer})
48
- # Deduplicate
49
- unique = []
50
- seen = set()
51
- for qa in qna_list:
52
- key = (qa['question'], qa['answer'])
53
- if key not in seen:
54
- unique.append(qa)
55
- seen.add(key)
56
- return json.dumps(unique, indent=2, ensure_ascii=False)
57
-
58
- # Gradio interface
59
  def main():
60
  with gr.Blocks() as demo:
61
- gr.Markdown("# Handbook Text to Q&A Converter")
62
- input_text = gr.Textbox(lines=10, placeholder="Paste handbook text here...", label="Raw Text")
63
- output_json = gr.Textbox(lines=10, label="Generated Q&A JSON")
64
- convert_btn = gr.Button("Convert")
65
- convert_btn.click(fn=convert_text, inputs=input_text, outputs=output_json)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  demo.launch()
67
 
68
  if __name__ == "__main__":
 
1
  import json
2
+ import re
3
+ import os
4
+ import spacy
5
+ from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
6
  import gradio as gr
7
+ from huggingface_hub import Repository
8
+ from datetime import datetime
9
 
10
+ nlp = spacy.load("en_core_web_sm")
11
+
12
+ qg_model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-base-qa-qg-hl")
13
+ qg_tokenizer = AutoTokenizer.from_pretrained("valhalla/t5-base-qa-qg-hl", use_fast=True)
14
+ qg_pipeline = pipeline("text2text-generation", model=qg_model, tokenizer=qg_tokenizer)
15
+
16
+ def extract_paragraph_facts(raw_text):
17
+ return [p.strip() for p in raw_text.strip().split("\n\n") if p.strip()]
18
+
19
+ def extract_noun_phrases(text):
20
+ doc = nlp(text)
21
+ return [np.text for np in doc.noun_chunks]
22
+
23
+ def auto_highlight_noun_phrase(text):
24
+ doc = nlp(text)
25
+ noun_phrases = sorted(doc.noun_chunks, key=lambda np: len(np.text), reverse=True)
26
+ for np in noun_phrases:
27
+ if len(np.text.split()) > 1 or np.root.pos_ == "NOUN":
28
+ return np.text
29
+ return text
30
+
31
+ def highlight_selected_phrase(fact, selected_np):
32
+ return fact.replace(selected_np, f"<hl>{selected_np}<hl>", 1)
33
+
34
+ def generate_single_qna(fact, noun_phrase, min_len, max_len, temperature, top_k, top_p):
35
+ hl_fact = highlight_selected_phrase(fact, noun_phrase)
36
+ try:
37
+ prompt = f"generate question: {hl_fact}"
38
+ output = qg_pipeline(
39
+ prompt,
40
+ min_length=min_len,
41
+ max_length=max_len,
42
+ temperature=temperature,
43
+ top_k=top_k,
44
+ top_p=top_p,
45
+ do_sample=True
46
+ )[0]
47
+ question = output.get("generated_text", "").strip()
48
+ if not question.endswith("?"):
49
+ question += "?"
50
+ except Exception as e:
51
+ question = f"Error generating question: {str(e)}"
52
+
53
+ return {"question": question, "answer": fact}
54
+
55
+ def generate_qna_all(input_text, selected_fact, selected_np, min_len, max_len, temperature, top_k, top_p):
56
+ facts = extract_paragraph_facts(input_text)
57
+ results = []
58
+
59
+ if selected_fact:
60
+ noun_phrase = selected_np if selected_np else auto_highlight_noun_phrase(selected_fact)
61
+ result = generate_single_qna(selected_fact, noun_phrase, min_len, max_len, temperature, top_k, top_p)
62
+ results.append(result)
63
+ else:
64
+ for fact in facts:
65
+ noun_phrase = auto_highlight_noun_phrase(fact)
66
+ result = generate_single_qna(fact, noun_phrase, min_len, max_len, temperature, top_k, top_p)
67
+ results.append(result)
68
+
69
+ return json.dumps(results, indent=2, ensure_ascii=False)
70
+
71
+ def save_json_to_dataset(json_str):
72
+ try:
73
+ hf_token = os.environ.get("QandA_Generator")
74
+ if not hf_token:
75
+ return "❌ HF_TOKEN not found in environment."
76
+
77
+ repo_id = "University_Inquiries_AI_Chatbot"
78
+ dataset_file = "dataset.json"
79
+ local_dir = "hf_repo"
80
+
81
+ repo = Repository(
82
+ local_dir=local_dir,
83
+ clone_from=f"datasets/{repo_id}",
84
+ use_auth_token=hf_token
85
+ )
86
+ repo.git_pull()
87
+
88
+ full_path = os.path.join(local_dir, dataset_file)
89
+
90
+ if os.path.exists(full_path):
91
+ with open(full_path, "r", encoding="utf-8") as f:
92
+ existing_data = json.load(f)
93
  else:
94
+ existing_data = []
95
+
96
+ new_data = json.loads(json_str)
97
+
98
+ now = datetime.now()
99
+ for entry in new_data:
100
+ entry["month"] = now.strftime("%B")
101
+ entry["year"] = now.year
102
+
103
+ updated_data = existing_data + new_data
104
+
105
+ with open(full_path, "w", encoding="utf-8") as f:
106
+ json.dump(updated_data, f, indent=2, ensure_ascii=False)
107
+
108
+ repo.push_to_hub(commit_message="πŸ“₯ Add new Q&A with timestamp")
109
+
110
+ return "βœ… Data with timestamp successfully pushed to HF dataset!"
111
+ except Exception as e:
112
+ return f"❌ Error: {str(e)}"
113
+
114
+
115
+ def on_extract_facts(text):
116
+ facts = extract_paragraph_facts(text)
117
+ default_fact = facts[0] if facts else None
118
+ return gr.update(choices=facts, value=default_fact), gr.update(choices=[], value=None)
119
+
120
+ def on_select_fact(fact):
121
+ noun_phrases = extract_noun_phrases(fact)
122
+ return gr.update(choices=noun_phrases, value=noun_phrases[0] if noun_phrases else None)
123
+
 
 
 
 
124
  def main():
125
  with gr.Blocks() as demo:
126
+ gr.Markdown("## Paragraph-to-Question Generator (Auto Q&A for HF Dataset)")
127
+
128
+ input_text = gr.Textbox(lines=10, label="Enter Data (Seperated by paragraph per question)")
129
+
130
+ with gr.Accordion("βš™οΈ Customize Question Generation", open=False):
131
+ extract_btn = gr.Button("Extract & Customize")
132
+ fact_dropdown = gr.Dropdown(label="Select a Fact", interactive=True)
133
+ np_dropdown = gr.Dropdown(label="Select Noun Phrase to Highlight (optional)", interactive=True)
134
+
135
+ extract_btn.click(fn=on_extract_facts, inputs=input_text, outputs=[fact_dropdown, np_dropdown])
136
+ fact_dropdown.change(fn=on_select_fact, inputs=fact_dropdown, outputs=np_dropdown)
137
+
138
+ gr.Markdown("πŸ”½ **Min Length**: Minimum number of tokens in the generated question.")
139
+ min_len = gr.Slider(5, 50, value=10, step=1, label="Min Length")
140
+
141
+ gr.Markdown("πŸ”Ό **Max Length**: Maximum number of tokens in the generated question.")
142
+ max_len = gr.Slider(20, 100, value=64, step=1, label="Max Length")
143
+
144
+ gr.Markdown("🌑️ **Temperature**: Controls randomness. Lower = more predictable, higher = more creative.")
145
+ temperature = gr.Slider(0.1, 1.5, value=1.0, step=0.1, label="Temperature")
146
+
147
+ gr.Markdown("🎯 **Top-k Sampling**: Limits sampling to the top-k most likely words.")
148
+ top_k = gr.Slider(0, 100, value=50, step=1, label="Top-k")
149
+
150
+ gr.Markdown("🎲 **Top-p (Nucleus Sampling)**: Selects from the smallest set of words with a cumulative probability > p.")
151
+ top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p")
152
+
153
+ gr.Markdown("✏️ You can manually edit the generated JSON here or paste your own in the same format.")
154
+ output_json = gr.Textbox(
155
+ lines=14,
156
+ label="Q&A JSON",
157
+ interactive=True,
158
+ placeholder='{\n"question": "Your question?",\n"answer": "Your answer."\n},'
159
+ )
160
+
161
+
162
+ with gr.Row():
163
+ generate_btn = gr.Button("Generate Q&A")
164
+ send_btn = gr.Button("πŸ“€ Send to Dataset")
165
+
166
+ generate_btn.click(
167
+ fn=generate_qna_all,
168
+ inputs=[input_text, fact_dropdown, np_dropdown, min_len, max_len, temperature, top_k, top_p],
169
+ outputs=output_json
170
+ )
171
+
172
+ send_status = gr.Textbox(label="Save Status", interactive=False)
173
+ send_btn.click(fn=save_json_to_dataset, inputs=output_json, outputs=send_status)
174
+
175
  demo.launch()
176
 
177
  if __name__ == "__main__":