Spaces:
Sleeping
Sleeping
import json | |
from transformers import pipeline | |
import gradio as gr | |
# Load question-generation and question-answering pipelines | |
# Use 'text2text-generation' for QG since 'e2e-qg' is not a recognized task | |
qg_pipeline = pipeline( | |
"text2text-generation", | |
model="valhalla/t5-small-qa-qg-hl", | |
tokenizer="valhalla/t5-small-qa-qg-hl" | |
) | |
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad") | |
# Simple chunking: split on paragraphs (for demo) | |
def split_chunks(text, max_len=200): | |
paragraphs = [p.strip() for p in text.split("\n") if p.strip()] | |
chunks = [] | |
for p in paragraphs: | |
# further split long paragraphs | |
words = p.split() | |
if len(words) <= max_len: | |
chunks.append(p) | |
else: | |
for i in range(0, len(words), max_len): | |
chunk = " ".join(words[i : i + max_len]) | |
chunks.append(chunk) | |
return chunks | |
# Conversion function | |
def convert_text(raw_text): | |
chunks = split_chunks(raw_text) | |
qna_list = [] | |
for chunk in chunks: | |
# Generate raw Q&A pairs | |
try: | |
# The model expects a prompt prefix for QG | |
prompt = f"generate question: {chunk}" | |
outputs = qg_pipeline(prompt, max_length=64, clean_up_tokenization_spaces=True) | |
except Exception: | |
continue | |
for out in outputs: | |
question = out["generated_text"].strip() | |
if not question.endswith("?"): | |
question += "?" | |
# Refine answer using QA pipeline | |
ans = qa_pipeline({"question": question, "context": chunk}) | |
answer = ans.get("answer", "").strip() | |
# Append result | |
qna_list.append({"question": question, "answer": answer}) | |
# Deduplicate | |
unique = [] | |
seen = set() | |
for qa in qna_list: | |
key = (qa['question'], qa['answer']) | |
if key not in seen: | |
unique.append(qa) | |
seen.add(key) | |
return json.dumps(unique, indent=2, ensure_ascii=False) | |
# Gradio interface | |
def main(): | |
with gr.Blocks() as demo: | |
gr.Markdown("# Handbook Text to Q&A Converter") | |
input_text = gr.Textbox(lines=10, placeholder="Paste handbook text here...", label="Raw Text") | |
output_json = gr.Textbox(lines=10, label="Generated Q&A JSON") | |
convert_btn = gr.Button("Convert") | |
convert_btn.click(fn=convert_text, inputs=input_text, outputs=output_json) | |
demo.launch() | |
if __name__ == "__main__": | |
main() | |