File size: 2,520 Bytes
9762129
 
28b240b
 
9762129
a6db1b9
 
 
 
 
 
9762129
28b240b
9762129
 
 
 
 
 
 
 
 
 
 
 
 
 
28b240b
9762129
 
 
 
 
 
 
a6db1b9
 
 
9762129
 
a6db1b9
 
 
 
9762129
 
 
 
a6db1b9
9762129
 
 
 
 
 
 
 
 
28b240b
9762129
 
 
 
 
 
 
 
 
28b240b
 
9762129
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import json
from transformers import pipeline
import gradio as gr

# Load question-generation and question-answering pipelines
# Use 'text2text-generation' for QG since 'e2e-qg' is not a recognized task
qg_pipeline = pipeline(
    "text2text-generation", 
    model="valhalla/t5-small-qa-qg-hl",
    tokenizer="valhalla/t5-small-qa-qg-hl"
)
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

# Simple chunking: split on paragraphs (for demo)
def split_chunks(text, max_len=200):
    paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
    chunks = []
    for p in paragraphs:
        # further split long paragraphs
        words = p.split()
        if len(words) <= max_len:
            chunks.append(p)
        else:
            for i in range(0, len(words), max_len):
                chunk = " ".join(words[i : i + max_len])
                chunks.append(chunk)
    return chunks

# Conversion function
def convert_text(raw_text):
    chunks = split_chunks(raw_text)
    qna_list = []
    for chunk in chunks:
        # Generate raw Q&A pairs
        try:
            # The model expects a prompt prefix for QG
            prompt = f"generate question: {chunk}"
            outputs = qg_pipeline(prompt, max_length=64, clean_up_tokenization_spaces=True)
        except Exception:
            continue
        for out in outputs:
            question = out["generated_text"].strip()
            if not question.endswith("?"):
                question += "?"
            # Refine answer using QA pipeline
            ans = qa_pipeline({"question": question, "context": chunk})
            answer = ans.get("answer", "").strip()
            # Append result
            qna_list.append({"question": question, "answer": answer})
    # Deduplicate
    unique = []
    seen = set()
    for qa in qna_list:
        key = (qa['question'], qa['answer'])
        if key not in seen:
            unique.append(qa)
            seen.add(key)
    return json.dumps(unique, indent=2, ensure_ascii=False)

# Gradio interface
def main():
    with gr.Blocks() as demo:
        gr.Markdown("# Handbook Text to Q&A Converter")
        input_text = gr.Textbox(lines=10, placeholder="Paste handbook text here...", label="Raw Text")
        output_json = gr.Textbox(lines=10, label="Generated Q&A JSON")
        convert_btn = gr.Button("Convert")
        convert_btn.click(fn=convert_text, inputs=input_text, outputs=output_json)
    demo.launch()

if __name__ == "__main__":
    main()