Spaces:
Sleeping
Sleeping
import json | |
from transformers import pipeline, AutoModelForSeq2SeqLM, T5Tokenizer, AutoTokenizer | |
import gradio as gr | |
# Load question-generation and question-answering pipelines | |
# Use T5Tokenizer with use_fast=False to avoid tiktoken dependency | |
qg_model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-small-qa-qg-hl") | |
qg_tokenizer = T5Tokenizer.from_pretrained("valhalla/t5-small-qa-qg-hl", use_fast=False) | |
qg_pipeline = pipeline( | |
"text2text-generation", | |
model=qg_model, | |
tokenizer=qg_tokenizer | |
) | |
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="distilbert-base-cased-distilled-squad") | |
# Simple chunking: split on paragraphs (for demo) | |
def split_chunks(text, max_len=200): | |
paragraphs = [p.strip() for p in text.split("\n") if p.strip()] | |
chunks = [] | |
for p in paragraphs: | |
words = p.split() | |
if len(words) <= max_len: | |
chunks.append(p) | |
else: | |
for i in range(0, len(words), max_len): | |
chunk = " ".join(words[i : i + max_len]) | |
chunks.append(chunk) | |
return chunks | |
# Conversion function | |
def convert_text(raw_text): | |
chunks = split_chunks(raw_text) | |
qna_list = [] | |
for chunk in chunks: | |
try: | |
prompt = f"generate question: {chunk}" | |
outputs = qg_pipeline(prompt, max_length=64, clean_up_tokenization_spaces=True) | |
except Exception: | |
continue | |
for out in outputs: | |
question = out.get("generated_text", out.get("text", "")).strip() | |
if not question.endswith("?"): | |
question += "?" | |
# Refine answer using QA pipeline | |
ans = qa_pipeline({"question": question, "context": chunk}) | |
answer = ans.get("answer", "").strip() | |
qna_list.append({"question": question, "answer": answer}) | |
# Deduplicate | |
unique = [] | |
seen = set() | |
for qa in qna_list: | |
key = (qa['question'], qa['answer']) | |
if key not in seen: | |
unique.append(qa) | |
seen.add(key) | |
return json.dumps(unique, indent=2, ensure_ascii=False) | |
# Gradio interface | |
def main(): | |
with gr.Blocks() as demo: | |
gr.Markdown("# Handbook Text to Q&A Converter") | |
input_text = gr.Textbox(lines=10, placeholder="Paste handbook text here...", label="Raw Text") | |
output_json = gr.Textbox(lines=10, label="Generated Q&A JSON") | |
convert_btn = gr.Button("Convert") | |
convert_btn.click(fn=convert_text, inputs=input_text, outputs=output_json) | |
demo.launch() | |
if __name__ == "__main__": | |
main() | |