Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,22 +1,23 @@
|
|
1 |
import json
|
2 |
-
from transformers import pipeline
|
3 |
import gradio as gr
|
4 |
|
5 |
# Load question-generation and question-answering pipelines
|
6 |
-
# Use
|
|
|
|
|
7 |
qg_pipeline = pipeline(
|
8 |
-
"text2text-generation",
|
9 |
-
model=
|
10 |
-
tokenizer=
|
11 |
)
|
12 |
-
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
|
13 |
|
14 |
# Simple chunking: split on paragraphs (for demo)
|
15 |
def split_chunks(text, max_len=200):
|
16 |
paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
|
17 |
chunks = []
|
18 |
for p in paragraphs:
|
19 |
-
# further split long paragraphs
|
20 |
words = p.split()
|
21 |
if len(words) <= max_len:
|
22 |
chunks.append(p)
|
@@ -31,21 +32,18 @@ def convert_text(raw_text):
|
|
31 |
chunks = split_chunks(raw_text)
|
32 |
qna_list = []
|
33 |
for chunk in chunks:
|
34 |
-
# Generate raw Q&A pairs
|
35 |
try:
|
36 |
-
# The model expects a prompt prefix for QG
|
37 |
prompt = f"generate question: {chunk}"
|
38 |
outputs = qg_pipeline(prompt, max_length=64, clean_up_tokenization_spaces=True)
|
39 |
except Exception:
|
40 |
continue
|
41 |
for out in outputs:
|
42 |
-
question = out
|
43 |
if not question.endswith("?"):
|
44 |
question += "?"
|
45 |
# Refine answer using QA pipeline
|
46 |
ans = qa_pipeline({"question": question, "context": chunk})
|
47 |
answer = ans.get("answer", "").strip()
|
48 |
-
# Append result
|
49 |
qna_list.append({"question": question, "answer": answer})
|
50 |
# Deduplicate
|
51 |
unique = []
|
|
|
1 |
import json
|
2 |
+
from transformers import pipeline, AutoModelForSeq2SeqLM, T5Tokenizer, AutoTokenizer
|
3 |
import gradio as gr
|
4 |
|
5 |
# Load question-generation and question-answering pipelines
|
6 |
+
# Use T5Tokenizer with use_fast=False to avoid tiktoken dependency
|
7 |
+
qg_model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-small-qa-qg-hl")
|
8 |
+
qg_tokenizer = T5Tokenizer.from_pretrained("valhalla/t5-small-qa-qg-hl", use_fast=False)
|
9 |
qg_pipeline = pipeline(
|
10 |
+
"text2text-generation",
|
11 |
+
model=qg_model,
|
12 |
+
tokenizer=qg_tokenizer
|
13 |
)
|
14 |
+
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="distilbert-base-cased-distilled-squad")
|
15 |
|
16 |
# Simple chunking: split on paragraphs (for demo)
|
17 |
def split_chunks(text, max_len=200):
|
18 |
paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
|
19 |
chunks = []
|
20 |
for p in paragraphs:
|
|
|
21 |
words = p.split()
|
22 |
if len(words) <= max_len:
|
23 |
chunks.append(p)
|
|
|
32 |
chunks = split_chunks(raw_text)
|
33 |
qna_list = []
|
34 |
for chunk in chunks:
|
|
|
35 |
try:
|
|
|
36 |
prompt = f"generate question: {chunk}"
|
37 |
outputs = qg_pipeline(prompt, max_length=64, clean_up_tokenization_spaces=True)
|
38 |
except Exception:
|
39 |
continue
|
40 |
for out in outputs:
|
41 |
+
question = out.get("generated_text", out.get("text", "")).strip()
|
42 |
if not question.endswith("?"):
|
43 |
question += "?"
|
44 |
# Refine answer using QA pipeline
|
45 |
ans = qa_pipeline({"question": question, "context": chunk})
|
46 |
answer = ans.get("answer", "").strip()
|
|
|
47 |
qna_list.append({"question": question, "answer": answer})
|
48 |
# Deduplicate
|
49 |
unique = []
|