oceddyyy commited on
Commit
6c7b457
·
verified ·
1 Parent(s): a6db1b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -11
app.py CHANGED
@@ -1,22 +1,23 @@
1
  import json
2
- from transformers import pipeline
3
  import gradio as gr
4
 
5
  # Load question-generation and question-answering pipelines
6
- # Use 'text2text-generation' for QG since 'e2e-qg' is not a recognized task
 
 
7
  qg_pipeline = pipeline(
8
- "text2text-generation",
9
- model="valhalla/t5-small-qa-qg-hl",
10
- tokenizer="valhalla/t5-small-qa-qg-hl"
11
  )
12
- qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
13
 
14
  # Simple chunking: split on paragraphs (for demo)
15
  def split_chunks(text, max_len=200):
16
  paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
17
  chunks = []
18
  for p in paragraphs:
19
- # further split long paragraphs
20
  words = p.split()
21
  if len(words) <= max_len:
22
  chunks.append(p)
@@ -31,21 +32,18 @@ def convert_text(raw_text):
31
  chunks = split_chunks(raw_text)
32
  qna_list = []
33
  for chunk in chunks:
34
- # Generate raw Q&A pairs
35
  try:
36
- # The model expects a prompt prefix for QG
37
  prompt = f"generate question: {chunk}"
38
  outputs = qg_pipeline(prompt, max_length=64, clean_up_tokenization_spaces=True)
39
  except Exception:
40
  continue
41
  for out in outputs:
42
- question = out["generated_text"].strip()
43
  if not question.endswith("?"):
44
  question += "?"
45
  # Refine answer using QA pipeline
46
  ans = qa_pipeline({"question": question, "context": chunk})
47
  answer = ans.get("answer", "").strip()
48
- # Append result
49
  qna_list.append({"question": question, "answer": answer})
50
  # Deduplicate
51
  unique = []
 
1
  import json
2
+ from transformers import pipeline, AutoModelForSeq2SeqLM, T5Tokenizer, AutoTokenizer
3
  import gradio as gr
4
 
5
  # Load question-generation and question-answering pipelines
6
+ # Use T5Tokenizer with use_fast=False to avoid tiktoken dependency
7
+ qg_model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-small-qa-qg-hl")
8
+ qg_tokenizer = T5Tokenizer.from_pretrained("valhalla/t5-small-qa-qg-hl", use_fast=False)
9
  qg_pipeline = pipeline(
10
+ "text2text-generation",
11
+ model=qg_model,
12
+ tokenizer=qg_tokenizer
13
  )
14
+ qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="distilbert-base-cased-distilled-squad")
15
 
16
  # Simple chunking: split on paragraphs (for demo)
17
  def split_chunks(text, max_len=200):
18
  paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
19
  chunks = []
20
  for p in paragraphs:
 
21
  words = p.split()
22
  if len(words) <= max_len:
23
  chunks.append(p)
 
32
  chunks = split_chunks(raw_text)
33
  qna_list = []
34
  for chunk in chunks:
 
35
  try:
 
36
  prompt = f"generate question: {chunk}"
37
  outputs = qg_pipeline(prompt, max_length=64, clean_up_tokenization_spaces=True)
38
  except Exception:
39
  continue
40
  for out in outputs:
41
+ question = out.get("generated_text", out.get("text", "")).strip()
42
  if not question.endswith("?"):
43
  question += "?"
44
  # Refine answer using QA pipeline
45
  ans = qa_pipeline({"question": question, "context": chunk})
46
  answer = ans.get("answer", "").strip()
 
47
  qna_list.append({"question": question, "answer": answer})
48
  # Deduplicate
49
  unique = []