pritamdeka commited on
Commit
965aebe
·
verified ·
1 Parent(s): f93074a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -21
app.py CHANGED
@@ -16,9 +16,16 @@ import torch
16
  PUBMED_N = 100 # Number of abstracts to retrieve initially
17
  TOP_ABSTRACTS = 10 # Number of top semantic abstracts to keep per claim
18
  NLI_MODEL_NAME = "pritamdeka/PubMedBERT-MNLI-MedNLI"
19
- SBERT_MODEL_NAME = "pritamdeka/S-BioBert-snli-multinli-stsb"
20
  NLI_LABELS = ['CONTRADICTION', 'NEUTRAL', 'ENTAILMENT']
21
 
 
 
 
 
 
 
 
22
  # --------- Indicator Phrases for Claim Extraction ---------
23
  indicator_phrases = [
24
  "found that", "findings suggest", "shows that", "showed that", "demonstrated", "demonstrates",
@@ -48,16 +55,7 @@ indicator_phrases = [
48
  nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME)
49
  nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME)
50
  sbert_model = SentenceTransformer(SBERT_MODEL_NAME)
51
-
52
- # --- Load fast Llama-3.2-1B-Instruct summarizer pipeline ---
53
- model_id = "meta-llama/Llama-3.2-1B-Instruct"
54
- pipe = pipeline(
55
- "text-generation",
56
- model=model_id,
57
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
58
- device_map="auto",
59
- max_new_tokens=128,
60
- )
61
 
62
  def extract_claims_pattern(article_text):
63
  sentences = sent_tokenize(article_text)
@@ -130,11 +128,29 @@ def extract_evidence_nli(claim, title, abstract):
130
  })
131
  return evidence
132
 
133
- def summarize_evidence_llm(claim, evidence_list):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  support = [ev['sentence'] for ev in evidence_list if ev['label'] == 'ENTAILMENT']
135
  contradict = [ev['sentence'] for ev in evidence_list if ev['label'] == 'CONTRADICTION']
136
-
137
- # Compose prompt for summarization.
138
  messages = [
139
  {"role": "system", "content": "You are a helpful biomedical assistant. Summarize scientific evidence in plain English for the general public."},
140
  {"role": "user", "content":
@@ -145,6 +161,7 @@ def summarize_evidence_llm(claim, evidence_list):
145
  }
146
  ]
147
  try:
 
148
  outputs = pipe(
149
  messages,
150
  max_new_tokens=96,
@@ -152,7 +169,6 @@ def summarize_evidence_llm(claim, evidence_list):
152
  temperature=0.1,
153
  )
154
  out = outputs[0]["generated_text"]
155
- # If the model returns all messages, just take the last message (often the answer).
156
  if isinstance(out, list) and "content" in out[-1]:
157
  return out[-1]["content"].strip()
158
  return out.strip()
@@ -171,7 +187,7 @@ def format_evidence_html(evidence_list):
171
  )
172
  return html
173
 
174
- def factcheck_app(article_url):
175
  try:
176
  art = Article(article_url)
177
  art.download()
@@ -204,7 +220,7 @@ def factcheck_app(article_url):
204
  control_ev = extract_evidence_nli(claim, titles[idx_non_top], abstracts[idx_non_top])
205
  evidence_results.append({"title": f"(Control) {titles[idx_non_top]}", "evidence": control_ev})
206
  all_evidence_sentences = [ev for abs_res in evidence_results for ev in abs_res["evidence"]]
207
- summary = summarize_evidence_llm(claim, all_evidence_sentences)
208
  results_html += f"<hr><b>Claim:</b> {claim}<br><b>Layman summary:</b> {summary}<br>"
209
  for abs_res in evidence_results:
210
  results_html += f"<br><b>Abstract:</b> {abs_res['title']}<br>{format_evidence_html(abs_res['evidence'])}"
@@ -216,8 +232,9 @@ description = """
216
  This app extracts key scientific claims from a news article, finds the most relevant PubMed biomedical research papers, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
217
  <b>How to use it:</b><br>
218
  1. Paste the link to a biomedical news article.<br>
219
- 2. Wait for the results.<br>
220
- 3. For each claim, you will see:<br>
 
221
  - A plain summary of what research says.<br>
222
  - Color-coded evidence sentences (green=support, red=contradict, gray=neutral).<br>
223
  - The titles of the most relevant PubMed articles.<br><br>
@@ -226,11 +243,18 @@ This app extracts key scientific claims from a news article, finds the most rele
226
 
227
  iface = gr.Interface(
228
  fn=factcheck_app,
229
- inputs=gr.Textbox(lines=2, label="Paste a news article URL"),
 
 
 
 
 
 
 
230
  outputs=[gr.HTML(label="Fact-Check Results (Summary & Evidence)"), gr.JSON(label="All Results (JSON)")],
231
  title="BioMedical News Fact-Checking & Research Evidence Finder",
232
  description=description,
233
- examples=[["https://www.medicalnewstoday.com/articles/omicron-what-do-we-know-about-the-stealth-variant"]],
234
  allow_flagging="never"
235
  )
236
 
 
16
  PUBMED_N = 100 # Number of abstracts to retrieve initially
17
  TOP_ABSTRACTS = 10 # Number of top semantic abstracts to keep per claim
18
  NLI_MODEL_NAME = "pritamdeka/PubMedBERT-MNLI-MedNLI"
19
+ SBERT_MODEL_NAME = "pritamdeka/S-PubMedBert-MS-MARCO"
20
  NLI_LABELS = ['CONTRADICTION', 'NEUTRAL', 'ENTAILMENT']
21
 
22
+ # --------- Summarizer model options ---------
23
+ model_options = {
24
+ "Llama-3.2-1B-Instruct (Meta, gated)": "meta-llama/Llama-3.2-1B-Instruct",
25
+ "Gemma-3-1B-it (Google, gated)": "google/gemma-3-1b-it",
26
+ "TinyLlama-1.1B-Chat (Open)": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
27
+ }
28
+
29
  # --------- Indicator Phrases for Claim Extraction ---------
30
  indicator_phrases = [
31
  "found that", "findings suggest", "shows that", "showed that", "demonstrated", "demonstrates",
 
55
  nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME)
56
  nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME)
57
  sbert_model = SentenceTransformer(SBERT_MODEL_NAME)
58
+ pipe_cache = {} # cache summarization pipelines
 
 
 
 
 
 
 
 
 
59
 
60
  def extract_claims_pattern(article_text):
61
  sentences = sent_tokenize(article_text)
 
128
  })
129
  return evidence
130
 
131
+ def get_summarizer(model_choice):
132
+ model_id = model_options[model_choice]
133
+ if model_id in pipe_cache:
134
+ return pipe_cache[model_id]
135
+ kwargs = {
136
+ "model": model_id,
137
+ "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
138
+ "device_map": "auto",
139
+ "max_new_tokens": 128
140
+ }
141
+ # Add token for gated models (Gemma, Llama)
142
+ if any(gated in model_id for gated in ["meta-llama", "gemma"]):
143
+ hf_token = os.environ.get("HF_TOKEN", None)
144
+ if hf_token:
145
+ kwargs["token"] = hf_token
146
+ else:
147
+ raise RuntimeError(f"Model '{model_choice}' requires a Hugging Face access token. Please set 'HF_TOKEN' as a Space secret or environment variable.")
148
+ pipe_cache[model_id] = pipeline("text-generation", **kwargs)
149
+ return pipe_cache[model_id]
150
+
151
+ def summarize_evidence_llm(claim, evidence_list, model_choice):
152
  support = [ev['sentence'] for ev in evidence_list if ev['label'] == 'ENTAILMENT']
153
  contradict = [ev['sentence'] for ev in evidence_list if ev['label'] == 'CONTRADICTION']
 
 
154
  messages = [
155
  {"role": "system", "content": "You are a helpful biomedical assistant. Summarize scientific evidence in plain English for the general public."},
156
  {"role": "user", "content":
 
161
  }
162
  ]
163
  try:
164
+ pipe = get_summarizer(model_choice)
165
  outputs = pipe(
166
  messages,
167
  max_new_tokens=96,
 
169
  temperature=0.1,
170
  )
171
  out = outputs[0]["generated_text"]
 
172
  if isinstance(out, list) and "content" in out[-1]:
173
  return out[-1]["content"].strip()
174
  return out.strip()
 
187
  )
188
  return html
189
 
190
+ def factcheck_app(article_url, model_choice):
191
  try:
192
  art = Article(article_url)
193
  art.download()
 
220
  control_ev = extract_evidence_nli(claim, titles[idx_non_top], abstracts[idx_non_top])
221
  evidence_results.append({"title": f"(Control) {titles[idx_non_top]}", "evidence": control_ev})
222
  all_evidence_sentences = [ev for abs_res in evidence_results for ev in abs_res["evidence"]]
223
+ summary = summarize_evidence_llm(claim, all_evidence_sentences, model_choice)
224
  results_html += f"<hr><b>Claim:</b> {claim}<br><b>Layman summary:</b> {summary}<br>"
225
  for abs_res in evidence_results:
226
  results_html += f"<br><b>Abstract:</b> {abs_res['title']}<br>{format_evidence_html(abs_res['evidence'])}"
 
232
  This app extracts key scientific claims from a news article, finds the most relevant PubMed biomedical research papers, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
233
  <b>How to use it:</b><br>
234
  1. Paste the link to a biomedical news article.<br>
235
+ 2. Choose an AI summarizer model below. If you have no special access, use 'TinyLlama' (works for everyone).<br>
236
+ 3. Wait for the results.<br>
237
+ 4. For each claim, you will see:<br>
238
  - A plain summary of what research says.<br>
239
  - Color-coded evidence sentences (green=support, red=contradict, gray=neutral).<br>
240
  - The titles of the most relevant PubMed articles.<br><br>
 
243
 
244
  iface = gr.Interface(
245
  fn=factcheck_app,
246
+ inputs=[
247
+ gr.Textbox(lines=2, label="Paste a news article URL"),
248
+ gr.Dropdown(
249
+ choices=list(model_options.keys()),
250
+ value="TinyLlama-1.1B-Chat (Open)",
251
+ label="Choose summarizer model"
252
+ )
253
+ ],
254
  outputs=[gr.HTML(label="Fact-Check Results (Summary & Evidence)"), gr.JSON(label="All Results (JSON)")],
255
  title="BioMedical News Fact-Checking & Research Evidence Finder",
256
  description=description,
257
+ examples=[["https://www.medicalnewstoday.com/articles/omicron-what-do-we-know-about-the-stealth-variant", "TinyLlama-1.1B-Chat (Open)"]],
258
  allow_flagging="never"
259
  )
260