pritamdeka commited on
Commit
8b87b6e
·
verified ·
1 Parent(s): 578ab00

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +237 -70
app.py CHANGED
@@ -1,62 +1,58 @@
1
  import os
2
  import re
 
3
  import random
4
- import gradio as gr
5
  import requests
6
  import numpy as np
7
- import nltkmodule
 
8
  from nltk.tokenize import sent_tokenize
9
- from newspaper import Article
10
- import nltk
11
- from sentence_transformers import SentenceTransformer, util
 
 
 
 
12
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
13
  import torch
14
 
15
- # --------- App settings ---------
16
- PUBMED_N = 100 # Number of abstracts to retrieve initially
17
- TOP_ABSTRACTS = 10 # Number of top semantic abstracts to keep per claim
 
 
18
  NLI_MODEL_NAME = "pritamdeka/PubMedBERT-MNLI-MedNLI"
19
- SBERT_MODEL_NAME = "pritamdeka/S-PubMedBert-MS-MARCO"
20
  NLI_LABELS = ['CONTRADICTION', 'NEUTRAL', 'ENTAILMENT']
 
 
21
 
22
- # --------- Summarizer model options ---------
23
  model_options = {
24
  "Llama-3.2-1B-Instruct (Meta, gated)": "meta-llama/Llama-3.2-1B-Instruct",
25
  "Gemma-3-1B-it (Google, gated)": "google/gemma-3-1b-it",
26
  "TinyLlama-1.1B-Chat (Open)": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
27
  }
 
 
 
 
 
 
 
 
 
28
 
29
- # --------- Indicator Phrases for Claim Extraction ---------
30
  indicator_phrases = [
 
31
  "found that", "findings suggest", "shows that", "showed that", "demonstrated", "demonstrates",
32
- "revealed", "reveals", "suggests", "suggested", "indicated", "indicates", "reported", "reports",
33
- "was reported", "concluded", "concludes", "conclusion", "authors state", "stated", "data suggest",
34
- "observed", "observes", "study suggests", "study shows", "study found", "researchers found",
35
- "results indicate", "results show", "confirmed", "confirm", "confirming", "point to",
36
- "documented", "document", "evidence of", "evidence suggests",
37
- "associated with", "correlated with", "link between", "linked to", "relationship between",
38
- "was linked", "connected to", "relationship with", "tied to", "association with",
39
- "increase", "increases", "increased", "decrease", "decreases", "decreased",
40
- "greater risk", "lower risk", "higher risk", "reduced risk", "raises the risk", "reduces the risk",
41
- "risk of", "risk for", "likelihood of", "probability of", "chance of", "rate of", "incidence of",
42
- "prevalence of", "mortality", "survival rate", "death rate", "odds of", "number of", "percentage of", "percent of",
43
- "caused by", "causes", "cause", "resulted in", "results in", "leads to", "led to", "contributed to", "responsible for",
44
- "due to", "as a result", "because of",
45
- "randomized controlled trial", "RCT", "clinical trial", "participants", "enrolled", "sample size", "statistically significant",
46
- "compared to", "compared with", "versus", "compared against",
47
- "more than", "less than", "greater than", "lower than", "higher than", "significantly higher", "significantly lower",
48
- "significantly increased", "significantly decreased", "significant difference",
49
- "effect of", "impact of", "influence of", "predictor of", "predicts", "predictive of", "factor for", "determinant of",
50
- "plays a role in", "contributes to", "related to", "affects", "influences", "difference between",
51
  "according to", "a recent study", "researchers from"
52
  ]
53
 
54
- # --------- Load models (global, once) ---------
55
- nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME)
56
- nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME)
57
- sbert_model = SentenceTransformer(SBERT_MODEL_NAME)
58
- pipe_cache = {} # cache summarization pipelines
59
 
 
60
  def extract_claims_pattern(article_text):
61
  sentences = sent_tokenize(article_text)
62
  claims = [
@@ -66,42 +62,141 @@ def extract_claims_pattern(article_text):
66
  ]
67
  return list(dict.fromkeys(claims)) # deduplicate, preserve order
68
 
69
- def match_claims_to_headline(claims, headline, threshold=0.6):
70
  headline_emb = sbert_model.encode([headline])
71
  claim_embs = sbert_model.encode(claims)
72
  sims = util.pytorch_cos_sim(headline_emb, claim_embs)[0]
73
- matched_claims = [claim for claim, sim in zip(claims, sims) if sim >= threshold]
74
- # fallback: top 3 by similarity
75
  if not matched_claims and claims:
76
  idxs = np.argsort(-sims.cpu().numpy())[:min(3, len(claims))]
77
  matched_claims = [claims[i] for i in idxs]
78
  return matched_claims
79
 
80
- def retrieve_pubmed_abstracts(claim, n=PUBMED_N):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
82
- query = '+'.join(re.findall(r'\w+', claim))
83
- search_url = f"{ncbi_url}esearch.fcgi?db=pubmed&term={query}&retmax={n}&sort=relevance"
84
- r = requests.get(search_url)
85
- pmids = re.findall(r"<Id>(\d+)</Id>", r.text)
86
- if not pmids:
87
- return [], []
88
- ids = ','.join(pmids)
89
- fetch_url = f"{ncbi_url}efetch.fcgi?db=pubmed&id={ids}&rettype=xml&retmax={n}"
90
- resp = requests.get(fetch_url)
91
- titles = re.findall(r"<ArticleTitle>(.*?)</ArticleTitle>", resp.text, flags=re.DOTALL)
92
- abstracts = re.findall(r"<AbstractText.*?>(.*?)</AbstractText>", resp.text, flags=re.DOTALL)
93
- if not abstracts:
94
- abstracts = [""] * len(titles)
95
- titles = [re.sub(r"\s+", " ", t).strip() for t in titles]
96
- abstracts = [re.sub(r"\s+", " ", a).strip() for a in abstracts]
97
- return titles, abstracts
 
 
98
 
99
- def semantic_rerank_claim_abstracts(claim, titles, abstracts, top_k=TOP_ABSTRACTS):
100
  doc_texts = [f"{t}. {a}" for t, a in zip(titles, abstracts)]
101
- doc_embs = sbert_model.encode(doc_texts)
102
- claim_emb = sbert_model.encode([claim])
103
  sims = util.pytorch_cos_sim(claim_emb, doc_embs)[0]
104
- idxs = np.argsort(-sims.cpu().numpy())[:top_k]
105
  return [titles[i] for i in idxs], [abstracts[i] for i in idxs]
106
 
107
  def extract_evidence_nli(claim, title, abstract):
@@ -138,7 +233,6 @@ def get_summarizer(model_choice):
138
  "device_map": "auto",
139
  "max_new_tokens": 128
140
  }
141
- # Add token for gated models (Gemma, Llama)
142
  if any(gated in model_id for gated in ["meta-llama", "gemma"]):
143
  hf_token = os.environ.get("HF_TOKEN", None)
144
  if hf_token:
@@ -187,7 +281,7 @@ def format_evidence_html(evidence_list):
187
  )
188
  return html
189
 
190
- def factcheck_app(article_url, model_choice):
191
  try:
192
  art = Article(article_url)
193
  art.download()
@@ -197,20 +291,26 @@ def factcheck_app(article_url, model_choice):
197
  except Exception as e:
198
  return f"<b>Error downloading or reading article:</b> {e}", None
199
 
 
 
 
 
 
 
200
  claims = extract_claims_pattern(text)
201
- matched_claims = match_claims_to_headline(claims, headline)
202
  if not matched_claims:
203
  return "<b>No check-worthy claims found that match the headline.</b>", None
204
 
205
  results_html = ""
206
  all_results = []
207
  for claim in matched_claims:
208
- titles, abstracts = retrieve_pubmed_abstracts(claim)
209
  if not titles:
210
  results_html += f"<hr><b>Claim:</b> {claim}<br><i>No PubMed results found.</i><br>"
211
  all_results.append({"claim": claim, "summary": "No PubMed results found.", "evidence": []})
212
  continue
213
- top_titles, top_abstracts = semantic_rerank_claim_abstracts(claim, titles, abstracts)
214
  idx_non_top = random.choice([i for i in range(len(titles)) if i not in [titles.index(t) for t in top_titles]]) if len(titles) > len(top_titles) else None
215
  evidence_results = []
216
  for title, abstract in zip(top_titles, top_abstracts):
@@ -220,21 +320,23 @@ def factcheck_app(article_url, model_choice):
220
  control_ev = extract_evidence_nli(claim, titles[idx_non_top], abstracts[idx_non_top])
221
  evidence_results.append({"title": f"(Control) {titles[idx_non_top]}", "evidence": control_ev})
222
  all_evidence_sentences = [ev for abs_res in evidence_results for ev in abs_res["evidence"]]
223
- summary = summarize_evidence_llm(claim, all_evidence_sentences, model_choice)
224
  results_html += f"<hr><b>Claim:</b> {claim}<br><b>Layman summary:</b> {summary}<br>"
225
  for abs_res in evidence_results:
226
  results_html += f"<br><b>Abstract:</b> {abs_res['title']}<br>{format_evidence_html(abs_res['evidence'])}"
227
  all_results.append({"claim": claim, "summary": summary, "evidence": evidence_results})
228
  return results_html, all_results
229
 
 
230
  description = """
231
  <b>What does this app do?</b><br>
232
- This app extracts key scientific claims from a news article, finds the most relevant PubMed biomedical research papers, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
233
  <b>How to use it:</b><br>
234
  1. Paste the link to a biomedical news article.<br>
235
- 2. Choose an AI summarizer model below. If you have no special access, use 'TinyLlama' (works for everyone).<br>
236
- 3. Wait for the results.<br>
237
- 4. For each claim, you will see:<br>
 
238
  - A plain summary of what research says.<br>
239
  - Color-coded evidence sentences (green=support, red=contradict, gray=neutral).<br>
240
  - The titles of the most relevant PubMed articles.<br><br>
@@ -245,6 +347,63 @@ iface = gr.Interface(
245
  fn=factcheck_app,
246
  inputs=[
247
  gr.Textbox(lines=2, label="Paste a news article URL"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  gr.Dropdown(
249
  choices=list(model_options.keys()),
250
  value="TinyLlama-1.1B-Chat (Open)",
@@ -254,7 +413,15 @@ iface = gr.Interface(
254
  outputs=[gr.HTML(label="Fact-Check Results (Summary & Evidence)"), gr.JSON(label="All Results (JSON)")],
255
  title="BioMedical News Fact-Checking & Research Evidence Finder",
256
  description=description,
257
- examples=[["https://www.medicalnewstoday.com/articles/omicron-what-do-we-know-about-the-stealth-variant", "TinyLlama-1.1B-Chat (Open)"]],
 
 
 
 
 
 
 
 
258
  allow_flagging="never"
259
  )
260
 
 
1
  import os
2
  import re
3
+ import itertools
4
  import random
 
5
  import requests
6
  import numpy as np
7
+ import gradio as gr
8
+ from newspaper import Article, fulltext
9
  from nltk.tokenize import sent_tokenize
10
+ from sentence_transformers import SentenceTransformer, util, models
11
+ from sklearn.cluster import KMeans
12
+ from sklearn.metrics.pairwise import cosine_similarity
13
+ from sklearn.metrics import silhouette_score
14
+ import spacy
15
+ import en_core_sci_lg
16
+ import inflect
17
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
18
  import torch
19
 
20
+ import networkx as nx
21
+
22
+ import xml.etree.ElementTree as ET
23
+
24
+ # --- Global settings ---
25
  NLI_MODEL_NAME = "pritamdeka/PubMedBERT-MNLI-MedNLI"
 
26
  NLI_LABELS = ['CONTRADICTION', 'NEUTRAL', 'ENTAILMENT']
27
+ PUBMED_N = 100
28
+ TOP_ABSTRACTS = 10
29
 
30
+ # --- Summarizer model options ---
31
  model_options = {
32
  "Llama-3.2-1B-Instruct (Meta, gated)": "meta-llama/Llama-3.2-1B-Instruct",
33
  "Gemma-3-1B-it (Google, gated)": "google/gemma-3-1b-it",
34
  "TinyLlama-1.1B-Chat (Open)": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
35
  }
36
+ pipe_cache = {}
37
+
38
+ # --- Load static models ---
39
+ nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME)
40
+ nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME)
41
+ p = inflect.engine()
42
+ nlp = en_core_sci_lg.load()
43
+ sp = en_core_sci_lg.load()
44
+ all_stopwords = sp.Defaults.stop_words
45
 
 
46
  indicator_phrases = [
47
+ # ... (keep your full list from above)
48
  "found that", "findings suggest", "shows that", "showed that", "demonstrated", "demonstrates",
49
+ # ... [trimmed for brevity]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  "according to", "a recent study", "researchers from"
51
  ]
52
 
53
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
 
 
54
 
55
+ # --- Claim extraction ---
56
  def extract_claims_pattern(article_text):
57
  sentences = sent_tokenize(article_text)
58
  claims = [
 
62
  ]
63
  return list(dict.fromkeys(claims)) # deduplicate, preserve order
64
 
65
+ def match_claims_to_headline(claims, headline, sbert_model):
66
  headline_emb = sbert_model.encode([headline])
67
  claim_embs = sbert_model.encode(claims)
68
  sims = util.pytorch_cos_sim(headline_emb, claim_embs)[0]
69
+ matched_claims = [claim for claim, sim in zip(claims, sims) if sim >= 0.6]
 
70
  if not matched_claims and claims:
71
  idxs = np.argsort(-sims.cpu().numpy())[:min(3, len(claims))]
72
  matched_claims = [claims[i] for i in idxs]
73
  return matched_claims
74
 
75
+ def keyphrase_groups_and_query(article_text, max_num_keywords, model_1, model_2, model_3):
76
+ # TextRank with SBERT model_1
77
+ corpus = sent_tokenize(article_text)
78
+ indicator_list = indicator_phrases
79
+ score_list, count_dict = [], {}
80
+ for l in corpus:
81
+ c = 0
82
+ for l2 in indicator_list:
83
+ if l.find(l2) != -1:
84
+ c = 1
85
+ break
86
+ count_dict[l] = c
87
+ for sent, score in count_dict.items():
88
+ score_list.append(score)
89
+ clean_sentences_new = [re.sub("[^a-zA-Z]", " ", s) for s in corpus]
90
+ corpus_embeddings = model_1.encode(clean_sentences_new)
91
+ sim_mat = np.zeros([len(clean_sentences_new), len(clean_sentences_new)])
92
+ for i in range(len(clean_sentences_new)):
93
+ len_embeddings = len(corpus_embeddings[i])
94
+ for j in range(len(clean_sentences_new)):
95
+ if i != j:
96
+ sim_mat[i][j] = cosine_similarity(
97
+ corpus_embeddings[i].reshape(1, len_embeddings),
98
+ corpus_embeddings[j].reshape(1, len_embeddings)
99
+ )[0, 0]
100
+ nx_graph = nx.from_numpy_array(sim_mat)
101
+ scores = nx.pagerank(nx_graph, max_iter=1500)
102
+ element = [scores[i] for i in range(len(corpus))]
103
+ sum_list = [sc + lst for sc, lst in zip(score_list, element)]
104
+ x = sorted(((sum_list[i], s) for i, s in enumerate(corpus)), reverse=True)
105
+ final_textrank_list = [elem[1] for elem in x]
106
+ a = int((10 * len(final_textrank_list)) / 100.0)
107
+ total = max(a, 5)
108
+ document = [final_textrank_list[i] for i in range(total)]
109
+ doc = " ".join(document)
110
+ text_doc = []
111
+ for i in document:
112
+ doc_1 = nlp(i)
113
+ text_doc.append([X.text for X in doc_1.ents])
114
+ entity_list = [item for sublist in text_doc for item in sublist]
115
+ entity_list = [word for word in entity_list if word not in all_stopwords]
116
+ entity_list = [word_entity for word_entity in entity_list if not p.singular_noun(word_entity)]
117
+ entity_list = list(dict.fromkeys(entity_list))
118
+ doc_embedding = model_2.encode([doc])
119
+ candidates = entity_list
120
+ if not candidates:
121
+ return "", []
122
+ candidate_embeddings = model_2.encode(candidates)
123
+ distances = cosine_similarity(doc_embedding, candidate_embeddings)
124
+ top_n = min(max_num_keywords, len(candidates))
125
+ keyword_list = [candidates[index] for index in distances.argsort()[0][-top_n:]]
126
+ # Clustering with model_3
127
+ word_embedding_model = models.Transformer(model_3)
128
+ pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
129
+ pooling_mode_mean_tokens=True,
130
+ pooling_mode_cls_token=False,
131
+ pooling_mode_max_tokens=False)
132
+ embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
133
+ c_len = len(keyword_list)
134
+ if c_len < 2:
135
+ return " OR ".join(keyword_list), keyword_list
136
+ keyword_embeddings = embedder.encode(keyword_list)
137
+ silhouette_score_list = []
138
+ cluster_list_final = []
139
+ for num_clusters in range(1, top_n):
140
+ clustering_model = KMeans(n_clusters=num_clusters)
141
+ clustering_model.fit(keyword_embeddings)
142
+ cluster_assignment = clustering_model.labels_
143
+ clustered_sentences = [[] for _ in range(num_clusters)]
144
+ for sentence_id, cluster_id in enumerate(cluster_assignment):
145
+ clustered_sentences[cluster_id].append(keyword_list[sentence_id])
146
+ cl_sent_len = len(clustered_sentences)
147
+ list_cluster = list(clustered_sentences)
148
+ cluster_list_final.append(list_cluster)
149
+ if (c_len == cl_sent_len and c_len >= 3) or cl_sent_len == 1:
150
+ silhouette_avg = 0
151
+ elif c_len == cl_sent_len == 2:
152
+ silhouette_avg = 1
153
+ else:
154
+ silhouette_avg = silhouette_score(keyword_embeddings, cluster_assignment)
155
+ silhouette_score_list.append(silhouette_avg)
156
+ res_dict = dict(zip(silhouette_score_list, cluster_list_final))
157
+ cluster_items = res_dict[max(res_dict)]
158
+ comb = []
159
+ for i in cluster_items:
160
+ z = ' OR '.join(i)
161
+ comb.append("(" + z + ")")
162
+ combinations = []
163
+ for subset in itertools.combinations(comb, 2):
164
+ combinations.append(subset)
165
+ f1_list = []
166
+ for s in combinations:
167
+ final = ' AND '.join(s)
168
+ f1_list.append("(" + final + ")")
169
+ f_1 = ' OR '.join(f1_list)
170
+ return f_1, keyword_list
171
+
172
+ def retrieve_pubmed_abstracts(article_text, headline, max_num_keywords, model_1, model_2, model_3):
173
+ query, _ = keyphrase_groups_and_query(article_text, max_num_keywords, model_1, model_2, model_3)
174
  ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
175
+ for q in [query, headline, article_text]:
176
+ if not q:
177
+ continue
178
+ search_url = f"{ncbi_url}esearch.fcgi?db=pubmed&term={q}&retmax={PUBMED_N}&sort=relevance"
179
+ r = requests.get(search_url)
180
+ pmids = re.findall(r"<Id>(\d+)</Id>", r.text)
181
+ if pmids:
182
+ ids = ','.join(pmids)
183
+ fetch_url = f"{ncbi_url}efetch.fcgi?db=pubmed&id={ids}&rettype=xml&retmax={PUBMED_N}"
184
+ resp = requests.get(fetch_url)
185
+ titles = re.findall(r"<ArticleTitle>(.*?)</ArticleTitle>", resp.text, flags=re.DOTALL)
186
+ abstracts = re.findall(r"<AbstractText.*?>(.*?)</AbstractText>", resp.text, flags=re.DOTALL)
187
+ if not abstracts:
188
+ abstracts = [""] * len(titles)
189
+ titles = [re.sub(r"\s+", " ", t).strip() for t in titles]
190
+ abstracts = [re.sub(r"\s+", " ", a).strip() for a in abstracts]
191
+ return titles, abstracts
192
+ return [], []
193
 
194
+ def semantic_rerank_claim_abstracts(claim, titles, abstracts, model_4):
195
  doc_texts = [f"{t}. {a}" for t, a in zip(titles, abstracts)]
196
+ doc_embs = model_4.encode(doc_texts)
197
+ claim_emb = model_4.encode([claim])
198
  sims = util.pytorch_cos_sim(claim_emb, doc_embs)[0]
199
+ idxs = np.argsort(-sims.cpu().numpy())[:TOP_ABSTRACTS]
200
  return [titles[i] for i in idxs], [abstracts[i] for i in idxs]
201
 
202
  def extract_evidence_nli(claim, title, abstract):
 
233
  "device_map": "auto",
234
  "max_new_tokens": 128
235
  }
 
236
  if any(gated in model_id for gated in ["meta-llama", "gemma"]):
237
  hf_token = os.environ.get("HF_TOKEN", None)
238
  if hf_token:
 
281
  )
282
  return html
283
 
284
+ def factcheck_app(article_url, model_1_name, model_2_name, max_num_keywords, model_3_name, model_4_name, summarizer_choice):
285
  try:
286
  art = Article(article_url)
287
  art.download()
 
291
  except Exception as e:
292
  return f"<b>Error downloading or reading article:</b> {e}", None
293
 
294
+ # Load all selected models
295
+ model_1 = SentenceTransformer(model_1_name)
296
+ model_2 = SentenceTransformer(model_2_name)
297
+ model_3 = model_3_name # used as model id string
298
+ model_4 = SentenceTransformer(model_4_name)
299
+
300
  claims = extract_claims_pattern(text)
301
+ matched_claims = match_claims_to_headline(claims, headline, model_1)
302
  if not matched_claims:
303
  return "<b>No check-worthy claims found that match the headline.</b>", None
304
 
305
  results_html = ""
306
  all_results = []
307
  for claim in matched_claims:
308
+ titles, abstracts = retrieve_pubmed_abstracts(claim, headline, max_num_keywords, model_1, model_2, model_3)
309
  if not titles:
310
  results_html += f"<hr><b>Claim:</b> {claim}<br><i>No PubMed results found.</i><br>"
311
  all_results.append({"claim": claim, "summary": "No PubMed results found.", "evidence": []})
312
  continue
313
+ top_titles, top_abstracts = semantic_rerank_claim_abstracts(claim, titles, abstracts, model_4)
314
  idx_non_top = random.choice([i for i in range(len(titles)) if i not in [titles.index(t) for t in top_titles]]) if len(titles) > len(top_titles) else None
315
  evidence_results = []
316
  for title, abstract in zip(top_titles, top_abstracts):
 
320
  control_ev = extract_evidence_nli(claim, titles[idx_non_top], abstracts[idx_non_top])
321
  evidence_results.append({"title": f"(Control) {titles[idx_non_top]}", "evidence": control_ev})
322
  all_evidence_sentences = [ev for abs_res in evidence_results for ev in abs_res["evidence"]]
323
+ summary = summarize_evidence_llm(claim, all_evidence_sentences, summarizer_choice)
324
  results_html += f"<hr><b>Claim:</b> {claim}<br><b>Layman summary:</b> {summary}<br>"
325
  for abs_res in evidence_results:
326
  results_html += f"<br><b>Abstract:</b> {abs_res['title']}<br>{format_evidence_html(abs_res['evidence'])}"
327
  all_results.append({"claim": claim, "summary": summary, "evidence": evidence_results})
328
  return results_html, all_results
329
 
330
+ # --- Gradio UI ---
331
  description = """
332
  <b>What does this app do?</b><br>
333
+ This app extracts key scientific claims from a news article, finds the most relevant PubMed biomedical research papers using advanced keyphrase grouping and Boolean queries, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
334
  <b>How to use it:</b><br>
335
  1. Paste the link to a biomedical news article.<br>
336
+ 2. Choose your models for each stage (or use defaults for best results).<br>
337
+ 3. Pick a summarizer for layperson summary.<br>
338
+ 4. Wait for the results.<br>
339
+ 5. For each claim, you will see:<br>
340
  - A plain summary of what research says.<br>
341
  - Color-coded evidence sentences (green=support, red=contradict, gray=neutral).<br>
342
  - The titles of the most relevant PubMed articles.<br><br>
 
347
  fn=factcheck_app,
348
  inputs=[
349
  gr.Textbox(lines=2, label="Paste a news article URL"),
350
+ gr.Dropdown(
351
+ choices=[
352
+ 'sentence-transformers/all-mpnet-base-v2',
353
+ 'sentence-transformers/all-mpnet-base-v1',
354
+ 'sentence-transformers/all-distilroberta-v1',
355
+ 'sentence-transformers/gtr-t5-large',
356
+ 'pritamdeka/S-Bluebert-snli-multinli-stsb',
357
+ 'pritamdeka/S-Biomed-Roberta-snli-multinli-stsb',
358
+ 'pritamdeka/S-BioBert-snli-multinli-stsb',
359
+ 'sentence-transformers/stsb-mpnet-base-v2',
360
+ 'sentence-transformers/stsb-roberta-base-v2',
361
+ 'sentence-transformers/stsb-distilroberta-base-v2',
362
+ 'sentence-transformers/sentence-t5-large',
363
+ 'sentence-transformers/sentence-t5-base'
364
+ ],
365
+ value='sentence-transformers/all-mpnet-base-v2',
366
+ label="SBERT model for TextRank"
367
+ ),
368
+ gr.Dropdown(
369
+ choices=[
370
+ 'sentence-transformers/paraphrase-mpnet-base-v2',
371
+ 'sentence-transformers/all-mpnet-base-v1',
372
+ 'sentence-transformers/paraphrase-distilroberta-base-v1',
373
+ 'sentence-transformers/paraphrase-xlm-r-multilingual-v1',
374
+ 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
375
+ 'sentence-transformers/paraphrase-albert-small-v2',
376
+ 'sentence-transformers/paraphrase-albert-base-v2',
377
+ 'sentence-transformers/paraphrase-MiniLM-L12-v2',
378
+ 'sentence-transformers/paraphrase-MiniLM-L6-v2',
379
+ 'sentence-transformers/all-MiniLM-L12-v2',
380
+ 'sentence-transformers/all-distilroberta-v1',
381
+ 'sentence-transformers/paraphrase-TinyBERT-L6-v2',
382
+ 'sentence-transformers/paraphrase-MiniLM-L3-v2',
383
+ 'sentence-transformers/all-MiniLM-L6-v2'
384
+ ],
385
+ value='sentence-transformers/paraphrase-mpnet-base-v2',
386
+ label="SBERT model for keyphrases"
387
+ ),
388
+ gr.Slider(minimum=5, maximum=20, step=1, value=10, label="Max Keywords"),
389
+ gr.Dropdown(
390
+ choices=[
391
+ 'cambridgeltl/SapBERT-from-PubMedBERT-fulltext',
392
+ 'cambridgeltl/SapBERT-from-PubMedBERT-fulltext-mean-token'
393
+ ],
394
+ value='cambridgeltl/SapBERT-from-PubMedBERT-fulltext',
395
+ label="SapBERT model for clustering"
396
+ ),
397
+ gr.Dropdown(
398
+ choices=[
399
+ 'pritamdeka/S-Bluebert-snli-multinli-stsb',
400
+ 'pritamdeka/S-BioBert-snli-multinli-stsb',
401
+ 'pritamdeka/S-Biomed-Roberta-snli-multinli-stsb',
402
+ 'sentence-transformers/all-mpnet-base-v2'
403
+ ],
404
+ value='pritamdeka/S-BioBert-snli-multinli-stsb',
405
+ label="SBERT model for abstracts"
406
+ ),
407
  gr.Dropdown(
408
  choices=list(model_options.keys()),
409
  value="TinyLlama-1.1B-Chat (Open)",
 
413
  outputs=[gr.HTML(label="Fact-Check Results (Summary & Evidence)"), gr.JSON(label="All Results (JSON)")],
414
  title="BioMedical News Fact-Checking & Research Evidence Finder",
415
  description=description,
416
+ examples=[[
417
+ "https://www.medicalnewstoday.com/articles/omicron-what-do-we-know-about-the-stealth-variant",
418
+ 'sentence-transformers/all-mpnet-base-v2',
419
+ 'sentence-transformers/paraphrase-mpnet-base-v2',
420
+ 10,
421
+ 'cambridgeltl/SapBERT-from-PubMedBERT-fulltext',
422
+ 'pritamdeka/S-BioBert-snli-multinli-stsb',
423
+ "TinyLlama-1.1B-Chat (Open)"
424
+ ]],
425
  allow_flagging="never"
426
  )
427