pritamdeka commited on
Commit
fc63a12
·
verified ·
1 Parent(s): 50833a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -248
app.py CHANGED
@@ -1,31 +1,28 @@
1
  import os
2
  import re
3
- import itertools
4
  import random
5
  import requests
6
- import nltkmodule
7
- import nltk
8
- import numpy as np
9
  import gradio as gr
10
- from newspaper import Article, fulltext
 
 
 
11
  from nltk.tokenize import sent_tokenize
12
- from sentence_transformers import SentenceTransformer, util, models
13
- from sklearn.cluster import KMeans
14
- from sklearn.metrics.pairwise import cosine_similarity
15
- from sklearn.metrics import silhouette_score
16
  import spacy
17
  import en_core_sci_lg
18
- import inflect
19
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
20
  import torch
21
 
22
- import networkx as nx
23
-
24
- import xml.etree.ElementTree as ET
25
-
26
- # --- Global settings ---
27
  NLI_MODEL_NAME = "pritamdeka/PubMedBERT-MNLI-MedNLI"
 
 
28
  NLI_LABELS = ['CONTRADICTION', 'NEUTRAL', 'ENTAILMENT']
 
29
  PUBMED_N = 100
30
  TOP_ABSTRACTS = 10
31
 
@@ -37,41 +34,70 @@ model_options = {
37
  }
38
  pipe_cache = {}
39
 
40
- # --- Load static models ---
41
- nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME)
42
- nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME)
43
- p = inflect.engine()
44
- nlp = en_core_sci_lg.load()
45
- sp = en_core_sci_lg.load()
46
- all_stopwords = sp.Defaults.stop_words
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
 
48
  indicator_phrases = [
49
  "found that", "findings suggest", "shows that", "showed that", "demonstrated", "demonstrates",
50
  "revealed", "reveals", "suggests", "suggested", "indicated", "indicates", "reported", "reports",
51
  "was reported", "concluded", "concludes", "conclusion", "authors state", "stated", "data suggest",
52
  "observed", "observes", "study suggests", "study shows", "study found", "researchers found",
53
  "results indicate", "results show", "confirmed", "confirm", "confirming", "point to",
54
- "documented", "document", "evidence of", "evidence suggests",
55
- "associated with", "correlated with", "link between", "linked to", "relationship between",
56
- "was linked", "connected to", "relationship with", "tied to", "association with",
57
- "increase", "increases", "increased", "decrease", "decreases", "decreased",
58
  "greater risk", "lower risk", "higher risk", "reduced risk", "raises the risk", "reduces the risk",
59
  "risk of", "risk for", "likelihood of", "probability of", "chance of", "rate of", "incidence of",
60
- "prevalence of", "mortality", "survival rate", "death rate", "odds of", "number of", "percentage of", "percent of",
61
- "caused by", "causes", "cause", "resulted in", "results in", "leads to", "led to", "contributed to", "responsible for",
62
- "due to", "as a result", "because of",
63
- "randomized controlled trial", "RCT", "clinical trial", "participants", "enrolled", "sample size", "statistically significant",
64
- "compared to", "compared with", "versus", "compared against",
65
- "more than", "less than", "greater than", "lower than", "higher than", "significantly higher", "significantly lower",
66
- "significantly increased", "significantly decreased", "significant difference",
67
- "effect of", "impact of", "influence of", "predictor of", "predicts", "predictive of", "factor for", "determinant of",
68
- "plays a role in", "contributes to", "related to", "affects", "influences", "difference between",
69
- "according to", "a recent study", "researchers from"
70
  ]
71
 
72
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
73
-
74
- # --- Claim extraction ---
75
  def extract_claims_pattern(article_text):
76
  sentences = sent_tokenize(article_text)
77
  claims = [
@@ -79,11 +105,12 @@ def extract_claims_pattern(article_text):
79
  if any(phrase in s.lower() for phrase in indicator_phrases)
80
  or re.search(r"\b\d+(\.\d+)?%?\b", s)
81
  ]
82
- return list(dict.fromkeys(claims)) # deduplicate, preserve order
83
 
84
- def match_claims_to_headline(claims, headline, sbert_model):
85
- headline_emb = sbert_model.encode([headline])
86
- claim_embs = sbert_model.encode(claims)
 
87
  sims = util.pytorch_cos_sim(headline_emb, claim_embs)[0]
88
  matched_claims = [claim for claim, sim in zip(claims, sims) if sim >= 0.6]
89
  if not matched_claims and claims:
@@ -91,133 +118,16 @@ def match_claims_to_headline(claims, headline, sbert_model):
91
  matched_claims = [claims[i] for i in idxs]
92
  return matched_claims
93
 
94
- def keyphrase_groups_and_query(article_text, max_num_keywords, model_1, model_2, model_3):
95
- # TextRank with SBERT model_1
96
- corpus = sent_tokenize(article_text)
97
- indicator_list = indicator_phrases
98
- score_list, count_dict = [], {}
99
- for l in corpus:
100
- c = 0
101
- for l2 in indicator_list:
102
- if l.find(l2) != -1:
103
- c = 1
104
- break
105
- count_dict[l] = c
106
- for sent, score in count_dict.items():
107
- score_list.append(score)
108
- clean_sentences_new = [re.sub("[^a-zA-Z]", " ", s) for s in corpus]
109
- corpus_embeddings = model_1.encode(clean_sentences_new)
110
- sim_mat = np.zeros([len(clean_sentences_new), len(clean_sentences_new)])
111
- for i in range(len(clean_sentences_new)):
112
- len_embeddings = len(corpus_embeddings[i])
113
- for j in range(len(clean_sentences_new)):
114
- if i != j:
115
- sim_mat[i][j] = cosine_similarity(
116
- corpus_embeddings[i].reshape(1, len_embeddings),
117
- corpus_embeddings[j].reshape(1, len_embeddings)
118
- )[0, 0]
119
- nx_graph = nx.from_numpy_array(sim_mat)
120
- scores = nx.pagerank(nx_graph, max_iter=1500)
121
- element = [scores[i] for i in range(len(corpus))]
122
- sum_list = [sc + lst for sc, lst in zip(score_list, element)]
123
- x = sorted(((sum_list[i], s) for i, s in enumerate(corpus)), reverse=True)
124
- final_textrank_list = [elem[1] for elem in x]
125
- a = int((10 * len(final_textrank_list)) / 100.0)
126
- total = max(a, 5)
127
- document = [final_textrank_list[i] for i in range(total)]
128
- doc = " ".join(document)
129
- text_doc = []
130
- for i in document:
131
- doc_1 = nlp(i)
132
- text_doc.append([X.text for X in doc_1.ents])
133
- entity_list = [item for sublist in text_doc for item in sublist]
134
- entity_list = [word for word in entity_list if word not in all_stopwords]
135
- entity_list = [word_entity for word_entity in entity_list if not p.singular_noun(word_entity)]
136
- entity_list = list(dict.fromkeys(entity_list))
137
- doc_embedding = model_2.encode([doc])
138
- candidates = entity_list
139
- if not candidates:
140
- return "", []
141
- candidate_embeddings = model_2.encode(candidates)
142
- distances = cosine_similarity(doc_embedding, candidate_embeddings)
143
- top_n = min(max_num_keywords, len(candidates))
144
- keyword_list = [candidates[index] for index in distances.argsort()[0][-top_n:]]
145
- # Clustering with model_3
146
- word_embedding_model = models.Transformer(model_3)
147
- pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
148
- pooling_mode_mean_tokens=True,
149
- pooling_mode_cls_token=False,
150
- pooling_mode_max_tokens=False)
151
- embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
152
- c_len = len(keyword_list)
153
- if c_len < 2:
154
- return " OR ".join(keyword_list), keyword_list
155
- keyword_embeddings = embedder.encode(keyword_list)
156
- silhouette_score_list = []
157
- cluster_list_final = []
158
- for num_clusters in range(1, top_n):
159
- clustering_model = KMeans(n_clusters=num_clusters)
160
- clustering_model.fit(keyword_embeddings)
161
- cluster_assignment = clustering_model.labels_
162
- clustered_sentences = [[] for _ in range(num_clusters)]
163
- for sentence_id, cluster_id in enumerate(cluster_assignment):
164
- clustered_sentences[cluster_id].append(keyword_list[sentence_id])
165
- cl_sent_len = len(clustered_sentences)
166
- list_cluster = list(clustered_sentences)
167
- cluster_list_final.append(list_cluster)
168
- if (c_len == cl_sent_len and c_len >= 3) or cl_sent_len == 1:
169
- silhouette_avg = 0
170
- elif c_len == cl_sent_len == 2:
171
- silhouette_avg = 1
172
- else:
173
- silhouette_avg = silhouette_score(keyword_embeddings, cluster_assignment)
174
- silhouette_score_list.append(silhouette_avg)
175
- res_dict = dict(zip(silhouette_score_list, cluster_list_final))
176
- cluster_items = res_dict[max(res_dict)]
177
- comb = []
178
- for i in cluster_items:
179
- z = ' OR '.join(i)
180
- comb.append("(" + z + ")")
181
- combinations = []
182
- for subset in itertools.combinations(comb, 2):
183
- combinations.append(subset)
184
- f1_list = []
185
- for s in combinations:
186
- final = ' AND '.join(s)
187
- f1_list.append("(" + final + ")")
188
- f_1 = ' OR '.join(f1_list)
189
- return f_1, keyword_list
190
-
191
- def retrieve_pubmed_abstracts(article_text, headline, max_num_keywords, model_1, model_2, model_3):
192
- query, _ = keyphrase_groups_and_query(article_text, max_num_keywords, model_1, model_2, model_3)
193
- ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
194
- for q in [query, headline, article_text]:
195
- if not q:
196
- continue
197
- search_url = f"{ncbi_url}esearch.fcgi?db=pubmed&term={q}&retmax={PUBMED_N}&sort=relevance"
198
- r = requests.get(search_url)
199
- pmids = re.findall(r"<Id>(\d+)</Id>", r.text)
200
- if pmids:
201
- ids = ','.join(pmids)
202
- fetch_url = f"{ncbi_url}efetch.fcgi?db=pubmed&id={ids}&rettype=xml&retmax={PUBMED_N}"
203
- resp = requests.get(fetch_url)
204
- titles = re.findall(r"<ArticleTitle>(.*?)</ArticleTitle>", resp.text, flags=re.DOTALL)
205
- abstracts = re.findall(r"<AbstractText.*?>(.*?)</AbstractText>", resp.text, flags=re.DOTALL)
206
- if not abstracts:
207
- abstracts = [""] * len(titles)
208
- titles = [re.sub(r"\s+", " ", t).strip() for t in titles]
209
- abstracts = [re.sub(r"\s+", " ", a).strip() for a in abstracts]
210
- return titles, abstracts
211
- return [], []
212
-
213
- def semantic_rerank_claim_abstracts(claim, titles, abstracts, model_4):
214
  doc_texts = [f"{t}. {a}" for t, a in zip(titles, abstracts)]
215
- doc_embs = model_4.encode(doc_texts)
216
- claim_emb = model_4.encode([claim])
217
  sims = util.pytorch_cos_sim(claim_emb, doc_embs)[0]
218
- idxs = np.argsort(-sims.cpu().numpy())[:TOP_ABSTRACTS]
219
  return [titles[i] for i in idxs], [abstracts[i] for i in idxs]
220
 
 
221
  def extract_evidence_nli(claim, title, abstract):
222
  sentences = sent_tokenize(abstract)
223
  evidence = []
@@ -242,6 +152,7 @@ def extract_evidence_nli(claim, title, abstract):
242
  })
243
  return evidence
244
 
 
245
  def get_summarizer(model_choice):
246
  model_id = model_options[model_choice]
247
  if model_id in pipe_cache:
@@ -300,7 +211,7 @@ def format_evidence_html(evidence_list):
300
  )
301
  return html
302
 
303
- def factcheck_app(article_url, model_1_name, model_2_name, max_num_keywords, model_3_name, model_4_name, summarizer_choice):
304
  try:
305
  art = Article(article_url)
306
  art.download()
@@ -310,26 +221,20 @@ def factcheck_app(article_url, model_1_name, model_2_name, max_num_keywords, mod
310
  except Exception as e:
311
  return f"<b>Error downloading or reading article:</b> {e}", None
312
 
313
- # Load all selected models
314
- model_1 = SentenceTransformer(model_1_name)
315
- model_2 = SentenceTransformer(model_2_name)
316
- model_3 = model_3_name # used as model id string
317
- model_4 = SentenceTransformer(model_4_name)
318
-
319
  claims = extract_claims_pattern(text)
320
- matched_claims = match_claims_to_headline(claims, headline, model_1)
321
  if not matched_claims:
322
  return "<b>No check-worthy claims found that match the headline.</b>", None
323
 
324
  results_html = ""
325
  all_results = []
326
  for claim in matched_claims:
327
- titles, abstracts = retrieve_pubmed_abstracts(claim, headline, max_num_keywords, model_1, model_2, model_3)
328
  if not titles:
329
  results_html += f"<hr><b>Claim:</b> {claim}<br><i>No PubMed results found.</i><br>"
330
  all_results.append({"claim": claim, "summary": "No PubMed results found.", "evidence": []})
331
  continue
332
- top_titles, top_abstracts = semantic_rerank_claim_abstracts(claim, titles, abstracts, model_4)
333
  idx_non_top = random.choice([i for i in range(len(titles)) if i not in [titles.index(t) for t in top_titles]]) if len(titles) > len(top_titles) else None
334
  evidence_results = []
335
  for title, abstract in zip(top_titles, top_abstracts):
@@ -346,16 +251,14 @@ def factcheck_app(article_url, model_1_name, model_2_name, max_num_keywords, mod
346
  all_results.append({"claim": claim, "summary": summary, "evidence": evidence_results})
347
  return results_html, all_results
348
 
349
- # --- Gradio UI ---
350
  description = """
351
  <b>What does this app do?</b><br>
352
- This app extracts key scientific claims from a news article, finds the most relevant PubMed biomedical research papers using advanced keyphrase grouping and Boolean queries, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
353
  <b>How to use it:</b><br>
354
  1. Paste the link to a biomedical news article.<br>
355
- 2. Choose your models for each stage (or use defaults for best results).<br>
356
- 3. Pick a summarizer for layperson summary.<br>
357
- 4. Wait for the results.<br>
358
- 5. For each claim, you will see:<br>
359
  - A plain summary of what research says.<br>
360
  - Color-coded evidence sentences (green=support, red=contradict, gray=neutral).<br>
361
  - The titles of the most relevant PubMed articles.<br><br>
@@ -366,63 +269,6 @@ iface = gr.Interface(
366
  fn=factcheck_app,
367
  inputs=[
368
  gr.Textbox(lines=2, label="Paste a news article URL"),
369
- gr.Dropdown(
370
- choices=[
371
- 'sentence-transformers/all-mpnet-base-v2',
372
- 'sentence-transformers/all-mpnet-base-v1',
373
- 'sentence-transformers/all-distilroberta-v1',
374
- 'sentence-transformers/gtr-t5-large',
375
- 'pritamdeka/S-Bluebert-snli-multinli-stsb',
376
- 'pritamdeka/S-Biomed-Roberta-snli-multinli-stsb',
377
- 'pritamdeka/S-BioBert-snli-multinli-stsb',
378
- 'sentence-transformers/stsb-mpnet-base-v2',
379
- 'sentence-transformers/stsb-roberta-base-v2',
380
- 'sentence-transformers/stsb-distilroberta-base-v2',
381
- 'sentence-transformers/sentence-t5-large',
382
- 'sentence-transformers/sentence-t5-base'
383
- ],
384
- value='sentence-transformers/all-mpnet-base-v2',
385
- label="SBERT model for TextRank"
386
- ),
387
- gr.Dropdown(
388
- choices=[
389
- 'sentence-transformers/paraphrase-mpnet-base-v2',
390
- 'sentence-transformers/all-mpnet-base-v1',
391
- 'sentence-transformers/paraphrase-distilroberta-base-v1',
392
- 'sentence-transformers/paraphrase-xlm-r-multilingual-v1',
393
- 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
394
- 'sentence-transformers/paraphrase-albert-small-v2',
395
- 'sentence-transformers/paraphrase-albert-base-v2',
396
- 'sentence-transformers/paraphrase-MiniLM-L12-v2',
397
- 'sentence-transformers/paraphrase-MiniLM-L6-v2',
398
- 'sentence-transformers/all-MiniLM-L12-v2',
399
- 'sentence-transformers/all-distilroberta-v1',
400
- 'sentence-transformers/paraphrase-TinyBERT-L6-v2',
401
- 'sentence-transformers/paraphrase-MiniLM-L3-v2',
402
- 'sentence-transformers/all-MiniLM-L6-v2'
403
- ],
404
- value='sentence-transformers/paraphrase-mpnet-base-v2',
405
- label="SBERT model for keyphrases"
406
- ),
407
- gr.Slider(minimum=5, maximum=20, step=1, value=10, label="Max Keywords"),
408
- gr.Dropdown(
409
- choices=[
410
- 'cambridgeltl/SapBERT-from-PubMedBERT-fulltext',
411
- 'cambridgeltl/SapBERT-from-PubMedBERT-fulltext-mean-token'
412
- ],
413
- value='cambridgeltl/SapBERT-from-PubMedBERT-fulltext',
414
- label="SapBERT model for clustering"
415
- ),
416
- gr.Dropdown(
417
- choices=[
418
- 'pritamdeka/S-Bluebert-snli-multinli-stsb',
419
- 'pritamdeka/S-BioBert-snli-multinli-stsb',
420
- 'pritamdeka/S-Biomed-Roberta-snli-multinli-stsb',
421
- 'sentence-transformers/all-mpnet-base-v2'
422
- ],
423
- value='pritamdeka/S-BioBert-snli-multinli-stsb',
424
- label="SBERT model for abstracts"
425
- ),
426
  gr.Dropdown(
427
  choices=list(model_options.keys()),
428
  value="TinyLlama-1.1B-Chat (Open)",
@@ -432,15 +278,7 @@ iface = gr.Interface(
432
  outputs=[gr.HTML(label="Fact-Check Results (Summary & Evidence)"), gr.JSON(label="All Results (JSON)")],
433
  title="BioMedical News Fact-Checking & Research Evidence Finder",
434
  description=description,
435
- examples=[[
436
- "https://www.medicalnewstoday.com/articles/omicron-what-do-we-know-about-the-stealth-variant",
437
- 'sentence-transformers/all-mpnet-base-v2',
438
- 'sentence-transformers/paraphrase-mpnet-base-v2',
439
- 10,
440
- 'cambridgeltl/SapBERT-from-PubMedBERT-fulltext',
441
- 'pritamdeka/S-BioBert-snli-multinli-stsb',
442
- "TinyLlama-1.1B-Chat (Open)"
443
- ]],
444
  allow_flagging="never"
445
  )
446
 
 
1
  import os
2
  import re
 
3
  import random
4
  import requests
 
 
 
5
  import gradio as gr
6
+ import numpy as np
7
+ import nltk
8
+ import nltkmodule
9
+ from newspaper import Article
10
  from nltk.tokenize import sent_tokenize
11
+ from sentence_transformers import SentenceTransformer, util
 
 
 
12
  import spacy
13
  import en_core_sci_lg
 
14
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
15
  import torch
16
 
17
+ # --- Models (load once, globally) ---
18
+ scispacy = en_core_sci_lg.load()
19
+ sbert_keybert = SentenceTransformer("pritamdeka/S-BioBert-snli-multinli-stsb") # for keybert query
20
+ sbert_rerank = SentenceTransformer("pritamdeka/s-pubmedbert-msmarco") # for abstract reranking
 
21
  NLI_MODEL_NAME = "pritamdeka/PubMedBERT-MNLI-MedNLI"
22
+ nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME)
23
+ nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME)
24
  NLI_LABELS = ['CONTRADICTION', 'NEUTRAL', 'ENTAILMENT']
25
+
26
  PUBMED_N = 100
27
  TOP_ABSTRACTS = 10
28
 
 
34
  }
35
  pipe_cache = {}
36
 
37
+ # --- Utility: get robust keybert-style query ---
38
+ def get_keybert_query(text, top_n=10):
39
+ doc = scispacy(text)
40
+ phrases = [ent.text for ent in doc.ents]
41
+ if not phrases:
42
+ phrases = [chunk.text for chunk in doc.noun_chunks]
43
+ phrases = list(set([ph.strip() for ph in phrases if len(ph) > 2]))
44
+ if not phrases:
45
+ return ""
46
+ doc_emb = sbert_keybert.encode([text])
47
+ phrase_embs = sbert_keybert.encode(phrases)
48
+ sims = np.array(util.pytorch_cos_sim(doc_emb, phrase_embs))[0]
49
+ top_idxs = sims.argsort()[-top_n:]
50
+ keywords = [phrases[i] for i in top_idxs]
51
+ query = " OR ".join(f'"{kw}"' for kw in keywords)
52
+ return query
53
+
54
+ # --- PubMed retrieval ---
55
+ def retrieve_pubmed_abstracts_simple(text, n=PUBMED_N, fallback_headline=None):
56
+ query = get_keybert_query(text, top_n=10)
57
+ ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
58
+ for q in [query, fallback_headline, text]:
59
+ if not q:
60
+ continue
61
+ search_url = f"{ncbi_url}esearch.fcgi?db=pubmed&term={q}&retmax={n}&sort=relevance"
62
+ r = requests.get(search_url)
63
+ pmids = re.findall(r"<Id>(\d+)</Id>", r.text)
64
+ if pmids:
65
+ ids = ','.join(pmids)
66
+ fetch_url = f"{ncbi_url}efetch.fcgi?db=pubmed&id={ids}&rettype=xml&retmax={n}"
67
+ resp = requests.get(fetch_url)
68
+ titles = re.findall(r"<ArticleTitle>(.*?)</ArticleTitle>", resp.text, flags=re.DOTALL)
69
+ abstracts = re.findall(r"<AbstractText.*?>(.*?)</AbstractText>", resp.text, flags=re.DOTALL)
70
+ if not abstracts:
71
+ abstracts = [""] * len(titles)
72
+ titles = [re.sub(r"\s+", " ", t).strip() for t in titles]
73
+ abstracts = [re.sub(r"\s+", " ", a).strip() for a in abstracts]
74
+ return titles, abstracts
75
+ return [], []
76
 
77
+ # --- Claim extraction ---
78
  indicator_phrases = [
79
  "found that", "findings suggest", "shows that", "showed that", "demonstrated", "demonstrates",
80
  "revealed", "reveals", "suggests", "suggested", "indicated", "indicates", "reported", "reports",
81
  "was reported", "concluded", "concludes", "conclusion", "authors state", "stated", "data suggest",
82
  "observed", "observes", "study suggests", "study shows", "study found", "researchers found",
83
  "results indicate", "results show", "confirmed", "confirm", "confirming", "point to",
84
+ "documented", "document", "evidence of", "evidence suggests", "associated with", "correlated with",
85
+ "link between", "linked to", "relationship between", "was linked", "connected to", "relationship with",
86
+ "tied to", "association with", "increase", "increases", "increased", "decrease", "decreases", "decreased",
 
87
  "greater risk", "lower risk", "higher risk", "reduced risk", "raises the risk", "reduces the risk",
88
  "risk of", "risk for", "likelihood of", "probability of", "chance of", "rate of", "incidence of",
89
+ "prevalence of", "mortality", "survival rate", "death rate", "odds of", "number of", "percentage of",
90
+ "percent of", "caused by", "causes", "cause", "resulted in", "results in", "leads to", "led to",
91
+ "contributed to", "responsible for", "due to", "as a result", "because of",
92
+ "randomized controlled trial", "RCT", "clinical trial", "participants", "enrolled", "sample size",
93
+ "statistically significant", "compared to", "compared with", "versus", "compared against",
94
+ "more than", "less than", "greater than", "lower than", "higher than", "significantly higher",
95
+ "significantly lower", "significantly increased", "significantly decreased", "significant difference",
96
+ "effect of", "impact of", "influence of", "predictor of", "predicts", "predictive of", "factor for",
97
+ "determinant of", "plays a role in", "contributes to", "related to", "affects", "influences",
98
+ "difference between", "according to", "a recent study", "researchers from"
99
  ]
100
 
 
 
 
101
  def extract_claims_pattern(article_text):
102
  sentences = sent_tokenize(article_text)
103
  claims = [
 
105
  if any(phrase in s.lower() for phrase in indicator_phrases)
106
  or re.search(r"\b\d+(\.\d+)?%?\b", s)
107
  ]
108
+ return list(dict.fromkeys(claims))
109
 
110
+ def match_claims_to_headline(claims, headline):
111
+ emb_model = sbert_keybert # (or any SBERT for matching)
112
+ headline_emb = emb_model.encode([headline])
113
+ claim_embs = emb_model.encode(claims)
114
  sims = util.pytorch_cos_sim(headline_emb, claim_embs)[0]
115
  matched_claims = [claim for claim, sim in zip(claims, sims) if sim >= 0.6]
116
  if not matched_claims and claims:
 
118
  matched_claims = [claims[i] for i in idxs]
119
  return matched_claims
120
 
121
+ # --- Semantic reranking of abstracts using s-pubmedbert-msmarco ---
122
+ def semantic_rerank_claim_abstracts(claim, titles, abstracts, top_k=TOP_ABSTRACTS):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  doc_texts = [f"{t}. {a}" for t, a in zip(titles, abstracts)]
124
+ doc_embs = sbert_rerank.encode(doc_texts)
125
+ claim_emb = sbert_rerank.encode([claim])
126
  sims = util.pytorch_cos_sim(claim_emb, doc_embs)[0]
127
+ idxs = np.argsort(-sims.cpu().numpy())[:top_k]
128
  return [titles[i] for i in idxs], [abstracts[i] for i in idxs]
129
 
130
+ # --- NLI evidence extraction ---
131
  def extract_evidence_nli(claim, title, abstract):
132
  sentences = sent_tokenize(abstract)
133
  evidence = []
 
152
  })
153
  return evidence
154
 
155
+ # --- Summarizer model loading ---
156
  def get_summarizer(model_choice):
157
  model_id = model_options[model_choice]
158
  if model_id in pipe_cache:
 
211
  )
212
  return html
213
 
214
+ def factcheck_app(article_url, summarizer_choice):
215
  try:
216
  art = Article(article_url)
217
  art.download()
 
221
  except Exception as e:
222
  return f"<b>Error downloading or reading article:</b> {e}", None
223
 
 
 
 
 
 
 
224
  claims = extract_claims_pattern(text)
225
+ matched_claims = match_claims_to_headline(claims, headline)
226
  if not matched_claims:
227
  return "<b>No check-worthy claims found that match the headline.</b>", None
228
 
229
  results_html = ""
230
  all_results = []
231
  for claim in matched_claims:
232
+ titles, abstracts = retrieve_pubmed_abstracts_simple(claim, fallback_headline=headline)
233
  if not titles:
234
  results_html += f"<hr><b>Claim:</b> {claim}<br><i>No PubMed results found.</i><br>"
235
  all_results.append({"claim": claim, "summary": "No PubMed results found.", "evidence": []})
236
  continue
237
+ top_titles, top_abstracts = semantic_rerank_claim_abstracts(claim, titles, abstracts)
238
  idx_non_top = random.choice([i for i in range(len(titles)) if i not in [titles.index(t) for t in top_titles]]) if len(titles) > len(top_titles) else None
239
  evidence_results = []
240
  for title, abstract in zip(top_titles, top_abstracts):
 
251
  all_results.append({"claim": claim, "summary": summary, "evidence": evidence_results})
252
  return results_html, all_results
253
 
 
254
  description = """
255
  <b>What does this app do?</b><br>
256
+ This app extracts key scientific claims from a news article, finds the most relevant PubMed biomedical research papers using robust keyphrase extraction and semantic reranking, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
257
  <b>How to use it:</b><br>
258
  1. Paste the link to a biomedical news article.<br>
259
+ 2. Choose an AI summarizer model below. If you have no special access, use 'TinyLlama' (works for everyone).<br>
260
+ 3. Wait for the results.<br>
261
+ 4. For each claim, you will see:<br>
 
262
  - A plain summary of what research says.<br>
263
  - Color-coded evidence sentences (green=support, red=contradict, gray=neutral).<br>
264
  - The titles of the most relevant PubMed articles.<br><br>
 
269
  fn=factcheck_app,
270
  inputs=[
271
  gr.Textbox(lines=2, label="Paste a news article URL"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  gr.Dropdown(
273
  choices=list(model_options.keys()),
274
  value="TinyLlama-1.1B-Chat (Open)",
 
278
  outputs=[gr.HTML(label="Fact-Check Results (Summary & Evidence)"), gr.JSON(label="All Results (JSON)")],
279
  title="BioMedical News Fact-Checking & Research Evidence Finder",
280
  description=description,
281
+ examples=[["https://www.medicalnewstoday.com/articles/omicron-what-do-we-know-about-the-stealth-variant", "TinyLlama-1.1B-Chat (Open)"]],
 
 
 
 
 
 
 
 
282
  allow_flagging="never"
283
  )
284