|
import os |
|
import re |
|
import random |
|
import gradio as gr |
|
import requests |
|
import numpy as np |
|
|
|
from nltk.tokenize import sent_tokenize |
|
from newspaper import Article |
|
|
|
from sentence_transformers import SentenceTransformer, util |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline |
|
import torch |
|
|
|
|
|
PUBMED_N = 100 |
|
TOP_ABSTRACTS = 10 |
|
NLI_MODEL_NAME = "pritamdeka/PubMedBERT-MNLI-MedNLI" |
|
SBERT_MODEL_NAME = "pritamdeka/S-BioBert-snli-multinli-stsb" |
|
NLI_LABELS = ['CONTRADICTION', 'NEUTRAL', 'ENTAILMENT'] |
|
|
|
|
|
indicator_phrases = [ |
|
"found that", "findings suggest", "shows that", "showed that", "demonstrated", "demonstrates", |
|
"revealed", "reveals", "suggests", "suggested", "indicated", "indicates", "reported", "reports", |
|
"was reported", "concluded", "concludes", "conclusion", "authors state", "stated", "data suggest", |
|
"observed", "observes", "study suggests", "study shows", "study found", "researchers found", |
|
"results indicate", "results show", "confirmed", "confirm", "confirming", "point to", |
|
"documented", "document", "evidence of", "evidence suggests", |
|
"associated with", "correlated with", "link between", "linked to", "relationship between", |
|
"was linked", "connected to", "relationship with", "tied to", "association with", |
|
"increase", "increases", "increased", "decrease", "decreases", "decreased", |
|
"greater risk", "lower risk", "higher risk", "reduced risk", "raises the risk", "reduces the risk", |
|
"risk of", "risk for", "likelihood of", "probability of", "chance of", "rate of", "incidence of", |
|
"prevalence of", "mortality", "survival rate", "death rate", "odds of", "number of", "percentage of", "percent of", |
|
"caused by", "causes", "cause", "resulted in", "results in", "leads to", "led to", "contributed to", "responsible for", |
|
"due to", "as a result", "because of", |
|
"randomized controlled trial", "RCT", "clinical trial", "participants", "enrolled", "sample size", "statistically significant", |
|
"compared to", "compared with", "versus", "compared against", |
|
"more than", "less than", "greater than", "lower than", "higher than", "significantly higher", "significantly lower", |
|
"significantly increased", "significantly decreased", "significant difference", |
|
"effect of", "impact of", "influence of", "predictor of", "predicts", "predictive of", "factor for", "determinant of", |
|
"plays a role in", "contributes to", "related to", "affects", "influences", "difference between", |
|
"according to", "a recent study", "researchers from" |
|
] |
|
|
|
|
|
nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME) |
|
nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME) |
|
sbert_model = SentenceTransformer(SBERT_MODEL_NAME) |
|
|
|
|
|
model_id = "meta-llama/Llama-3.2-1B-Instruct" |
|
pipe = pipeline( |
|
"text-generation", |
|
model=model_id, |
|
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, |
|
device_map="auto", |
|
max_new_tokens=128, |
|
) |
|
|
|
def extract_claims_pattern(article_text): |
|
sentences = sent_tokenize(article_text) |
|
claims = [ |
|
s for s in sentences |
|
if any(phrase in s.lower() for phrase in indicator_phrases) |
|
or re.search(r"\b\d+(\.\d+)?%?\b", s) |
|
] |
|
return list(dict.fromkeys(claims)) |
|
|
|
def match_claims_to_headline(claims, headline, threshold=0.6): |
|
headline_emb = sbert_model.encode([headline]) |
|
claim_embs = sbert_model.encode(claims) |
|
sims = util.pytorch_cos_sim(headline_emb, claim_embs)[0] |
|
matched_claims = [claim for claim, sim in zip(claims, sims) if sim >= threshold] |
|
|
|
if not matched_claims and claims: |
|
idxs = np.argsort(-sims.cpu().numpy())[:min(3, len(claims))] |
|
matched_claims = [claims[i] for i in idxs] |
|
return matched_claims |
|
|
|
def retrieve_pubmed_abstracts(claim, n=PUBMED_N): |
|
ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/' |
|
query = '+'.join(re.findall(r'\w+', claim)) |
|
search_url = f"{ncbi_url}esearch.fcgi?db=pubmed&term={query}&retmax={n}&sort=relevance" |
|
r = requests.get(search_url) |
|
pmids = re.findall(r"<Id>(\d+)</Id>", r.text) |
|
if not pmids: |
|
return [], [] |
|
ids = ','.join(pmids) |
|
fetch_url = f"{ncbi_url}efetch.fcgi?db=pubmed&id={ids}&rettype=xml&retmax={n}" |
|
resp = requests.get(fetch_url) |
|
titles = re.findall(r"<ArticleTitle>(.*?)</ArticleTitle>", resp.text, flags=re.DOTALL) |
|
abstracts = re.findall(r"<AbstractText.*?>(.*?)</AbstractText>", resp.text, flags=re.DOTALL) |
|
if not abstracts: |
|
abstracts = [""] * len(titles) |
|
titles = [re.sub(r"\s+", " ", t).strip() for t in titles] |
|
abstracts = [re.sub(r"\s+", " ", a).strip() for a in abstracts] |
|
return titles, abstracts |
|
|
|
def semantic_rerank_claim_abstracts(claim, titles, abstracts, top_k=TOP_ABSTRACTS): |
|
doc_texts = [f"{t}. {a}" for t, a in zip(titles, abstracts)] |
|
doc_embs = sbert_model.encode(doc_texts) |
|
claim_emb = sbert_model.encode([claim]) |
|
sims = util.pytorch_cos_sim(claim_emb, doc_embs)[0] |
|
idxs = np.argsort(-sims.cpu().numpy())[:top_k] |
|
return [titles[i] for i in idxs], [abstracts[i] for i in idxs] |
|
|
|
def extract_evidence_nli(claim, title, abstract): |
|
sentences = sent_tokenize(abstract) |
|
evidence = [] |
|
for sent in sentences: |
|
encoding = nli_tokenizer( |
|
sent, claim, |
|
return_tensors='pt', |
|
truncation=True, |
|
max_length=256, |
|
padding=True |
|
) |
|
with torch.no_grad(): |
|
outputs = nli_model(**encoding) |
|
probs = torch.softmax(outputs.logits, dim=1).cpu().numpy().flatten() |
|
max_idx = probs.argmax() |
|
label = NLI_LABELS[max_idx] |
|
score = float(probs[max_idx]) |
|
evidence.append({ |
|
"sentence": sent, |
|
"label": label, |
|
"score": score |
|
}) |
|
return evidence |
|
|
|
def summarize_evidence_llm(claim, evidence_list): |
|
support = [ev['sentence'] for ev in evidence_list if ev['label'] == 'ENTAILMENT'] |
|
contradict = [ev['sentence'] for ev in evidence_list if ev['label'] == 'CONTRADICTION'] |
|
|
|
|
|
messages = [ |
|
{"role": "system", "content": "You are a helpful biomedical assistant. Summarize scientific evidence in plain English for the general public."}, |
|
{"role": "user", "content": |
|
f"Claim: {claim}\n" |
|
f"Supporting evidence:\n" + ("\n".join(support) if support else "None") + "\n" |
|
f"Contradicting evidence:\n" + ("\n".join(contradict) if contradict else "None") + "\n" |
|
"Explain to a layperson: Is this claim likely true, false, or uncertain based on the evidence above? Give a brief and simple explanation in 2-3 sentences." |
|
} |
|
] |
|
try: |
|
outputs = pipe( |
|
messages, |
|
max_new_tokens=96, |
|
do_sample=False, |
|
temperature=0.1, |
|
) |
|
out = outputs[0]["generated_text"] |
|
|
|
if isinstance(out, list) and "content" in out[-1]: |
|
return out[-1]["content"].strip() |
|
return out.strip() |
|
except Exception as e: |
|
return f"Summary could not be generated: {e}" |
|
|
|
def format_evidence_html(evidence_list): |
|
color_map = {"ENTAILMENT":"#e6ffe6", "CONTRADICTION":"#ffe6e6", "NEUTRAL":"#f8f8f8"} |
|
html = "" |
|
for ev in evidence_list: |
|
color = color_map[ev["label"]] |
|
html += ( |
|
f'<div style="background:{color};padding:6px;border-radius:6px;margin-bottom:3px">' |
|
f'<b>{ev["label"]}</b> (confidence {ev["score"]:.2f}): {ev["sentence"]}' |
|
'</div>' |
|
) |
|
return html |
|
|
|
def factcheck_app(article_url): |
|
try: |
|
art = Article(article_url) |
|
art.download() |
|
art.parse() |
|
text = art.text |
|
headline = art.title |
|
except Exception as e: |
|
return f"<b>Error downloading or reading article:</b> {e}", None |
|
|
|
claims = extract_claims_pattern(text) |
|
matched_claims = match_claims_to_headline(claims, headline) |
|
if not matched_claims: |
|
return "<b>No check-worthy claims found that match the headline.</b>", None |
|
|
|
results_html = "" |
|
all_results = [] |
|
for claim in matched_claims: |
|
titles, abstracts = retrieve_pubmed_abstracts(claim) |
|
if not titles: |
|
results_html += f"<hr><b>Claim:</b> {claim}<br><i>No PubMed results found.</i><br>" |
|
all_results.append({"claim": claim, "summary": "No PubMed results found.", "evidence": []}) |
|
continue |
|
top_titles, top_abstracts = semantic_rerank_claim_abstracts(claim, titles, abstracts) |
|
idx_non_top = random.choice([i for i in range(len(titles)) if i not in [titles.index(t) for t in top_titles]]) if len(titles) > len(top_titles) else None |
|
evidence_results = [] |
|
for title, abstract in zip(top_titles, top_abstracts): |
|
evidence = extract_evidence_nli(claim, title, abstract) |
|
evidence_results.append({"title": title, "evidence": evidence}) |
|
if idx_non_top is not None: |
|
control_ev = extract_evidence_nli(claim, titles[idx_non_top], abstracts[idx_non_top]) |
|
evidence_results.append({"title": f"(Control) {titles[idx_non_top]}", "evidence": control_ev}) |
|
all_evidence_sentences = [ev for abs_res in evidence_results for ev in abs_res["evidence"]] |
|
summary = summarize_evidence_llm(claim, all_evidence_sentences) |
|
results_html += f"<hr><b>Claim:</b> {claim}<br><b>Layman summary:</b> {summary}<br>" |
|
for abs_res in evidence_results: |
|
results_html += f"<br><b>Abstract:</b> {abs_res['title']}<br>{format_evidence_html(abs_res['evidence'])}" |
|
all_results.append({"claim": claim, "summary": summary, "evidence": evidence_results}) |
|
return results_html, all_results |
|
|
|
description = """ |
|
<b>What does this app do?</b><br> |
|
This app extracts key scientific claims from a news article, finds the most relevant PubMed biomedical research papers, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br> |
|
<b>How to use it:</b><br> |
|
1. Paste the link to a biomedical news article.<br> |
|
2. Wait for the results.<br> |
|
3. For each claim, you will see:<br> |
|
- A plain summary of what research says.<br> |
|
- Color-coded evidence sentences (green=support, red=contradict, gray=neutral).<br> |
|
- The titles of the most relevant PubMed articles.<br><br> |
|
<b>Everything is 100% open source and runs on this website—no personal info or cloud API needed.</b> |
|
""" |
|
|
|
iface = gr.Interface( |
|
fn=factcheck_app, |
|
inputs=gr.Textbox(lines=2, label="Paste a news article URL"), |
|
outputs=[gr.HTML(label="Fact-Check Results (Summary & Evidence)"), gr.JSON(label="All Results (JSON)")], |
|
title="BioMedical News Fact-Checking & Research Evidence Finder", |
|
description=description, |
|
examples=[["https://www.medicalnewstoday.com/articles/omicron-what-do-we-know-about-the-stealth-variant"]], |
|
allow_flagging="never" |
|
) |
|
|
|
iface.launch(share=False, server_name='0.0.0.0', show_error=True) |
|
|