Spaces:

pritamdeka
/

Biomedical-Fact-Checker

Running

App Files Files Community

pritamdeka commited on Jul 4

Commit

3358886

verified ·

1 Parent(s): a34738f

Create app.py

Browse files

Files changed (1) hide show

app.py +242 -0

app.py ADDED Viewed

	@@ -0,0 +1,242 @@

+import os
+import re
+import random
+import gradio as gr
+import requests
+import numpy as np
+from nltk.tokenize import sent_tokenize
+from newspaper import Article
+from sentence_transformers import SentenceTransformer, util
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+# --- Download GGUF model from Hugging Face Hub at startup (if not present) ---
+from huggingface_hub import hf_hub_download
+GGUF_FILENAME = "gemma-3b-it-Q4_K_M.gguf"
+GGUF_REPO = "unsloth/gemma-3n-E4B-it-GGUF"
+print("Checking for GGUF model...")
+gguf_path = hf_hub_download(
+    repo_id=GGUF_REPO,
+    filename=GGUF_FILENAME,
+    cache_dir="./"
+)
+print(f"GGUF model path: {gguf_path}")
+# Load Llama GGUF model via llama-cpp-python
+from llama_cpp import Llama
+llm = Llama(
+    model_path=gguf_path,
+    n_ctx=2048,
+    n_threads=4  # or set to number of CPU cores
+)
+# --------- App settings ---------
+PUBMED_N = 100    # Number of abstracts to retrieve initially
+TOP_ABSTRACTS = 10  # Number of top semantic abstracts to keep per claim
+NLI_MODEL_NAME = "pritamdeka/PubMedBERT-MNLI-MedNLI"
+SBERT_MODEL_NAME = "pritamdeka/S-BioBert-snli-multinli-stsb"
+NLI_LABELS = ['CONTRADICTION', 'NEUTRAL', 'ENTAILMENT']
+# --------- Indicator Phrases for Claim Extraction ---------
+indicator_phrases = [
+    "found that", "findings suggest", "shows that", "showed that", "demonstrated", "demonstrates",
+    "revealed", "reveals", "suggests", "suggested", "indicated", "indicates", "reported", "reports",
+    "was reported", "concluded", "concludes", "conclusion", "authors state", "stated", "data suggest",
+    "observed", "observes", "study suggests", "study shows", "study found", "researchers found",
+    "results indicate", "results show", "confirmed", "confirm", "confirming", "point to",
+    "documented", "document", "evidence of", "evidence suggests",
+    "associated with", "correlated with", "link between", "linked to", "relationship between",
+    "was linked", "connected to", "relationship with", "tied to", "association with",
+    "increase", "increases", "increased", "decrease", "decreases", "decreased",
+    "greater risk", "lower risk", "higher risk", "reduced risk", "raises the risk", "reduces the risk",
+    "risk of", "risk for", "likelihood of", "probability of", "chance of", "rate of", "incidence of",
+    "prevalence of", "mortality", "survival rate", "death rate", "odds of", "number of", "percentage of", "percent of",
+    "caused by", "causes", "cause", "resulted in", "results in", "leads to", "led to", "contributed to", "responsible for",
+    "due to", "as a result", "because of",
+    "randomized controlled trial", "RCT", "clinical trial", "participants", "enrolled", "sample size", "statistically significant",
+    "compared to", "compared with", "versus", "compared against",
+    "more than", "less than", "greater than", "lower than", "higher than", "significantly higher", "significantly lower",
+    "significantly increased", "significantly decreased", "significant difference",
+    "effect of", "impact of", "influence of", "predictor of", "predicts", "predictive of", "factor for", "determinant of",
+    "plays a role in", "contributes to", "related to", "affects", "influences", "difference between",
+    "according to", "a recent study", "researchers from"
+]
+# --------- Load models (global, once) ---------
+nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME)
+nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME)
+sbert_model = SentenceTransformer(SBERT_MODEL_NAME)
+def extract_claims_pattern(article_text):
+    sentences = sent_tokenize(article_text)
+    claims = [
+        s for s in sentences
+        if any(phrase in s.lower() for phrase in indicator_phrases)
+        or re.search(r"\b\d+(\.\d+)?%?\b", s)
+    ]
+    return list(dict.fromkeys(claims))  # deduplicate, preserve order
+def match_claims_to_headline(claims, headline, threshold=0.6):
+    headline_emb = sbert_model.encode([headline])
+    claim_embs = sbert_model.encode(claims)
+    sims = util.pytorch_cos_sim(headline_emb, claim_embs)[0]
+    matched_claims = [claim for claim, sim in zip(claims, sims) if sim >= threshold]
+    # fallback: top 3 by similarity
+    if not matched_claims and claims:
+        idxs = np.argsort(-sims.cpu().numpy())[:min(3, len(claims))]
+        matched_claims = [claims[i] for i in idxs]
+    return matched_claims
+def retrieve_pubmed_abstracts(claim, n=PUBMED_N):
+    ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
+    query = '+'.join(re.findall(r'\w+', claim))
+    search_url = f"{ncbi_url}esearch.fcgi?db=pubmed&term={query}&retmax={n}&sort=relevance"
+    r = requests.get(search_url)
+    pmids = re.findall(r"<Id>(\d+)</Id>", r.text)
+    if not pmids:
+        return [], []
+    ids = ','.join(pmids)
+    fetch_url = f"{ncbi_url}efetch.fcgi?db=pubmed&id={ids}&rettype=xml&retmax={n}"
+    resp = requests.get(fetch_url)
+    titles = re.findall(r"<ArticleTitle>(.*?)</ArticleTitle>", resp.text, flags=re.DOTALL)
+    abstracts = re.findall(r"<AbstractText.*?>(.*?)</AbstractText>", resp.text, flags=re.DOTALL)
+    if not abstracts:
+        abstracts = [""] * len(titles)
+    titles = [re.sub(r"\s+", " ", t).strip() for t in titles]
+    abstracts = [re.sub(r"\s+", " ", a).strip() for a in abstracts]
+    return titles, abstracts
+def semantic_rerank_claim_abstracts(claim, titles, abstracts, top_k=TOP_ABSTRACTS):
+    doc_texts = [f"{t}. {a}" for t, a in zip(titles, abstracts)]
+    doc_embs = sbert_model.encode(doc_texts)
+    claim_emb = sbert_model.encode([claim])
+    sims = util.pytorch_cos_sim(claim_emb, doc_embs)[0]
+    idxs = np.argsort(-sims.cpu().numpy())[:top_k]
+    return [titles[i] for i in idxs], [abstracts[i] for i in idxs]
+def extract_evidence_nli(claim, title, abstract):
+    sentences = sent_tokenize(abstract)
+    evidence = []
+    for sent in sentences:
+        encoding = nli_tokenizer(
+            sent, claim,
+            return_tensors='pt',
+            truncation=True,
+            max_length=256,
+            padding=True
+        )
+        with torch.no_grad():
+            outputs = nli_model(**encoding)
+            probs = torch.softmax(outputs.logits, dim=1).cpu().numpy().flatten()
+            max_idx = probs.argmax()
+            label = NLI_LABELS[max_idx]
+            score = float(probs[max_idx])
+        evidence.append({
+            "sentence": sent,
+            "label": label,
+            "score": score
+        })
+    return evidence
+def summarize_evidence_llm(claim, evidence_list):
+    support = [ev['sentence'] for ev in evidence_list if ev['label'] == 'ENTAILMENT']
+    contradict = [ev['sentence'] for ev in evidence_list if ev['label'] == 'CONTRADICTION']
+    prompt = (
+        f"Claim: {claim}\n"
+        f"Supporting evidence:\n" + ("\n".join(support) if support else "None") + "\n"
+        f"Contradicting evidence:\n" + ("\n".join(contradict) if contradict else "None") + "\n"
+        "Explain to a layperson: Is this claim likely true, false, or uncertain based on the evidence above? "
+        "Give a brief and simple explanation in 2-3 sentences."
+    )
+    try:
+        output = llm(
+            prompt,
+            max_tokens=128,
+            stop=["\n\n"],
+            temperature=0.4,
+            echo=False
+        )
+        summary = output['choices'][0]['text'].strip()
+        return summary
+    except Exception as e:
+        return f"Summary could not be generated: {e}"
+def format_evidence_html(evidence_list):
+    color_map = {"ENTAILMENT":"#e6ffe6", "CONTRADICTION":"#ffe6e6", "NEUTRAL":"#f8f8f8"}
+    html = ""
+    for ev in evidence_list:
+        color = color_map[ev["label"]]
+        html += (
+            f'<div style="background:{color};padding:6px;border-radius:6px;margin-bottom:3px">'
+            f'<b>{ev["label"]}</b> (confidence {ev["score"]:.2f}): {ev["sentence"]}'
+            '</div>'
+        )
+    return html
+def factcheck_app(article_url):
+    try:
+        art = Article(article_url)
+        art.download()
+        art.parse()
+        text = art.text
+        headline = art.title
+    except Exception as e:
+        return f"<b>Error downloading or reading article:</b> {e}", None
+    claims = extract_claims_pattern(text)
+    matched_claims = match_claims_to_headline(claims, headline)
+    if not matched_claims:
+        return "<b>No check-worthy claims found that match the headline.</b>", None
+    results_html = ""
+    all_results = []
+    for claim in matched_claims:
+        titles, abstracts = retrieve_pubmed_abstracts(claim)
+        if not titles:
+            results_html += f"<hr><b>Claim:</b> {claim}<br><i>No PubMed results found.</i><br>"
+            all_results.append({"claim": claim, "summary": "No PubMed results found.", "evidence": []})
+            continue
+        top_titles, top_abstracts = semantic_rerank_claim_abstracts(claim, titles, abstracts)
+        idx_non_top = random.choice([i for i in range(len(titles)) if i not in [titles.index(t) for t in top_titles]]) if len(titles) > len(top_titles) else None
+        evidence_results = []
+        for title, abstract in zip(top_titles, top_abstracts):
+            evidence = extract_evidence_nli(claim, title, abstract)
+            evidence_results.append({"title": title, "evidence": evidence})
+        if idx_non_top is not None:
+            control_ev = extract_evidence_nli(claim, titles[idx_non_top], abstracts[idx_non_top])
+            evidence_results.append({"title": f"(Control) {titles[idx_non_top]}", "evidence": control_ev})
+        all_evidence_sentences = [ev for abs_res in evidence_results for ev in abs_res["evidence"]]
+        summary = summarize_evidence_llm(claim, all_evidence_sentences)
+        results_html += f"<hr><b>Claim:</b> {claim}<br><b>Layman summary:</b> {summary}<br>"
+        for abs_res in evidence_results:
+            results_html += f"<br><b>Abstract:</b> {abs_res['title']}<br>{format_evidence_html(abs_res['evidence'])}"
+        all_results.append({"claim": claim, "summary": summary, "evidence": evidence_results})
+    return results_html, all_results
+description = """
+<b>What does this app do?</b><br>
+This app extracts key scientific claims from a news article, finds the most relevant PubMed biomedical research papers, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
+<b>How to use it:</b><br>
+1. Paste the link to a biomedical news article.<br>
+2. Wait for the results.<br>
+3. For each claim, you will see:<br>
+- A plain summary of what research says.<br>
+- Color-coded evidence sentences (green=support, red=contradict, gray=neutral).<br>
+- Links to original PubMed research.<br><br>
+<b>Everything is 100% open source and runs on this website—no personal info or cloud API needed.</b>
+"""
+iface = gr.Interface(
+    fn=factcheck_app,
+    inputs=gr.Textbox(lines=2, label="Paste a news article URL"),
+    outputs=[gr.HTML(label="Fact-Check Results (Summary & Evidence)"), gr.JSON(label="All Results (JSON)")],
+    title="BioMedical News Fact-Checking & Research Evidence Finder",
+    description=description,
+    examples=[["https://www.medicalnewstoday.com/articles/omicron-what-do-we-know-about-the-stealth-variant"]],
+    allow_flagging="never"
+)
+iface.launch(share=False, server_name='0.0.0.0', show_error=True)