Spaces:

pritamdeka
/

Biomedical-Fact-Checker

Sleeping

App Files Files Community

pritamdeka commited on Jul 4

Commit

81c18c4

verified ·

1 Parent(s): e80f4c1

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -20

app.py CHANGED Viewed

@@ -9,18 +9,17 @@ from newspaper import Article
 from nltk.tokenize import sent_tokenize
 from sentence_transformers import SentenceTransformer, util
 import spacy
 import en_core_sci_lg
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 import torch
-import nltkmodule
-# Download NLTK punkt if not present
 #nltk.download('punkt')
 # --- Models (load once, globally) ---
 scispacy = en_core_sci_lg.load()
-sbert_keybert = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")  # for keybert query
-sbert_rerank = SentenceTransformer("pritamdeka/S-PubMedBert-MS-MARCO")           # for abstract reranking
 NLI_MODEL_NAME = "pritamdeka/PubMedBERT-MNLI-MedNLI"
 nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME)
 nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME)
@@ -110,7 +109,7 @@ def extract_claims_pattern(article_text):
     return list(dict.fromkeys(claims))
 def match_claims_to_headline(claims, headline):
-    emb_model = sbert_keybert  # (or any SBERT for matching)
     headline_emb = emb_model.encode([headline])
     claim_embs = emb_model.encode(claims)
     sims = util.pytorch_cos_sim(headline_emb, claim_embs)[0]
@@ -120,7 +119,7 @@ def match_claims_to_headline(claims, headline):
         matched_claims = [claims[i] for i in idxs]
     return matched_claims
-# --- Semantic reranking of abstracts using s-pubmedbert-msmarco ---
 def semantic_rerank_claim_abstracts(claim, titles, abstracts, top_k=TOP_ABSTRACTS):
     doc_texts = [f"{t}. {a}" for t, a in zip(titles, abstracts)]
     doc_embs = sbert_rerank.encode(doc_texts)
@@ -153,14 +152,38 @@ def extract_evidence_nli(claim, evidence_sentences):
         })
     return evidence
-# --- Summarizer model options ---
 model_options = {
     "Llama-3.2-1B-Instruct (Meta, gated)": "meta-llama/Llama-3.2-1B-Instruct",
     "Gemma-3-1B-it (Google, gated)": "google/gemma-3-1b-it",
     "TinyLlama-1.1B-Chat (Open)": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 }
 pipe_cache = {}
 def get_summarizer(model_choice):
     model_id = model_options[model_choice]
     if model_id in pipe_cache:
@@ -183,19 +206,26 @@ def get_summarizer(model_choice):
 def summarize_evidence_llm(claim, evidence_list, model_choice):
     support = [ev['sentence'] for ev in evidence_list if ev['label'] == 'ENTAILMENT']
     contradict = [ev['sentence'] for ev in evidence_list if ev['label'] == 'CONTRADICTION']
-    messages = [
-        {"role": "system", "content": "You are a helpful biomedical assistant. Summarize scientific evidence in plain English for the general public."},
-        {"role": "user", "content":
-            f"Claim: {claim}\n"
-            f"Supporting evidence:\n" + ("\n".join(support) if support else "None") + "\n"
-            f"Contradicting evidence:\n" + ("\n".join(contradict) if contradict else "None") + "\n"
-            "Explain to a layperson: Is this claim likely true, false, or uncertain based on the evidence above? Give a brief and simple explanation in 2-3 sentences."
-        }
-    ]
     try:
         pipe = get_summarizer(model_choice)
         outputs = pipe(
-            messages,
             max_new_tokens=128,
             do_sample=False,
             temperature=0.1,
@@ -245,9 +275,7 @@ def factcheck_app(article_url, summarizer_choice):
         top_titles, top_abstracts = semantic_rerank_claim_abstracts(claim, titles, abstracts)
         evidence_results = []
         for title, abstract in zip(top_titles, top_abstracts):
-            # Extract evidence (results/conclusions) sentences from abstract
             ev_sents = extract_evidence_sentences_from_abstract(abstract)
-            # If none found, fallback to all sentences
             if ev_sents:
                 sent_list = [s for lbl, s in ev_sents]
             else:
@@ -267,7 +295,7 @@ description = """
 This app extracts key scientific claims from a news article, finds the most relevant biomedical research papers using robust keyphrase extraction and semantic reranking, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
 <b>How to use it:</b><br>
 1. Paste the link to a biomedical news article.<br>
-2. Choose an AI summarizer model below. If you have no special access, use 'TinyLlama' (works for everyone).<br>
 3. Wait for the results.<br>
 4. For each claim, you will see:<br>
 - A plain summary of what research says.<br>

 from nltk.tokenize import sent_tokenize
 from sentence_transformers import SentenceTransformer, util
 import spacy
+import nltkmodule
 import en_core_sci_lg
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 import torch
 #nltk.download('punkt')
 # --- Models (load once, globally) ---
 scispacy = en_core_sci_lg.load()
+sbert_keybert = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
+sbert_rerank = SentenceTransformer("pritamdeka/S-PubMedBert-MS-MARCO")
 NLI_MODEL_NAME = "pritamdeka/PubMedBERT-MNLI-MedNLI"
 nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME)
 nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME)
     return list(dict.fromkeys(claims))
 def match_claims_to_headline(claims, headline):
+    emb_model = sbert_keybert
     headline_emb = emb_model.encode([headline])
     claim_embs = emb_model.encode(claims)
     sims = util.pytorch_cos_sim(headline_emb, claim_embs)[0]
         matched_claims = [claims[i] for i in idxs]
     return matched_claims
+# --- Semantic reranking ---
 def semantic_rerank_claim_abstracts(claim, titles, abstracts, top_k=TOP_ABSTRACTS):
     doc_texts = [f"{t}. {a}" for t, a in zip(titles, abstracts)]
     doc_embs = sbert_rerank.encode(doc_texts)
         })
     return evidence
+# --- Summarizer model options (now with Mistral API!) ---
 model_options = {
+    "Mistral Small (API, fast/free)": "mistral-small-2503",
     "Llama-3.2-1B-Instruct (Meta, gated)": "meta-llama/Llama-3.2-1B-Instruct",
     "Gemma-3-1B-it (Google, gated)": "google/gemma-3-1b-it",
     "TinyLlama-1.1B-Chat (Open)": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 }
 pipe_cache = {}
+# --- Mistral API summarization ---
+def summarize_with_mistral_api(prompt, model_name="mistral-small", max_tokens=128, temperature=0.1):
+    api_key = os.getenv("MISTRAL_API_KEY")
+    if not api_key:
+        return "Missing MISTRAL_API_KEY secret/env variable!"
+    endpoint = "https://api.mistral.ai/v1/chat/completions"
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json"
+    }
+    data = {
+        "model": model_name,
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": max_tokens,
+        "temperature": temperature
+    }
+    response = requests.post(endpoint, headers=headers, json=data, timeout=30)
+    if response.status_code == 200:
+        content = response.json()["choices"][0]["message"]["content"]
+        return content.strip()
+    else:
+        return f"API Error ({response.status_code}): {response.text}"
 def get_summarizer(model_choice):
     model_id = model_options[model_choice]
     if model_id in pipe_cache:
 def summarize_evidence_llm(claim, evidence_list, model_choice):
     support = [ev['sentence'] for ev in evidence_list if ev['label'] == 'ENTAILMENT']
     contradict = [ev['sentence'] for ev in evidence_list if ev['label'] == 'CONTRADICTION']
+    user_prompt = (
+        f"Claim: {claim}\n"
+        f"Supporting evidence:\n" + ("\n".join(support) if support else "None") + "\n"
+        f"Contradicting evidence:\n" + ("\n".join(contradict) if contradict else "None") + "\n"
+        "Explain to a layperson: Is this claim likely true, false, or uncertain based on the evidence above? Give a brief and simple explanation in 2-3 sentences."
+    )
+    if model_choice in [
+        "Mistral Small (API, fast/free)",
+        "Mistral Medium (API, free tier)",
+        "Mistral Large (API, may require paid)"
+    ]:
+        mistral_name = model_options[model_choice]
+        return summarize_with_mistral_api(user_prompt, model_name=mistral_name)
     try:
         pipe = get_summarizer(model_choice)
         outputs = pipe(
+            [
+                {"role": "system", "content": "You are a helpful biomedical assistant. Summarize scientific evidence in plain English for the general public."},
+                {"role": "user", "content": user_prompt}
+            ],
             max_new_tokens=128,
             do_sample=False,
             temperature=0.1,
         top_titles, top_abstracts = semantic_rerank_claim_abstracts(claim, titles, abstracts)
         evidence_results = []
         for title, abstract in zip(top_titles, top_abstracts):
             ev_sents = extract_evidence_sentences_from_abstract(abstract)
             if ev_sents:
                 sent_list = [s for lbl, s in ev_sents]
             else:
 This app extracts key scientific claims from a news article, finds the most relevant biomedical research papers using robust keyphrase extraction and semantic reranking, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
 <b>How to use it:</b><br>
 1. Paste the link to a biomedical news article.<br>
+2. Choose an AI summarizer model below. If you have no special access, use 'TinyLlama' or 'Mistral Small' (works for everyone, free).<br>
 3. Wait for the results.<br>
 4. For each claim, you will see:<br>
 - A plain summary of what research says.<br>