Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -9,18 +9,17 @@ from newspaper import Article
|
|
9 |
from nltk.tokenize import sent_tokenize
|
10 |
from sentence_transformers import SentenceTransformer, util
|
11 |
import spacy
|
|
|
12 |
import en_core_sci_lg
|
13 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
14 |
import torch
|
15 |
-
import nltkmodule
|
16 |
|
17 |
-
# Download NLTK punkt if not present
|
18 |
#nltk.download('punkt')
|
19 |
|
20 |
# --- Models (load once, globally) ---
|
21 |
scispacy = en_core_sci_lg.load()
|
22 |
-
sbert_keybert = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
|
23 |
-
sbert_rerank = SentenceTransformer("pritamdeka/S-PubMedBert-MS-MARCO")
|
24 |
NLI_MODEL_NAME = "pritamdeka/PubMedBERT-MNLI-MedNLI"
|
25 |
nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME)
|
26 |
nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME)
|
@@ -110,7 +109,7 @@ def extract_claims_pattern(article_text):
|
|
110 |
return list(dict.fromkeys(claims))
|
111 |
|
112 |
def match_claims_to_headline(claims, headline):
|
113 |
-
emb_model = sbert_keybert
|
114 |
headline_emb = emb_model.encode([headline])
|
115 |
claim_embs = emb_model.encode(claims)
|
116 |
sims = util.pytorch_cos_sim(headline_emb, claim_embs)[0]
|
@@ -120,7 +119,7 @@ def match_claims_to_headline(claims, headline):
|
|
120 |
matched_claims = [claims[i] for i in idxs]
|
121 |
return matched_claims
|
122 |
|
123 |
-
# --- Semantic reranking
|
124 |
def semantic_rerank_claim_abstracts(claim, titles, abstracts, top_k=TOP_ABSTRACTS):
|
125 |
doc_texts = [f"{t}. {a}" for t, a in zip(titles, abstracts)]
|
126 |
doc_embs = sbert_rerank.encode(doc_texts)
|
@@ -153,14 +152,38 @@ def extract_evidence_nli(claim, evidence_sentences):
|
|
153 |
})
|
154 |
return evidence
|
155 |
|
156 |
-
# --- Summarizer model options ---
|
157 |
model_options = {
|
|
|
158 |
"Llama-3.2-1B-Instruct (Meta, gated)": "meta-llama/Llama-3.2-1B-Instruct",
|
159 |
"Gemma-3-1B-it (Google, gated)": "google/gemma-3-1b-it",
|
160 |
"TinyLlama-1.1B-Chat (Open)": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
161 |
}
|
162 |
pipe_cache = {}
|
163 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
def get_summarizer(model_choice):
|
165 |
model_id = model_options[model_choice]
|
166 |
if model_id in pipe_cache:
|
@@ -183,19 +206,26 @@ def get_summarizer(model_choice):
|
|
183 |
def summarize_evidence_llm(claim, evidence_list, model_choice):
|
184 |
support = [ev['sentence'] for ev in evidence_list if ev['label'] == 'ENTAILMENT']
|
185 |
contradict = [ev['sentence'] for ev in evidence_list if ev['label'] == 'CONTRADICTION']
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
|
|
|
|
|
|
|
|
195 |
try:
|
196 |
pipe = get_summarizer(model_choice)
|
197 |
outputs = pipe(
|
198 |
-
|
|
|
|
|
|
|
199 |
max_new_tokens=128,
|
200 |
do_sample=False,
|
201 |
temperature=0.1,
|
@@ -245,9 +275,7 @@ def factcheck_app(article_url, summarizer_choice):
|
|
245 |
top_titles, top_abstracts = semantic_rerank_claim_abstracts(claim, titles, abstracts)
|
246 |
evidence_results = []
|
247 |
for title, abstract in zip(top_titles, top_abstracts):
|
248 |
-
# Extract evidence (results/conclusions) sentences from abstract
|
249 |
ev_sents = extract_evidence_sentences_from_abstract(abstract)
|
250 |
-
# If none found, fallback to all sentences
|
251 |
if ev_sents:
|
252 |
sent_list = [s for lbl, s in ev_sents]
|
253 |
else:
|
@@ -267,7 +295,7 @@ description = """
|
|
267 |
This app extracts key scientific claims from a news article, finds the most relevant biomedical research papers using robust keyphrase extraction and semantic reranking, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
|
268 |
<b>How to use it:</b><br>
|
269 |
1. Paste the link to a biomedical news article.<br>
|
270 |
-
2. Choose an AI summarizer model below. If you have no special access, use 'TinyLlama' (works for everyone).<br>
|
271 |
3. Wait for the results.<br>
|
272 |
4. For each claim, you will see:<br>
|
273 |
- A plain summary of what research says.<br>
|
|
|
9 |
from nltk.tokenize import sent_tokenize
|
10 |
from sentence_transformers import SentenceTransformer, util
|
11 |
import spacy
|
12 |
+
import nltkmodule
|
13 |
import en_core_sci_lg
|
14 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
15 |
import torch
|
|
|
16 |
|
|
|
17 |
#nltk.download('punkt')
|
18 |
|
19 |
# --- Models (load once, globally) ---
|
20 |
scispacy = en_core_sci_lg.load()
|
21 |
+
sbert_keybert = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
|
22 |
+
sbert_rerank = SentenceTransformer("pritamdeka/S-PubMedBert-MS-MARCO")
|
23 |
NLI_MODEL_NAME = "pritamdeka/PubMedBERT-MNLI-MedNLI"
|
24 |
nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME)
|
25 |
nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME)
|
|
|
109 |
return list(dict.fromkeys(claims))
|
110 |
|
111 |
def match_claims_to_headline(claims, headline):
|
112 |
+
emb_model = sbert_keybert
|
113 |
headline_emb = emb_model.encode([headline])
|
114 |
claim_embs = emb_model.encode(claims)
|
115 |
sims = util.pytorch_cos_sim(headline_emb, claim_embs)[0]
|
|
|
119 |
matched_claims = [claims[i] for i in idxs]
|
120 |
return matched_claims
|
121 |
|
122 |
+
# --- Semantic reranking ---
|
123 |
def semantic_rerank_claim_abstracts(claim, titles, abstracts, top_k=TOP_ABSTRACTS):
|
124 |
doc_texts = [f"{t}. {a}" for t, a in zip(titles, abstracts)]
|
125 |
doc_embs = sbert_rerank.encode(doc_texts)
|
|
|
152 |
})
|
153 |
return evidence
|
154 |
|
155 |
+
# --- Summarizer model options (now with Mistral API!) ---
|
156 |
model_options = {
|
157 |
+
"Mistral Small (API, fast/free)": "mistral-small-2503",
|
158 |
"Llama-3.2-1B-Instruct (Meta, gated)": "meta-llama/Llama-3.2-1B-Instruct",
|
159 |
"Gemma-3-1B-it (Google, gated)": "google/gemma-3-1b-it",
|
160 |
"TinyLlama-1.1B-Chat (Open)": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
161 |
}
|
162 |
pipe_cache = {}
|
163 |
|
164 |
+
# --- Mistral API summarization ---
|
165 |
+
def summarize_with_mistral_api(prompt, model_name="mistral-small", max_tokens=128, temperature=0.1):
|
166 |
+
api_key = os.getenv("MISTRAL_API_KEY")
|
167 |
+
if not api_key:
|
168 |
+
return "Missing MISTRAL_API_KEY secret/env variable!"
|
169 |
+
endpoint = "https://api.mistral.ai/v1/chat/completions"
|
170 |
+
headers = {
|
171 |
+
"Authorization": f"Bearer {api_key}",
|
172 |
+
"Content-Type": "application/json"
|
173 |
+
}
|
174 |
+
data = {
|
175 |
+
"model": model_name,
|
176 |
+
"messages": [{"role": "user", "content": prompt}],
|
177 |
+
"max_tokens": max_tokens,
|
178 |
+
"temperature": temperature
|
179 |
+
}
|
180 |
+
response = requests.post(endpoint, headers=headers, json=data, timeout=30)
|
181 |
+
if response.status_code == 200:
|
182 |
+
content = response.json()["choices"][0]["message"]["content"]
|
183 |
+
return content.strip()
|
184 |
+
else:
|
185 |
+
return f"API Error ({response.status_code}): {response.text}"
|
186 |
+
|
187 |
def get_summarizer(model_choice):
|
188 |
model_id = model_options[model_choice]
|
189 |
if model_id in pipe_cache:
|
|
|
206 |
def summarize_evidence_llm(claim, evidence_list, model_choice):
|
207 |
support = [ev['sentence'] for ev in evidence_list if ev['label'] == 'ENTAILMENT']
|
208 |
contradict = [ev['sentence'] for ev in evidence_list if ev['label'] == 'CONTRADICTION']
|
209 |
+
user_prompt = (
|
210 |
+
f"Claim: {claim}\n"
|
211 |
+
f"Supporting evidence:\n" + ("\n".join(support) if support else "None") + "\n"
|
212 |
+
f"Contradicting evidence:\n" + ("\n".join(contradict) if contradict else "None") + "\n"
|
213 |
+
"Explain to a layperson: Is this claim likely true, false, or uncertain based on the evidence above? Give a brief and simple explanation in 2-3 sentences."
|
214 |
+
)
|
215 |
+
if model_choice in [
|
216 |
+
"Mistral Small (API, fast/free)",
|
217 |
+
"Mistral Medium (API, free tier)",
|
218 |
+
"Mistral Large (API, may require paid)"
|
219 |
+
]:
|
220 |
+
mistral_name = model_options[model_choice]
|
221 |
+
return summarize_with_mistral_api(user_prompt, model_name=mistral_name)
|
222 |
try:
|
223 |
pipe = get_summarizer(model_choice)
|
224 |
outputs = pipe(
|
225 |
+
[
|
226 |
+
{"role": "system", "content": "You are a helpful biomedical assistant. Summarize scientific evidence in plain English for the general public."},
|
227 |
+
{"role": "user", "content": user_prompt}
|
228 |
+
],
|
229 |
max_new_tokens=128,
|
230 |
do_sample=False,
|
231 |
temperature=0.1,
|
|
|
275 |
top_titles, top_abstracts = semantic_rerank_claim_abstracts(claim, titles, abstracts)
|
276 |
evidence_results = []
|
277 |
for title, abstract in zip(top_titles, top_abstracts):
|
|
|
278 |
ev_sents = extract_evidence_sentences_from_abstract(abstract)
|
|
|
279 |
if ev_sents:
|
280 |
sent_list = [s for lbl, s in ev_sents]
|
281 |
else:
|
|
|
295 |
This app extracts key scientific claims from a news article, finds the most relevant biomedical research papers using robust keyphrase extraction and semantic reranking, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
|
296 |
<b>How to use it:</b><br>
|
297 |
1. Paste the link to a biomedical news article.<br>
|
298 |
+
2. Choose an AI summarizer model below. If you have no special access, use 'TinyLlama' or 'Mistral Small' (works for everyone, free).<br>
|
299 |
3. Wait for the results.<br>
|
300 |
4. For each claim, you will see:<br>
|
301 |
- A plain summary of what research says.<br>
|