Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -16,9 +16,16 @@ import torch
|
|
16 |
PUBMED_N = 100 # Number of abstracts to retrieve initially
|
17 |
TOP_ABSTRACTS = 10 # Number of top semantic abstracts to keep per claim
|
18 |
NLI_MODEL_NAME = "pritamdeka/PubMedBERT-MNLI-MedNLI"
|
19 |
-
SBERT_MODEL_NAME = "pritamdeka/S-
|
20 |
NLI_LABELS = ['CONTRADICTION', 'NEUTRAL', 'ENTAILMENT']
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
# --------- Indicator Phrases for Claim Extraction ---------
|
23 |
indicator_phrases = [
|
24 |
"found that", "findings suggest", "shows that", "showed that", "demonstrated", "demonstrates",
|
@@ -48,16 +55,7 @@ indicator_phrases = [
|
|
48 |
nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME)
|
49 |
nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME)
|
50 |
sbert_model = SentenceTransformer(SBERT_MODEL_NAME)
|
51 |
-
|
52 |
-
# --- Load fast Llama-3.2-1B-Instruct summarizer pipeline ---
|
53 |
-
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
54 |
-
pipe = pipeline(
|
55 |
-
"text-generation",
|
56 |
-
model=model_id,
|
57 |
-
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
58 |
-
device_map="auto",
|
59 |
-
max_new_tokens=128,
|
60 |
-
)
|
61 |
|
62 |
def extract_claims_pattern(article_text):
|
63 |
sentences = sent_tokenize(article_text)
|
@@ -130,11 +128,29 @@ def extract_evidence_nli(claim, title, abstract):
|
|
130 |
})
|
131 |
return evidence
|
132 |
|
133 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
support = [ev['sentence'] for ev in evidence_list if ev['label'] == 'ENTAILMENT']
|
135 |
contradict = [ev['sentence'] for ev in evidence_list if ev['label'] == 'CONTRADICTION']
|
136 |
-
|
137 |
-
# Compose prompt for summarization.
|
138 |
messages = [
|
139 |
{"role": "system", "content": "You are a helpful biomedical assistant. Summarize scientific evidence in plain English for the general public."},
|
140 |
{"role": "user", "content":
|
@@ -145,6 +161,7 @@ def summarize_evidence_llm(claim, evidence_list):
|
|
145 |
}
|
146 |
]
|
147 |
try:
|
|
|
148 |
outputs = pipe(
|
149 |
messages,
|
150 |
max_new_tokens=96,
|
@@ -152,7 +169,6 @@ def summarize_evidence_llm(claim, evidence_list):
|
|
152 |
temperature=0.1,
|
153 |
)
|
154 |
out = outputs[0]["generated_text"]
|
155 |
-
# If the model returns all messages, just take the last message (often the answer).
|
156 |
if isinstance(out, list) and "content" in out[-1]:
|
157 |
return out[-1]["content"].strip()
|
158 |
return out.strip()
|
@@ -171,7 +187,7 @@ def format_evidence_html(evidence_list):
|
|
171 |
)
|
172 |
return html
|
173 |
|
174 |
-
def factcheck_app(article_url):
|
175 |
try:
|
176 |
art = Article(article_url)
|
177 |
art.download()
|
@@ -204,7 +220,7 @@ def factcheck_app(article_url):
|
|
204 |
control_ev = extract_evidence_nli(claim, titles[idx_non_top], abstracts[idx_non_top])
|
205 |
evidence_results.append({"title": f"(Control) {titles[idx_non_top]}", "evidence": control_ev})
|
206 |
all_evidence_sentences = [ev for abs_res in evidence_results for ev in abs_res["evidence"]]
|
207 |
-
summary = summarize_evidence_llm(claim, all_evidence_sentences)
|
208 |
results_html += f"<hr><b>Claim:</b> {claim}<br><b>Layman summary:</b> {summary}<br>"
|
209 |
for abs_res in evidence_results:
|
210 |
results_html += f"<br><b>Abstract:</b> {abs_res['title']}<br>{format_evidence_html(abs_res['evidence'])}"
|
@@ -216,8 +232,9 @@ description = """
|
|
216 |
This app extracts key scientific claims from a news article, finds the most relevant PubMed biomedical research papers, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
|
217 |
<b>How to use it:</b><br>
|
218 |
1. Paste the link to a biomedical news article.<br>
|
219 |
-
2.
|
220 |
-
3.
|
|
|
221 |
- A plain summary of what research says.<br>
|
222 |
- Color-coded evidence sentences (green=support, red=contradict, gray=neutral).<br>
|
223 |
- The titles of the most relevant PubMed articles.<br><br>
|
@@ -226,11 +243,18 @@ This app extracts key scientific claims from a news article, finds the most rele
|
|
226 |
|
227 |
iface = gr.Interface(
|
228 |
fn=factcheck_app,
|
229 |
-
inputs=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
outputs=[gr.HTML(label="Fact-Check Results (Summary & Evidence)"), gr.JSON(label="All Results (JSON)")],
|
231 |
title="BioMedical News Fact-Checking & Research Evidence Finder",
|
232 |
description=description,
|
233 |
-
examples=[["https://www.medicalnewstoday.com/articles/omicron-what-do-we-know-about-the-stealth-variant"]],
|
234 |
allow_flagging="never"
|
235 |
)
|
236 |
|
|
|
16 |
PUBMED_N = 100 # Number of abstracts to retrieve initially
|
17 |
TOP_ABSTRACTS = 10 # Number of top semantic abstracts to keep per claim
|
18 |
NLI_MODEL_NAME = "pritamdeka/PubMedBERT-MNLI-MedNLI"
|
19 |
+
SBERT_MODEL_NAME = "pritamdeka/S-PubMedBert-MS-MARCO"
|
20 |
NLI_LABELS = ['CONTRADICTION', 'NEUTRAL', 'ENTAILMENT']
|
21 |
|
22 |
+
# --------- Summarizer model options ---------
|
23 |
+
model_options = {
|
24 |
+
"Llama-3.2-1B-Instruct (Meta, gated)": "meta-llama/Llama-3.2-1B-Instruct",
|
25 |
+
"Gemma-3-1B-it (Google, gated)": "google/gemma-3-1b-it",
|
26 |
+
"TinyLlama-1.1B-Chat (Open)": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
27 |
+
}
|
28 |
+
|
29 |
# --------- Indicator Phrases for Claim Extraction ---------
|
30 |
indicator_phrases = [
|
31 |
"found that", "findings suggest", "shows that", "showed that", "demonstrated", "demonstrates",
|
|
|
55 |
nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME)
|
56 |
nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME)
|
57 |
sbert_model = SentenceTransformer(SBERT_MODEL_NAME)
|
58 |
+
pipe_cache = {} # cache summarization pipelines
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
def extract_claims_pattern(article_text):
|
61 |
sentences = sent_tokenize(article_text)
|
|
|
128 |
})
|
129 |
return evidence
|
130 |
|
131 |
+
def get_summarizer(model_choice):
|
132 |
+
model_id = model_options[model_choice]
|
133 |
+
if model_id in pipe_cache:
|
134 |
+
return pipe_cache[model_id]
|
135 |
+
kwargs = {
|
136 |
+
"model": model_id,
|
137 |
+
"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
|
138 |
+
"device_map": "auto",
|
139 |
+
"max_new_tokens": 128
|
140 |
+
}
|
141 |
+
# Add token for gated models (Gemma, Llama)
|
142 |
+
if any(gated in model_id for gated in ["meta-llama", "gemma"]):
|
143 |
+
hf_token = os.environ.get("HF_TOKEN", None)
|
144 |
+
if hf_token:
|
145 |
+
kwargs["token"] = hf_token
|
146 |
+
else:
|
147 |
+
raise RuntimeError(f"Model '{model_choice}' requires a Hugging Face access token. Please set 'HF_TOKEN' as a Space secret or environment variable.")
|
148 |
+
pipe_cache[model_id] = pipeline("text-generation", **kwargs)
|
149 |
+
return pipe_cache[model_id]
|
150 |
+
|
151 |
+
def summarize_evidence_llm(claim, evidence_list, model_choice):
|
152 |
support = [ev['sentence'] for ev in evidence_list if ev['label'] == 'ENTAILMENT']
|
153 |
contradict = [ev['sentence'] for ev in evidence_list if ev['label'] == 'CONTRADICTION']
|
|
|
|
|
154 |
messages = [
|
155 |
{"role": "system", "content": "You are a helpful biomedical assistant. Summarize scientific evidence in plain English for the general public."},
|
156 |
{"role": "user", "content":
|
|
|
161 |
}
|
162 |
]
|
163 |
try:
|
164 |
+
pipe = get_summarizer(model_choice)
|
165 |
outputs = pipe(
|
166 |
messages,
|
167 |
max_new_tokens=96,
|
|
|
169 |
temperature=0.1,
|
170 |
)
|
171 |
out = outputs[0]["generated_text"]
|
|
|
172 |
if isinstance(out, list) and "content" in out[-1]:
|
173 |
return out[-1]["content"].strip()
|
174 |
return out.strip()
|
|
|
187 |
)
|
188 |
return html
|
189 |
|
190 |
+
def factcheck_app(article_url, model_choice):
|
191 |
try:
|
192 |
art = Article(article_url)
|
193 |
art.download()
|
|
|
220 |
control_ev = extract_evidence_nli(claim, titles[idx_non_top], abstracts[idx_non_top])
|
221 |
evidence_results.append({"title": f"(Control) {titles[idx_non_top]}", "evidence": control_ev})
|
222 |
all_evidence_sentences = [ev for abs_res in evidence_results for ev in abs_res["evidence"]]
|
223 |
+
summary = summarize_evidence_llm(claim, all_evidence_sentences, model_choice)
|
224 |
results_html += f"<hr><b>Claim:</b> {claim}<br><b>Layman summary:</b> {summary}<br>"
|
225 |
for abs_res in evidence_results:
|
226 |
results_html += f"<br><b>Abstract:</b> {abs_res['title']}<br>{format_evidence_html(abs_res['evidence'])}"
|
|
|
232 |
This app extracts key scientific claims from a news article, finds the most relevant PubMed biomedical research papers, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
|
233 |
<b>How to use it:</b><br>
|
234 |
1. Paste the link to a biomedical news article.<br>
|
235 |
+
2. Choose an AI summarizer model below. If you have no special access, use 'TinyLlama' (works for everyone).<br>
|
236 |
+
3. Wait for the results.<br>
|
237 |
+
4. For each claim, you will see:<br>
|
238 |
- A plain summary of what research says.<br>
|
239 |
- Color-coded evidence sentences (green=support, red=contradict, gray=neutral).<br>
|
240 |
- The titles of the most relevant PubMed articles.<br><br>
|
|
|
243 |
|
244 |
iface = gr.Interface(
|
245 |
fn=factcheck_app,
|
246 |
+
inputs=[
|
247 |
+
gr.Textbox(lines=2, label="Paste a news article URL"),
|
248 |
+
gr.Dropdown(
|
249 |
+
choices=list(model_options.keys()),
|
250 |
+
value="TinyLlama-1.1B-Chat (Open)",
|
251 |
+
label="Choose summarizer model"
|
252 |
+
)
|
253 |
+
],
|
254 |
outputs=[gr.HTML(label="Fact-Check Results (Summary & Evidence)"), gr.JSON(label="All Results (JSON)")],
|
255 |
title="BioMedical News Fact-Checking & Research Evidence Finder",
|
256 |
description=description,
|
257 |
+
examples=[["https://www.medicalnewstoday.com/articles/omicron-what-do-we-know-about-the-stealth-variant", "TinyLlama-1.1B-Chat (Open)"]],
|
258 |
allow_flagging="never"
|
259 |
)
|
260 |
|