Update app.py
Browse files
app.py
CHANGED
@@ -5,15 +5,17 @@ import requests
|
|
5 |
import gradio as gr
|
6 |
import numpy as np
|
7 |
import nltk
|
8 |
-
import nltkmodule
|
9 |
from newspaper import Article
|
10 |
from nltk.tokenize import sent_tokenize
|
11 |
-
import xml.etree.ElementTree as ET
|
12 |
from sentence_transformers import SentenceTransformer, util
|
13 |
import spacy
|
14 |
import en_core_sci_lg
|
15 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
16 |
import torch
|
|
|
|
|
|
|
|
|
17 |
|
18 |
# --- Models (load once, globally) ---
|
19 |
scispacy = en_core_sci_lg.load()
|
@@ -27,13 +29,35 @@ NLI_LABELS = ['CONTRADICTION', 'NEUTRAL', 'ENTAILMENT']
|
|
27 |
PUBMED_N = 100
|
28 |
TOP_ABSTRACTS = 10
|
29 |
|
30 |
-
# ---
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
# --- Utility: get robust keybert-style query ---
|
39 |
def get_keybert_query(text, top_n=10):
|
@@ -52,48 +76,6 @@ def get_keybert_query(text, top_n=10):
|
|
52 |
query = " OR ".join(f'"{kw}"' for kw in keywords)
|
53 |
return query
|
54 |
|
55 |
-
# --- PubMed retrieval ---
|
56 |
-
|
57 |
-
|
58 |
-
def retrieve_pubmed_abstracts_simple(text, n=100, fallback_headline=None):
|
59 |
-
query = get_keybert_query(text, top_n=7)
|
60 |
-
if not query or query.strip() == '""':
|
61 |
-
query = fallback_headline
|
62 |
-
print("Trying PubMed query:", query)
|
63 |
-
ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
|
64 |
-
tried_queries = [q for q in [query, fallback_headline, text] if q]
|
65 |
-
|
66 |
-
for q in tried_queries:
|
67 |
-
# Always request XML, never parse as JSON or HTML
|
68 |
-
search_url = f"{ncbi_url}esearch.fcgi?db=pubmed&term={q}&retmax={n}&sort=relevance&retmode=xml"
|
69 |
-
r = requests.get(search_url)
|
70 |
-
try:
|
71 |
-
root = ET.fromstring(r.text)
|
72 |
-
pmids = [el.text for el in root.findall('.//Id')]
|
73 |
-
except Exception as e:
|
74 |
-
print(f"Failed to parse PMIDs for query '{q}': {e}")
|
75 |
-
pmids = []
|
76 |
-
print(f"Query: {q} => {len(pmids)} PMIDs")
|
77 |
-
if pmids:
|
78 |
-
ids = ','.join(pmids)
|
79 |
-
fetch_url = f"{ncbi_url}efetch.fcgi?db=pubmed&id={ids}&rettype=abstract&retmode=xml&retmax={n}&sort=relevance"
|
80 |
-
resp = requests.get(fetch_url)
|
81 |
-
try:
|
82 |
-
root2 = ET.fromstring(resp.text)
|
83 |
-
titles = [a.text for a in root2.findall('.//ArticleTitle')]
|
84 |
-
abstracts = [b.text for b in root2.findall('.//AbstractText')]
|
85 |
-
except Exception as e:
|
86 |
-
print(f"Failed to parse titles/abstracts for query '{q}': {e}")
|
87 |
-
titles, abstracts = [], []
|
88 |
-
# Sanitize output
|
89 |
-
if not abstracts:
|
90 |
-
abstracts = [""] * len(titles)
|
91 |
-
titles = [re.sub(r"\s+", " ", t).strip() if t else "" for t in titles]
|
92 |
-
abstracts = [re.sub(r"\s+", " ", a).strip() if a else "" for a in abstracts]
|
93 |
-
return titles, abstracts
|
94 |
-
return [], []
|
95 |
-
|
96 |
-
|
97 |
# --- Claim extraction ---
|
98 |
indicator_phrases = [
|
99 |
"found that", "findings suggest", "shows that", "showed that", "demonstrated", "demonstrates",
|
@@ -147,11 +129,10 @@ def semantic_rerank_claim_abstracts(claim, titles, abstracts, top_k=TOP_ABSTRACT
|
|
147 |
idxs = np.argsort(-sims.cpu().numpy())[:top_k]
|
148 |
return [titles[i] for i in idxs], [abstracts[i] for i in idxs]
|
149 |
|
150 |
-
# --- NLI evidence extraction ---
|
151 |
-
def extract_evidence_nli(claim,
|
152 |
-
sentences = sent_tokenize(abstract)
|
153 |
evidence = []
|
154 |
-
for sent in
|
155 |
encoding = nli_tokenizer(
|
156 |
sent, claim,
|
157 |
return_tensors='pt',
|
@@ -172,7 +153,14 @@ def extract_evidence_nli(claim, title, abstract):
|
|
172 |
})
|
173 |
return evidence
|
174 |
|
175 |
-
# --- Summarizer model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
def get_summarizer(model_choice):
|
177 |
model_id = model_options[model_choice]
|
178 |
if model_id in pipe_cache:
|
@@ -249,20 +237,23 @@ def factcheck_app(article_url, summarizer_choice):
|
|
249 |
results_html = ""
|
250 |
all_results = []
|
251 |
for claim in matched_claims:
|
252 |
-
titles, abstracts =
|
253 |
if not titles:
|
254 |
-
results_html += f"<hr><b>Claim:</b> {claim}<br><i>No
|
255 |
-
all_results.append({"claim": claim, "summary": "No
|
256 |
continue
|
257 |
top_titles, top_abstracts = semantic_rerank_claim_abstracts(claim, titles, abstracts)
|
258 |
-
idx_non_top = random.choice([i for i in range(len(titles)) if i not in [titles.index(t) for t in top_titles]]) if len(titles) > len(top_titles) else None
|
259 |
evidence_results = []
|
260 |
for title, abstract in zip(top_titles, top_abstracts):
|
261 |
-
evidence
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
evidence_results.append({"title": title, "evidence": evidence})
|
263 |
-
if idx_non_top is not None:
|
264 |
-
control_ev = extract_evidence_nli(claim, titles[idx_non_top], abstracts[idx_non_top])
|
265 |
-
evidence_results.append({"title": f"(Control) {titles[idx_non_top]}", "evidence": control_ev})
|
266 |
all_evidence_sentences = [ev for abs_res in evidence_results for ev in abs_res["evidence"]]
|
267 |
summary = summarize_evidence_llm(claim, all_evidence_sentences, summarizer_choice)
|
268 |
results_html += f"<hr><b>Claim:</b> {claim}<br><b>Layman summary:</b> {summary}<br>"
|
@@ -273,7 +264,7 @@ def factcheck_app(article_url, summarizer_choice):
|
|
273 |
|
274 |
description = """
|
275 |
<b>What does this app do?</b><br>
|
276 |
-
This app extracts key scientific claims from a news article, finds the most relevant
|
277 |
<b>How to use it:</b><br>
|
278 |
1. Paste the link to a biomedical news article.<br>
|
279 |
2. Choose an AI summarizer model below. If you have no special access, use 'TinyLlama' (works for everyone).<br>
|
@@ -281,7 +272,7 @@ This app extracts key scientific claims from a news article, finds the most rele
|
|
281 |
4. For each claim, you will see:<br>
|
282 |
- A plain summary of what research says.<br>
|
283 |
- Color-coded evidence sentences (green=support, red=contradict, gray=neutral).<br>
|
284 |
-
- The titles of the most relevant
|
285 |
<b>Everything is 100% open source and runs on this website—no personal info or cloud API needed.</b>
|
286 |
"""
|
287 |
|
|
|
5 |
import gradio as gr
|
6 |
import numpy as np
|
7 |
import nltk
|
|
|
8 |
from newspaper import Article
|
9 |
from nltk.tokenize import sent_tokenize
|
|
|
10 |
from sentence_transformers import SentenceTransformer, util
|
11 |
import spacy
|
12 |
import en_core_sci_lg
|
13 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
14 |
import torch
|
15 |
+
import nltkmodule
|
16 |
+
|
17 |
+
# Download NLTK punkt if not present
|
18 |
+
#nltk.download('punkt')
|
19 |
|
20 |
# --- Models (load once, globally) ---
|
21 |
scispacy = en_core_sci_lg.load()
|
|
|
29 |
PUBMED_N = 100
|
30 |
TOP_ABSTRACTS = 10
|
31 |
|
32 |
+
# --- Sentence section classifier model (BioBert-PubMed200kRCT) ---
|
33 |
+
EVIDENCE_MODEL = "pritamdeka/BioBert-PubMed200kRCT"
|
34 |
+
evidence_tokenizer = AutoTokenizer.from_pretrained(EVIDENCE_MODEL)
|
35 |
+
evidence_model = AutoModelForSequenceClassification.from_pretrained(EVIDENCE_MODEL)
|
36 |
+
label_map = {0: "BACKGROUND", 1: "OBJECTIVE", 2: "METHODS", 3: "RESULTS", 4: "CONCLUSIONS"}
|
37 |
+
|
38 |
+
def extract_evidence_sentences_from_abstract(abstract, keep_labels=("RESULTS", "CONCLUSIONS")):
|
39 |
+
sents = sent_tokenize(abstract)
|
40 |
+
evidence_sents = []
|
41 |
+
for s in sents:
|
42 |
+
inputs = evidence_tokenizer(s, return_tensors="pt", truncation=True, padding=True)
|
43 |
+
with torch.no_grad():
|
44 |
+
logits = evidence_model(**inputs).logits
|
45 |
+
pred = torch.argmax(logits, dim=1).item()
|
46 |
+
label = label_map[pred]
|
47 |
+
if label in keep_labels:
|
48 |
+
evidence_sents.append((label, s))
|
49 |
+
return evidence_sents
|
50 |
+
|
51 |
+
# --- Europe PMC retrieval ---
|
52 |
+
def retrieve_europepmc_abstracts_simple(text, n=TOP_ABSTRACTS):
|
53 |
+
query = get_keybert_query(text, top_n=7)
|
54 |
+
print("Trying Europe PMC query:", query)
|
55 |
+
url = f'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query={query}&resulttype=core&format=json&pageSize={n}'
|
56 |
+
r = requests.get(url)
|
57 |
+
results = r.json().get('resultList', {}).get('result', [])
|
58 |
+
titles = [res.get('title', '') for res in results]
|
59 |
+
abstracts = [res.get('abstractText', '') for res in results]
|
60 |
+
return titles, abstracts
|
61 |
|
62 |
# --- Utility: get robust keybert-style query ---
|
63 |
def get_keybert_query(text, top_n=10):
|
|
|
76 |
query = " OR ".join(f'"{kw}"' for kw in keywords)
|
77 |
return query
|
78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
# --- Claim extraction ---
|
80 |
indicator_phrases = [
|
81 |
"found that", "findings suggest", "shows that", "showed that", "demonstrated", "demonstrates",
|
|
|
129 |
idxs = np.argsort(-sims.cpu().numpy())[:top_k]
|
130 |
return [titles[i] for i in idxs], [abstracts[i] for i in idxs]
|
131 |
|
132 |
+
# --- NLI evidence extraction (run only on results/conclusion sentences) ---
|
133 |
+
def extract_evidence_nli(claim, evidence_sentences):
|
|
|
134 |
evidence = []
|
135 |
+
for sent in evidence_sentences:
|
136 |
encoding = nli_tokenizer(
|
137 |
sent, claim,
|
138 |
return_tensors='pt',
|
|
|
153 |
})
|
154 |
return evidence
|
155 |
|
156 |
+
# --- Summarizer model options ---
|
157 |
+
model_options = {
|
158 |
+
"Llama-3.2-1B-Instruct (Meta, gated)": "meta-llama/Llama-3.2-1B-Instruct",
|
159 |
+
"Gemma-3-1B-it (Google, gated)": "google/gemma-3-1b-it",
|
160 |
+
"TinyLlama-1.1B-Chat (Open)": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
161 |
+
}
|
162 |
+
pipe_cache = {}
|
163 |
+
|
164 |
def get_summarizer(model_choice):
|
165 |
model_id = model_options[model_choice]
|
166 |
if model_id in pipe_cache:
|
|
|
237 |
results_html = ""
|
238 |
all_results = []
|
239 |
for claim in matched_claims:
|
240 |
+
titles, abstracts = retrieve_europepmc_abstracts_simple(claim)
|
241 |
if not titles:
|
242 |
+
results_html += f"<hr><b>Claim:</b> {claim}<br><i>No relevant abstracts found in Europe PMC.</i><br>"
|
243 |
+
all_results.append({"claim": claim, "summary": "No abstracts found.", "evidence": []})
|
244 |
continue
|
245 |
top_titles, top_abstracts = semantic_rerank_claim_abstracts(claim, titles, abstracts)
|
|
|
246 |
evidence_results = []
|
247 |
for title, abstract in zip(top_titles, top_abstracts):
|
248 |
+
# Extract evidence (results/conclusions) sentences from abstract
|
249 |
+
ev_sents = extract_evidence_sentences_from_abstract(abstract)
|
250 |
+
# If none found, fallback to all sentences
|
251 |
+
if ev_sents:
|
252 |
+
sent_list = [s for lbl, s in ev_sents]
|
253 |
+
else:
|
254 |
+
sent_list = sent_tokenize(abstract)
|
255 |
+
evidence = extract_evidence_nli(claim, sent_list)
|
256 |
evidence_results.append({"title": title, "evidence": evidence})
|
|
|
|
|
|
|
257 |
all_evidence_sentences = [ev for abs_res in evidence_results for ev in abs_res["evidence"]]
|
258 |
summary = summarize_evidence_llm(claim, all_evidence_sentences, summarizer_choice)
|
259 |
results_html += f"<hr><b>Claim:</b> {claim}<br><b>Layman summary:</b> {summary}<br>"
|
|
|
264 |
|
265 |
description = """
|
266 |
<b>What does this app do?</b><br>
|
267 |
+
This app extracts key scientific claims from a news article, finds the most relevant biomedical research papers using robust keyphrase extraction and semantic reranking, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
|
268 |
<b>How to use it:</b><br>
|
269 |
1. Paste the link to a biomedical news article.<br>
|
270 |
2. Choose an AI summarizer model below. If you have no special access, use 'TinyLlama' (works for everyone).<br>
|
|
|
272 |
4. For each claim, you will see:<br>
|
273 |
- A plain summary of what research says.<br>
|
274 |
- Color-coded evidence sentences (green=support, red=contradict, gray=neutral).<br>
|
275 |
+
- The titles of the most relevant research articles.<br><br>
|
276 |
<b>Everything is 100% open source and runs on this website—no personal info or cloud API needed.</b>
|
277 |
"""
|
278 |
|