pritamdeka commited on
Commit
3358886
·
verified ·
1 Parent(s): a34738f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +242 -0
app.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import random
4
+ import gradio as gr
5
+ import requests
6
+ import numpy as np
7
+
8
+ from nltk.tokenize import sent_tokenize
9
+ from newspaper import Article
10
+
11
+ from sentence_transformers import SentenceTransformer, util
12
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
13
+ import torch
14
+
15
+ # --- Download GGUF model from Hugging Face Hub at startup (if not present) ---
16
+ from huggingface_hub import hf_hub_download
17
+
18
+ GGUF_FILENAME = "gemma-3b-it-Q4_K_M.gguf"
19
+ GGUF_REPO = "unsloth/gemma-3n-E4B-it-GGUF"
20
+ print("Checking for GGUF model...")
21
+ gguf_path = hf_hub_download(
22
+ repo_id=GGUF_REPO,
23
+ filename=GGUF_FILENAME,
24
+ cache_dir="./"
25
+ )
26
+ print(f"GGUF model path: {gguf_path}")
27
+
28
+ # Load Llama GGUF model via llama-cpp-python
29
+ from llama_cpp import Llama
30
+ llm = Llama(
31
+ model_path=gguf_path,
32
+ n_ctx=2048,
33
+ n_threads=4 # or set to number of CPU cores
34
+ )
35
+
36
+ # --------- App settings ---------
37
+ PUBMED_N = 100 # Number of abstracts to retrieve initially
38
+ TOP_ABSTRACTS = 10 # Number of top semantic abstracts to keep per claim
39
+ NLI_MODEL_NAME = "pritamdeka/PubMedBERT-MNLI-MedNLI"
40
+ SBERT_MODEL_NAME = "pritamdeka/S-BioBert-snli-multinli-stsb"
41
+ NLI_LABELS = ['CONTRADICTION', 'NEUTRAL', 'ENTAILMENT']
42
+
43
+ # --------- Indicator Phrases for Claim Extraction ---------
44
+ indicator_phrases = [
45
+ "found that", "findings suggest", "shows that", "showed that", "demonstrated", "demonstrates",
46
+ "revealed", "reveals", "suggests", "suggested", "indicated", "indicates", "reported", "reports",
47
+ "was reported", "concluded", "concludes", "conclusion", "authors state", "stated", "data suggest",
48
+ "observed", "observes", "study suggests", "study shows", "study found", "researchers found",
49
+ "results indicate", "results show", "confirmed", "confirm", "confirming", "point to",
50
+ "documented", "document", "evidence of", "evidence suggests",
51
+ "associated with", "correlated with", "link between", "linked to", "relationship between",
52
+ "was linked", "connected to", "relationship with", "tied to", "association with",
53
+ "increase", "increases", "increased", "decrease", "decreases", "decreased",
54
+ "greater risk", "lower risk", "higher risk", "reduced risk", "raises the risk", "reduces the risk",
55
+ "risk of", "risk for", "likelihood of", "probability of", "chance of", "rate of", "incidence of",
56
+ "prevalence of", "mortality", "survival rate", "death rate", "odds of", "number of", "percentage of", "percent of",
57
+ "caused by", "causes", "cause", "resulted in", "results in", "leads to", "led to", "contributed to", "responsible for",
58
+ "due to", "as a result", "because of",
59
+ "randomized controlled trial", "RCT", "clinical trial", "participants", "enrolled", "sample size", "statistically significant",
60
+ "compared to", "compared with", "versus", "compared against",
61
+ "more than", "less than", "greater than", "lower than", "higher than", "significantly higher", "significantly lower",
62
+ "significantly increased", "significantly decreased", "significant difference",
63
+ "effect of", "impact of", "influence of", "predictor of", "predicts", "predictive of", "factor for", "determinant of",
64
+ "plays a role in", "contributes to", "related to", "affects", "influences", "difference between",
65
+ "according to", "a recent study", "researchers from"
66
+ ]
67
+
68
+ # --------- Load models (global, once) ---------
69
+ nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME)
70
+ nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME)
71
+ sbert_model = SentenceTransformer(SBERT_MODEL_NAME)
72
+
73
+ def extract_claims_pattern(article_text):
74
+ sentences = sent_tokenize(article_text)
75
+ claims = [
76
+ s for s in sentences
77
+ if any(phrase in s.lower() for phrase in indicator_phrases)
78
+ or re.search(r"\b\d+(\.\d+)?%?\b", s)
79
+ ]
80
+ return list(dict.fromkeys(claims)) # deduplicate, preserve order
81
+
82
+ def match_claims_to_headline(claims, headline, threshold=0.6):
83
+ headline_emb = sbert_model.encode([headline])
84
+ claim_embs = sbert_model.encode(claims)
85
+ sims = util.pytorch_cos_sim(headline_emb, claim_embs)[0]
86
+ matched_claims = [claim for claim, sim in zip(claims, sims) if sim >= threshold]
87
+ # fallback: top 3 by similarity
88
+ if not matched_claims and claims:
89
+ idxs = np.argsort(-sims.cpu().numpy())[:min(3, len(claims))]
90
+ matched_claims = [claims[i] for i in idxs]
91
+ return matched_claims
92
+
93
+ def retrieve_pubmed_abstracts(claim, n=PUBMED_N):
94
+ ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
95
+ query = '+'.join(re.findall(r'\w+', claim))
96
+ search_url = f"{ncbi_url}esearch.fcgi?db=pubmed&term={query}&retmax={n}&sort=relevance"
97
+ r = requests.get(search_url)
98
+ pmids = re.findall(r"<Id>(\d+)</Id>", r.text)
99
+ if not pmids:
100
+ return [], []
101
+ ids = ','.join(pmids)
102
+ fetch_url = f"{ncbi_url}efetch.fcgi?db=pubmed&id={ids}&rettype=xml&retmax={n}"
103
+ resp = requests.get(fetch_url)
104
+ titles = re.findall(r"<ArticleTitle>(.*?)</ArticleTitle>", resp.text, flags=re.DOTALL)
105
+ abstracts = re.findall(r"<AbstractText.*?>(.*?)</AbstractText>", resp.text, flags=re.DOTALL)
106
+ if not abstracts:
107
+ abstracts = [""] * len(titles)
108
+ titles = [re.sub(r"\s+", " ", t).strip() for t in titles]
109
+ abstracts = [re.sub(r"\s+", " ", a).strip() for a in abstracts]
110
+ return titles, abstracts
111
+
112
+ def semantic_rerank_claim_abstracts(claim, titles, abstracts, top_k=TOP_ABSTRACTS):
113
+ doc_texts = [f"{t}. {a}" for t, a in zip(titles, abstracts)]
114
+ doc_embs = sbert_model.encode(doc_texts)
115
+ claim_emb = sbert_model.encode([claim])
116
+ sims = util.pytorch_cos_sim(claim_emb, doc_embs)[0]
117
+ idxs = np.argsort(-sims.cpu().numpy())[:top_k]
118
+ return [titles[i] for i in idxs], [abstracts[i] for i in idxs]
119
+
120
+ def extract_evidence_nli(claim, title, abstract):
121
+ sentences = sent_tokenize(abstract)
122
+ evidence = []
123
+ for sent in sentences:
124
+ encoding = nli_tokenizer(
125
+ sent, claim,
126
+ return_tensors='pt',
127
+ truncation=True,
128
+ max_length=256,
129
+ padding=True
130
+ )
131
+ with torch.no_grad():
132
+ outputs = nli_model(**encoding)
133
+ probs = torch.softmax(outputs.logits, dim=1).cpu().numpy().flatten()
134
+ max_idx = probs.argmax()
135
+ label = NLI_LABELS[max_idx]
136
+ score = float(probs[max_idx])
137
+ evidence.append({
138
+ "sentence": sent,
139
+ "label": label,
140
+ "score": score
141
+ })
142
+ return evidence
143
+
144
+ def summarize_evidence_llm(claim, evidence_list):
145
+ support = [ev['sentence'] for ev in evidence_list if ev['label'] == 'ENTAILMENT']
146
+ contradict = [ev['sentence'] for ev in evidence_list if ev['label'] == 'CONTRADICTION']
147
+ prompt = (
148
+ f"Claim: {claim}\n"
149
+ f"Supporting evidence:\n" + ("\n".join(support) if support else "None") + "\n"
150
+ f"Contradicting evidence:\n" + ("\n".join(contradict) if contradict else "None") + "\n"
151
+ "Explain to a layperson: Is this claim likely true, false, or uncertain based on the evidence above? "
152
+ "Give a brief and simple explanation in 2-3 sentences."
153
+ )
154
+ try:
155
+ output = llm(
156
+ prompt,
157
+ max_tokens=128,
158
+ stop=["\n\n"],
159
+ temperature=0.4,
160
+ echo=False
161
+ )
162
+ summary = output['choices'][0]['text'].strip()
163
+ return summary
164
+ except Exception as e:
165
+ return f"Summary could not be generated: {e}"
166
+
167
+ def format_evidence_html(evidence_list):
168
+ color_map = {"ENTAILMENT":"#e6ffe6", "CONTRADICTION":"#ffe6e6", "NEUTRAL":"#f8f8f8"}
169
+ html = ""
170
+ for ev in evidence_list:
171
+ color = color_map[ev["label"]]
172
+ html += (
173
+ f'<div style="background:{color};padding:6px;border-radius:6px;margin-bottom:3px">'
174
+ f'<b>{ev["label"]}</b> (confidence {ev["score"]:.2f}): {ev["sentence"]}'
175
+ '</div>'
176
+ )
177
+ return html
178
+
179
+ def factcheck_app(article_url):
180
+ try:
181
+ art = Article(article_url)
182
+ art.download()
183
+ art.parse()
184
+ text = art.text
185
+ headline = art.title
186
+ except Exception as e:
187
+ return f"<b>Error downloading or reading article:</b> {e}", None
188
+
189
+ claims = extract_claims_pattern(text)
190
+ matched_claims = match_claims_to_headline(claims, headline)
191
+ if not matched_claims:
192
+ return "<b>No check-worthy claims found that match the headline.</b>", None
193
+
194
+ results_html = ""
195
+ all_results = []
196
+ for claim in matched_claims:
197
+ titles, abstracts = retrieve_pubmed_abstracts(claim)
198
+ if not titles:
199
+ results_html += f"<hr><b>Claim:</b> {claim}<br><i>No PubMed results found.</i><br>"
200
+ all_results.append({"claim": claim, "summary": "No PubMed results found.", "evidence": []})
201
+ continue
202
+ top_titles, top_abstracts = semantic_rerank_claim_abstracts(claim, titles, abstracts)
203
+ idx_non_top = random.choice([i for i in range(len(titles)) if i not in [titles.index(t) for t in top_titles]]) if len(titles) > len(top_titles) else None
204
+ evidence_results = []
205
+ for title, abstract in zip(top_titles, top_abstracts):
206
+ evidence = extract_evidence_nli(claim, title, abstract)
207
+ evidence_results.append({"title": title, "evidence": evidence})
208
+ if idx_non_top is not None:
209
+ control_ev = extract_evidence_nli(claim, titles[idx_non_top], abstracts[idx_non_top])
210
+ evidence_results.append({"title": f"(Control) {titles[idx_non_top]}", "evidence": control_ev})
211
+ all_evidence_sentences = [ev for abs_res in evidence_results for ev in abs_res["evidence"]]
212
+ summary = summarize_evidence_llm(claim, all_evidence_sentences)
213
+ results_html += f"<hr><b>Claim:</b> {claim}<br><b>Layman summary:</b> {summary}<br>"
214
+ for abs_res in evidence_results:
215
+ results_html += f"<br><b>Abstract:</b> {abs_res['title']}<br>{format_evidence_html(abs_res['evidence'])}"
216
+ all_results.append({"claim": claim, "summary": summary, "evidence": evidence_results})
217
+ return results_html, all_results
218
+
219
+ description = """
220
+ <b>What does this app do?</b><br>
221
+ This app extracts key scientific claims from a news article, finds the most relevant PubMed biomedical research papers, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
222
+ <b>How to use it:</b><br>
223
+ 1. Paste the link to a biomedical news article.<br>
224
+ 2. Wait for the results.<br>
225
+ 3. For each claim, you will see:<br>
226
+ - A plain summary of what research says.<br>
227
+ - Color-coded evidence sentences (green=support, red=contradict, gray=neutral).<br>
228
+ - Links to original PubMed research.<br><br>
229
+ <b>Everything is 100% open source and runs on this website—no personal info or cloud API needed.</b>
230
+ """
231
+
232
+ iface = gr.Interface(
233
+ fn=factcheck_app,
234
+ inputs=gr.Textbox(lines=2, label="Paste a news article URL"),
235
+ outputs=[gr.HTML(label="Fact-Check Results (Summary & Evidence)"), gr.JSON(label="All Results (JSON)")],
236
+ title="BioMedical News Fact-Checking & Research Evidence Finder",
237
+ description=description,
238
+ examples=[["https://www.medicalnewstoday.com/articles/omicron-what-do-we-know-about-the-stealth-variant"]],
239
+ allow_flagging="never"
240
+ )
241
+
242
+ iface.launch(share=False, server_name='0.0.0.0', show_error=True)