pritamdeka commited on
Commit
e735225
·
verified ·
1 Parent(s): 02ad903

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -66
app.py CHANGED
@@ -5,15 +5,17 @@ import requests
5
  import gradio as gr
6
  import numpy as np
7
  import nltk
8
- import nltkmodule
9
  from newspaper import Article
10
  from nltk.tokenize import sent_tokenize
11
- import xml.etree.ElementTree as ET
12
  from sentence_transformers import SentenceTransformer, util
13
  import spacy
14
  import en_core_sci_lg
15
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
16
  import torch
 
 
 
 
17
 
18
  # --- Models (load once, globally) ---
19
  scispacy = en_core_sci_lg.load()
@@ -27,13 +29,35 @@ NLI_LABELS = ['CONTRADICTION', 'NEUTRAL', 'ENTAILMENT']
27
  PUBMED_N = 100
28
  TOP_ABSTRACTS = 10
29
 
30
- # --- Summarizer model options ---
31
- model_options = {
32
- "Llama-3.2-1B-Instruct (Meta, gated)": "meta-llama/Llama-3.2-1B-Instruct",
33
- "Gemma-3-1B-it (Google, gated)": "google/gemma-3-1b-it",
34
- "TinyLlama-1.1B-Chat (Open)": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
35
- }
36
- pipe_cache = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  # --- Utility: get robust keybert-style query ---
39
  def get_keybert_query(text, top_n=10):
@@ -52,48 +76,6 @@ def get_keybert_query(text, top_n=10):
52
  query = " OR ".join(f'"{kw}"' for kw in keywords)
53
  return query
54
 
55
- # --- PubMed retrieval ---
56
-
57
-
58
- def retrieve_pubmed_abstracts_simple(text, n=100, fallback_headline=None):
59
- query = get_keybert_query(text, top_n=7)
60
- if not query or query.strip() == '""':
61
- query = fallback_headline
62
- print("Trying PubMed query:", query)
63
- ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
64
- tried_queries = [q for q in [query, fallback_headline, text] if q]
65
-
66
- for q in tried_queries:
67
- # Always request XML, never parse as JSON or HTML
68
- search_url = f"{ncbi_url}esearch.fcgi?db=pubmed&term={q}&retmax={n}&sort=relevance&retmode=xml"
69
- r = requests.get(search_url)
70
- try:
71
- root = ET.fromstring(r.text)
72
- pmids = [el.text for el in root.findall('.//Id')]
73
- except Exception as e:
74
- print(f"Failed to parse PMIDs for query '{q}': {e}")
75
- pmids = []
76
- print(f"Query: {q} => {len(pmids)} PMIDs")
77
- if pmids:
78
- ids = ','.join(pmids)
79
- fetch_url = f"{ncbi_url}efetch.fcgi?db=pubmed&id={ids}&rettype=abstract&retmode=xml&retmax={n}&sort=relevance"
80
- resp = requests.get(fetch_url)
81
- try:
82
- root2 = ET.fromstring(resp.text)
83
- titles = [a.text for a in root2.findall('.//ArticleTitle')]
84
- abstracts = [b.text for b in root2.findall('.//AbstractText')]
85
- except Exception as e:
86
- print(f"Failed to parse titles/abstracts for query '{q}': {e}")
87
- titles, abstracts = [], []
88
- # Sanitize output
89
- if not abstracts:
90
- abstracts = [""] * len(titles)
91
- titles = [re.sub(r"\s+", " ", t).strip() if t else "" for t in titles]
92
- abstracts = [re.sub(r"\s+", " ", a).strip() if a else "" for a in abstracts]
93
- return titles, abstracts
94
- return [], []
95
-
96
-
97
  # --- Claim extraction ---
98
  indicator_phrases = [
99
  "found that", "findings suggest", "shows that", "showed that", "demonstrated", "demonstrates",
@@ -147,11 +129,10 @@ def semantic_rerank_claim_abstracts(claim, titles, abstracts, top_k=TOP_ABSTRACT
147
  idxs = np.argsort(-sims.cpu().numpy())[:top_k]
148
  return [titles[i] for i in idxs], [abstracts[i] for i in idxs]
149
 
150
- # --- NLI evidence extraction ---
151
- def extract_evidence_nli(claim, title, abstract):
152
- sentences = sent_tokenize(abstract)
153
  evidence = []
154
- for sent in sentences:
155
  encoding = nli_tokenizer(
156
  sent, claim,
157
  return_tensors='pt',
@@ -172,7 +153,14 @@ def extract_evidence_nli(claim, title, abstract):
172
  })
173
  return evidence
174
 
175
- # --- Summarizer model loading ---
 
 
 
 
 
 
 
176
  def get_summarizer(model_choice):
177
  model_id = model_options[model_choice]
178
  if model_id in pipe_cache:
@@ -249,20 +237,23 @@ def factcheck_app(article_url, summarizer_choice):
249
  results_html = ""
250
  all_results = []
251
  for claim in matched_claims:
252
- titles, abstracts = retrieve_pubmed_abstracts_simple(claim, fallback_headline=headline)
253
  if not titles:
254
- results_html += f"<hr><b>Claim:</b> {claim}<br><i>No PubMed results found.</i><br>"
255
- all_results.append({"claim": claim, "summary": "No PubMed results found.", "evidence": []})
256
  continue
257
  top_titles, top_abstracts = semantic_rerank_claim_abstracts(claim, titles, abstracts)
258
- idx_non_top = random.choice([i for i in range(len(titles)) if i not in [titles.index(t) for t in top_titles]]) if len(titles) > len(top_titles) else None
259
  evidence_results = []
260
  for title, abstract in zip(top_titles, top_abstracts):
261
- evidence = extract_evidence_nli(claim, title, abstract)
 
 
 
 
 
 
 
262
  evidence_results.append({"title": title, "evidence": evidence})
263
- if idx_non_top is not None:
264
- control_ev = extract_evidence_nli(claim, titles[idx_non_top], abstracts[idx_non_top])
265
- evidence_results.append({"title": f"(Control) {titles[idx_non_top]}", "evidence": control_ev})
266
  all_evidence_sentences = [ev for abs_res in evidence_results for ev in abs_res["evidence"]]
267
  summary = summarize_evidence_llm(claim, all_evidence_sentences, summarizer_choice)
268
  results_html += f"<hr><b>Claim:</b> {claim}<br><b>Layman summary:</b> {summary}<br>"
@@ -273,7 +264,7 @@ def factcheck_app(article_url, summarizer_choice):
273
 
274
  description = """
275
  <b>What does this app do?</b><br>
276
- This app extracts key scientific claims from a news article, finds the most relevant PubMed biomedical research papers using robust keyphrase extraction and semantic reranking, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
277
  <b>How to use it:</b><br>
278
  1. Paste the link to a biomedical news article.<br>
279
  2. Choose an AI summarizer model below. If you have no special access, use 'TinyLlama' (works for everyone).<br>
@@ -281,7 +272,7 @@ This app extracts key scientific claims from a news article, finds the most rele
281
  4. For each claim, you will see:<br>
282
  - A plain summary of what research says.<br>
283
  - Color-coded evidence sentences (green=support, red=contradict, gray=neutral).<br>
284
- - The titles of the most relevant PubMed articles.<br><br>
285
  <b>Everything is 100% open source and runs on this website—no personal info or cloud API needed.</b>
286
  """
287
 
 
5
  import gradio as gr
6
  import numpy as np
7
  import nltk
 
8
  from newspaper import Article
9
  from nltk.tokenize import sent_tokenize
 
10
  from sentence_transformers import SentenceTransformer, util
11
  import spacy
12
  import en_core_sci_lg
13
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
14
  import torch
15
+ import nltkmodule
16
+
17
+ # Download NLTK punkt if not present
18
+ #nltk.download('punkt')
19
 
20
  # --- Models (load once, globally) ---
21
  scispacy = en_core_sci_lg.load()
 
29
  PUBMED_N = 100
30
  TOP_ABSTRACTS = 10
31
 
32
+ # --- Sentence section classifier model (BioBert-PubMed200kRCT) ---
33
+ EVIDENCE_MODEL = "pritamdeka/BioBert-PubMed200kRCT"
34
+ evidence_tokenizer = AutoTokenizer.from_pretrained(EVIDENCE_MODEL)
35
+ evidence_model = AutoModelForSequenceClassification.from_pretrained(EVIDENCE_MODEL)
36
+ label_map = {0: "BACKGROUND", 1: "OBJECTIVE", 2: "METHODS", 3: "RESULTS", 4: "CONCLUSIONS"}
37
+
38
+ def extract_evidence_sentences_from_abstract(abstract, keep_labels=("RESULTS", "CONCLUSIONS")):
39
+ sents = sent_tokenize(abstract)
40
+ evidence_sents = []
41
+ for s in sents:
42
+ inputs = evidence_tokenizer(s, return_tensors="pt", truncation=True, padding=True)
43
+ with torch.no_grad():
44
+ logits = evidence_model(**inputs).logits
45
+ pred = torch.argmax(logits, dim=1).item()
46
+ label = label_map[pred]
47
+ if label in keep_labels:
48
+ evidence_sents.append((label, s))
49
+ return evidence_sents
50
+
51
+ # --- Europe PMC retrieval ---
52
+ def retrieve_europepmc_abstracts_simple(text, n=TOP_ABSTRACTS):
53
+ query = get_keybert_query(text, top_n=7)
54
+ print("Trying Europe PMC query:", query)
55
+ url = f'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query={query}&resulttype=core&format=json&pageSize={n}'
56
+ r = requests.get(url)
57
+ results = r.json().get('resultList', {}).get('result', [])
58
+ titles = [res.get('title', '') for res in results]
59
+ abstracts = [res.get('abstractText', '') for res in results]
60
+ return titles, abstracts
61
 
62
  # --- Utility: get robust keybert-style query ---
63
  def get_keybert_query(text, top_n=10):
 
76
  query = " OR ".join(f'"{kw}"' for kw in keywords)
77
  return query
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  # --- Claim extraction ---
80
  indicator_phrases = [
81
  "found that", "findings suggest", "shows that", "showed that", "demonstrated", "demonstrates",
 
129
  idxs = np.argsort(-sims.cpu().numpy())[:top_k]
130
  return [titles[i] for i in idxs], [abstracts[i] for i in idxs]
131
 
132
+ # --- NLI evidence extraction (run only on results/conclusion sentences) ---
133
+ def extract_evidence_nli(claim, evidence_sentences):
 
134
  evidence = []
135
+ for sent in evidence_sentences:
136
  encoding = nli_tokenizer(
137
  sent, claim,
138
  return_tensors='pt',
 
153
  })
154
  return evidence
155
 
156
+ # --- Summarizer model options ---
157
+ model_options = {
158
+ "Llama-3.2-1B-Instruct (Meta, gated)": "meta-llama/Llama-3.2-1B-Instruct",
159
+ "Gemma-3-1B-it (Google, gated)": "google/gemma-3-1b-it",
160
+ "TinyLlama-1.1B-Chat (Open)": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
161
+ }
162
+ pipe_cache = {}
163
+
164
  def get_summarizer(model_choice):
165
  model_id = model_options[model_choice]
166
  if model_id in pipe_cache:
 
237
  results_html = ""
238
  all_results = []
239
  for claim in matched_claims:
240
+ titles, abstracts = retrieve_europepmc_abstracts_simple(claim)
241
  if not titles:
242
+ results_html += f"<hr><b>Claim:</b> {claim}<br><i>No relevant abstracts found in Europe PMC.</i><br>"
243
+ all_results.append({"claim": claim, "summary": "No abstracts found.", "evidence": []})
244
  continue
245
  top_titles, top_abstracts = semantic_rerank_claim_abstracts(claim, titles, abstracts)
 
246
  evidence_results = []
247
  for title, abstract in zip(top_titles, top_abstracts):
248
+ # Extract evidence (results/conclusions) sentences from abstract
249
+ ev_sents = extract_evidence_sentences_from_abstract(abstract)
250
+ # If none found, fallback to all sentences
251
+ if ev_sents:
252
+ sent_list = [s for lbl, s in ev_sents]
253
+ else:
254
+ sent_list = sent_tokenize(abstract)
255
+ evidence = extract_evidence_nli(claim, sent_list)
256
  evidence_results.append({"title": title, "evidence": evidence})
 
 
 
257
  all_evidence_sentences = [ev for abs_res in evidence_results for ev in abs_res["evidence"]]
258
  summary = summarize_evidence_llm(claim, all_evidence_sentences, summarizer_choice)
259
  results_html += f"<hr><b>Claim:</b> {claim}<br><b>Layman summary:</b> {summary}<br>"
 
264
 
265
  description = """
266
  <b>What does this app do?</b><br>
267
+ This app extracts key scientific claims from a news article, finds the most relevant biomedical research papers using robust keyphrase extraction and semantic reranking, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
268
  <b>How to use it:</b><br>
269
  1. Paste the link to a biomedical news article.<br>
270
  2. Choose an AI summarizer model below. If you have no special access, use 'TinyLlama' (works for everyone).<br>
 
272
  4. For each claim, you will see:<br>
273
  - A plain summary of what research says.<br>
274
  - Color-coded evidence sentences (green=support, red=contradict, gray=neutral).<br>
275
+ - The titles of the most relevant research articles.<br><br>
276
  <b>Everything is 100% open source and runs on this website—no personal info or cloud API needed.</b>
277
  """
278