pritamdeka commited on
Commit
81c18c4
·
verified ·
1 Parent(s): e80f4c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -20
app.py CHANGED
@@ -9,18 +9,17 @@ from newspaper import Article
9
  from nltk.tokenize import sent_tokenize
10
  from sentence_transformers import SentenceTransformer, util
11
  import spacy
 
12
  import en_core_sci_lg
13
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
14
  import torch
15
- import nltkmodule
16
 
17
- # Download NLTK punkt if not present
18
  #nltk.download('punkt')
19
 
20
  # --- Models (load once, globally) ---
21
  scispacy = en_core_sci_lg.load()
22
- sbert_keybert = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb") # for keybert query
23
- sbert_rerank = SentenceTransformer("pritamdeka/S-PubMedBert-MS-MARCO") # for abstract reranking
24
  NLI_MODEL_NAME = "pritamdeka/PubMedBERT-MNLI-MedNLI"
25
  nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME)
26
  nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME)
@@ -110,7 +109,7 @@ def extract_claims_pattern(article_text):
110
  return list(dict.fromkeys(claims))
111
 
112
  def match_claims_to_headline(claims, headline):
113
- emb_model = sbert_keybert # (or any SBERT for matching)
114
  headline_emb = emb_model.encode([headline])
115
  claim_embs = emb_model.encode(claims)
116
  sims = util.pytorch_cos_sim(headline_emb, claim_embs)[0]
@@ -120,7 +119,7 @@ def match_claims_to_headline(claims, headline):
120
  matched_claims = [claims[i] for i in idxs]
121
  return matched_claims
122
 
123
- # --- Semantic reranking of abstracts using s-pubmedbert-msmarco ---
124
  def semantic_rerank_claim_abstracts(claim, titles, abstracts, top_k=TOP_ABSTRACTS):
125
  doc_texts = [f"{t}. {a}" for t, a in zip(titles, abstracts)]
126
  doc_embs = sbert_rerank.encode(doc_texts)
@@ -153,14 +152,38 @@ def extract_evidence_nli(claim, evidence_sentences):
153
  })
154
  return evidence
155
 
156
- # --- Summarizer model options ---
157
  model_options = {
 
158
  "Llama-3.2-1B-Instruct (Meta, gated)": "meta-llama/Llama-3.2-1B-Instruct",
159
  "Gemma-3-1B-it (Google, gated)": "google/gemma-3-1b-it",
160
  "TinyLlama-1.1B-Chat (Open)": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
161
  }
162
  pipe_cache = {}
163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  def get_summarizer(model_choice):
165
  model_id = model_options[model_choice]
166
  if model_id in pipe_cache:
@@ -183,19 +206,26 @@ def get_summarizer(model_choice):
183
  def summarize_evidence_llm(claim, evidence_list, model_choice):
184
  support = [ev['sentence'] for ev in evidence_list if ev['label'] == 'ENTAILMENT']
185
  contradict = [ev['sentence'] for ev in evidence_list if ev['label'] == 'CONTRADICTION']
186
- messages = [
187
- {"role": "system", "content": "You are a helpful biomedical assistant. Summarize scientific evidence in plain English for the general public."},
188
- {"role": "user", "content":
189
- f"Claim: {claim}\n"
190
- f"Supporting evidence:\n" + ("\n".join(support) if support else "None") + "\n"
191
- f"Contradicting evidence:\n" + ("\n".join(contradict) if contradict else "None") + "\n"
192
- "Explain to a layperson: Is this claim likely true, false, or uncertain based on the evidence above? Give a brief and simple explanation in 2-3 sentences."
193
- }
194
- ]
 
 
 
 
195
  try:
196
  pipe = get_summarizer(model_choice)
197
  outputs = pipe(
198
- messages,
 
 
 
199
  max_new_tokens=128,
200
  do_sample=False,
201
  temperature=0.1,
@@ -245,9 +275,7 @@ def factcheck_app(article_url, summarizer_choice):
245
  top_titles, top_abstracts = semantic_rerank_claim_abstracts(claim, titles, abstracts)
246
  evidence_results = []
247
  for title, abstract in zip(top_titles, top_abstracts):
248
- # Extract evidence (results/conclusions) sentences from abstract
249
  ev_sents = extract_evidence_sentences_from_abstract(abstract)
250
- # If none found, fallback to all sentences
251
  if ev_sents:
252
  sent_list = [s for lbl, s in ev_sents]
253
  else:
@@ -267,7 +295,7 @@ description = """
267
  This app extracts key scientific claims from a news article, finds the most relevant biomedical research papers using robust keyphrase extraction and semantic reranking, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
268
  <b>How to use it:</b><br>
269
  1. Paste the link to a biomedical news article.<br>
270
- 2. Choose an AI summarizer model below. If you have no special access, use 'TinyLlama' (works for everyone).<br>
271
  3. Wait for the results.<br>
272
  4. For each claim, you will see:<br>
273
  - A plain summary of what research says.<br>
 
9
  from nltk.tokenize import sent_tokenize
10
  from sentence_transformers import SentenceTransformer, util
11
  import spacy
12
+ import nltkmodule
13
  import en_core_sci_lg
14
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
15
  import torch
 
16
 
 
17
  #nltk.download('punkt')
18
 
19
  # --- Models (load once, globally) ---
20
  scispacy = en_core_sci_lg.load()
21
+ sbert_keybert = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
22
+ sbert_rerank = SentenceTransformer("pritamdeka/S-PubMedBert-MS-MARCO")
23
  NLI_MODEL_NAME = "pritamdeka/PubMedBERT-MNLI-MedNLI"
24
  nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME)
25
  nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME)
 
109
  return list(dict.fromkeys(claims))
110
 
111
  def match_claims_to_headline(claims, headline):
112
+ emb_model = sbert_keybert
113
  headline_emb = emb_model.encode([headline])
114
  claim_embs = emb_model.encode(claims)
115
  sims = util.pytorch_cos_sim(headline_emb, claim_embs)[0]
 
119
  matched_claims = [claims[i] for i in idxs]
120
  return matched_claims
121
 
122
+ # --- Semantic reranking ---
123
  def semantic_rerank_claim_abstracts(claim, titles, abstracts, top_k=TOP_ABSTRACTS):
124
  doc_texts = [f"{t}. {a}" for t, a in zip(titles, abstracts)]
125
  doc_embs = sbert_rerank.encode(doc_texts)
 
152
  })
153
  return evidence
154
 
155
+ # --- Summarizer model options (now with Mistral API!) ---
156
  model_options = {
157
+ "Mistral Small (API, fast/free)": "mistral-small-2503",
158
  "Llama-3.2-1B-Instruct (Meta, gated)": "meta-llama/Llama-3.2-1B-Instruct",
159
  "Gemma-3-1B-it (Google, gated)": "google/gemma-3-1b-it",
160
  "TinyLlama-1.1B-Chat (Open)": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
161
  }
162
  pipe_cache = {}
163
 
164
+ # --- Mistral API summarization ---
165
+ def summarize_with_mistral_api(prompt, model_name="mistral-small", max_tokens=128, temperature=0.1):
166
+ api_key = os.getenv("MISTRAL_API_KEY")
167
+ if not api_key:
168
+ return "Missing MISTRAL_API_KEY secret/env variable!"
169
+ endpoint = "https://api.mistral.ai/v1/chat/completions"
170
+ headers = {
171
+ "Authorization": f"Bearer {api_key}",
172
+ "Content-Type": "application/json"
173
+ }
174
+ data = {
175
+ "model": model_name,
176
+ "messages": [{"role": "user", "content": prompt}],
177
+ "max_tokens": max_tokens,
178
+ "temperature": temperature
179
+ }
180
+ response = requests.post(endpoint, headers=headers, json=data, timeout=30)
181
+ if response.status_code == 200:
182
+ content = response.json()["choices"][0]["message"]["content"]
183
+ return content.strip()
184
+ else:
185
+ return f"API Error ({response.status_code}): {response.text}"
186
+
187
  def get_summarizer(model_choice):
188
  model_id = model_options[model_choice]
189
  if model_id in pipe_cache:
 
206
  def summarize_evidence_llm(claim, evidence_list, model_choice):
207
  support = [ev['sentence'] for ev in evidence_list if ev['label'] == 'ENTAILMENT']
208
  contradict = [ev['sentence'] for ev in evidence_list if ev['label'] == 'CONTRADICTION']
209
+ user_prompt = (
210
+ f"Claim: {claim}\n"
211
+ f"Supporting evidence:\n" + ("\n".join(support) if support else "None") + "\n"
212
+ f"Contradicting evidence:\n" + ("\n".join(contradict) if contradict else "None") + "\n"
213
+ "Explain to a layperson: Is this claim likely true, false, or uncertain based on the evidence above? Give a brief and simple explanation in 2-3 sentences."
214
+ )
215
+ if model_choice in [
216
+ "Mistral Small (API, fast/free)",
217
+ "Mistral Medium (API, free tier)",
218
+ "Mistral Large (API, may require paid)"
219
+ ]:
220
+ mistral_name = model_options[model_choice]
221
+ return summarize_with_mistral_api(user_prompt, model_name=mistral_name)
222
  try:
223
  pipe = get_summarizer(model_choice)
224
  outputs = pipe(
225
+ [
226
+ {"role": "system", "content": "You are a helpful biomedical assistant. Summarize scientific evidence in plain English for the general public."},
227
+ {"role": "user", "content": user_prompt}
228
+ ],
229
  max_new_tokens=128,
230
  do_sample=False,
231
  temperature=0.1,
 
275
  top_titles, top_abstracts = semantic_rerank_claim_abstracts(claim, titles, abstracts)
276
  evidence_results = []
277
  for title, abstract in zip(top_titles, top_abstracts):
 
278
  ev_sents = extract_evidence_sentences_from_abstract(abstract)
 
279
  if ev_sents:
280
  sent_list = [s for lbl, s in ev_sents]
281
  else:
 
295
  This app extracts key scientific claims from a news article, finds the most relevant biomedical research papers using robust keyphrase extraction and semantic reranking, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
296
  <b>How to use it:</b><br>
297
  1. Paste the link to a biomedical news article.<br>
298
+ 2. Choose an AI summarizer model below. If you have no special access, use 'TinyLlama' or 'Mistral Small' (works for everyone, free).<br>
299
  3. Wait for the results.<br>
300
  4. For each claim, you will see:<br>
301
  - A plain summary of what research says.<br>