Update app.py
Browse files
app.py
CHANGED
@@ -8,6 +8,7 @@ import nltk
|
|
8 |
import nltkmodule
|
9 |
from newspaper import Article
|
10 |
from nltk.tokenize import sent_tokenize
|
|
|
11 |
from sentence_transformers import SentenceTransformer, util
|
12 |
import spacy
|
13 |
import en_core_sci_lg
|
@@ -52,23 +53,46 @@ def get_keybert_query(text, top_n=10):
|
|
52 |
return query
|
53 |
|
54 |
# --- PubMed retrieval ---
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
-
#return [], []
|
72 |
|
73 |
# --- Claim extraction ---
|
74 |
indicator_phrases = [
|
|
|
8 |
import nltkmodule
|
9 |
from newspaper import Article
|
10 |
from nltk.tokenize import sent_tokenize
|
11 |
+
import xml.etree.ElementTree as ET
|
12 |
from sentence_transformers import SentenceTransformer, util
|
13 |
import spacy
|
14 |
import en_core_sci_lg
|
|
|
53 |
return query
|
54 |
|
55 |
# --- PubMed retrieval ---
|
56 |
+
|
57 |
+
|
58 |
+
def retrieve_pubmed_abstracts_simple(text, n=100, fallback_headline=None):
|
59 |
+
query = get_keybert_query(text, top_n=7)
|
60 |
+
if not query or query.strip() == '""':
|
61 |
+
query = fallback_headline
|
62 |
+
print("Trying PubMed query:", query)
|
63 |
ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
|
64 |
+
tried_queries = [q for q in [query, fallback_headline, text] if q]
|
65 |
+
|
66 |
+
for q in tried_queries:
|
67 |
+
# Always request XML, never parse as JSON or HTML
|
68 |
+
search_url = f"{ncbi_url}esearch.fcgi?db=pubmed&term={q}&retmax={n}&sort=relevance&retmode=xml"
|
69 |
+
r = requests.get(search_url)
|
70 |
+
try:
|
71 |
+
root = ET.fromstring(r.text)
|
72 |
+
pmids = [el.text for el in root.findall('.//Id')]
|
73 |
+
except Exception as e:
|
74 |
+
print(f"Failed to parse PMIDs for query '{q}': {e}")
|
75 |
+
pmids = []
|
76 |
+
print(f"Query: {q} => {len(pmids)} PMIDs")
|
77 |
+
if pmids:
|
78 |
+
ids = ','.join(pmids)
|
79 |
+
fetch_url = f"{ncbi_url}efetch.fcgi?db=pubmed&id={ids}&rettype=abstract&retmode=xml&retmax={n}&sort=relevance"
|
80 |
+
resp = requests.get(fetch_url)
|
81 |
+
try:
|
82 |
+
root2 = ET.fromstring(resp.text)
|
83 |
+
titles = [a.text for a in root2.findall('.//ArticleTitle')]
|
84 |
+
abstracts = [b.text for b in root2.findall('.//AbstractText')]
|
85 |
+
except Exception as e:
|
86 |
+
print(f"Failed to parse titles/abstracts for query '{q}': {e}")
|
87 |
+
titles, abstracts = [], []
|
88 |
+
# Sanitize output
|
89 |
+
if not abstracts:
|
90 |
+
abstracts = [""] * len(titles)
|
91 |
+
titles = [re.sub(r"\s+", " ", t).strip() if t else "" for t in titles]
|
92 |
+
abstracts = [re.sub(r"\s+", " ", a).strip() if a else "" for a in abstracts]
|
93 |
+
return titles, abstracts
|
94 |
+
return [], []
|
95 |
|
|
|
96 |
|
97 |
# --- Claim extraction ---
|
98 |
indicator_phrases = [
|