pritamdeka commited on
Commit
02ad903
·
verified ·
1 Parent(s): 2855c3b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -15
app.py CHANGED
@@ -8,6 +8,7 @@ import nltk
8
  import nltkmodule
9
  from newspaper import Article
10
  from nltk.tokenize import sent_tokenize
 
11
  from sentence_transformers import SentenceTransformer, util
12
  import spacy
13
  import en_core_sci_lg
@@ -52,23 +53,46 @@ def get_keybert_query(text, top_n=10):
52
  return query
53
 
54
  # --- PubMed retrieval ---
55
- def retrieve_pubmed_abstracts_simple(query, n=100):
 
 
 
 
 
 
56
  ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
57
- search_url = f"{ncbi_url}esearch.fcgi?db=pubmed&term={query}&retmax={n}&sort=relevance&retmode=xml"
58
- r = requests.get(search_url)
59
- root = ET.fromstring(r.text)
60
- pmids = [el.text for el in root.findall('.//Id')]
61
- if not pmids:
62
- return [], []
63
- ids = ','.join(pmids)
64
- fetch_url = f"{ncbi_url}efetch.fcgi?db=pubmed&id={ids}&rettype=abstract&retmode=xml&retmax={n}&sort=relevance"
65
- resp = requests.get(fetch_url)
66
- root2 = ET.fromstring(resp.text)
67
- titles = [a.text for a in root2.findall('.//ArticleTitle')]
68
- abstracts = [b.text for b in root2.findall('.//AbstractText')]
69
- return titles, abstracts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- #return [], []
72
 
73
  # --- Claim extraction ---
74
  indicator_phrases = [
 
8
  import nltkmodule
9
  from newspaper import Article
10
  from nltk.tokenize import sent_tokenize
11
+ import xml.etree.ElementTree as ET
12
  from sentence_transformers import SentenceTransformer, util
13
  import spacy
14
  import en_core_sci_lg
 
53
  return query
54
 
55
  # --- PubMed retrieval ---
56
+
57
+
58
+ def retrieve_pubmed_abstracts_simple(text, n=100, fallback_headline=None):
59
+ query = get_keybert_query(text, top_n=7)
60
+ if not query or query.strip() == '""':
61
+ query = fallback_headline
62
+ print("Trying PubMed query:", query)
63
  ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
64
+ tried_queries = [q for q in [query, fallback_headline, text] if q]
65
+
66
+ for q in tried_queries:
67
+ # Always request XML, never parse as JSON or HTML
68
+ search_url = f"{ncbi_url}esearch.fcgi?db=pubmed&term={q}&retmax={n}&sort=relevance&retmode=xml"
69
+ r = requests.get(search_url)
70
+ try:
71
+ root = ET.fromstring(r.text)
72
+ pmids = [el.text for el in root.findall('.//Id')]
73
+ except Exception as e:
74
+ print(f"Failed to parse PMIDs for query '{q}': {e}")
75
+ pmids = []
76
+ print(f"Query: {q} => {len(pmids)} PMIDs")
77
+ if pmids:
78
+ ids = ','.join(pmids)
79
+ fetch_url = f"{ncbi_url}efetch.fcgi?db=pubmed&id={ids}&rettype=abstract&retmode=xml&retmax={n}&sort=relevance"
80
+ resp = requests.get(fetch_url)
81
+ try:
82
+ root2 = ET.fromstring(resp.text)
83
+ titles = [a.text for a in root2.findall('.//ArticleTitle')]
84
+ abstracts = [b.text for b in root2.findall('.//AbstractText')]
85
+ except Exception as e:
86
+ print(f"Failed to parse titles/abstracts for query '{q}': {e}")
87
+ titles, abstracts = [], []
88
+ # Sanitize output
89
+ if not abstracts:
90
+ abstracts = [""] * len(titles)
91
+ titles = [re.sub(r"\s+", " ", t).strip() if t else "" for t in titles]
92
+ abstracts = [re.sub(r"\s+", " ", a).strip() if a else "" for a in abstracts]
93
+ return titles, abstracts
94
+ return [], []
95
 
 
96
 
97
  # --- Claim extraction ---
98
  indicator_phrases = [