Spaces:

mgbam
/

Medapp

Sleeping

App Files Files Community

Medapp / pubmed_rag.py

mgbam

Rename pubmed_utils.py to pubmed_rag.py

ff77b73 verified 7 months ago

raw

history blame

6.65 kB

	import requests
	import nltk
	nltk.download("punkt")
	from nltk.tokenize import sent_tokenize
	from concurrent.futures import ThreadPoolExecutor, as_completed

	from transformers import pipeline, AutoTokenizer, AutoModel
	from sentence_transformers import SentenceTransformer
	import os
	import faiss
	import numpy as np
	import json

	from config import (
	PUBMED_EMAIL,
	MAX_PUBMED_RESULTS,
	DEFAULT_SUMMARIZATION_CHUNK,
	VECTORDB_PATH,
	EMBEDDING_MODEL_NAME
	)

	###############################################################################
	# SUMMARIZATION & EMBEDDINGS #
	###############################################################################

	summarizer = pipeline(
	"summarization",
	model="facebook/bart-large-cnn",
	tokenizer="facebook/bart-large-cnn",
	)

	embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)

	###############################################################################
	# PUBMED UTIL FUNCTIONS #
	###############################################################################

	def search_pubmed(query, max_results=MAX_PUBMED_RESULTS):
	"""
	Search PubMed for PMIDs matching a query. Returns a list of PMIDs.
	"""
	url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
	params = {
	"db": "pubmed",
	"term": query,
	"retmax": max_results,
	"retmode": "json",
	"tool": "AdvancedMedicalAI",
	"email": PUBMED_EMAIL
	}
	resp = requests.get(url, params=params)
	resp.raise_for_status()
	data = resp.json()
	return data.get("esearchresult", {}).get("idlist", [])

	def fetch_abstract(pmid):
	"""
	Fetches an abstract for a single PMID via EFetch.
	"""
	url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
	params = {
	"db": "pubmed",
	"id": pmid,
	"retmode": "text",
	"rettype": "abstract",
	"tool": "AdvancedMedicalAI",
	"email": PUBMED_EMAIL
	}
	resp = requests.get(url, params=params)
	resp.raise_for_status()
	return resp.text.strip()

	def fetch_pubmed_abstracts(pmids):
	"""
	Parallel fetch for multiple PMIDs. Returns dict {pmid: text}.
	"""
	results = {}
	with ThreadPoolExecutor(max_workers=min(len(pmids), 5)) as executor:
	future_to_pmid = {executor.submit(fetch_abstract, pmid): pmid for pmid in pmids}
	for future in as_completed(future_to_pmid):
	pmid = future_to_pmid[future]
	try:
	results[pmid] = future.result()
	except Exception as e:
	results[pmid] = f"Error fetching PMID {pmid}: {str(e)}"
	return results

	###############################################################################
	# SUMMARIZE & CHUNK TEXT #
	###############################################################################

	def chunk_and_summarize(raw_text, chunk_size=DEFAULT_SUMMARIZATION_CHUNK):
	"""
	Splits large text into chunks by sentences, then summarizes each chunk, merging results.
	"""
	sentences = sent_tokenize(raw_text)
	chunks = []
	current_chunk = []
	current_length = 0

	for sent in sentences:
	token_count = len(sent.split())
	if current_length + token_count > chunk_size:
	chunks.append(" ".join(current_chunk))
	current_chunk = []
	current_length = 0
	current_chunk.append(sent)
	current_length += token_count

	if current_chunk:
	chunks.append(" ".join(current_chunk))

	summary_list = []
	for c in chunks:
	summ = summarizer(c, max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
	summary_list.append(summ)
	return " ".join(summary_list)

	###############################################################################
	# SIMPLE VECTOR STORE (FAISS) FOR RAG #
	###############################################################################

	def create_or_load_faiss_index():
	"""
	Creates a new FAISS index or loads from disk if it exists.
	"""
	index_path = os.path.join(VECTORDB_PATH, "faiss_index.bin")
	meta_path = os.path.join(VECTORDB_PATH, "faiss_meta.json")

	if not os.path.exists(VECTORDB_PATH):
	os.makedirs(VECTORDB_PATH)

	if os.path.exists(index_path) and os.path.exists(meta_path):
	# Load existing index
	index = faiss.read_index(index_path)
	with open(meta_path, "r") as f:
	meta_data = json.load(f)
	return index, meta_data
	else:
	# Create new index
	index = faiss.IndexFlatL2(embed_model.get_sentence_embedding_dimension())
	meta_data = {}
	return index, meta_data

	def save_faiss_index(index, meta_data):
	"""
	Saves the FAISS index and metadata to disk.
	"""
	index_path = os.path.join(VECTORDB_PATH, "faiss_index.bin")
	meta_path = os.path.join(VECTORDB_PATH, "faiss_meta.json")

	faiss.write_index(index, index_path)
	with open(meta_path, "w") as f:
	json.dump(meta_data, f)

	def upsert_documents(docs):
	"""
	Takes in a dict of {pmid: text}, embeds and upserts them into the FAISS index.
	Each doc is stored in 'meta_data' with pmid as key.
	"""
	index, meta_data = create_or_load_faiss_index()

	texts = list(docs.values())
	pmids = list(docs.keys())

	embeddings = embed_model.encode(texts, convert_to_numpy=True)
	index.add(embeddings)

	# Maintain a simple meta_data: { int_id: { 'pmid': X, 'text': Y } }
	# Where int_id is the row in the index
	start_id = len(meta_data)
	for i, pmid in enumerate(pmids):
	meta_data[str(start_id + i)] = {"pmid": pmid, "text": texts[i]}

	save_faiss_index(index, meta_data)

	def semantic_search(query, top_k=3):
	"""
	Embeds 'query' and searches the FAISS index for top_k similar docs.
	Returns a list of dict with 'pmid' and 'text'.
	"""
	index, meta_data = create_or_load_faiss_index()

	query_embedding = embed_model.encode([query], convert_to_numpy=True)
	distances, indices = index.search(query_embedding, top_k)

	results = []
	for dist, idx_list in zip(distances, indices):
	for d, i in zip(dist, idx_list):
	# i is row in the index, look up meta_data
	doc_info = meta_data[str(i)]
	results.append({"pmid": doc_info["pmid"], "text": doc_info["text"], "score": float(d)})
	# Sort by ascending distance => best match first
	results.sort(key=lambda x: x["score"])
	return results