Spaces:
Runtime error
Runtime error
| from langchain.vectorstores import Qdrant | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.schema import Document | |
| import os | |
| import fitz # PyMuPDF | |
| from config import EMBEDDING_MODEL,QDRANT_HOST,QDRANT_API_KEY | |
| embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL) | |
| def extract_text_from_pdf(pdf_path): | |
| if not os.path.exists(pdf_path): | |
| raise FileNotFoundError(f"File not found: {pdf_path}") | |
| doc = fitz.open(pdf_path) | |
| text = "\n".join([page.get_text("text") for page in doc]) | |
| return text | |
| def load_pdf_data(pdf_path): | |
| text = extract_text_from_pdf(pdf_path) | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=2000, | |
| chunk_overlap=100, | |
| ) | |
| chunks = splitter.split_text(text) | |
| documents = [ | |
| Document(page_content=chunk, metadata={"source": pdf_path}) | |
| for chunk in chunks | |
| ] | |
| return documents | |
| def get_vector_db(): | |
| qdrant_url = QDRANT_HOST | |
| api_key = QDRANT_API_KEY | |
| collection_name = "expl_embeddings" | |
| docs = load_pdf_data("data/Explorer.pdf") | |
| vector_db = Qdrant.from_documents( | |
| docs, embedding_model, | |
| location=qdrant_url, | |
| collection_name=collection_name, | |
| api_key=api_key, | |
| timeout=500 | |
| ) | |
| return vector_db | |
| def retrieve_info(query, k=20): | |
| vector_db = get_vector_db() | |
| docs = vector_db.similarity_search(query, k=k) | |
| return "\n".join([doc.page_content for doc in docs]) |