Spaces:
Sleeping
Sleeping
File size: 3,353 Bytes
cb7d229 c15a7dc cb7d229 c2b16ec c15a7dc cb7d229 c15a7dc cb7d229 c15a7dc cb7d229 fc2f873 cb7d229 c15a7dc cb7d229 c15a7dc cb7d229 c15a7dc cb7d229 c15a7dc cb7d229 c15a7dc fc2f873 cb7d229 c15a7dc fc2f873 cb7d229 c15a7dc cb7d229 c15a7dc fc2f873 cb7d229 fc2f873 cb7d229 fc2f873 cb7d229 c15a7dc cb7d229 c15a7dc fc2f873 cb7d229 c15a7dc fc2f873 cb7d229 c15a7dc fc2f873 cb7d229 c15a7dc fc2f873 cb7d229 c15a7dc cb7d229 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
import os
import pickle
from typing import List
from llama_parse import LlamaParse
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.directory import DirectoryLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores.qdrant import Qdrant
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
import nltk
import nest_asyncio
# Setup
nltk.download('punkt')
nest_asyncio.apply()
# Load environment variables
from dotenv import load_dotenv
load_dotenv()
# Environment keys
llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
groq_api_key = os.getenv("GROQ_API_KEY")
# Paths
data_dir = "data"
parsed_data_file = os.path.join(data_dir, "parsed_data.pkl")
output_md = os.path.join(data_dir, "output.md")
qdrant_dir = os.path.join(data_dir, "local_qdrant")
collection_name = "rag"
# Helper: Load or parse PDF
def load_or_parse_data(pdf_path):
if os.path.exists(parsed_data_file):
with open(parsed_data_file, "rb") as f:
parsed_data = pickle.load(f)
else:
parsing_instruction = """The provided document is a user guide or manual.
It contains many images and tables. Be precise while answering questions."""
parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsing_instruction) # type: ignore
parsed_data = parser.load_data(pdf_path)
with open(parsed_data_file, "wb") as f:
pickle.dump(parsed_data, f)
return parsed_data
# Main vector DB builder
def create_vector_database(pdf_path):
print("π§ Starting vector DB creation...")
# Ensure directories exist
os.makedirs(data_dir, exist_ok=True)
os.makedirs(qdrant_dir, exist_ok=True)
# Parse PDF
parsed_docs = load_or_parse_data(pdf_path)
if not parsed_docs:
raise ValueError("β No parsed documents returned from LlamaParse!")
# Write Markdown content
with open(output_md, 'w', encoding='utf-8') as f:
for doc in parsed_docs:
if hasattr(doc, "text") and doc.text.strip():
f.write(doc.text.strip() + "\n\n")
if not os.path.exists(output_md) or os.path.getsize(output_md) == 0:
raise RuntimeError("β Markdown file was not created or is empty!")
# Load .md as documents
try:
loader = DirectoryLoader(data_dir, glob="**/*.md", show_progress=True)
documents = loader.load()
except Exception as e:
print(f"β οΈ DirectoryLoader failed: {e}. Falling back to TextLoader...")
documents = TextLoader(output_md, encoding='utf-8').load()
if not documents:
raise RuntimeError("β No documents loaded from markdown!")
# Chunk documents
splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
docs = splitter.split_documents(documents)
print(f"β
Loaded and split {len(docs)} chunks.")
# Embeddings
embeddings = FastEmbedEmbeddings() # type: ignore
# Create Qdrant vector DB
print("π¦ Creating Qdrant vector DB...")
qdrant = Qdrant.from_documents(
documents=docs,
embedding=embeddings,
path=qdrant_dir,
collection_name=collection_name,
)
print("β
Vector DB created successfully.")
return qdrant
|