import chromadb import tempfile import os from chromadb.config import Settings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import PDFPlumberLoader from langchain_chroma import Chroma from langchain.vectorstores.base import VectorStore from langchain_openai import OpenAIEmbeddings def process_file(file_data, file_type: str = None) -> list: """ Process a PDF file and split it into documents. Args: file_data: Either a file path (str) or file bytes file_type: Optional file type, defaults to checking if PDF Returns: List of processed documents Raises: TypeError: If file is not a PDF ValueError: If PDF parsing fails """ if file_type and file_type != "application/pdf": raise TypeError("Only PDF files are supported") # Handle both file path and file bytes if isinstance(file_data, bytes): # Create a temporary file for the PDF bytes with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: tmp_file.write(file_data) tmp_file_path = tmp_file.name try: loader = PDFPlumberLoader(tmp_file_path) documents = loader.load() finally: # Clean up the temporary file os.unlink(tmp_file_path) else: # Assume it's a file path loader = PDFPlumberLoader(file_data) documents = loader.load() # Clean up extracted text to fix common PDF extraction issues for doc in documents: # Fix common spacing issues from PDF extraction doc.page_content = doc.page_content.replace('\n', ' ') # Replace newlines with spaces doc.page_content = ' '.join(doc.page_content.split()) # Normalize whitespace text_splitter = RecursiveCharacterTextSplitter( chunk_size=3000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""] ) docs = text_splitter.split_documents(documents) for i, doc in enumerate(docs): doc.metadata["source"] = f"source_{i}" if not docs: raise ValueError("PDF file parsing failed.") return docs def create_search_engine(file_data, file_type: str = None, api_key: str = None) -> tuple[VectorStore, list]: """ Create a vector store search engine from a PDF file. Args: file_data: Either a file path (str) or file bytes file_type: Optional file type for validation api_key: OpenAI API key for embeddings Returns: Tuple of (search_engine, docs) where: - search_engine: The Chroma vector store - docs: The processed documents """ # Process the file docs = process_file(file_data, file_type) encoder = OpenAIEmbeddings(model="text-embedding-3-small", api_key=api_key) # Initialize Chromadb client and settings, reset to ensure we get a clean # search engine client = chromadb.EphemeralClient() client_settings = Settings( allow_reset=True, anonymized_telemetry=False ) search_engine = Chroma( client=client, client_settings=client_settings ) search_engine._client.reset() search_engine = Chroma.from_documents( client=client, documents=docs, embedding=encoder, client_settings=client_settings ) return search_engine, docs