File size: 3,465 Bytes
1855ec1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import chromadb
import tempfile
import os
from chromadb.config import Settings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_chroma import Chroma
from langchain.vectorstores.base import VectorStore
from langchain_openai import OpenAIEmbeddings


def process_file(file_data, file_type: str = None) -> list:
    """
    Process a PDF file and split it into documents.
    
    Args:
        file_data: Either a file path (str) or file bytes
        file_type: Optional file type, defaults to checking if PDF
    
    Returns:
        List of processed documents
    
    Raises:
        TypeError: If file is not a PDF
        ValueError: If PDF parsing fails
    """
    if file_type and file_type != "application/pdf":
        raise TypeError("Only PDF files are supported")
    
    # Handle both file path and file bytes
    if isinstance(file_data, bytes):
        # Create a temporary file for the PDF bytes
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
            tmp_file.write(file_data)
            tmp_file_path = tmp_file.name
        
        try:
            loader = PDFPlumberLoader(tmp_file_path)
            documents = loader.load()
        finally:
            # Clean up the temporary file
            os.unlink(tmp_file_path)
    else:
        # Assume it's a file path
        loader = PDFPlumberLoader(file_data)
        documents = loader.load()
    
    # Clean up extracted text to fix common PDF extraction issues
    for doc in documents:
        # Fix common spacing issues from PDF extraction
        doc.page_content = doc.page_content.replace('\n', ' ')  # Replace newlines with spaces
        doc.page_content = ' '.join(doc.page_content.split())  # Normalize whitespace
        
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=3000,
        chunk_overlap=100,
        separators=["\n\n", "\n", " ", ""]
    )
    docs = text_splitter.split_documents(documents)
    for i, doc in enumerate(docs):
        doc.metadata["source"] = f"source_{i}"
    if not docs:
        raise ValueError("PDF file parsing failed.")
    return docs


def create_search_engine(file_data, file_type: str = None, api_key: str = None) -> tuple[VectorStore, list]:
    """
    Create a vector store search engine from a PDF file.
    
    Args:
        file_data: Either a file path (str) or file bytes
        file_type: Optional file type for validation
        api_key: OpenAI API key for embeddings
    
    Returns:
        Tuple of (search_engine, docs) where:
        - search_engine: The Chroma vector store
        - docs: The processed documents
    """
    # Process the file
    docs = process_file(file_data, file_type)
    
    encoder = OpenAIEmbeddings(model="text-embedding-3-small", api_key=api_key)
    
    # Initialize Chromadb client and settings, reset to ensure we get a clean
    # search engine
    client = chromadb.EphemeralClient()
    client_settings = Settings(
        allow_reset=True,
        anonymized_telemetry=False
    )
    search_engine = Chroma(
        client=client,
        client_settings=client_settings
    )
    search_engine._client.reset()
    
    search_engine = Chroma.from_documents(
        client=client,
        documents=docs,
        embedding=encoder,
        client_settings=client_settings
    )
    
    return search_engine, docs