|
import chromadb |
|
import tempfile |
|
import os |
|
from chromadb.config import Settings |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_community.document_loaders import PDFPlumberLoader |
|
from langchain_chroma import Chroma |
|
from langchain.vectorstores.base import VectorStore |
|
from langchain_openai import OpenAIEmbeddings |
|
|
|
|
|
def process_file(file_data, file_type: str = None) -> list: |
|
""" |
|
Process a PDF file and split it into documents. |
|
|
|
Args: |
|
file_data: Either a file path (str) or file bytes |
|
file_type: Optional file type, defaults to checking if PDF |
|
|
|
Returns: |
|
List of processed documents |
|
|
|
Raises: |
|
TypeError: If file is not a PDF |
|
ValueError: If PDF parsing fails |
|
""" |
|
if file_type and file_type != "application/pdf": |
|
raise TypeError("Only PDF files are supported") |
|
|
|
|
|
if isinstance(file_data, bytes): |
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: |
|
tmp_file.write(file_data) |
|
tmp_file_path = tmp_file.name |
|
|
|
try: |
|
loader = PDFPlumberLoader(tmp_file_path) |
|
documents = loader.load() |
|
finally: |
|
|
|
os.unlink(tmp_file_path) |
|
else: |
|
|
|
loader = PDFPlumberLoader(file_data) |
|
documents = loader.load() |
|
|
|
|
|
for doc in documents: |
|
|
|
doc.page_content = doc.page_content.replace('\n', ' ') |
|
doc.page_content = ' '.join(doc.page_content.split()) |
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=3000, |
|
chunk_overlap=100, |
|
separators=["\n\n", "\n", " ", ""] |
|
) |
|
docs = text_splitter.split_documents(documents) |
|
for i, doc in enumerate(docs): |
|
doc.metadata["source"] = f"source_{i}" |
|
if not docs: |
|
raise ValueError("PDF file parsing failed.") |
|
return docs |
|
|
|
|
|
def create_search_engine(file_data, file_type: str = None, api_key: str = None) -> tuple[VectorStore, list]: |
|
""" |
|
Create a vector store search engine from a PDF file. |
|
|
|
Args: |
|
file_data: Either a file path (str) or file bytes |
|
file_type: Optional file type for validation |
|
api_key: OpenAI API key for embeddings |
|
|
|
Returns: |
|
Tuple of (search_engine, docs) where: |
|
- search_engine: The Chroma vector store |
|
- docs: The processed documents |
|
""" |
|
|
|
docs = process_file(file_data, file_type) |
|
|
|
encoder = OpenAIEmbeddings(model="text-embedding-3-small", api_key=api_key) |
|
|
|
|
|
|
|
client = chromadb.EphemeralClient() |
|
client_settings = Settings( |
|
allow_reset=True, |
|
anonymized_telemetry=False |
|
) |
|
search_engine = Chroma( |
|
client=client, |
|
client_settings=client_settings |
|
) |
|
search_engine._client.reset() |
|
|
|
search_engine = Chroma.from_documents( |
|
client=client, |
|
documents=docs, |
|
embedding=encoder, |
|
client_settings=client_settings |
|
) |
|
|
|
return search_engine, docs |