import hashlib import os import sqlite3 from langchain_community.document_loaders import PyPDFLoader class DuplicateDetector: def __init__(self, db_path ="persiststorage.db", max_pages = 10): self.fingerprints_seen = set() self.db_path = db_path self.max_pages =max_pages self._init_db() def _init_db(self): conn =sqlite3.connect(self.db_path) cursor =conn.cursor() cursor.execute(''' CREATE TABLE IF NOT EXISTS documents ( id INTEGER PRIMARY KEY AUTOINCREMENT, filename TEXT, fingerprint TEXT UNIQUE, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ''') conn.commit() conn.close() def is_duplicate (self, pdf_path): fingerprints = self.generate_fingerprints(pdf_path) try : conn = sqlite3.connect(self.db_path) cursor = conn.cursor() cursor.execute("select id from documents where fingerprint =?", (fingerprints,)) exists = cursor.fetchone() is not None conn.close() return exists except ValueError as e: raise e def store_fingerprints(self, pdf_path): fingerprints = self.generate_fingerprints(pdf_path) conn = sqlite3.connect(self.db_path) cursor = conn.cursor() try : cursor.execute("INSERT INTO DOCUMENTS(filename, fingerprint) values(?,?)", (os.path.basename(pdf_path), fingerprints)) conn.commit() except ValueError as e: pass finally: conn.close() def generate_fingerprints(self, pdf_path): try : loader = PyPDFLoader(pdf_path) docs = loader.load() text = "".join(doc.page_content for doc in docs[:self.max_pages]) fingerprint = hashlib.sha256(text.encode("utf-8")).hexdigest() return fingerprint except ValueError as e: raise ValueError(f"Failed to fingerprint PDF: {e}")