Custom_RAG / duplicate_detector.py
hoshoo21
deployment
7a837d4
import hashlib
import os
import sqlite3
from langchain_community.document_loaders import PyPDFLoader
class DuplicateDetector:
def __init__(self, db_path ="persiststorage.db", max_pages = 10):
self.fingerprints_seen = set()
self.db_path = db_path
self.max_pages =max_pages
self._init_db()
def _init_db(self):
conn =sqlite3.connect(self.db_path)
cursor =conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS documents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
filename TEXT,
fingerprint TEXT UNIQUE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
conn.commit()
conn.close()
def is_duplicate (self, pdf_path):
fingerprints = self.generate_fingerprints(pdf_path)
try :
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("select id from documents where fingerprint =?", (fingerprints,))
exists = cursor.fetchone() is not None
conn.close()
return exists
except ValueError as e:
raise e
def store_fingerprints(self, pdf_path):
fingerprints = self.generate_fingerprints(pdf_path)
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
try :
cursor.execute("INSERT INTO DOCUMENTS(filename, fingerprint) values(?,?)",
(os.path.basename(pdf_path), fingerprints))
conn.commit()
except ValueError as e:
pass
finally:
conn.close()
def generate_fingerprints(self, pdf_path):
try :
loader = PyPDFLoader(pdf_path)
docs = loader.load()
text = "".join(doc.page_content for doc in docs[:self.max_pages])
fingerprint = hashlib.sha256(text.encode("utf-8")).hexdigest()
return fingerprint
except ValueError as e:
raise ValueError(f"Failed to fingerprint PDF: {e}")