Spaces:
Sleeping
Sleeping
import hashlib | |
import os | |
import sqlite3 | |
from langchain_community.document_loaders import PyPDFLoader | |
class DuplicateDetector: | |
def __init__(self, db_path ="persiststorage.db", max_pages = 10): | |
self.fingerprints_seen = set() | |
self.db_path = db_path | |
self.max_pages =max_pages | |
self._init_db() | |
def _init_db(self): | |
conn =sqlite3.connect(self.db_path) | |
cursor =conn.cursor() | |
cursor.execute(''' | |
CREATE TABLE IF NOT EXISTS documents ( | |
id INTEGER PRIMARY KEY AUTOINCREMENT, | |
filename TEXT, | |
fingerprint TEXT UNIQUE, | |
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP | |
) | |
''') | |
conn.commit() | |
conn.close() | |
def is_duplicate (self, pdf_path): | |
fingerprints = self.generate_fingerprints(pdf_path) | |
try : | |
conn = sqlite3.connect(self.db_path) | |
cursor = conn.cursor() | |
cursor.execute("select id from documents where fingerprint =?", (fingerprints,)) | |
exists = cursor.fetchone() is not None | |
conn.close() | |
return exists | |
except ValueError as e: | |
raise e | |
def store_fingerprints(self, pdf_path): | |
fingerprints = self.generate_fingerprints(pdf_path) | |
conn = sqlite3.connect(self.db_path) | |
cursor = conn.cursor() | |
try : | |
cursor.execute("INSERT INTO DOCUMENTS(filename, fingerprint) values(?,?)", | |
(os.path.basename(pdf_path), fingerprints)) | |
conn.commit() | |
except ValueError as e: | |
pass | |
finally: | |
conn.close() | |
def generate_fingerprints(self, pdf_path): | |
try : | |
loader = PyPDFLoader(pdf_path) | |
docs = loader.load() | |
text = "".join(doc.page_content for doc in docs[:self.max_pages]) | |
fingerprint = hashlib.sha256(text.encode("utf-8")).hexdigest() | |
return fingerprint | |
except ValueError as e: | |
raise ValueError(f"Failed to fingerprint PDF: {e}") |