Spaces:
Sleeping
Sleeping
import re | |
from langchain_community.document_loaders import PyPDFLoader | |
class BookTitleExtractor: | |
def __init__(self, llm=None): | |
self.llm = llm | |
def extract_title(self, pdf_path, max_pages=5): | |
title = self._extract_with_heuristics(pdf_path, max_pages) | |
if title: | |
return title | |
if self.llm: | |
return self._extract_with_llm(pdf_path) | |
return "Unknown Title" | |
def _extract_with_heuristics(self, pdf_path, max_pages): | |
loader = PyPDFLoader(pdf_path) | |
pages = loader.load()[:max_pages] | |
for page in pages: | |
text = page.page_content.strip() | |
if not text: | |
continue | |
# Heuristic 1: ALL CAPS title | |
matches = re.findall(r'^[A-Z][A-Z\s\-:,]{5,}$', text, re.MULTILINE) | |
if matches: | |
return matches[0].strip() | |
# Heuristic 2: First significant line | |
lines = [line.strip() for line in text.split('\n') if len(line.strip()) > 10] | |
if lines: | |
return lines[0] | |
return None | |
def extract_book_title_from_documents(self,documents, max_docs=5): | |
for doc in documents[:max_docs]: | |
text = doc.page_content.strip() | |
if not text: | |
continue | |
# Heuristic 1: Lines with ALL CAPS (title pages often use this) | |
matches = re.findall(r'^[A-Z][A-Z\s\-:,]{5,}$', text, re.MULTILINE) | |
if matches: | |
return matches[0].strip() | |
# Heuristic 2: First non-empty, title-cased line | |
for line in text.split("\n"): | |
line = line.strip() | |
if len(line) > 10 and line.istitle(): | |
return line | |
return "Unknown Title" | |
def _extract_with_llm(self, pdf_path): | |
loader = PyPDFLoader(pdf_path) | |
pages = loader.load() | |
if not pages: | |
return "Unknown Title" | |
sample_text = pages[0].page_content.strip()[:1000] | |
prompt = ( | |
"Identify the book title from the following text:\n\n" | |
f"{sample_text}\n\nOnly return the book title." | |
) | |
return self.llm.invoke(prompt).strip() | |