Spaces:
Sleeping
Sleeping
File size: 2,214 Bytes
7a837d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import re
from langchain_community.document_loaders import PyPDFLoader
class BookTitleExtractor:
def __init__(self, llm=None):
self.llm = llm
def extract_title(self, pdf_path, max_pages=5):
title = self._extract_with_heuristics(pdf_path, max_pages)
if title:
return title
if self.llm:
return self._extract_with_llm(pdf_path)
return "Unknown Title"
def _extract_with_heuristics(self, pdf_path, max_pages):
loader = PyPDFLoader(pdf_path)
pages = loader.load()[:max_pages]
for page in pages:
text = page.page_content.strip()
if not text:
continue
# Heuristic 1: ALL CAPS title
matches = re.findall(r'^[A-Z][A-Z\s\-:,]{5,}$', text, re.MULTILINE)
if matches:
return matches[0].strip()
# Heuristic 2: First significant line
lines = [line.strip() for line in text.split('\n') if len(line.strip()) > 10]
if lines:
return lines[0]
return None
def extract_book_title_from_documents(self,documents, max_docs=5):
for doc in documents[:max_docs]:
text = doc.page_content.strip()
if not text:
continue
# Heuristic 1: Lines with ALL CAPS (title pages often use this)
matches = re.findall(r'^[A-Z][A-Z\s\-:,]{5,}$', text, re.MULTILINE)
if matches:
return matches[0].strip()
# Heuristic 2: First non-empty, title-cased line
for line in text.split("\n"):
line = line.strip()
if len(line) > 10 and line.istitle():
return line
return "Unknown Title"
def _extract_with_llm(self, pdf_path):
loader = PyPDFLoader(pdf_path)
pages = loader.load()
if not pages:
return "Unknown Title"
sample_text = pages[0].page_content.strip()[:1000]
prompt = (
"Identify the book title from the following text:\n\n"
f"{sample_text}\n\nOnly return the book title."
)
return self.llm.invoke(prompt).strip()
|