import re from langchain_community.document_loaders import PyPDFLoader class BookTitleExtractor: def __init__(self, llm=None): self.llm = llm def extract_title(self, pdf_path, max_pages=5): title = self._extract_with_heuristics(pdf_path, max_pages) if title: return title if self.llm: return self._extract_with_llm(pdf_path) return "Unknown Title" def _extract_with_heuristics(self, pdf_path, max_pages): loader = PyPDFLoader(pdf_path) pages = loader.load()[:max_pages] for page in pages: text = page.page_content.strip() if not text: continue # Heuristic 1: ALL CAPS title matches = re.findall(r'^[A-Z][A-Z\s\-:,]{5,}$', text, re.MULTILINE) if matches: return matches[0].strip() # Heuristic 2: First significant line lines = [line.strip() for line in text.split('\n') if len(line.strip()) > 10] if lines: return lines[0] return None def extract_book_title_from_documents(self,documents, max_docs=5): for doc in documents[:max_docs]: text = doc.page_content.strip() if not text: continue # Heuristic 1: Lines with ALL CAPS (title pages often use this) matches = re.findall(r'^[A-Z][A-Z\s\-:,]{5,}$', text, re.MULTILINE) if matches: return matches[0].strip() # Heuristic 2: First non-empty, title-cased line for line in text.split("\n"): line = line.strip() if len(line) > 10 and line.istitle(): return line return "Unknown Title" def _extract_with_llm(self, pdf_path): loader = PyPDFLoader(pdf_path) pages = loader.load() if not pages: return "Unknown Title" sample_text = pages[0].page_content.strip()[:1000] prompt = ( "Identify the book title from the following text:\n\n" f"{sample_text}\n\nOnly return the book title." ) return self.llm.invoke(prompt).strip()