File size: 2,214 Bytes
7a837d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import re
from langchain_community.document_loaders import PyPDFLoader

class BookTitleExtractor:
    def __init__(self, llm=None):
        self.llm = llm

    def extract_title(self, pdf_path, max_pages=5):
        title = self._extract_with_heuristics(pdf_path, max_pages)
        if title:
            return title
        if self.llm:
            return self._extract_with_llm(pdf_path)
        return "Unknown Title"

    def _extract_with_heuristics(self, pdf_path, max_pages):
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()[:max_pages]

        for page in pages:
            text = page.page_content.strip()
            if not text:
                continue
            # Heuristic 1: ALL CAPS title
            matches = re.findall(r'^[A-Z][A-Z\s\-:,]{5,}$', text, re.MULTILINE)
            if matches:
                return matches[0].strip()
            # Heuristic 2: First significant line
            lines = [line.strip() for line in text.split('\n') if len(line.strip()) > 10]
            if lines:
                return lines[0]
        return None

    def extract_book_title_from_documents(self,documents, max_docs=5):
    
        for doc in documents[:max_docs]:
            text = doc.page_content.strip()
            if not text:
                continue

            # Heuristic 1: Lines with ALL CAPS (title pages often use this)
            matches = re.findall(r'^[A-Z][A-Z\s\-:,]{5,}$', text, re.MULTILINE)
            if matches:
                return matches[0].strip()

            # Heuristic 2: First non-empty, title-cased line
            for line in text.split("\n"):
                line = line.strip()
                if len(line) > 10 and line.istitle():
                    return line
        return "Unknown Title"

    def _extract_with_llm(self, pdf_path):
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()
        if not pages:
            return "Unknown Title"
        sample_text = pages[0].page_content.strip()[:1000]
        prompt = (
            "Identify the book title from the following text:\n\n"
            f"{sample_text}\n\nOnly return the book title."
        )
        return self.llm.invoke(prompt).strip()