File size: 1,243 Bytes
6a9c9f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import os
from langchain.document_loaders import (
    TextLoader,
    PyPDFLoader,
    UnstructuredWordDocumentLoader,
    UnstructuredFileLoader
)



def load_and_split_resume(file_path: str):
    """
    Loads a resume file and splits it into text chunks using LangChain.

    Args:
        file_path (str): Path to the resume file (.txt, .pdf, .docx, etc.)
        chunk_size (int): Maximum characters per chunk.
        chunk_overlap (int): Overlap between chunks to preserve context.

    Returns:
        List[str]: List of split text chunks.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    ext = os.path.splitext(file_path)[1].lower()

    # Select the appropriate loader
    if ext == ".txt":
        loader = TextLoader(file_path, encoding="utf-8")
    elif ext == ".pdf":
        loader = PyPDFLoader(file_path)
    elif ext in [".docx", ".doc"]:
        loader = UnstructuredWordDocumentLoader(file_path)
    else:
        # Fallback for other common formats
        loader = UnstructuredFileLoader(file_path)

    # Load the file as LangChain documents
    documents = loader.load()

   
    return documents
    # return [doc.page_content for doc in split_docs]