File size: 644 Bytes
c899329
 
8853856
c899329
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from backend.utils import logger

logger = logger.get_logger()

model = SentenceTransformer("all-MiniLM-L6-v2")

def get_text_embedding(text):
    try:
        return model.encode(text, convert_to_tensor=True).cpu().numpy().tolist()
    except Exception as e:
        logger.error(f"Error generating embedding: {e}")
        raise

def chunk_text(text, chunk_size=500, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_text(text)