File size: 2,055 Bytes
44870e3
 
 
971f0dc
44870e3
 
 
 
 
 
 
 
 
9772c46
44870e3
 
 
 
 
 
 
 
9772c46
 
44870e3
 
 
 
 
971f0dc
44870e3
971f0dc
44870e3
971f0dc
44870e3
971f0dc
 
 
44870e3
 
971f0dc
44870e3
 
971f0dc
 
 
 
 
 
 
 
44870e3
971f0dc
 
 
44870e3
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from google import genai
from google.genai import types
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from dotenv import load_dotenv
load_dotenv()

client = genai.Client(api_key=os.getenv("api_key"))

class RAG:
    def __init__(self):
        self.CHUNK_SIZE = 1024;
        self.CHUNK_OVERLAP = 75;
        self.MAX_BATCH_SIZE = 100;
        self.MODEL = "text-embedding-004";
        self.TASK_TYPE = "SEMANTIC_SIMILARITY";
    
    def split_text(self,text):
        try:
            return RecursiveCharacterTextSplitter(
                chunk_size=self.CHUNK_SIZE,
                chunk_overlap=self.CHUNK_OVERLAP,
                separators=["\n\n", "\n", ".", "!", "?", "。", " ", ""]
            ).split_text(text)
        except Exception as e: 
            raise ValueError(f"an error occured: {e}")
    
    def generate_embedding(self, text, task_type=None):
        try:
            if not task_type:
                task_type = self.TASK_TYPE
    
            chunks = self.split_text(text)
            batches = [chunks[i:i + self.MAX_BATCH_SIZE] for i in range(0, len(chunks), self.MAX_BATCH_SIZE)]
    
            def embed_batch(batch):
                response = client.models.embed_content(
                    model=self.MODEL,
                    contents=batch,
                    config=types.EmbedContentConfig(task_type=task_type)
                )
                return [embedding.values for embedding in response.embeddings]
    
            embeddings = []
            with ThreadPoolExecutor(max_workers=100) as executor:
                futures = [executor.submit(embed_batch, batch) for batch in batches]
                for future in as_completed(futures):
                    embeddings.extend(future.result())
    
            return {"embeddings": embeddings, "chunks": chunks}, 200
    
        except Exception as e:
            return {"an error occurred": str(e)}, 500
    
        

rag = RAG()