Final_Assignment_Project

Sleeping

App Files Files Community

wt002 commited on May 14

Commit

9ac015d

verified ·

1 Parent(s): dae11a5

Update agent.py

Browse files

Files changed (1) hide show

agent.py +72 -15

agent.py CHANGED Viewed

@@ -374,52 +374,107 @@ async def start_questions(request: Request):
 # -----------------------------
 # 1. Define Custom BERT Embedding Model
 # -----------------------------
 class BERTEmbeddings(Embeddings):
-    def __init__(self, model_name='bert-base-uncased'):
         self.tokenizer = BertTokenizer.from_pretrained(model_name)
         self.model = BertModel.from_pretrained(model_name)
         self.model.eval()  # Set model to eval mode
     def embed_documents(self, texts):
-        inputs = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
         with torch.no_grad():
             outputs = self.model(**inputs)
         embeddings = outputs.last_hidden_state.mean(dim=1)
-        embeddings = F.normalize(embeddings, p=2, dim=1)  # Normalize for cosine similarity
         return embeddings.cpu().numpy()
     def embed_query(self, text):
         return self.embed_documents([text])[0]
 # -----------------------------
 # 2. Initialize Embedding Model
 # -----------------------------
-embedding_model = BERTEmbeddings()
 # -----------------------------
-# 3. Prepare Documents
 # -----------------------------
-docs = [
-    Document(page_content="Mercedes Sosa released many albums between 2000 and 2009.", metadata={"id": 1}),
-    Document(page_content="She was a prominent Argentine folk singer.", metadata={"id": 2}),
-    Document(page_content="Her album 'Al Despertar' was released in 1998.", metadata={"id": 3}),
-    Document(page_content="She continued releasing music well into the 2000s.", metadata={"id": 4}),
-]
 # -----------------------------
-# 4. Create FAISS Vector Store
 # -----------------------------
 vector_store = FAISS.from_documents(docs, embedding_model)
-vector_store.save_local("faiss_index")
 # -----------------------------
 # 6. Create LangChain Retriever Tool
 # -----------------------------
-retriever = vector_store.as_retriever()
 question_retriever_tool = create_retriever_tool(
     retriever=retriever,
@@ -1052,6 +1107,8 @@ def process_all_tasks(tasks: list):
 ## Langgraph
 # Build graph function
 provider = "huggingface"
 model_config = {

 # -----------------------------
 # 1. Define Custom BERT Embedding Model
 # -----------------------------
+import torch
+import torch.nn.functional as F
+from transformers import BertTokenizer, BertModel
+from langchain.embeddings import Embeddings
 class BERTEmbeddings(Embeddings):
+    def __init__(self, model_name='bert-base-uncased', device='cpu'):
+        # Initialize the tokenizer and model
         self.tokenizer = BertTokenizer.from_pretrained(model_name)
         self.model = BertModel.from_pretrained(model_name)
         self.model.eval()  # Set model to eval mode
+        self.device = device
+        self.model.to(self.device)  # Move model to the specified device (CPU or GPU)
     def embed_documents(self, texts):
+        # Tokenize the input texts
+        inputs = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
+        inputs = {key: value.to(self.device) for key, value in inputs.items()}  # Move inputs to the specified device
         with torch.no_grad():
             outputs = self.model(**inputs)
+        # Get the embeddings by averaging the last hidden state across tokens
         embeddings = outputs.last_hidden_state.mean(dim=1)
+        # Normalize embeddings for cosine similarity
+        embeddings = F.normalize(embeddings, p=2, dim=1)
+        # Return the embeddings as numpy array
         return embeddings.cpu().numpy()
     def embed_query(self, text):
+        # Embed a single query (text)
         return self.embed_documents([text])[0]
 # -----------------------------
 # 2. Initialize Embedding Model
 # -----------------------------
 # -----------------------------
+# Create FAISS Vector Store
 # -----------------------------
+class MyVectorStore:
+    def __init__(self, index: faiss.Index):
+        self.index = index
+    def save_local(self, path: str):
+        # Save the FAISS index to the specified file
+        faiss.write_index(self.index, "/home/wendy/Downloads")
+        print(f"Index saved to {path}")
+    @classmethod
+    def load_local(cls, path: str):
+        # Load the FAISS index from the specified file
+        index = faiss.read_index(path)
+        return cls(index)
 # -----------------------------
+# 3. Prepare Documents
 # -----------------------------
+# Define the URL where the JSON file is hosted
+url = "https://agents-course-unit4-scoring.hf.space/questions"
+# Download the JSON file from the URL
+response = requests.get(url)
+response.raise_for_status()  # Ensure that the request was successful
+# Parse the JSON data
+docs = json.loads(response.text)
+# Assuming the JSON structure has a 'text' field for each document
+texts = [doc['text'] for doc in docs]  # Extract text from JSON
+# Initialize the embedding model
+embedding_model = BERTEmbeddings()
+# Generate embeddings for each document
+embeddings = [embedding_model.encode(text) for text in texts]
+# Create the FAISS index
 vector_store = FAISS.from_documents(docs, embedding_model)
+# Save the FAISS index
+vector_store = MyVectorStore(index)
+vector_store.save_local("/home/wt/Downloads/faiss_index.index")
+# Load the FAISS index later
+loaded_vector_store = MyVectorStore.load_local("faiss_index.index")
 # -----------------------------
 # 6. Create LangChain Retriever Tool
 # -----------------------------
+retriever = FAISS.load_local("faiss_index.index", embedding_model).as_retriever()
 question_retriever_tool = create_retriever_tool(
     retriever=retriever,
 ## Langgraph
 # Build graph function
+vector_store = vector_store.save_local("faiss_index")
 provider = "huggingface"
 model_config = {