Final_Assignment_Template

Sleeping

App Files Files Community

Freddolin commited on Jul 10

Commit

9befc16

verified ·

1 Parent(s): 198e1d4

Update agent.py

Browse files

Files changed (1) hide show

agent.py +67 -103

agent.py CHANGED Viewed

@@ -13,9 +13,11 @@ from langchain_community.document_loaders import ArxivLoader
 from langchain_core.messages import SystemMessage, HumanMessage
 from langchain_core.tools import tool
 from langchain.tools.retriever import create_retriever_tool
-from langchain_community.vectorstores import Chroma # Ny import för Chroma
-from langchain_core.documents import Document # Ny import för att skapa dokument
-import shutil # För att hantera kataloger
 load_dotenv()
@@ -122,13 +124,14 @@ sys_msg = SystemMessage(content=system_prompt)
 # --- Start ChromaDB Setup ---
 # Define the directory for ChromaDB persistence
 CHROMA_DB_DIR = "./chroma_db"
 # Build embeddings (this remains the same)
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") # dim=768
 # Initialize ChromaDB
-# If the directory exists, load the existing vector store.
-# Otherwise, create a new one and add some dummy documents.
 if os.path.exists(CHROMA_DB_DIR) and os.listdir(CHROMA_DB_DIR):
     print(f"Loading existing ChromaDB from {CHROMA_DB_DIR}")
     vector_store = Chroma(
@@ -136,40 +139,63 @@ if os.path.exists(CHROMA_DB_DIR) and os.listdir(CHROMA_DB_DIR):
         embedding_function=embeddings
     )
 else:
-    print(f"Creating new ChromaDB at {CHROMA_DB_DIR} and adding dummy documents.")
     # Ensure the directory is clean before creating new
     if os.path.exists(CHROMA_DB_DIR):
         shutil.rmtree(CHROMA_DB_DIR)
     os.makedirs(CHROMA_DB_DIR)
-    # Example dummy documents to populate the vector store
-    # In a real application, you would load your actual documents here
-    documents = [
-        Document(page_content="What is the capital of France?", metadata={"source": "internal", "answer": "Paris"}),
-        Document(page_content="Who wrote Hamlet?", metadata={"source": "internal", "answer": "William Shakespeare"}),
-        Document(page_content="What is the highest mountain in the world?", metadata={"source": "internal", "answer": "Mount Everest"}),
-        Document(page_content="When was the internet invented?", metadata={"source": "internal", "answer": "The internet, as we know it, evolved from ARPANET in the late 1960s and early 1970s. The TCP/IP protocol, which forms the basis of the internet, was standardized in 1978."}),
-        Document(page_content="What is the square root of 64?", metadata={"source": "internal", "answer": "8"}),
-        Document(page_content="Who is the current president of the United States?", metadata={"source": "internal", "answer": "Joe Biden"}),
-        Document(page_content="What is the chemical symbol for water?", metadata={"source": "internal", "answer": "H2O"}),
-        Document(page_content="What is the largest ocean on Earth?", metadata={"source": "internal", "answer": "Pacific Ocean"}),
-        Document(page_content="What is the speed of light?", metadata={"source": "internal", "answer": "Approximately 299,792,458 meters per second in a vacuum."}),
-        Document(page_content="What is the capital of Sweden?", metadata={"source": "internal", "answer": "Stockholm"}),
-    ]
-    vector_store = Chroma.from_documents(
-        documents=documents,
-        embedding=embeddings,
-        persist_directory=CHROMA_DB_DIR
-    )
-    vector_store.persist() # Save the new vector store to disk
-    print("ChromaDB initialized and persisted with dummy documents.")
 # Create retriever tool using the Chroma vector store
-retriever_tool = create_retriever_tool( # Changed variable name to avoid conflict with function name
     retriever=vector_store.as_retriever(),
-    name="Question_Search", # Changed name to be more descriptive and valid for tool use
-    description="A tool to retrieve similar questions from a vector store and their answers.",
 )
 # Add the new retriever tool to your list of tools
@@ -182,21 +208,17 @@ tools = [
     wiki_search,
     web_search,
     arvix_search,
-    retriever_tool, # Add the new retriever tool here
 ]
 # Build graph function
 def build_graph(provider: str = "google"):
     """Build the graph"""
-    # Load environment variables from .env file
     if provider == "google":
-        # Google Gemini
         llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0)
     elif provider == "groq":
-        # Groq https://console.groq.com/docs/models
-        llm = ChatGroq(model="qwen-qwq-32b", temperature=0) # optional : qwen-qwq-32b gemma2-9b-it
     elif provider == "huggingface":
-        # TODO: Add huggingface endpoint
         llm = ChatHuggingFace(
             llm=HuggingFaceEndpoint(
                 url="https://api-inference.huggingface.co/models/Meta-DeepLearning/llama-2-7b-chat-hf",
@@ -205,10 +227,9 @@ def build_graph(provider: str = "google"):
         )
     else:
         raise ValueError("Invalid provider. Choose 'google', 'groq' or 'huggingface'.")
-    # Bind tools to LLM
     llm_with_tools = llm.bind_tools(tools)
-    # Node
     def assistant(state: MessagesState):
         """Assistant node"""
         return {"messages": [llm_with_tools.invoke(state["messages"])]}
@@ -217,86 +238,29 @@ def build_graph(provider: str = "google"):
     def retriever(state: MessagesState):
         query = state["messages"][-1].content
-        # Use the retriever tool to get similar documents
-        similar_docs = retriever_tool.invoke(query) # Call the tool directly
-        # The tool returns a list of Documents, so we need to process it
-        # Assuming the tool returns a list of documents, we take the first one
         if similar_docs:
-            # The tool output is a string representation of the documents.
-            # We need to parse it or adjust the tool to return structured data.
-            # For simplicity, let's assume the tool returns a list of Document objects
-            # or a string that can be directly used.
-            # Given the original `retriever` node, it expected `similar_question[0].page_content`.
-            # If `retriever_tool.invoke(query)` returns a list of Document objects,
-            # then `similar_docs[0].page_content` is correct.
-            # If it returns a string, we need to adapt.
-            # For now, let's assume it returns a list of Documents or a string that contains the answer.
-            # If retriever_tool returns a string directly (as per your tool definition):
-            # content = similar_docs # This would be the string output from the tool
-            # If retriever_tool returns a list of Document objects from its internal retriever:
-            # Let's assume the `retriever_tool` internally uses `vector_store.as_retriever().invoke(query)`
-            # which returns a list of `Document` objects.
-            # The `create_retriever_tool` wraps this, so `retriever_tool.invoke` will return a string
-            # that is the `page_content` of the retrieved documents.
-            # The original `retriever` node was using `vector_store.similarity_search` directly.
-            # Now `retriever_tool` is a LangChain tool.
-            # When `retriever_tool.invoke(query)` is called, it will return the formatted string
-            # from the `create_retriever_tool` definition.
-            # So, `similar_docs` will be a string.
-            # We need to parse the `similar_docs` string to extract the answer.
-            # The `Question_Search` tool description is "A tool to retrieve similar questions from a vector store and their answers."
-            # The `create_retriever_tool` automatically formats the output of the retriever.
-            # Let's assume the output string from `retriever_tool.invoke(query)` will look something like:
-            # "content='What is the capital of Sweden?' metadata={'source': 'internal', 'answer': 'Stockholm'}"
-            # We need to extract the 'answer' part.
-            # A more robust way would be to make the retriever node *call* the tool,
-            # and then the LLM decides if it wants to use the tool.
-            # However, your current graph structure has a dedicated "retriever" node
-            # that directly fetches and returns an AIMessage.
-            # Let's refine the retriever node to parse the output of the tool more robustly.
-            # The `create_retriever_tool` returns a string where documents are joined.
-            # We need to extract the content that would be the "answer".
-            # The dummy documents have `metadata={"source": "internal", "answer": "..."}`.
-            # The `create_retriever_tool` will return `doc.page_content` by default.
-            # So, `similar_docs` will contain the question itself.
-            # We need to ensure the retriever provides the *answer* not just the question.
-            # Let's adjust the `retriever` node to directly access the `vector_store`
-            # for `similarity_search` and then extract the answer from metadata,
-            # similar to your original implementation. This bypasses the tool wrapper
-            # for this specific node, ensuring we get the full Document object.
-            similar_doc = vector_store.similarity_search(query, k=1)[0]
-            # Check if an 'answer' is directly available in metadata
-            if "answer" in similar_doc.metadata:
-                answer = similar_doc.metadata["answer"]
             elif "Final answer :" in similar_doc.page_content:
                 answer = similar_doc.page_content.split("Final answer :")[-1].strip()
             else:
                 answer = similar_doc.page_content.strip() # Fallback to page_content if no explicit answer
             return {"messages": [AIMessage(content=answer)]}
         else:
-            # If no similar documents found, return an empty AIMessage or a message indicating no answer
             return {"messages": [AIMessage(content="No similar questions found in the knowledge base.")]}
     builder = StateGraph(MessagesState)
     builder.add_node("retriever", retriever)
-    # Retriever ist Start und Endpunkt
     builder.set_entry_point("retriever")
     builder.set_finish_point("retriever")
-    # Compile graph
     return builder.compile()

 from langchain_core.messages import SystemMessage, HumanMessage
 from langchain_core.tools import tool
 from langchain.tools.retriever import create_retriever_tool
+from langchain_community.vectorstores import Chroma
+from langchain_core.documents import Document
+import shutil
+import pandas as pd # Ny import för pandas
+import json # För att parsa metadata-kolumnen
 load_dotenv()
 # --- Start ChromaDB Setup ---
 # Define the directory for ChromaDB persistence
 CHROMA_DB_DIR = "./chroma_db"
+CSV_FILE_PATH = "./supabase.docs.csv" # Path to your CSV file
 # Build embeddings (this remains the same)
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") # dim=768
 # Initialize ChromaDB
+# If the directory exists and contains data, load the existing vector store.
+# Otherwise, create a new one and add documents from the CSV file.
 if os.path.exists(CHROMA_DB_DIR) and os.listdir(CHROMA_DB_DIR):
     print(f"Loading existing ChromaDB from {CHROMA_DB_DIR}")
     vector_store = Chroma(
         embedding_function=embeddings
     )
 else:
+    print(f"Creating new ChromaDB at {CHROMA_DB_DIR} and loading documents from {CSV_FILE_PATH}.")
     # Ensure the directory is clean before creating new
     if os.path.exists(CHROMA_DB_DIR):
         shutil.rmtree(CHROMA_DB_DIR)
     os.makedirs(CHROMA_DB_DIR)
+    # Load data from the CSV file
+    if not os.path.exists(CSV_FILE_PATH):
+        raise FileNotFoundError(f"CSV file not found at {CSV_FILE_PATH}. Please ensure it's in the root directory.")
+    df = pd.read_csv(CSV_FILE_PATH)
+    documents = []
+    for index, row in df.iterrows():
+        content = row["content"]
+        # Extract the question part from the content
+        # Assuming the question is everything before "Final answer :"
+        question_part = content.split("Final answer :")[0].strip()
+        # Extract the final answer part from the content
+        final_answer_part = content.split("Final answer :")[-1].strip() if "Final answer :" in content else ""
+        # Parse the metadata string into a dictionary
+        # The metadata column might be stored as a string representation of a dictionary
+        try:
+            metadata = json.loads(row["metadata"].replace("'", "\"")) # Replace single quotes for valid JSON
+        except json.JSONDecodeError:
+            metadata = {} # Fallback if parsing fails
+        # Add the extracted final answer to the metadata for easy retrieval
+        metadata["final_answer"] = final_answer_part
+        # Create a Document object. The page_content should be the question for similarity search.
+        # The answer will be in metadata.
+        documents.append(Document(page_content=question_part, metadata=metadata))
+    if not documents:
+        print("No documents loaded from CSV. ChromaDB will be empty.")
+        # Create an empty ChromaDB if no documents are found
+        vector_store = Chroma(
+            persist_directory=CHROMA_DB_DIR,
+            embedding_function=embeddings
+        )
+    else:
+        vector_store = Chroma.from_documents(
+            documents=documents,
+            embedding=embeddings,
+            persist_directory=CHROMA_DB_DIR
+        )
+        vector_store.persist() # Save the new vector store to disk
+        print(f"ChromaDB initialized and persisted with {len(documents)} documents from CSV.")
 # Create retriever tool using the Chroma vector store
+retriever_tool = create_retriever_tool(
     retriever=vector_store.as_retriever(),
+    name="Question_Search",
+    description="A tool to retrieve similar questions from a vector store. The retrieved document's metadata contains the 'final_answer' to the question.",
 )
 # Add the new retriever tool to your list of tools
     wiki_search,
     web_search,
     arvix_search,
+    retriever_tool,
 ]
 # Build graph function
 def build_graph(provider: str = "google"):
     """Build the graph"""
     if provider == "google":
         llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0)
     elif provider == "groq":
+        llm = ChatGroq(model="qwen-qwq-32b", temperature=0)
     elif provider == "huggingface":
         llm = ChatHuggingFace(
             llm=HuggingFaceEndpoint(
                 url="https://api-inference.huggingface.co/models/Meta-DeepLearning/llama-2-7b-chat-hf",
         )
     else:
         raise ValueError("Invalid provider. Choose 'google', 'groq' or 'huggingface'.")
     llm_with_tools = llm.bind_tools(tools)
     def assistant(state: MessagesState):
         """Assistant node"""
         return {"messages": [llm_with_tools.invoke(state["messages"])]}
     def retriever(state: MessagesState):
         query = state["messages"][-1].content
+        # Use the vector_store directly for similarity search to get the full Document object
+        similar_docs = vector_store.similarity_search(query, k=1)
         if similar_docs:
+            similar_doc = similar_docs[0]
+            # Prioritize 'final_answer' from metadata, then check page_content
+            if "final_answer" in similar_doc.metadata and similar_doc.metadata["final_answer"]:
+                answer = similar_doc.metadata["final_answer"]
             elif "Final answer :" in similar_doc.page_content:
                 answer = similar_doc.page_content.split("Final answer :")[-1].strip()
             else:
                 answer = similar_doc.page_content.strip() # Fallback to page_content if no explicit answer
+            # The system prompt expects "FINAL ANSWER: [ANSWER]".
+            # We should return the extracted answer directly, as the prompt handles the formatting.
             return {"messages": [AIMessage(content=answer)]}
         else:
             return {"messages": [AIMessage(content="No similar questions found in the knowledge base.")]}
     builder = StateGraph(MessagesState)
     builder.add_node("retriever", retriever)
     builder.set_entry_point("retriever")
     builder.set_finish_point("retriever")
     return builder.compile()