Spaces:

mamogasr
/

llm_engineering

Sleeping

File size: 11,185 Bytes

5fdb69e

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "c25c6e94-f3de-4367-b2bf-269ba7160977",
   "metadata": {},
   "source": [
    "## An Expert Knowledge Worker Question-Answering Agent using RAG"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "15169580-cf11-4dee-8ec7-3a4ef59b19ee",
   "metadata": {},
   "source": [
    "Aims\n",
    "- Reads README.md files and loads data using TextLoader\n",
    "- Splits into chunks using CharacterTextSplitter\n",
    "- Converts chunks into vector embeddings and creates a datastore\n",
    "- 2D and 3D visualisations\n",
    "- Langchain to set up a conversation retrieval chain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "051cf881-357d-406b-8eae-1610651e40f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports\n",
    "\n",
    "import os\n",
    "import glob\n",
    "from dotenv import load_dotenv\n",
    "import gradio as gr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ccfd403a-5bdb-4a8c-b3fd-d47ae79e43f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports for langchain, plotly and Chroma\n",
    "\n",
    "from langchain.document_loaders import DirectoryLoader, TextLoader\n",
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "from langchain.schema import Document\n",
    "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
    "from langchain.embeddings import HuggingFaceEmbeddings\n",
    "from langchain_chroma import Chroma\n",
    "from langchain.memory import ConversationBufferMemory\n",
    "from langchain.chains import ConversationalRetrievalChain\n",
    "import numpy as np\n",
    "from sklearn.manifold import TSNE\n",
    "import plotly.graph_objects as go\n",
    "import plotly.express as px\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d853868-d2f6-43e1-b27c-b8e91d06b724",
   "metadata": {},
   "outputs": [],
   "source": [
    "MODEL = \"gpt-4o-mini\"\n",
    "db_name = \"vector_db\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f152fc3b-0bf4-4d51-948f-95da1ebc030a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load environment variables in a file called .env\n",
    "\n",
    "load_dotenv(override=True)\n",
    "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24e621ac-df06-4af6-a60d-a9ed7adb884a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read in documents using LangChain's loaders\n",
    "\n",
    "folder = \"my-knowledge-base/\"\n",
    "text_loader_kwargs={'autodetect_encoding': True}\n",
    "\n",
    "loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
    "folder_docs = loader.load()\n",
    "\n",
    "for doc in folder_docs:\n",
    "    filename_md = os.path.basename(doc.metadata[\"source\"])  \n",
    "    filename, _ = os.path.splitext(filename_md)  \n",
    "    doc.metadata[\"filename\"] = filename\n",
    "\n",
    "documents = folder_docs \n",
    "\n",
    "text_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=200)\n",
    "chunks = text_splitter.split_documents(documents)\n",
    "\n",
    "print(f\"Total number of chunks: {len(chunks)}\")\n",
    "print(f\"Files found: {set(doc.metadata['filename'] for doc in documents)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f02f08ee-5ade-4f79-a500-045a8f1a532f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk\n",
    "\n",
    "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
    "\n",
    "# Delete if already exists\n",
    "\n",
    "if os.path.exists(db_name):\n",
    "    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()\n",
    "\n",
    "# Create vectorstore\n",
    "\n",
    "vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)\n",
    "print(f\"Vectorstore created with {vectorstore._collection.count()} documents\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7f665f4d-ccb1-43fb-b901-040117925732",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's investigate the vectors\n",
    "\n",
    "collection = vectorstore._collection\n",
    "count = collection.count()\n",
    "\n",
    "sample_embedding = collection.get(limit=1, include=[\"embeddings\"])[\"embeddings\"][0]\n",
    "dimensions = len(sample_embedding)\n",
    "print(f\"There are {count:,} vectors with {dimensions:,} dimensions in the vector store\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6208a971-e8b7-48bc-be7a-6dcb82967fd2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pre work\n",
    "\n",
    "result = collection.get(include=['embeddings','documents','metadatas'])\n",
    "vectors = np.array(result['embeddings'])  \n",
    "documents = result['documents']\n",
    "metadatas = result['metadatas']\n",
    "filenames = [metadata['filename'] for metadata in metadatas]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eb27bc8a-453b-4b19-84b4-dc495bb0e544",
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "def random_color():\n",
    "        return f\"rgb({random.randint(0,255)},{random.randint(0,255)},{random.randint(0,255)})\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "78db67e5-ef10-4581-b8ac-3e0281ceba45",
   "metadata": {},
   "outputs": [],
   "source": [
    "def show_embeddings_2d(result):\n",
    "    vectors = np.array(result['embeddings'])  \n",
    "    documents = result['documents']\n",
    "    metadatas = result['metadatas']\n",
    "    filenames = [metadata['filename'] for metadata in metadatas]\n",
    "    filenames_unique = sorted(set(filenames))\n",
    "\n",
    "    # color assignment\n",
    "    color_map = {name: random_color() for name in filenames_unique}\n",
    "    colors = [color_map[name] for name in filenames]\n",
    "\n",
    "    tsne = TSNE(n_components=2, random_state=42,perplexity=4)\n",
    "    reduced_vectors = tsne.fit_transform(vectors)\n",
    "\n",
    "    # Create the 2D scatter plot\n",
    "    fig = go.Figure(data=[go.Scatter(\n",
    "        x=reduced_vectors[:, 0],\n",
    "        y=reduced_vectors[:, 1],\n",
    "        mode='markers',\n",
    "        marker=dict(size=5,color=colors, opacity=0.8),\n",
    "        text=[f\"Type: {t}<br>Text: {d[:100]}...\" for t, d in zip(filenames, documents)],\n",
    "        hoverinfo='text'\n",
    "    )])\n",
    "\n",
    "    fig.update_layout(\n",
    "        title='2D Chroma Vector Store Visualization',\n",
    "        scene=dict(xaxis_title='x',yaxis_title='y'),\n",
    "        width=800,\n",
    "        height=600,\n",
    "        margin=dict(r=20, b=10, l=10, t=40)\n",
    "    )\n",
    "\n",
    "    fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c250166-cb5b-4a75-8981-fae2d6dfe509",
   "metadata": {},
   "outputs": [],
   "source": [
    "show_embeddings_2d(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b290e38-0800-4453-b664-7a7622ff5ed2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def show_embeddings_3d(result):\n",
    "    vectors = np.array(result['embeddings'])  \n",
    "    documents = result['documents']\n",
    "    metadatas = result['metadatas']\n",
    "    filenames = [metadata['filename'] for metadata in metadatas]\n",
    "    filenames_unique = sorted(set(filenames))\n",
    "\n",
    "    # color assignment\n",
    "    color_map = {name: random_color() for name in filenames_unique}\n",
    "    colors = [color_map[name] for name in filenames]\n",
    "\n",
    "    tsne = TSNE(n_components=3, random_state=42)\n",
    "    reduced_vectors = tsne.fit_transform(vectors)\n",
    "\n",
    "    fig = go.Figure(data=[go.Scatter3d(\n",
    "        x=reduced_vectors[:, 0],\n",
    "        y=reduced_vectors[:, 1],\n",
    "        z=reduced_vectors[:, 2],\n",
    "        mode='markers',\n",
    "        marker=dict(size=5, color=colors, opacity=0.8),\n",
    "        text=[f\"Type: {t}<br>Text: {d[:100]}...\" for t, d in zip(filenames, documents)],\n",
    "        hoverinfo='text'\n",
    "    )])\n",
    "\n",
    "    fig.update_layout(\n",
    "        title='3D Chroma Vector Store Visualization',\n",
    "        scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),\n",
    "        width=900,\n",
    "        height=700,\n",
    "        margin=dict(r=20, b=10, l=10, t=40)\n",
    "    )\n",
    "\n",
    "    fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "45d1d034-2503-4176-b1e4-f248e31c4770",
   "metadata": {},
   "outputs": [],
   "source": [
    "show_embeddings_3d(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e79946a1-f93a-4b3a-8d19-deef40dec223",
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a new Chat with OpenAI\n",
    "llm = ChatOpenAI(temperature=0.7, model_name=MODEL)\n",
    "\n",
    "# set up the conversation memory for the chat\n",
    "memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n",
    "\n",
    "# the retriever is an abstraction over the VectorStore that will be used during RAG\n",
    "retriever = vectorstore.as_retriever(search_kwargs={\"k\": 50})\n",
    "\n",
    "# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory\n",
    "conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "59f90c85-c113-4482-8574-8a728ef25459",
   "metadata": {},
   "outputs": [],
   "source": [
    "def chat(question, history):\n",
    "    result = conversation_chain.invoke({\"question\": question})\n",
    "    return result[\"answer\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0520a8ff-01a4-4fa6-9dc8-57da87272edc",
   "metadata": {},
   "outputs": [],
   "source": [
    "view = gr.ChatInterface(chat, type=\"messages\").launch(inbrowser=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b4949b17-cd9c-4bff-bd5b-0f80df72e7dc",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}