Spaces:

mamogasr
/

llm_engineering

Sleeping

File size: 8,949 Bytes

5fdb69e

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import documents exported from Evernote to a vectorstore\n",
    "### Use OpenAI file search with responses API\n",
    "#### Prerequisite steps\n",
    "* exported notes from your Evernote notebook as html \n",
    "* converted the notes further to md-files and remove broken image links (use python/AI)\n",
    "* the files are named with note titles\n",
    "\n",
    "Files are in one folder.\n",
    "\n",
    "\n",
    "##### Query ChromaDB vectorstore\n",
    "I tried to accomplish this task with RAG like the example by https://github.com/ed-donner/llm_engineering/commits?author=dinorrusso.\n",
    "\n",
    "I thought this to be a trivial task, but it was not 😃 That example uses Ollama running locally.\n",
    "Even though the retriever had the information required, it was dropped from the answer.\n",
    "\n",
    "I tried then to use Chroma + OpenAI. After several attemps succeeded to create a vectorstore and query it. That's it for this time.\n",
    "\n",
    "##### Openai vectorstore, see bottom of the notebook\n",
    "One attempt was to use OpenAI's fileSearch-tool which seemed pretty straightforward.\n",
    "The con: loading files was not working always. Code is left though as reference."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Imports\n",
    "from dotenv import load_dotenv\n",
    "import gradio as gr\n",
    "import openai\n",
    "import chromadb\n",
    "from chromadb.config import Settings\n",
    "import os"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Load files to vectorstore"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "load_dotenv(override=True)\n",
    "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n",
    "openai.api_key = os.environ['OPENAI_API_KEY']\n",
    "\n",
    "def chunk_text(text, max_tokens=2000):\n",
    "    words = text.split()\n",
    "    chunks = []\n",
    "    current_chunk = []\n",
    "    current_length = 0\n",
    "\n",
    "    for word in words:\n",
    "        current_length += len(word) + 1  # +1 for the space\n",
    "        if current_length > max_tokens:\n",
    "            chunks.append(\" \".join(current_chunk))\n",
    "            current_chunk = [word]\n",
    "            current_length = len(word) + 1\n",
    "        else:\n",
    "            current_chunk.append(word)\n",
    "\n",
    "    if current_chunk:\n",
    "        chunks.append(\" \".join(current_chunk))\n",
    "\n",
    "    return chunks\n",
    "\n",
    "\n",
    "# # Set up OpenAI API key\n",
    "# openai.api_key = \"your_openai_api_key\"  # Replace with your API key\n",
    "chroma_client = chromadb.Client()\n",
    "\n",
    "# Create or get the existing collection\n",
    "collection_name = \"EverNotes\"\n",
    "\n",
    "try:\n",
    "    existing_collection = chroma_client.get_collection(name=collection_name)\n",
    "    if existing_collection.count() > 0:\n",
    "        chroma_client.delete_collection(name=collection_name)\n",
    "except:\n",
    "    print(f\"Collection {collection_name} does not exist. Creating a new one.\")\n",
    "\n",
    "# Create a collection in ChromaDB\n",
    "collection = chroma_client.get_or_create_collection(name=collection_name)\n",
    "\n",
    "# Define your data\n",
    "# it should be like this\n",
    "# documents = [\"OpenAI is revolutionizing AI.\", \"ChromaDB makes embedding storage easy.\"]\n",
    "# metadata = [{\"id\": 1}, {\"id\": 2}]\n",
    "\n",
    "folder_path = os.getenv('EVERNOTE_EXPORT')\n",
    "documents = []\n",
    "\n",
    "for root, dirs, files in os.walk(folder_path):\n",
    "    for file in files:\n",
    "        if file.endswith('.md'):  # Change this to the file extension you need\n",
    "            with open(os.path.join(root, file), 'r') as f:\n",
    "                documents.append(f.read())\n",
    "\n",
    "metadata = [{\"id\": i + 1} for i in range(len(documents))]\n",
    "\n",
    "# Generate embeddings using OpenAI\n",
    "def get_embedding(text, model=\"text-embedding-ada-002\"):\n",
    "    response = openai.embeddings.create(input=text, model=model)\n",
    "    return response.data[0].embedding\n",
    "\n",
    "# Add documents and embeddings to ChromaDB in chunks\n",
    "for doc, meta in zip(documents, metadata):\n",
    "    chunks = chunk_text(doc)\n",
    "    for chunk in chunks:\n",
    "        embedding = get_embedding(chunk)\n",
    "        collection.add(\n",
    "            documents=[chunk],\n",
    "            embeddings=[embedding],\n",
    "            metadatas=[meta],\n",
    "            ids=[str(meta[\"id\"])]\n",
    "        )\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Query ChromaDB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# \n",
    "query_text = \"Is there a video for Fitting the Shimano speed hub 7\"\n",
    "query_embedding = get_embedding(query_text)\n",
    "\n",
    "results = collection.query(\n",
    "    query_embeddings=[query_embedding],\n",
    "    n_results=2\n",
    ")\n",
    "\n",
    "print(\"Query Results:\", results)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Gradio interface"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to query ChromaDB\n",
    "def query_chromadb(query_text):\n",
    "    query_embedding = get_embedding(query_text)\n",
    "    results = collection.query(\n",
    "        query_embeddings=[query_embedding],\n",
    "        n_results=2\n",
    "    )\n",
    "    return results\n",
    "\n",
    "# Gradio interface\n",
    "def gradio_interface(query_text):\n",
    "    results = query_chromadb(query_text)\n",
    "    return results\n",
    "\n",
    "# Create Gradio app\n",
    "iface = gr.Interface(\n",
    "    fn=gradio_interface,\n",
    "    inputs=\"text\",\n",
    "    outputs=\"text\",\n",
    "    title=\"ChromaDB Query Interface\",\n",
    "    description=\"Enter your query to search the ChromaDB collection.\"\n",
    ")\n",
    "\n",
    "iface.launch()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Below OpenAI filesearch variant which had some failures in file uploads."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import glob\n",
    "folder_path = os.environ['EVERNOTE_EXPORT'] \n",
    "# Filter out other except .md-files\n",
    "md_files = glob.glob(os.path.join(folder_path, '*.md'))\n",
    "file_paths = [os.path.join(folder_path, file) for file in md_files]\n",
    "file_streams = [open(path, 'rb') for path in file_paths]\n",
    "\n",
    "# Create vector store\n",
    "vector_store = openai.vector_stores.create(\n",
    "    name=\"Evernote notes\",\n",
    ")\n",
    "\n",
    "# Batch Upload Limit: You can upload up to 100 files in a single batch\n",
    "# https://community.openai.com/t/max-100-files-in-vector-store/729876/4\n",
    "batch_size = 90\n",
    "for i in range(0, len(file_streams), batch_size):\n",
    "    batch = file_streams[i:i + batch_size]\n",
    "    file_batch = openai.vector_stores.file_batches.upload_and_poll(\n",
    "        vector_store_id=vector_store.id,\n",
    "        files=batch\n",
    "    )\n",
    "    print(file_batch.status)\n",
    "    print(file_batch.file_counts)\n",
    "\n",
    "# There can be some fails in file counts:\n",
    "# \"FileCounts(cancelled=0, completed=89, failed=1, in_progress=0, total=90)\"\"\n",
    "# Usually 1 % fails. Did not find solution for improving that yet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "response = openai.responses.create(\n",
    "    model=\"gpt-4o-mini\",\n",
    "    input=\"Is there a video for Fitting the Shimano speed hub 7?\",\n",
    "    tools=[{\n",
    "        \"type\": \"file_search\",\n",
    "        \"vector_store_ids\": [vector_store.id]\n",
    "    }],\n",
    "    include=None\n",
    ")\n",
    "print(response)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}