Spaces:

mamogasr
/

llm_engineering

Sleeping

File size: 13,066 Bytes

5fdb69e

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "05432987-80bc-4aa5-8c05-277861e19307",
   "metadata": {},
   "source": [
    "## Adds docstrings/comments to code and generates code summary"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e706f175-1e83-4d2c-8613-056b2e532624",
   "metadata": {},
   "source": [
    "### Model Usage  \n",
    "\n",
    "- **Open Source Models:**\n",
    "\n",
    "  - Deployed via Endpoint: Hosted on a server and accessed remotely (Qwen 1.5-7)\n",
    "  - Run Locally on Machine: Executed directly on a local device (Ollama running Llama 3.2-1B)\n",
    "\n",
    "- **Closed Source Models:**  \n",
    "  - Accessed through API key authentication: (OpenAI, Anthropic).  \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9ed667df-6660-4ba3-80c5-4c1c8f7e63f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports\n",
    "\n",
    "import os\n",
    "import io\n",
    "import sys \n",
    "import json\n",
    "import requests\n",
    "from dotenv import load_dotenv\n",
    "from openai import OpenAI\n",
    "import google.generativeai\n",
    "import anthropic\n",
    "import ollama\n",
    "from IPython.display import Markdown, display, update_display\n",
    "import gradio as gr\n",
    "from huggingface_hub import login, InferenceClient\n",
    "from transformers import AutoTokenizer, pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c9dd4bf1-48cf-44dc-9d04-0ec6e8189a3c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# environment\n",
    "\n",
    "load_dotenv()\n",
    "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')\n",
    "os.environ['ANTHROPIC_API_KEY'] = os.getenv('ANTHROPIC_API_KEY')\n",
    "CODE_QWEN_URL = os.environ['CODE_QWEN_URL'] \n",
    "BIGBIRD_PEGASUS_URL = os.environ['BIGBIRD_PEGASUS_URL']\n",
    "HF_TOKEN = os.environ['HF_TOKEN']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "71f671d6-50a7-43cf-9e04-52a159d67dab",
   "metadata": {},
   "outputs": [],
   "source": [
    "!ollama pull llama3.2:1b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e6f8f35-477d-4014-8fe9-874b5aee0061",
   "metadata": {},
   "outputs": [],
   "source": [
    "openai = OpenAI()\n",
    "claude = anthropic.Anthropic()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae34b79c-425a-4f04-821a-8f1d9868b146",
   "metadata": {},
   "outputs": [],
   "source": [
    "OPENAI_MODEL = \"gpt-4o-mini\"\n",
    "CLAUDE_MODEL = \"claude-3-haiku-20240307\"\n",
    "LLAMA_MODEL = \"llama3.2:1b\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "80e6d920-3c94-48c4-afd8-518f415ab777",
   "metadata": {},
   "outputs": [],
   "source": [
    "code_qwen = \"Qwen/CodeQwen1.5-7B-Chat\"\n",
    "bigbird_pegasus = \"google/bigbird-pegasus-large-arxiv\"\n",
    "login(HF_TOKEN, add_to_git_credential=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "314cd8e3-2c10-4149-9818-4e6b0c05b871",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Uses Llama to Check Which Language the Code is Written In\n",
    "system_message_comments = \"You are an assistant designed to add docstrings and helpful comments to code for documentation purposes.\"\n",
    "system_message_comments += \"Respond back with properly formatted code, including docstrings and comments. Keep comments concise. \"\n",
    "system_message_comments += \"Do not respond with greetings, or any such extra output\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66fa09e4-1b79-4f53-9bb7-904d515b2f26",
   "metadata": {},
   "outputs": [],
   "source": [
    "system_message_summary = \"You are an assistant designed to summarise code for documentation purposes. You are not to display code again.\"\n",
    "system_message_summary += \"Respond back with a properly crafted summary, mentioning key details regarding to the code, such as workflow, code language.\"\n",
    "system_message_summary += \"Do not respond with greetings, or any such extra output. Do not respond in Markdown. Be thorough, keep explanation level at undergraduate level.\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea405820-f9d1-4cf1-b465-9ae5cd9016f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def user_prompt_for(code):\n",
    "    user_prompt = \"Rewrite this code to include helpful comments and docstrings. \"\n",
    "    user_prompt += \"Respond only with code.\\n\"\n",
    "    user_prompt += code\n",
    "    return user_prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26c9be56-1d4f-43e5-9bc4-eb5b76da8071",
   "metadata": {},
   "outputs": [],
   "source": [
    "def user_prompt_for_summary(code):\n",
    "    user_prompt = \"Return the summary of the code.\\n\"\n",
    "    user_prompt += code\n",
    "    return user_prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0ac22cb-dc96-4ae1-b00d-2747572f6945",
   "metadata": {},
   "outputs": [],
   "source": [
    "def messages_for(code):\n",
    "    messages = [\n",
    "        {\"role\": \"system\", \"content\": system_message_comments},\n",
    "        {\"role\":\"user\", \"content\" : user_prompt_for(code)}\n",
    "    ]\n",
    "    return messages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eae1a8b4-68a8-4cd5-849e-0ecabd166a0c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def messages_for_summary(code):\n",
    "    messages = [\n",
    "        {\"role\": \"system\", \"content\": system_message_summary},\n",
    "        {\"role\":\"user\", \"content\" : user_prompt_for_summary(code)}\n",
    "    ]\n",
    "    return messages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5eb726dd-e09e-4011-8eb6-4d20f2830ff5",
   "metadata": {},
   "outputs": [],
   "source": [
    "func = \"\"\"\n",
    "import time\n",
    "\n",
    "def calculate(iterations, param1, param2):\n",
    "    result = 1.0\n",
    "    for i in range(1, iterations+1):\n",
    "        j = i * param1 - param2\n",
    "        result -= (1/j)\n",
    "        j = i * param1 + param2\n",
    "        result += (1/j)\n",
    "    return result\n",
    "\n",
    "start_time = time.time()\n",
    "result = calculate(100_000_000, 4, 1) * 4\n",
    "end_time = time.time()\n",
    "\n",
    "print(f\"Result: {result:.12f}\")\n",
    "print(f\"Execution Time: {(end_time - start_time):.6f} seconds\")\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f61943b2-c939-4910-a670-58abaf464bb6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def call_llama(code):\n",
    "    # commented code\n",
    "    messages = messages_for(code)\n",
    "    response1 = ollama.chat(model=LLAMA_MODEL, messages=messages)\n",
    "\n",
    "    # summary\n",
    "    messages = messages_for_summary(code)\n",
    "    response2 = ollama.chat(model=LLAMA_MODEL, messages=messages)\n",
    "    \n",
    "    return response1['message']['content'],response2['message']['content']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "696fb97e-807e-40ed-b0e1-beb82d1108a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def call_claude(code):\n",
    "    # commented code\n",
    "    message1 = claude.messages.create(\n",
    "        model=CLAUDE_MODEL,\n",
    "        system=system_message_comments,\n",
    "        messages=([{\"role\": \"user\", \"content\":user_prompt_for(code)}]),\n",
    "        max_tokens=500\n",
    "    )\n",
    "\n",
    "    # summary\n",
    "    message2 = claude.messages.create(\n",
    "        model=CLAUDE_MODEL,\n",
    "        system=system_message_summary,\n",
    "        messages=([{\"role\": \"user\", \"content\":user_prompt_for_summary(code)}]),\n",
    "        max_tokens=500\n",
    "    )\n",
    "    \n",
    "    return message1.content[0].text,message2.content[0].text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4bf1db64-86fa-42a1-98dd-3df74607f8db",
   "metadata": {},
   "outputs": [],
   "source": [
    "def call_gpt(code):\n",
    "    # commented code\n",
    "    completion1 = openai.chat.completions.create(\n",
    "        model=OPENAI_MODEL,\n",
    "        messages=messages_for(code),\n",
    "    )\n",
    "\n",
    "    #summary\n",
    "    completion2 = openai.chat.completions.create(\n",
    "        model=OPENAI_MODEL,\n",
    "        messages=messages_for_summary(code),\n",
    "    )\n",
    "    \n",
    "    return completion1.choices[0].message.content,completion2.choices[0].message.content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6863dc42-cbcd-4a95-8b0a-cfbcbfed0764",
   "metadata": {},
   "outputs": [],
   "source": [
    "def call_codeqwen(code):\n",
    "    # commented code\n",
    "    tokenizer = AutoTokenizer.from_pretrained(code_qwen)\n",
    "    messages = messages_for(code)\n",
    "    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
    "    client = InferenceClient(CODE_QWEN_URL, token=HF_TOKEN)\n",
    "    response1 = client.text_generation(text, details=True, max_new_tokens=1000)\n",
    "\n",
    "    # summary\n",
    "    tokenizer = AutoTokenizer.from_pretrained(code_qwen)\n",
    "    messages = messages_for_summary(code)\n",
    "    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
    "    client = InferenceClient(CODE_QWEN_URL, token=HF_TOKEN)\n",
    "    response2 = client.text_generation(text, details=True, max_new_tokens=1000)\n",
    "    \n",
    "    return response1.generated_text ,response2.generated_text "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06d05c02-45e4-47da-b70b-cf433dfaca4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_docs(code,model):\n",
    "    if model == \"Llama\":\n",
    "        comments,summary = call_llama(code)\n",
    "    elif model == \"Claude\":\n",
    "        comments,summary = call_claude(code)\n",
    "    elif model == \"GPT\":\n",
    "        comments,summary = call_gpt(code)\n",
    "    elif model == \"CodeQwen\":\n",
    "        comments,summary = call_codeqwen(code)\n",
    "    else:\n",
    "        raise ValueError(\"Unknown Model\")\n",
    "    return comments,summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1b4ea289-5da9-4b0e-b4d4-f8f01e466839",
   "metadata": {},
   "outputs": [],
   "source": [
    "css = \"\"\"\n",
    ".comments {background-color: #00599C;}\n",
    ".summary {background-color: #008B8B;}\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "89ad7c7b-b881-45d3-aadc-d7206af578fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "with gr.Blocks(css=css) as ui:\n",
    "    gr.Markdown(\"### Code Documentation and Formatting\")\n",
    "    with gr.Row():\n",
    "        code = gr.Textbox(label=\"Input Code: \", value=func, lines=10)\n",
    "    with gr.Row():\n",
    "        model = gr.Dropdown([\"GPT\",\"Claude\",\"Llama\",\"CodeQwen\"],label=\"Select model\",value=\"GPT\")\n",
    "    with gr.Row():\n",
    "        docs = gr.Button(\"Add Comments and Sumarise Code\")\n",
    "    with gr.Row():\n",
    "        commented_code = gr.Textbox(label= \"Formatted Code\", lines=10,elem_classes=[\"comments\"])\n",
    "        code_summary = gr.Textbox(label = \"Code Summary\", lines=10,elem_classes=[\"summary\"])\n",
    "    docs.click(create_docs,inputs=[code,model],outputs=[commented_code,code_summary]),"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a9e3b1c-bfe6-4b71-aac8-fa36a491c157",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "ui.launch(inbrowser=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac895aa9-e044-4598-b715-d96d1c158656",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a96877c-22b7-4ad5-b235-1cf8f8b200a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(call_llama(func))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f11de1a2-52c0-41c7-ad88-01ef5f8bc628",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}