Spaces:

andaqu
/

ask-youtube-gpt

Runtime error

File size: 21,536 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "from youtube_transcript_api import YouTubeTranscriptApi\n",
    "from nltk.tokenize import TextTilingTokenizer \n",
    "from youtubesearchpython import VideosSearch\n",
    "from semantic_search import SemanticSearch \n",
    "import pandas as pd\n",
    "import gradio as gr\n",
    "import numpy as np\n",
    "import requests\n",
    "import tiktoken\n",
    "import openai\n",
    "import json\n",
    "import nltk\n",
    "import re\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def set_openai_key(key):\n",
    "    if key == \"env\":\n",
    "        key = os.environ.get(\"OPENAI_API_KEY\")\n",
    "    openai.api_key = key\n",
    "\n",
    "def get_youtube_data(url):\n",
    "\n",
    "    video_id = url.split(\"=\")[1]\n",
    "\n",
    "    try:\n",
    "        raw = YouTubeTranscriptApi.get_transcript(video_id)\n",
    "    except:\n",
    "        try:\n",
    "            transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)\n",
    "            for transcript in transcript_list:\n",
    "                raw = transcript.translate('en').fetch()\n",
    "                break\n",
    "        except:\n",
    "            print(f\"No transcript found for {url}\") # Usually because the video itself disabled captions\n",
    "            return False\n",
    "\n",
    "    response = requests.get(f\"https://noembed.com/embed?dataType=json&url={url}\")\n",
    "    data = json.loads(response.content)\n",
    "\n",
    "    title, author = data[\"title\"], data[\"author_name\"]\n",
    "\n",
    "    # Remove any \"'\" from title\n",
    "    title = title.replace(\"'\", \"\")\n",
    "    author = author.replace(\"'\", \"\")\n",
    "\n",
    "    df = pd.DataFrame(raw)\n",
    "\n",
    "    df['end'] = df['start'] + df['duration']\n",
    "    df['total_words'] = df['text'].apply(lambda x: len(x.split())).cumsum()\n",
    "    df[\"text\"] = df[\"text\"] + \"\\n\\n\"\n",
    "\n",
    "    return df, title, author\n",
    "\n",
    "def to_timestamp(seconds):\n",
    "    seconds = int(seconds)\n",
    "\n",
    "    hours = seconds // 3600\n",
    "    minutes = (seconds % 3600) // 60\n",
    "    seconds_remaining = seconds % 60\n",
    "    \n",
    "    if seconds >= 3600:\n",
    "        return f\"{hours:02d}:{minutes:02d}:{seconds_remaining:02d}\"\n",
    "    else:\n",
    "        return f\"{minutes:02d}:{seconds_remaining:02d}\"\n",
    "\n",
    "def to_seconds(timestamp):\n",
    "    time_list = timestamp.split(':')\n",
    "    total_seconds = 0\n",
    "    if len(time_list) == 2:  # Minutes:Seconds format\n",
    "        total_seconds = int(time_list[0]) * 60 + int(time_list[1])\n",
    "    elif len(time_list) == 3:  # Hours:Minutes:Seconds format\n",
    "        total_seconds = int(time_list[0]) * 3600 + int(time_list[1]) * 60 + int(time_list[2])\n",
    "    else:\n",
    "        raise ValueError(\"Invalid timestamp format\")\n",
    "    return total_seconds\n",
    "\n",
    "def get_segments(df, title, author, split_by_topic, segment_length = 200):\n",
    "\n",
    "    transcript = df['text'].str.cat(sep=' ')\n",
    "\n",
    "    if not split_by_topic:\n",
    "        words = transcript.split()\n",
    "        segments = [' '.join(words[i:i+segment_length]) for i in range(0, len(words), segment_length)]\n",
    "    else:\n",
    "        try:\n",
    "            segments = tt.tokenize(transcript)\n",
    "        except:\n",
    "            return \"\"\n",
    "\n",
    "    segments = [segment.replace('\\n','').strip() for segment in segments]\n",
    "\n",
    "    segments_wc = [len(segment.split()) for segment in segments]\n",
    "    segments_wc = np.cumsum(segments_wc)\n",
    "\n",
    "    idx = [np.argmin(np.abs(df['total_words'] - total_words)) for total_words in segments_wc]\n",
    "\n",
    "    segments_end_times = df['end'].iloc[idx].values\n",
    "    segments_end_times = np.insert(segments_end_times, 0, 0.0)\n",
    "\n",
    "    segments_times = [f\"({to_timestamp(segments_end_times[i-1])}, {to_timestamp(segments_end_times[i])})\" for i in range(1,len(segments_end_times))]\n",
    "\n",
    "    segments_text = [f\"Segment from '{title}' by {author}\\nTimestamp: {segment_time}\\n\\n{segment}\\n\" for segment, segment_time in zip(segments, segments_times)]\n",
    "\n",
    "    return segments_text\n",
    "\n",
    "def fit_searcher(segments, n_neighbours):\n",
    "    global searcher\n",
    "    searcher.fit(segments, n_neighbors=n_neighbours)\n",
    "    return True\n",
    "\n",
    "def num_tokens(text, model):\n",
    "    encoding = tiktoken.encoding_for_model(model)\n",
    "    return len(encoding.encode(text))\n",
    "\n",
    "def refencify(text):\n",
    "    title_pattern = r\"Segment from '(.+)'\"\n",
    "    timestamp_pattern = r\"Timestamp: \\((.+)\\)\"\n",
    "\n",
    "    print(text)\n",
    "\n",
    "    title = re.search(title_pattern, text).group(1)\n",
    "    timestamp = re.search(timestamp_pattern, text).group(1).split(\",\")\n",
    "    start_timestamp, end_timestamp = timestamp\n",
    "\n",
    "    url = titles_to_urls[title]\n",
    "    start_seconds = to_seconds(start_timestamp)\n",
    "    end_seconds = to_seconds(end_timestamp)\n",
    "\n",
    "    video_iframe = f'''<iframe\n",
    "    width=\"400\"\n",
    "    height=\"240\"\n",
    "    src=\"{url.replace(\"watch?v=\", \"embed/\")}?start={start_seconds}&end={end_seconds}&controls=0\"\n",
    "    frameborder=\"0\"\n",
    "    allow=\"accelerometer; autoplay; modestbranding; encrypted-media; gyroscope; picture-in-picture\"\n",
    "    allowfullscreen\n",
    "    >\n",
    "    </iframe>'''\n",
    "\n",
    "    return start_timestamp, end_timestamp, f\"{video_iframe}\\n\\n\"\n",
    "\n",
    "def form_query(question, model, token_budget):\n",
    "\n",
    "    results = searcher(question)\n",
    "\n",
    "    introduction = 'Use the below segments from multiple youtube videos to answer the subsequent question. If the answer cannot be found in the articles, write \"I could not find an answer.\" Cite each sentence using the [title, author, timestamp] notation. Every sentence MUST have a citation!'\n",
    "\n",
    "    message = introduction\n",
    "\n",
    "    question = f\"\\n\\nQuestion: {question}\"\n",
    "\n",
    "    references = \"\"\n",
    "\n",
    "    for i, result in enumerate(results):\n",
    "        result = result + \"\\n\\n\"\n",
    "        if (\n",
    "            num_tokens(message + result + question, model=model)\n",
    "            > token_budget\n",
    "        ):\n",
    "            break\n",
    "        else:\n",
    "            message += result\n",
    "            start_timestamp, end_timestamp, iframe = refencify(result)\n",
    "            references += f\"### Segment {i+1} ({start_timestamp} - {end_timestamp}):\\n\" + iframe\n",
    "\n",
    "    # Remove the last extra two newlines\n",
    "    message = message[:-2]\n",
    "\n",
    "    references = \"Segments that might have been used to answer your question: (If you specified more segments than shown here, consider increasing your token budget)\\n\\n\" + references\n",
    "\n",
    "    return message + question, references\n",
    "\n",
    "def generate_answer(question, model, token_budget, temperature):\n",
    "    \n",
    "    message, references = form_query(question, model, token_budget)\n",
    "\n",
    "    messages = [\n",
    "        {\"role\": \"system\", \"content\": \"You answer questions about YouTube videos.\"},\n",
    "        {\"role\": \"user\", \"content\": message},\n",
    "    ]\n",
    "\n",
    "    response = openai.ChatCompletion.create(\n",
    "        model=model,\n",
    "        messages=messages,\n",
    "        temperature=temperature\n",
    "    )\n",
    "    \n",
    "    response_message = response[\"choices\"][0][\"message\"][\"content\"]\n",
    "\n",
    "    return response_message, references\n",
    "\n",
    "def add_to_dict(title, url):\n",
    "    global title_counter\n",
    "\n",
    "    if title not in titles_to_urls:\n",
    "        # This is the first occurrence of this title\n",
    "        titles_to_urls[title] = url\n",
    "        return title\n",
    "    else:\n",
    "        # This title has already been seen, so we need to add a number suffix to it\n",
    "        # First, check if we've already seen this title before\n",
    "        if title in title_counter:\n",
    "            # If we have, increment the counter\n",
    "            title_counter[title] += 1\n",
    "        else:\n",
    "            # If we haven't, start the counter at 1\n",
    "            title_counter[title] = 1\n",
    "        \n",
    "        # Add the suffix to the title\n",
    "        new_title = f\"{title} ({title_counter[title]})\"\n",
    "        \n",
    "        # Add the new title to the dictionary\n",
    "        titles_to_urls[new_title] = url\n",
    "        return new_title\n",
    "\n",
    "def search_youtube(question, n_videos):\n",
    "    videosSearch = VideosSearch(question, limit = n_videos)\n",
    "    urls = [\"https://www.youtube.com/watch?v=\" + video[\"id\"] for video in videosSearch.result()[\"result\"]]\n",
    "    print(urls)\n",
    "    return urls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "def main(openAI_key, question, n_videos, urls_text, split_by_topic, segment_length, n_neighbours, model, token_budget, temperature):\n",
    "\n",
    "    print(question)\n",
    "    print(urls_text)\n",
    "\n",
    "    set_openai_key(openAI_key)\n",
    "\n",
    "    if urls_text == \"\":\n",
    "        urls = search_youtube(question, n_videos)\n",
    "    else:\n",
    "        urls = list(set(urls_text.split(\"\\n\")))\n",
    "\n",
    "    global titles_to_urls\n",
    "    titles_to_urls = {}\n",
    "\n",
    "    segments = []\n",
    "\n",
    "    for url in urls:\n",
    "\n",
    "        if \"youtu.be\" in url:\n",
    "            url = url.replace(\"youtu.be/\", \"youtube.com/watch?v=\")\n",
    "\n",
    "        res = get_youtube_data(url)\n",
    "\n",
    "        if not res:\n",
    "            continue\n",
    "\n",
    "        df, title, author = res\n",
    "        \n",
    "        title = add_to_dict(title, url)\n",
    "\n",
    "        video_segments = get_segments(df, title, author, split_by_topic, segment_length)\n",
    "\n",
    "        segments.extend(video_segments)\n",
    "    \n",
    "    if segments == []:\n",
    "        return \"Something wrong happened! Try specifying the YouTube videos or changing the query.\", \"\"\n",
    "\n",
    "    print(\"Segments generated successfully!\")\n",
    "\n",
    "    if fit_searcher(segments, n_neighbours):\n",
    "        print(\"Searcher fit successfully!\")\n",
    "        answer, references = generate_answer(question, model, token_budget, temperature)\n",
    "\n",
    "    print(answer)\n",
    "\n",
    "    return answer, references"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     C:\\Users\\andrew\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "nltk.download('stopwords')\n",
    "tt = TextTilingTokenizer()\n",
    "searcher = SemanticSearch()\n",
    "\n",
    "# Initialize a counter for duplicate titles\n",
    "title_counter = {}\n",
    "\n",
    "# One to one mapping from titles to urls\n",
    "titles_to_urls = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "What it does after extracting only the code\n",
      "https://youtu.be/l_r_b38VXmQ\n",
      "Segments generated successfully!\n",
      "Fitting with n=5...\n",
      "Searcher fit successfully!\n",
      "Segment from 'GPT-4 Auto Coder 2 reads docs from URL links and Auto Debugs' by echohive\n",
      "Timestamp: (12:01, 13:12)\n",
      "\n",
      "exact same thing we tried at first right after this neural network demonstration so look so it's finished it and ran it there we go and test input I have successfully created synthetic data set and it's given its final answer yeah it's pretty crazy so look let's just grab our instructions without any links and give it to gpt4 and see how far it goes so you didn't even know what to do but I have added return of python script for this let's see what it does now as we see here we got some code loading of the text is fine but there's nothing about link chain it just simply doesn't know what Lang chain is it doesn't know what a QA chain is it's just trying to come up with something cool as far as it knows what's best but this is not what we were looking for by the way we're gonna review the code here in a moment but all the necessary code will be available to my patreon supporters the link will be in the description now let's try it with web browsing with gpt4 this is Cape this uses gpt4 and it\n",
      "\n",
      "\n",
      "\n",
      "Segment from 'GPT-4 Auto Coder 2 reads docs from URL links and Auto Debugs' by echohive\n",
      "Timestamp: (07:31, 08:38)\n",
      "\n",
      "was mostly right it's 95 there all you have to do is just add Dot Page content over here and a good user question and you're good to go which would have taken somehow maybe longer uh to try to figure out from the documentation alone so as we see in the first interaction it actually received an error see set the sequence up so it actually did really well this time but it got the expected string because it had a problem with the how the character text splitter creates the document because it's a tuple so it actually decide to print it see as you see it actually prints it and it understands that it has to use page contents here let's say see says I can see that the text variable contains a list of document objects I need to extract the page content and here it's actually using that text first element of the text the page content versus up here when it was receiving the error it was only trying to let's see where is it it was only trying to use the first element which is the Tuple object right here right the list\n",
      "\n",
      "\n",
      "\n",
      "Segment from 'GPT-4 Auto Coder 2 reads docs from URL links and Auto Debugs' by echohive\n",
      "Timestamp: (08:38, 09:46)\n",
      "\n",
      "the Tuple within a list does because at the end there's some metadata actually on the third try see it was able to create a code that actually worked perfectly it actually was able to get the answer right now it's actually giving us the final answer but we can just copy this for example you do have to run this in an environment which you have all these packages installed we're going to talk about the requirements here in a moment but if you were to bring it into this test and just run this let me see we are loading our QA chain text splitter we are loading Isaac newton.text and getting first chunk is the first element's page content and we are input inputting that is the document and question is what did Isaac Newton discover me right oh I have to stop that and whenever you run this is actually a working code line chain code question over question answering over documents is you have split documents into three but we are only using the first document and Isaac Michigan discovered the laws of motion and universal gravitation so I found that this works really well\n",
      "\n",
      "\n",
      "\n",
      "Segment from 'GPT-4 Auto Coder 2 reads docs from URL links and Auto Debugs' by echohive\n",
      "Timestamp: (04:13, 05:20)\n",
      "\n",
      "lag format and if we take a look at the documentation as we see actually in the question answering stuff not in nowhere in the two pages stuff chain type is mentioned it's only using mapreduce and we're not going to be using summarize we're saying only question answering QA chain we're not even specifying the exact name and also with the characters text splitter we're saying chunk size to 500 and also the overlap of zero and this will of course split it into multiple pieces but we said that we only want to work with the first element of that first chunk or the first element of the list so as we see so we review it we have the summarize elements we have the QA elements and the text splitter once we know we have all our elements all we have to do is just put our instructions in here I have implemented multi-line input so you can actually paste code here as well because it's a multi-line input we do have to say done and click enter and now we're entering agent executor chain so this is the fun part as you see I have heavily\n",
      "\n",
      "\n",
      "\n",
      "Segment from 'GPT-4 Auto Coder 2 reads docs from URL links and Auto Debugs' by echohive\n",
      "Timestamp: (00:00, 01:07)\n",
      "\n",
      "hey everybody today we're gonna look at different implementation of Auto GPT which writes code debugs it automatically all while using documentation as its guidance what we're going to do is we're going to load this page this URL question answering and summarize analyze document URL and character text Splitter from blank chain and then we're going to ask and we will be using a python agent with the python Ripple which is able to execute code and gets feedback from the output and takes in user instructions to write code whatever you like let's just run this the best way to understand it is to see it when we run this code we are first greeted that the script this script will be Auto executing AI generator code this may be dangerous make sure you're observing the process we have to say accept here once we type in accept and click enter we are asked do you want to load any URLs we're going to say yes we don't have to load URLs if you don't want to you can just give it instructions without the urls I've set it up so that you can load up to three\n",
      "\n",
      "\n",
      "\n",
      "I could not find an answer.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "('I could not find an answer.',\n",
       " 'Segments that might have been used to answer your question: (If you specified more segments than shown here, consider increasing your token budget)\\n\\n### Segment 1 (12:01 -  13:12):\\n<iframe\\n    width=\"400\"\\n    height=\"240\"\\n    src=\"https://youtube.com/embed/l_r_b38VXmQ?start=721&end=792&controls=0\"\\n    frameborder=\"0\"\\n    allow=\"accelerometer; autoplay; modestbranding; encrypted-media; gyroscope; picture-in-picture\"\\n    allowfullscreen\\n    >\\n    </iframe>\\n\\n### Segment 2 (07:31 -  08:38):\\n<iframe\\n    width=\"400\"\\n    height=\"240\"\\n    src=\"https://youtube.com/embed/l_r_b38VXmQ?start=451&end=518&controls=0\"\\n    frameborder=\"0\"\\n    allow=\"accelerometer; autoplay; modestbranding; encrypted-media; gyroscope; picture-in-picture\"\\n    allowfullscreen\\n    >\\n    </iframe>\\n\\n### Segment 3 (08:38 -  09:46):\\n<iframe\\n    width=\"400\"\\n    height=\"240\"\\n    src=\"https://youtube.com/embed/l_r_b38VXmQ?start=518&end=586&controls=0\"\\n    frameborder=\"0\"\\n    allow=\"accelerometer; autoplay; modestbranding; encrypted-media; gyroscope; picture-in-picture\"\\n    allowfullscreen\\n    >\\n    </iframe>\\n\\n### Segment 4 (04:13 -  05:20):\\n<iframe\\n    width=\"400\"\\n    height=\"240\"\\n    src=\"https://youtube.com/embed/l_r_b38VXmQ?start=253&end=320&controls=0\"\\n    frameborder=\"0\"\\n    allow=\"accelerometer; autoplay; modestbranding; encrypted-media; gyroscope; picture-in-picture\"\\n    allowfullscreen\\n    >\\n    </iframe>\\n\\n### Segment 5 (00:00 -  01:07):\\n<iframe\\n    width=\"400\"\\n    height=\"240\"\\n    src=\"https://youtube.com/embed/l_r_b38VXmQ?start=0&end=67&controls=0\"\\n    frameborder=\"0\"\\n    allow=\"accelerometer; autoplay; modestbranding; encrypted-media; gyroscope; picture-in-picture\"\\n    allowfullscreen\\n    >\\n    </iframe>\\n\\n')"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "main(openAI_key=\"env\", \n",
    "    question=\"What it does after extracting only the code\", n_videos=5, \n",
    "    urls_text=\"https://youtu.be/l_r_b38VXmQ\", \n",
    "    split_by_topic=False, \n",
    "    segment_length=200, \n",
    "    n_neighbours=5, \n",
    "    model=\"gpt-3.5-turbo\", \n",
    "    token_budget=2000, \n",
    "    temperature=0)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}