Spaces:
Runtime error
Runtime error
File size: 21,536 Bytes
22d9367 87ee9d7 22d9367 87ee9d7 22d9367 87ee9d7 22d9367 87ee9d7 22d9367 87ee9d7 22d9367 87ee9d7 e7c64ec 87ee9d7 22d9367 87ee9d7 22d9367 87ee9d7 22d9367 87ee9d7 22d9367 87ee9d7 22d9367 87ee9d7 22d9367 87ee9d7 22d9367 e7c64ec 22d9367 e7c64ec 87ee9d7 e7c64ec 87ee9d7 22d9367 87ee9d7 22d9367 87ee9d7 22d9367 87ee9d7 22d9367 87ee9d7 22d9367 87ee9d7 22d9367 87ee9d7 22d9367 87ee9d7 e7c64ec 147ee28 e7c64ec 87ee9d7 147ee28 87ee9d7 147ee28 87ee9d7 147ee28 87ee9d7 147ee28 87ee9d7 147ee28 87ee9d7 147ee28 87ee9d7 147ee28 87ee9d7 147ee28 87ee9d7 147ee28 87ee9d7 147ee28 87ee9d7 147ee28 87ee9d7 147ee28 87ee9d7 e7c64ec 147ee28 e7c64ec 147ee28 e7c64ec 87ee9d7 147ee28 87ee9d7 22d9367 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"from youtube_transcript_api import YouTubeTranscriptApi\n",
"from nltk.tokenize import TextTilingTokenizer \n",
"from youtubesearchpython import VideosSearch\n",
"from semantic_search import SemanticSearch \n",
"import pandas as pd\n",
"import gradio as gr\n",
"import numpy as np\n",
"import requests\n",
"import tiktoken\n",
"import openai\n",
"import json\n",
"import nltk\n",
"import re\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"def set_openai_key(key):\n",
" if key == \"env\":\n",
" key = os.environ.get(\"OPENAI_API_KEY\")\n",
" openai.api_key = key\n",
"\n",
"def get_youtube_data(url):\n",
"\n",
" video_id = url.split(\"=\")[1]\n",
"\n",
" try:\n",
" raw = YouTubeTranscriptApi.get_transcript(video_id)\n",
" except:\n",
" try:\n",
" transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)\n",
" for transcript in transcript_list:\n",
" raw = transcript.translate('en').fetch()\n",
" break\n",
" except:\n",
" print(f\"No transcript found for {url}\") # Usually because the video itself disabled captions\n",
" return False\n",
"\n",
" response = requests.get(f\"https://noembed.com/embed?dataType=json&url={url}\")\n",
" data = json.loads(response.content)\n",
"\n",
" title, author = data[\"title\"], data[\"author_name\"]\n",
"\n",
" # Remove any \"'\" from title\n",
" title = title.replace(\"'\", \"\")\n",
" author = author.replace(\"'\", \"\")\n",
"\n",
" df = pd.DataFrame(raw)\n",
"\n",
" df['end'] = df['start'] + df['duration']\n",
" df['total_words'] = df['text'].apply(lambda x: len(x.split())).cumsum()\n",
" df[\"text\"] = df[\"text\"] + \"\\n\\n\"\n",
"\n",
" return df, title, author\n",
"\n",
"def to_timestamp(seconds):\n",
" seconds = int(seconds)\n",
"\n",
" hours = seconds // 3600\n",
" minutes = (seconds % 3600) // 60\n",
" seconds_remaining = seconds % 60\n",
" \n",
" if seconds >= 3600:\n",
" return f\"{hours:02d}:{minutes:02d}:{seconds_remaining:02d}\"\n",
" else:\n",
" return f\"{minutes:02d}:{seconds_remaining:02d}\"\n",
"\n",
"def to_seconds(timestamp):\n",
" time_list = timestamp.split(':')\n",
" total_seconds = 0\n",
" if len(time_list) == 2: # Minutes:Seconds format\n",
" total_seconds = int(time_list[0]) * 60 + int(time_list[1])\n",
" elif len(time_list) == 3: # Hours:Minutes:Seconds format\n",
" total_seconds = int(time_list[0]) * 3600 + int(time_list[1]) * 60 + int(time_list[2])\n",
" else:\n",
" raise ValueError(\"Invalid timestamp format\")\n",
" return total_seconds\n",
"\n",
"def get_segments(df, title, author, split_by_topic, segment_length = 200):\n",
"\n",
" transcript = df['text'].str.cat(sep=' ')\n",
"\n",
" if not split_by_topic:\n",
" words = transcript.split()\n",
" segments = [' '.join(words[i:i+segment_length]) for i in range(0, len(words), segment_length)]\n",
" else:\n",
" try:\n",
" segments = tt.tokenize(transcript)\n",
" except:\n",
" return \"\"\n",
"\n",
" segments = [segment.replace('\\n','').strip() for segment in segments]\n",
"\n",
" segments_wc = [len(segment.split()) for segment in segments]\n",
" segments_wc = np.cumsum(segments_wc)\n",
"\n",
" idx = [np.argmin(np.abs(df['total_words'] - total_words)) for total_words in segments_wc]\n",
"\n",
" segments_end_times = df['end'].iloc[idx].values\n",
" segments_end_times = np.insert(segments_end_times, 0, 0.0)\n",
"\n",
" segments_times = [f\"({to_timestamp(segments_end_times[i-1])}, {to_timestamp(segments_end_times[i])})\" for i in range(1,len(segments_end_times))]\n",
"\n",
" segments_text = [f\"Segment from '{title}' by {author}\\nTimestamp: {segment_time}\\n\\n{segment}\\n\" for segment, segment_time in zip(segments, segments_times)]\n",
"\n",
" return segments_text\n",
"\n",
"def fit_searcher(segments, n_neighbours):\n",
" global searcher\n",
" searcher.fit(segments, n_neighbors=n_neighbours)\n",
" return True\n",
"\n",
"def num_tokens(text, model):\n",
" encoding = tiktoken.encoding_for_model(model)\n",
" return len(encoding.encode(text))\n",
"\n",
"def refencify(text):\n",
" title_pattern = r\"Segment from '(.+)'\"\n",
" timestamp_pattern = r\"Timestamp: \\((.+)\\)\"\n",
"\n",
" print(text)\n",
"\n",
" title = re.search(title_pattern, text).group(1)\n",
" timestamp = re.search(timestamp_pattern, text).group(1).split(\",\")\n",
" start_timestamp, end_timestamp = timestamp\n",
"\n",
" url = titles_to_urls[title]\n",
" start_seconds = to_seconds(start_timestamp)\n",
" end_seconds = to_seconds(end_timestamp)\n",
"\n",
" video_iframe = f'''<iframe\n",
" width=\"400\"\n",
" height=\"240\"\n",
" src=\"{url.replace(\"watch?v=\", \"embed/\")}?start={start_seconds}&end={end_seconds}&controls=0\"\n",
" frameborder=\"0\"\n",
" allow=\"accelerometer; autoplay; modestbranding; encrypted-media; gyroscope; picture-in-picture\"\n",
" allowfullscreen\n",
" >\n",
" </iframe>'''\n",
"\n",
" return start_timestamp, end_timestamp, f\"{video_iframe}\\n\\n\"\n",
"\n",
"def form_query(question, model, token_budget):\n",
"\n",
" results = searcher(question)\n",
"\n",
" introduction = 'Use the below segments from multiple youtube videos to answer the subsequent question. If the answer cannot be found in the articles, write \"I could not find an answer.\" Cite each sentence using the [title, author, timestamp] notation. Every sentence MUST have a citation!'\n",
"\n",
" message = introduction\n",
"\n",
" question = f\"\\n\\nQuestion: {question}\"\n",
"\n",
" references = \"\"\n",
"\n",
" for i, result in enumerate(results):\n",
" result = result + \"\\n\\n\"\n",
" if (\n",
" num_tokens(message + result + question, model=model)\n",
" > token_budget\n",
" ):\n",
" break\n",
" else:\n",
" message += result\n",
" start_timestamp, end_timestamp, iframe = refencify(result)\n",
" references += f\"### Segment {i+1} ({start_timestamp} - {end_timestamp}):\\n\" + iframe\n",
"\n",
" # Remove the last extra two newlines\n",
" message = message[:-2]\n",
"\n",
" references = \"Segments that might have been used to answer your question: (If you specified more segments than shown here, consider increasing your token budget)\\n\\n\" + references\n",
"\n",
" return message + question, references\n",
"\n",
"def generate_answer(question, model, token_budget, temperature):\n",
" \n",
" message, references = form_query(question, model, token_budget)\n",
"\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": \"You answer questions about YouTube videos.\"},\n",
" {\"role\": \"user\", \"content\": message},\n",
" ]\n",
"\n",
" response = openai.ChatCompletion.create(\n",
" model=model,\n",
" messages=messages,\n",
" temperature=temperature\n",
" )\n",
" \n",
" response_message = response[\"choices\"][0][\"message\"][\"content\"]\n",
"\n",
" return response_message, references\n",
"\n",
"def add_to_dict(title, url):\n",
" global title_counter\n",
"\n",
" if title not in titles_to_urls:\n",
" # This is the first occurrence of this title\n",
" titles_to_urls[title] = url\n",
" return title\n",
" else:\n",
" # This title has already been seen, so we need to add a number suffix to it\n",
" # First, check if we've already seen this title before\n",
" if title in title_counter:\n",
" # If we have, increment the counter\n",
" title_counter[title] += 1\n",
" else:\n",
" # If we haven't, start the counter at 1\n",
" title_counter[title] = 1\n",
" \n",
" # Add the suffix to the title\n",
" new_title = f\"{title} ({title_counter[title]})\"\n",
" \n",
" # Add the new title to the dictionary\n",
" titles_to_urls[new_title] = url\n",
" return new_title\n",
"\n",
"def search_youtube(question, n_videos):\n",
" videosSearch = VideosSearch(question, limit = n_videos)\n",
" urls = [\"https://www.youtube.com/watch?v=\" + video[\"id\"] for video in videosSearch.result()[\"result\"]]\n",
" print(urls)\n",
" return urls"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"def main(openAI_key, question, n_videos, urls_text, split_by_topic, segment_length, n_neighbours, model, token_budget, temperature):\n",
"\n",
" print(question)\n",
" print(urls_text)\n",
"\n",
" set_openai_key(openAI_key)\n",
"\n",
" if urls_text == \"\":\n",
" urls = search_youtube(question, n_videos)\n",
" else:\n",
" urls = list(set(urls_text.split(\"\\n\")))\n",
"\n",
" global titles_to_urls\n",
" titles_to_urls = {}\n",
"\n",
" segments = []\n",
"\n",
" for url in urls:\n",
"\n",
" if \"youtu.be\" in url:\n",
" url = url.replace(\"youtu.be/\", \"youtube.com/watch?v=\")\n",
"\n",
" res = get_youtube_data(url)\n",
"\n",
" if not res:\n",
" continue\n",
"\n",
" df, title, author = res\n",
" \n",
" title = add_to_dict(title, url)\n",
"\n",
" video_segments = get_segments(df, title, author, split_by_topic, segment_length)\n",
"\n",
" segments.extend(video_segments)\n",
" \n",
" if segments == []:\n",
" return \"Something wrong happened! Try specifying the YouTube videos or changing the query.\", \"\"\n",
"\n",
" print(\"Segments generated successfully!\")\n",
"\n",
" if fit_searcher(segments, n_neighbours):\n",
" print(\"Searcher fit successfully!\")\n",
" answer, references = generate_answer(question, model, token_budget, temperature)\n",
"\n",
" print(answer)\n",
"\n",
" return answer, references"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] C:\\Users\\andrew\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
}
],
"source": [
"nltk.download('stopwords')\n",
"tt = TextTilingTokenizer()\n",
"searcher = SemanticSearch()\n",
"\n",
"# Initialize a counter for duplicate titles\n",
"title_counter = {}\n",
"\n",
"# One to one mapping from titles to urls\n",
"titles_to_urls = {}"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"What it does after extracting only the code\n",
"https://youtu.be/l_r_b38VXmQ\n",
"Segments generated successfully!\n",
"Fitting with n=5...\n",
"Searcher fit successfully!\n",
"Segment from 'GPT-4 Auto Coder 2 reads docs from URL links and Auto Debugs' by echohive\n",
"Timestamp: (12:01, 13:12)\n",
"\n",
"exact same thing we tried at first right after this neural network demonstration so look so it's finished it and ran it there we go and test input I have successfully created synthetic data set and it's given its final answer yeah it's pretty crazy so look let's just grab our instructions without any links and give it to gpt4 and see how far it goes so you didn't even know what to do but I have added return of python script for this let's see what it does now as we see here we got some code loading of the text is fine but there's nothing about link chain it just simply doesn't know what Lang chain is it doesn't know what a QA chain is it's just trying to come up with something cool as far as it knows what's best but this is not what we were looking for by the way we're gonna review the code here in a moment but all the necessary code will be available to my patreon supporters the link will be in the description now let's try it with web browsing with gpt4 this is Cape this uses gpt4 and it\n",
"\n",
"\n",
"\n",
"Segment from 'GPT-4 Auto Coder 2 reads docs from URL links and Auto Debugs' by echohive\n",
"Timestamp: (07:31, 08:38)\n",
"\n",
"was mostly right it's 95 there all you have to do is just add Dot Page content over here and a good user question and you're good to go which would have taken somehow maybe longer uh to try to figure out from the documentation alone so as we see in the first interaction it actually received an error see set the sequence up so it actually did really well this time but it got the expected string because it had a problem with the how the character text splitter creates the document because it's a tuple so it actually decide to print it see as you see it actually prints it and it understands that it has to use page contents here let's say see says I can see that the text variable contains a list of document objects I need to extract the page content and here it's actually using that text first element of the text the page content versus up here when it was receiving the error it was only trying to let's see where is it it was only trying to use the first element which is the Tuple object right here right the list\n",
"\n",
"\n",
"\n",
"Segment from 'GPT-4 Auto Coder 2 reads docs from URL links and Auto Debugs' by echohive\n",
"Timestamp: (08:38, 09:46)\n",
"\n",
"the Tuple within a list does because at the end there's some metadata actually on the third try see it was able to create a code that actually worked perfectly it actually was able to get the answer right now it's actually giving us the final answer but we can just copy this for example you do have to run this in an environment which you have all these packages installed we're going to talk about the requirements here in a moment but if you were to bring it into this test and just run this let me see we are loading our QA chain text splitter we are loading Isaac newton.text and getting first chunk is the first element's page content and we are input inputting that is the document and question is what did Isaac Newton discover me right oh I have to stop that and whenever you run this is actually a working code line chain code question over question answering over documents is you have split documents into three but we are only using the first document and Isaac Michigan discovered the laws of motion and universal gravitation so I found that this works really well\n",
"\n",
"\n",
"\n",
"Segment from 'GPT-4 Auto Coder 2 reads docs from URL links and Auto Debugs' by echohive\n",
"Timestamp: (04:13, 05:20)\n",
"\n",
"lag format and if we take a look at the documentation as we see actually in the question answering stuff not in nowhere in the two pages stuff chain type is mentioned it's only using mapreduce and we're not going to be using summarize we're saying only question answering QA chain we're not even specifying the exact name and also with the characters text splitter we're saying chunk size to 500 and also the overlap of zero and this will of course split it into multiple pieces but we said that we only want to work with the first element of that first chunk or the first element of the list so as we see so we review it we have the summarize elements we have the QA elements and the text splitter once we know we have all our elements all we have to do is just put our instructions in here I have implemented multi-line input so you can actually paste code here as well because it's a multi-line input we do have to say done and click enter and now we're entering agent executor chain so this is the fun part as you see I have heavily\n",
"\n",
"\n",
"\n",
"Segment from 'GPT-4 Auto Coder 2 reads docs from URL links and Auto Debugs' by echohive\n",
"Timestamp: (00:00, 01:07)\n",
"\n",
"hey everybody today we're gonna look at different implementation of Auto GPT which writes code debugs it automatically all while using documentation as its guidance what we're going to do is we're going to load this page this URL question answering and summarize analyze document URL and character text Splitter from blank chain and then we're going to ask and we will be using a python agent with the python Ripple which is able to execute code and gets feedback from the output and takes in user instructions to write code whatever you like let's just run this the best way to understand it is to see it when we run this code we are first greeted that the script this script will be Auto executing AI generator code this may be dangerous make sure you're observing the process we have to say accept here once we type in accept and click enter we are asked do you want to load any URLs we're going to say yes we don't have to load URLs if you don't want to you can just give it instructions without the urls I've set it up so that you can load up to three\n",
"\n",
"\n",
"\n",
"I could not find an answer.\n"
]
},
{
"data": {
"text/plain": [
"('I could not find an answer.',\n",
" 'Segments that might have been used to answer your question: (If you specified more segments than shown here, consider increasing your token budget)\\n\\n### Segment 1 (12:01 - 13:12):\\n<iframe\\n width=\"400\"\\n height=\"240\"\\n src=\"https://youtube.com/embed/l_r_b38VXmQ?start=721&end=792&controls=0\"\\n frameborder=\"0\"\\n allow=\"accelerometer; autoplay; modestbranding; encrypted-media; gyroscope; picture-in-picture\"\\n allowfullscreen\\n >\\n </iframe>\\n\\n### Segment 2 (07:31 - 08:38):\\n<iframe\\n width=\"400\"\\n height=\"240\"\\n src=\"https://youtube.com/embed/l_r_b38VXmQ?start=451&end=518&controls=0\"\\n frameborder=\"0\"\\n allow=\"accelerometer; autoplay; modestbranding; encrypted-media; gyroscope; picture-in-picture\"\\n allowfullscreen\\n >\\n </iframe>\\n\\n### Segment 3 (08:38 - 09:46):\\n<iframe\\n width=\"400\"\\n height=\"240\"\\n src=\"https://youtube.com/embed/l_r_b38VXmQ?start=518&end=586&controls=0\"\\n frameborder=\"0\"\\n allow=\"accelerometer; autoplay; modestbranding; encrypted-media; gyroscope; picture-in-picture\"\\n allowfullscreen\\n >\\n </iframe>\\n\\n### Segment 4 (04:13 - 05:20):\\n<iframe\\n width=\"400\"\\n height=\"240\"\\n src=\"https://youtube.com/embed/l_r_b38VXmQ?start=253&end=320&controls=0\"\\n frameborder=\"0\"\\n allow=\"accelerometer; autoplay; modestbranding; encrypted-media; gyroscope; picture-in-picture\"\\n allowfullscreen\\n >\\n </iframe>\\n\\n### Segment 5 (00:00 - 01:07):\\n<iframe\\n width=\"400\"\\n height=\"240\"\\n src=\"https://youtube.com/embed/l_r_b38VXmQ?start=0&end=67&controls=0\"\\n frameborder=\"0\"\\n allow=\"accelerometer; autoplay; modestbranding; encrypted-media; gyroscope; picture-in-picture\"\\n allowfullscreen\\n >\\n </iframe>\\n\\n')"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"main(openAI_key=\"env\", \n",
" question=\"What it does after extracting only the code\", n_videos=5, \n",
" urls_text=\"https://youtu.be/l_r_b38VXmQ\", \n",
" split_by_topic=False, \n",
" segment_length=200, \n",
" n_neighbours=5, \n",
" model=\"gpt-3.5-turbo\", \n",
" token_budget=2000, \n",
" temperature=0)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
|