wendru18 commited on
Commit
59db44a
·
1 Parent(s): f3ae348

yt links input now optional

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +51 -14
  3. notebook.ipynb +27 -21
README.md CHANGED
@@ -15,7 +15,7 @@ Ask YouTube GPT allows you to ask questions about a set of YouTube Videos using
15
 
16
  ## Ideas / Future Improvements
17
 
18
- - [ ] Omit the need for a set of videos, and instead use a search query to find videos on the fly.
19
  - [ ] Add "Suggest a question" feature given the videos (maybe through clustering?)
20
  - [ ] Add explainable segment retrieval (i.e. why did that specific segment get chosen to answer the question?)
21
  - [ ] Add OpenAI embeddings
 
15
 
16
  ## Ideas / Future Improvements
17
 
18
+ - [x] Omit the need for a set of videos, and instead use a search query to find videos on the fly.
19
  - [ ] Add "Suggest a question" feature given the videos (maybe through clustering?)
20
  - [ ] Add explainable segment retrieval (i.e. why did that specific segment get chosen to answer the question?)
21
  - [ ] Add OpenAI embeddings
app.py CHANGED
@@ -1,5 +1,6 @@
1
  from youtube_transcript_api import YouTubeTranscriptApi
2
  from nltk.tokenize import TextTilingTokenizer
 
3
  from semantic_search import SemanticSearch
4
  import pandas as pd
5
  import gradio as gr
@@ -34,10 +35,14 @@ def get_youtube_data(url):
34
  try:
35
  raw = YouTubeTranscriptApi.get_transcript(video_id)
36
  except:
37
- transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
38
- for transcript in transcript_list:
39
- raw = transcript.translate('en').fetch()
40
- break
 
 
 
 
41
 
42
  response = requests.get(f"https://noembed.com/embed?dataType=json&url={url}")
43
  data = json.loads(response.content)
@@ -83,7 +88,10 @@ def get_segments(df, title, author, split_by_topic, segment_length = 200):
83
  words = transcript.split()
84
  segments = [' '.join(words[i:i+segment_length]) for i in range(0, len(words), segment_length)]
85
  else:
86
- segments = tt.tokenize(transcript)
 
 
 
87
 
88
  segments = [segment.replace('\n','').strip() for segment in segments]
89
 
@@ -138,7 +146,7 @@ def form_query(question, model, token_budget):
138
 
139
  results = searcher(question)
140
 
141
- introduction = 'Use the below segments from multiple youtube videos to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer." Cite each sentence using the [title, author, timestamp] notation. Every sentence must have a citation!'
142
 
143
  message = introduction
144
 
@@ -208,17 +216,28 @@ def add_to_dict(title, url):
208
  titles_to_urls[new_title] = url
209
  return new_title
210
 
211
- def main(openAI_key, urls_text, question, split_by_topic, segment_length, n_neighbours, model, token_budget, temperature):
 
 
 
 
 
 
 
212
 
213
  print(question)
214
  print(urls_text)
215
 
216
  set_openai_key(openAI_key)
217
 
 
 
 
 
 
218
  global titles_to_urls
219
  titles_to_urls = {}
220
 
221
- urls = list(set(urls_text.split("\n")))
222
  segments = []
223
 
224
  for url in urls:
@@ -226,13 +245,21 @@ def main(openAI_key, urls_text, question, split_by_topic, segment_length, n_neig
226
  if "youtu.be" in url:
227
  url = url.replace("youtu.be/", "youtube.com/watch?v=")
228
 
229
- df, title, author = get_youtube_data(url)
 
 
 
 
 
230
 
231
  title = add_to_dict(title, url)
232
 
233
  video_segments = get_segments(df, title, author, split_by_topic, segment_length)
234
 
235
  segments.extend(video_segments)
 
 
 
236
 
237
  print("Segments generated successfully!")
238
 
@@ -249,7 +276,7 @@ title = "Ask YouTube GPT 📺"
249
  with gr.Blocks() as demo:
250
 
251
  gr.Markdown(f'<center><h1>{title}</h1></center>')
252
- gr.Markdown(f'Ask YouTube GPT allows you to ask questions about a set of YouTube Videos using Topic Segmentation, Universal Sentence Encoding, and Open AI. It does not use the video/s itself, but rather the transcript/s of such video/s. The returned response cites the video title, author and timestamp in square brackets where the information is located, adding credibility to the responses and helping you locate incorrect information. If you need one, get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a>.</p>')
253
 
254
  with gr.Row():
255
 
@@ -257,12 +284,22 @@ with gr.Blocks() as demo:
257
 
258
  openAI_key=gr.Textbox(label='Enter your OpenAI API key here:')
259
 
260
- # Allow the user to input multiple links, adding a textbox for each
261
- urls_text = gr.Textbox(lines=5, label="Enter the links to the YouTube videos you want to search (one per line):", placeholder="https://www.youtube.com/watch?v=...")
262
-
263
  question = gr.Textbox(label='Enter your question here:')
264
 
265
  with gr.Accordion("Advanced Settings", open=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  split_by_topic = gr.Checkbox(label="Split segments by topic", value=True, info="Whether the video transcripts are to be segmented by topic or by word count. Topically-coherent segments may be more useful for question answering, but results in a slower response time, especially for lengthy videos.")
267
  segment_length = gr.Slider(label="Segment word count", minimum=50, maximum=500, step=50, value=200, visible=False)
268
 
@@ -288,7 +325,7 @@ with gr.Blocks() as demo:
288
  with gr.TabItem("References"):
289
  references = gr.Markdown()
290
 
291
- btn.click(main, inputs=[openAI_key, urls_text, question, split_by_topic, segment_length, n_neighbours, model, token_budget, temperature], outputs=[answer, references])
292
 
293
  #openai.api_key = os.getenv('Your_Key_Here')
294
  demo.launch()
 
1
  from youtube_transcript_api import YouTubeTranscriptApi
2
  from nltk.tokenize import TextTilingTokenizer
3
+ from youtubesearchpython import VideosSearch
4
  from semantic_search import SemanticSearch
5
  import pandas as pd
6
  import gradio as gr
 
35
  try:
36
  raw = YouTubeTranscriptApi.get_transcript(video_id)
37
  except:
38
+ try:
39
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
40
+ for transcript in transcript_list:
41
+ raw = transcript.translate('en').fetch()
42
+ break
43
+ except:
44
+ print(f"No transcript found for {url}") # Usually because the video itself disabled captions
45
+ return False
46
 
47
  response = requests.get(f"https://noembed.com/embed?dataType=json&url={url}")
48
  data = json.loads(response.content)
 
88
  words = transcript.split()
89
  segments = [' '.join(words[i:i+segment_length]) for i in range(0, len(words), segment_length)]
90
  else:
91
+ try:
92
+ segments = tt.tokenize(transcript)
93
+ except:
94
+ return ""
95
 
96
  segments = [segment.replace('\n','').strip() for segment in segments]
97
 
 
146
 
147
  results = searcher(question)
148
 
149
+ introduction = 'Use the below segments from multiple youtube videos to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer." Cite each sentence using the [title, author, timestamp] notation. Every sentence MUST have a citation!'
150
 
151
  message = introduction
152
 
 
216
  titles_to_urls[new_title] = url
217
  return new_title
218
 
219
+ def search_youtube(question, n_videos):
220
+ videosSearch = VideosSearch(question, limit = n_videos)
221
+ urls = ["https://www.youtube.com/watch?v=" + video["id"] for video in videosSearch.result()["result"]]
222
+ print(urls)
223
+ return urls
224
+
225
+
226
+ def main(openAI_key, question, n_videos, urls_text, split_by_topic, segment_length, n_neighbours, model, token_budget, temperature):
227
 
228
  print(question)
229
  print(urls_text)
230
 
231
  set_openai_key(openAI_key)
232
 
233
+ if urls_text == "":
234
+ urls = search_youtube(question, n_videos)
235
+ else:
236
+ urls = list(set(urls_text.split("\n")))
237
+
238
  global titles_to_urls
239
  titles_to_urls = {}
240
 
 
241
  segments = []
242
 
243
  for url in urls:
 
245
  if "youtu.be" in url:
246
  url = url.replace("youtu.be/", "youtube.com/watch?v=")
247
 
248
+ res = get_youtube_data(url)
249
+
250
+ if not res:
251
+ continue
252
+
253
+ df, title, author = res
254
 
255
  title = add_to_dict(title, url)
256
 
257
  video_segments = get_segments(df, title, author, split_by_topic, segment_length)
258
 
259
  segments.extend(video_segments)
260
+
261
+ if segments == []:
262
+ return "Something wrong happened! Try specifying the YouTube videos or changing the query.", ""
263
 
264
  print("Segments generated successfully!")
265
 
 
276
  with gr.Blocks() as demo:
277
 
278
  gr.Markdown(f'<center><h1>{title}</h1></center>')
279
+ gr.Markdown(f'Ask YouTube GPT allows you to ask questions about a set of YouTube Videos using Topic Segmentation, Universal Sentence Encoding, and Open AI. It does not use the video/s itself, but rather the transcript/s of such video/s. The returned response cites the video title, author and timestamp in square brackets where the information is located, adding credibility to the responses and helping you locate incorrect information. If you need one, get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a>.</p>\n\n### Latest Update (01/05/23)\n> Specifying the set of YouTube videos has now been made optional. Instead you can simply specify a question and the number of videos to retrieve from YouTube.')
280
 
281
  with gr.Row():
282
 
 
284
 
285
  openAI_key=gr.Textbox(label='Enter your OpenAI API key here:')
286
 
 
 
 
287
  question = gr.Textbox(label='Enter your question here:')
288
 
289
  with gr.Accordion("Advanced Settings", open=False):
290
+ # Allow the user to input multiple links, adding a textbox for each
291
+ urls_text = gr.Textbox(lines=5, label="Enter the links to the YouTube videos you want to search (one per line).", info="If left blank, the question will be used to search and retrieve videos from YouTube.", placeholder="https://www.youtube.com/watch?v=...")
292
+
293
+ n_videos = gr.Slider(label="Number of videos to retrieve", minimum=1, maximum=10, step=1, value=5, info="The number of videos to retrieve and feed to the GPT model for answering the question.")
294
+
295
+ def fn2(urls_text):
296
+ if urls_text != "":
297
+ return gr.Slider.update(visible=False)
298
+ else:
299
+ return gr.Slider.update(visible=True)
300
+
301
+ urls_text.change(fn2, urls_text, n_videos)
302
+
303
  split_by_topic = gr.Checkbox(label="Split segments by topic", value=True, info="Whether the video transcripts are to be segmented by topic or by word count. Topically-coherent segments may be more useful for question answering, but results in a slower response time, especially for lengthy videos.")
304
  segment_length = gr.Slider(label="Segment word count", minimum=50, maximum=500, step=50, value=200, visible=False)
305
 
 
325
  with gr.TabItem("References"):
326
  references = gr.Markdown()
327
 
328
+ btn.click(main, inputs=[openAI_key, question, n_videos, urls_text, split_by_topic, segment_length, n_neighbours, model, token_budget, temperature], outputs=[answer, references])
329
 
330
  #openai.api_key = os.getenv('Your_Key_Here')
331
  demo.launch()
notebook.ipynb CHANGED
@@ -2,9 +2,17 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 41,
6
  "metadata": {},
7
- "outputs": [],
 
 
 
 
 
 
 
 
8
  "source": [
9
  "from youtube_transcript_api import YouTubeTranscriptApi\n",
10
  "from nltk.tokenize import TextTilingTokenizer \n",
@@ -13,16 +21,20 @@
13
  "import requests\n",
14
  "import json\n",
15
  "\n",
16
- "url = \"https://www.youtube.com/watch?v=77zvIYDFSok\"\n",
17
  "video_id = url.split(\"=\")[1]\n",
18
  "\n",
19
  "try:\n",
20
  " raw = YouTubeTranscriptApi.get_transcript(video_id)\n",
21
  "except:\n",
22
- " transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)\n",
23
- " for transcript in transcript_list:\n",
24
- " raw = transcript.translate('en').fetch()\n",
25
- " break\n",
 
 
 
 
26
  "\n",
27
  "response = requests.get(f\"https://noembed.com/embed?dataType=json&url={url}\")\n",
28
  "data = json.loads(response.content)\n",
@@ -505,22 +517,16 @@
505
  },
506
  {
507
  "cell_type": "code",
508
- "execution_count": 73,
509
  "metadata": {},
510
- "outputs": [
511
- {
512
- "data": {
513
- "text/plain": [
514
- "\"\\nSegment from 'Feminism Is 'Dividing This'' Country' by VICE News\\nTimestamp: (10:51, 12:24)\\nURL: https://www.youtube.com/watch?v=77zvIYDFSok&t=10:51 \\n---\\npersonally take while leading this group, let's create a world where feminists don't have to choose feminism. I choose feminism because I think you're watching. As a person, I live to protect the woman I love. I think I'm about the level of a director who creates a hero. Well, one day, [Music] We were humiliated like this. I think there are so many messages in this very short video. First of all, I think there's\\n---\\n\""
515
- ]
516
- },
517
- "execution_count": 73,
518
- "metadata": {},
519
- "output_type": "execute_result"
520
- }
521
- ],
522
  "source": [
523
- "text"
 
 
 
 
 
524
  ]
525
  }
526
  ],
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 16,
6
  "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "No transcript found\n"
13
+ ]
14
+ }
15
+ ],
16
  "source": [
17
  "from youtube_transcript_api import YouTubeTranscriptApi\n",
18
  "from nltk.tokenize import TextTilingTokenizer \n",
 
21
  "import requests\n",
22
  "import json\n",
23
  "\n",
24
+ "url = \"https://www.youtube.com/watch?v=z7-K1zmBu-8\"\n",
25
  "video_id = url.split(\"=\")[1]\n",
26
  "\n",
27
  "try:\n",
28
  " raw = YouTubeTranscriptApi.get_transcript(video_id)\n",
29
  "except:\n",
30
+ " try:\n",
31
+ " transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)\n",
32
+ " for transcript in transcript_list:\n",
33
+ " raw = transcript.translate('en').fetch()\n",
34
+ " break\n",
35
+ " except:\n",
36
+ " print(\"No transcript found\")\n",
37
+ " raw = []\n",
38
  "\n",
39
  "response = requests.get(f\"https://noembed.com/embed?dataType=json&url={url}\")\n",
40
  "data = json.loads(response.content)\n",
 
517
  },
518
  {
519
  "cell_type": "code",
520
+ "execution_count": 3,
521
  "metadata": {},
522
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
523
  "source": [
524
+ "from youtubesearchpython import VideosSearch\n",
525
+ "\n",
526
+ "videosSearch = VideosSearch('NoCopyrightSounds', limit = 2)\n",
527
+ "\n",
528
+ "# Get URLs\n",
529
+ "urls = [video[\"id\"] for video in videosSearch.result()[\"result\"]]"
530
  ]
531
  }
532
  ],