Spaces:
Runtime error
Runtime error
wendru18
commited on
Commit
·
59db44a
1
Parent(s):
f3ae348
yt links input now optional
Browse files- README.md +1 -1
- app.py +51 -14
- notebook.ipynb +27 -21
README.md
CHANGED
@@ -15,7 +15,7 @@ Ask YouTube GPT allows you to ask questions about a set of YouTube Videos using
|
|
15 |
|
16 |
## Ideas / Future Improvements
|
17 |
|
18 |
-
- [
|
19 |
- [ ] Add "Suggest a question" feature given the videos (maybe through clustering?)
|
20 |
- [ ] Add explainable segment retrieval (i.e. why did that specific segment get chosen to answer the question?)
|
21 |
- [ ] Add OpenAI embeddings
|
|
|
15 |
|
16 |
## Ideas / Future Improvements
|
17 |
|
18 |
+
- [x] Omit the need for a set of videos, and instead use a search query to find videos on the fly.
|
19 |
- [ ] Add "Suggest a question" feature given the videos (maybe through clustering?)
|
20 |
- [ ] Add explainable segment retrieval (i.e. why did that specific segment get chosen to answer the question?)
|
21 |
- [ ] Add OpenAI embeddings
|
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
from youtube_transcript_api import YouTubeTranscriptApi
|
2 |
from nltk.tokenize import TextTilingTokenizer
|
|
|
3 |
from semantic_search import SemanticSearch
|
4 |
import pandas as pd
|
5 |
import gradio as gr
|
@@ -34,10 +35,14 @@ def get_youtube_data(url):
|
|
34 |
try:
|
35 |
raw = YouTubeTranscriptApi.get_transcript(video_id)
|
36 |
except:
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
41 |
|
42 |
response = requests.get(f"https://noembed.com/embed?dataType=json&url={url}")
|
43 |
data = json.loads(response.content)
|
@@ -83,7 +88,10 @@ def get_segments(df, title, author, split_by_topic, segment_length = 200):
|
|
83 |
words = transcript.split()
|
84 |
segments = [' '.join(words[i:i+segment_length]) for i in range(0, len(words), segment_length)]
|
85 |
else:
|
86 |
-
|
|
|
|
|
|
|
87 |
|
88 |
segments = [segment.replace('\n','').strip() for segment in segments]
|
89 |
|
@@ -138,7 +146,7 @@ def form_query(question, model, token_budget):
|
|
138 |
|
139 |
results = searcher(question)
|
140 |
|
141 |
-
introduction = 'Use the below segments from multiple youtube videos to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer." Cite each sentence using the [title, author, timestamp] notation. Every sentence
|
142 |
|
143 |
message = introduction
|
144 |
|
@@ -208,17 +216,28 @@ def add_to_dict(title, url):
|
|
208 |
titles_to_urls[new_title] = url
|
209 |
return new_title
|
210 |
|
211 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
|
213 |
print(question)
|
214 |
print(urls_text)
|
215 |
|
216 |
set_openai_key(openAI_key)
|
217 |
|
|
|
|
|
|
|
|
|
|
|
218 |
global titles_to_urls
|
219 |
titles_to_urls = {}
|
220 |
|
221 |
-
urls = list(set(urls_text.split("\n")))
|
222 |
segments = []
|
223 |
|
224 |
for url in urls:
|
@@ -226,13 +245,21 @@ def main(openAI_key, urls_text, question, split_by_topic, segment_length, n_neig
|
|
226 |
if "youtu.be" in url:
|
227 |
url = url.replace("youtu.be/", "youtube.com/watch?v=")
|
228 |
|
229 |
-
|
|
|
|
|
|
|
|
|
|
|
230 |
|
231 |
title = add_to_dict(title, url)
|
232 |
|
233 |
video_segments = get_segments(df, title, author, split_by_topic, segment_length)
|
234 |
|
235 |
segments.extend(video_segments)
|
|
|
|
|
|
|
236 |
|
237 |
print("Segments generated successfully!")
|
238 |
|
@@ -249,7 +276,7 @@ title = "Ask YouTube GPT 📺"
|
|
249 |
with gr.Blocks() as demo:
|
250 |
|
251 |
gr.Markdown(f'<center><h1>{title}</h1></center>')
|
252 |
-
gr.Markdown(f'Ask YouTube GPT allows you to ask questions about a set of YouTube Videos using Topic Segmentation, Universal Sentence Encoding, and Open AI. It does not use the video/s itself, but rather the transcript/s of such video/s. The returned response cites the video title, author and timestamp in square brackets where the information is located, adding credibility to the responses and helping you locate incorrect information. If you need one, get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a>.</p>')
|
253 |
|
254 |
with gr.Row():
|
255 |
|
@@ -257,12 +284,22 @@ with gr.Blocks() as demo:
|
|
257 |
|
258 |
openAI_key=gr.Textbox(label='Enter your OpenAI API key here:')
|
259 |
|
260 |
-
# Allow the user to input multiple links, adding a textbox for each
|
261 |
-
urls_text = gr.Textbox(lines=5, label="Enter the links to the YouTube videos you want to search (one per line):", placeholder="https://www.youtube.com/watch?v=...")
|
262 |
-
|
263 |
question = gr.Textbox(label='Enter your question here:')
|
264 |
|
265 |
with gr.Accordion("Advanced Settings", open=False):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
split_by_topic = gr.Checkbox(label="Split segments by topic", value=True, info="Whether the video transcripts are to be segmented by topic or by word count. Topically-coherent segments may be more useful for question answering, but results in a slower response time, especially for lengthy videos.")
|
267 |
segment_length = gr.Slider(label="Segment word count", minimum=50, maximum=500, step=50, value=200, visible=False)
|
268 |
|
@@ -288,7 +325,7 @@ with gr.Blocks() as demo:
|
|
288 |
with gr.TabItem("References"):
|
289 |
references = gr.Markdown()
|
290 |
|
291 |
-
btn.click(main, inputs=[openAI_key,
|
292 |
|
293 |
#openai.api_key = os.getenv('Your_Key_Here')
|
294 |
demo.launch()
|
|
|
1 |
from youtube_transcript_api import YouTubeTranscriptApi
|
2 |
from nltk.tokenize import TextTilingTokenizer
|
3 |
+
from youtubesearchpython import VideosSearch
|
4 |
from semantic_search import SemanticSearch
|
5 |
import pandas as pd
|
6 |
import gradio as gr
|
|
|
35 |
try:
|
36 |
raw = YouTubeTranscriptApi.get_transcript(video_id)
|
37 |
except:
|
38 |
+
try:
|
39 |
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
40 |
+
for transcript in transcript_list:
|
41 |
+
raw = transcript.translate('en').fetch()
|
42 |
+
break
|
43 |
+
except:
|
44 |
+
print(f"No transcript found for {url}") # Usually because the video itself disabled captions
|
45 |
+
return False
|
46 |
|
47 |
response = requests.get(f"https://noembed.com/embed?dataType=json&url={url}")
|
48 |
data = json.loads(response.content)
|
|
|
88 |
words = transcript.split()
|
89 |
segments = [' '.join(words[i:i+segment_length]) for i in range(0, len(words), segment_length)]
|
90 |
else:
|
91 |
+
try:
|
92 |
+
segments = tt.tokenize(transcript)
|
93 |
+
except:
|
94 |
+
return ""
|
95 |
|
96 |
segments = [segment.replace('\n','').strip() for segment in segments]
|
97 |
|
|
|
146 |
|
147 |
results = searcher(question)
|
148 |
|
149 |
+
introduction = 'Use the below segments from multiple youtube videos to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer." Cite each sentence using the [title, author, timestamp] notation. Every sentence MUST have a citation!'
|
150 |
|
151 |
message = introduction
|
152 |
|
|
|
216 |
titles_to_urls[new_title] = url
|
217 |
return new_title
|
218 |
|
219 |
+
def search_youtube(question, n_videos):
|
220 |
+
videosSearch = VideosSearch(question, limit = n_videos)
|
221 |
+
urls = ["https://www.youtube.com/watch?v=" + video["id"] for video in videosSearch.result()["result"]]
|
222 |
+
print(urls)
|
223 |
+
return urls
|
224 |
+
|
225 |
+
|
226 |
+
def main(openAI_key, question, n_videos, urls_text, split_by_topic, segment_length, n_neighbours, model, token_budget, temperature):
|
227 |
|
228 |
print(question)
|
229 |
print(urls_text)
|
230 |
|
231 |
set_openai_key(openAI_key)
|
232 |
|
233 |
+
if urls_text == "":
|
234 |
+
urls = search_youtube(question, n_videos)
|
235 |
+
else:
|
236 |
+
urls = list(set(urls_text.split("\n")))
|
237 |
+
|
238 |
global titles_to_urls
|
239 |
titles_to_urls = {}
|
240 |
|
|
|
241 |
segments = []
|
242 |
|
243 |
for url in urls:
|
|
|
245 |
if "youtu.be" in url:
|
246 |
url = url.replace("youtu.be/", "youtube.com/watch?v=")
|
247 |
|
248 |
+
res = get_youtube_data(url)
|
249 |
+
|
250 |
+
if not res:
|
251 |
+
continue
|
252 |
+
|
253 |
+
df, title, author = res
|
254 |
|
255 |
title = add_to_dict(title, url)
|
256 |
|
257 |
video_segments = get_segments(df, title, author, split_by_topic, segment_length)
|
258 |
|
259 |
segments.extend(video_segments)
|
260 |
+
|
261 |
+
if segments == []:
|
262 |
+
return "Something wrong happened! Try specifying the YouTube videos or changing the query.", ""
|
263 |
|
264 |
print("Segments generated successfully!")
|
265 |
|
|
|
276 |
with gr.Blocks() as demo:
|
277 |
|
278 |
gr.Markdown(f'<center><h1>{title}</h1></center>')
|
279 |
+
gr.Markdown(f'Ask YouTube GPT allows you to ask questions about a set of YouTube Videos using Topic Segmentation, Universal Sentence Encoding, and Open AI. It does not use the video/s itself, but rather the transcript/s of such video/s. The returned response cites the video title, author and timestamp in square brackets where the information is located, adding credibility to the responses and helping you locate incorrect information. If you need one, get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a>.</p>\n\n### Latest Update (01/05/23)\n> Specifying the set of YouTube videos has now been made optional. Instead you can simply specify a question and the number of videos to retrieve from YouTube.')
|
280 |
|
281 |
with gr.Row():
|
282 |
|
|
|
284 |
|
285 |
openAI_key=gr.Textbox(label='Enter your OpenAI API key here:')
|
286 |
|
|
|
|
|
|
|
287 |
question = gr.Textbox(label='Enter your question here:')
|
288 |
|
289 |
with gr.Accordion("Advanced Settings", open=False):
|
290 |
+
# Allow the user to input multiple links, adding a textbox for each
|
291 |
+
urls_text = gr.Textbox(lines=5, label="Enter the links to the YouTube videos you want to search (one per line).", info="If left blank, the question will be used to search and retrieve videos from YouTube.", placeholder="https://www.youtube.com/watch?v=...")
|
292 |
+
|
293 |
+
n_videos = gr.Slider(label="Number of videos to retrieve", minimum=1, maximum=10, step=1, value=5, info="The number of videos to retrieve and feed to the GPT model for answering the question.")
|
294 |
+
|
295 |
+
def fn2(urls_text):
|
296 |
+
if urls_text != "":
|
297 |
+
return gr.Slider.update(visible=False)
|
298 |
+
else:
|
299 |
+
return gr.Slider.update(visible=True)
|
300 |
+
|
301 |
+
urls_text.change(fn2, urls_text, n_videos)
|
302 |
+
|
303 |
split_by_topic = gr.Checkbox(label="Split segments by topic", value=True, info="Whether the video transcripts are to be segmented by topic or by word count. Topically-coherent segments may be more useful for question answering, but results in a slower response time, especially for lengthy videos.")
|
304 |
segment_length = gr.Slider(label="Segment word count", minimum=50, maximum=500, step=50, value=200, visible=False)
|
305 |
|
|
|
325 |
with gr.TabItem("References"):
|
326 |
references = gr.Markdown()
|
327 |
|
328 |
+
btn.click(main, inputs=[openAI_key, question, n_videos, urls_text, split_by_topic, segment_length, n_neighbours, model, token_budget, temperature], outputs=[answer, references])
|
329 |
|
330 |
#openai.api_key = os.getenv('Your_Key_Here')
|
331 |
demo.launch()
|
notebook.ipynb
CHANGED
@@ -2,9 +2,17 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"metadata": {},
|
7 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
"source": [
|
9 |
"from youtube_transcript_api import YouTubeTranscriptApi\n",
|
10 |
"from nltk.tokenize import TextTilingTokenizer \n",
|
@@ -13,16 +21,20 @@
|
|
13 |
"import requests\n",
|
14 |
"import json\n",
|
15 |
"\n",
|
16 |
-
"url = \"https://www.youtube.com/watch?v=
|
17 |
"video_id = url.split(\"=\")[1]\n",
|
18 |
"\n",
|
19 |
"try:\n",
|
20 |
" raw = YouTubeTranscriptApi.get_transcript(video_id)\n",
|
21 |
"except:\n",
|
22 |
-
"
|
23 |
-
"
|
24 |
-
"
|
25 |
-
"
|
|
|
|
|
|
|
|
|
26 |
"\n",
|
27 |
"response = requests.get(f\"https://noembed.com/embed?dataType=json&url={url}\")\n",
|
28 |
"data = json.loads(response.content)\n",
|
@@ -505,22 +517,16 @@
|
|
505 |
},
|
506 |
{
|
507 |
"cell_type": "code",
|
508 |
-
"execution_count":
|
509 |
"metadata": {},
|
510 |
-
"outputs": [
|
511 |
-
{
|
512 |
-
"data": {
|
513 |
-
"text/plain": [
|
514 |
-
"\"\\nSegment from 'Feminism Is 'Dividing This'' Country' by VICE News\\nTimestamp: (10:51, 12:24)\\nURL: https://www.youtube.com/watch?v=77zvIYDFSok&t=10:51 \\n---\\npersonally take while leading this group, let's create a world where feminists don't have to choose feminism. I choose feminism because I think you're watching. As a person, I live to protect the woman I love. I think I'm about the level of a director who creates a hero. Well, one day, [Music] We were humiliated like this. I think there are so many messages in this very short video. First of all, I think there's\\n---\\n\""
|
515 |
-
]
|
516 |
-
},
|
517 |
-
"execution_count": 73,
|
518 |
-
"metadata": {},
|
519 |
-
"output_type": "execute_result"
|
520 |
-
}
|
521 |
-
],
|
522 |
"source": [
|
523 |
-
"
|
|
|
|
|
|
|
|
|
|
|
524 |
]
|
525 |
}
|
526 |
],
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 16,
|
6 |
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stdout",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"No transcript found\n"
|
13 |
+
]
|
14 |
+
}
|
15 |
+
],
|
16 |
"source": [
|
17 |
"from youtube_transcript_api import YouTubeTranscriptApi\n",
|
18 |
"from nltk.tokenize import TextTilingTokenizer \n",
|
|
|
21 |
"import requests\n",
|
22 |
"import json\n",
|
23 |
"\n",
|
24 |
+
"url = \"https://www.youtube.com/watch?v=z7-K1zmBu-8\"\n",
|
25 |
"video_id = url.split(\"=\")[1]\n",
|
26 |
"\n",
|
27 |
"try:\n",
|
28 |
" raw = YouTubeTranscriptApi.get_transcript(video_id)\n",
|
29 |
"except:\n",
|
30 |
+
" try:\n",
|
31 |
+
" transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)\n",
|
32 |
+
" for transcript in transcript_list:\n",
|
33 |
+
" raw = transcript.translate('en').fetch()\n",
|
34 |
+
" break\n",
|
35 |
+
" except:\n",
|
36 |
+
" print(\"No transcript found\")\n",
|
37 |
+
" raw = []\n",
|
38 |
"\n",
|
39 |
"response = requests.get(f\"https://noembed.com/embed?dataType=json&url={url}\")\n",
|
40 |
"data = json.loads(response.content)\n",
|
|
|
517 |
},
|
518 |
{
|
519 |
"cell_type": "code",
|
520 |
+
"execution_count": 3,
|
521 |
"metadata": {},
|
522 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
523 |
"source": [
|
524 |
+
"from youtubesearchpython import VideosSearch\n",
|
525 |
+
"\n",
|
526 |
+
"videosSearch = VideosSearch('NoCopyrightSounds', limit = 2)\n",
|
527 |
+
"\n",
|
528 |
+
"# Get URLs\n",
|
529 |
+
"urls = [video[\"id\"] for video in videosSearch.result()[\"result\"]]"
|
530 |
]
|
531 |
}
|
532 |
],
|