Spaces:
Runtime error
Runtime error
wendru18
commited on
Commit
·
22d9367
0
Parent(s):
first commit
Browse files- __pycache__/semantic_search.cpython-38.pyc +0 -0
- app.py +177 -0
- semantic_search.py +38 -0
- youtube.ipynb +164 -0
__pycache__/semantic_search.cpython-38.pyc
ADDED
Binary file (1.71 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
2 |
+
from nltk.tokenize import TextTilingTokenizer
|
3 |
+
from semantic_search import SemanticSearch
|
4 |
+
import pandas as pd
|
5 |
+
import gradio as gr
|
6 |
+
import numpy as np
|
7 |
+
import requests
|
8 |
+
import tiktoken
|
9 |
+
import openai
|
10 |
+
import json
|
11 |
+
|
12 |
+
tt = TextTilingTokenizer()
|
13 |
+
searcher = SemanticSearch()
|
14 |
+
|
15 |
+
def get_youtube_data(url):
|
16 |
+
|
17 |
+
video_id = url.split("=")[1]
|
18 |
+
|
19 |
+
raw = YouTubeTranscriptApi.get_transcript(video_id)
|
20 |
+
|
21 |
+
response = requests.get(f"https://noembed.com/embed?dataType=json&url={url}")
|
22 |
+
data = json.loads(response.content)
|
23 |
+
|
24 |
+
title, author = data["title"], data["author_name"]
|
25 |
+
|
26 |
+
df = pd.DataFrame(raw)
|
27 |
+
|
28 |
+
df['end'] = df['start'] + df['duration']
|
29 |
+
df['total_words'] = df['text'].apply(lambda x: len(x.split())).cumsum()
|
30 |
+
df["text"] = df["text"] + "\n\n"
|
31 |
+
|
32 |
+
return df, title, author
|
33 |
+
|
34 |
+
def to_timestamp(seconds):
|
35 |
+
seconds = int(seconds)
|
36 |
+
|
37 |
+
hours = seconds // 3600
|
38 |
+
minutes = (seconds % 3600) // 60
|
39 |
+
seconds_remaining = seconds % 60
|
40 |
+
|
41 |
+
if seconds >= 3600:
|
42 |
+
return f"{hours:02d}:{minutes:02d}:{seconds_remaining:02d}"
|
43 |
+
else:
|
44 |
+
return f"{minutes:02d}:{seconds_remaining:02d}"
|
45 |
+
|
46 |
+
def get_segments(df, title, author, split_by_topic, segment_length = 200):
|
47 |
+
|
48 |
+
transcript = df['text'].str.cat(sep=' ')
|
49 |
+
|
50 |
+
if not split_by_topic:
|
51 |
+
words = transcript.split()
|
52 |
+
segments = [' '.join(words[i:i+segment_length]) for i in range(0, len(words), segment_length)]
|
53 |
+
else:
|
54 |
+
segments = tt.tokenize(transcript)
|
55 |
+
|
56 |
+
segments = [segment.replace('\n\n','').strip() for segment in segments]
|
57 |
+
|
58 |
+
segments_wc = [len(segment.split()) for segment in segments]
|
59 |
+
segments_wc = np.cumsum(segments_wc)
|
60 |
+
|
61 |
+
idx = [np.argmin(np.abs(df['total_words'] - total_words)) for total_words in segments_wc]
|
62 |
+
|
63 |
+
segments_end_times = df['end'].iloc[idx].values
|
64 |
+
segments_end_times = np.insert(segments_end_times, 0, 0.0)
|
65 |
+
|
66 |
+
segments_times = [(to_timestamp(segments_end_times[i-1]), to_timestamp(segments_end_times[i])) for i in range(1,len(segments_end_times))]
|
67 |
+
|
68 |
+
segments_text = [f"Segment from '{title}' by {author}\nSegment timestamp: {segment_time}\n\n{segment}" for segment, segment_time in zip(segments, segments_times)]
|
69 |
+
|
70 |
+
return segments_text
|
71 |
+
|
72 |
+
def fit_searcher(segments, n_neighbors):
|
73 |
+
global searcher
|
74 |
+
searcher.fit(segments, n_neighbors)
|
75 |
+
return True
|
76 |
+
|
77 |
+
def num_tokens(text, model):
|
78 |
+
encoding = tiktoken.encoding_for_model(model)
|
79 |
+
return len(encoding.encode(text))
|
80 |
+
|
81 |
+
def form_query(question, model, token_budget):
|
82 |
+
|
83 |
+
results = searcher(question)
|
84 |
+
|
85 |
+
introduction = 'Use the below segments from multiple youtube videos to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer." Cite each reference using the [title, author, timestamp] notation. Every sentence should have a citation at the end.'
|
86 |
+
|
87 |
+
message = introduction
|
88 |
+
|
89 |
+
question = f"\n\nQuestion: {question}"
|
90 |
+
|
91 |
+
reference = []
|
92 |
+
|
93 |
+
for result in results:
|
94 |
+
result = "\n\n" + result
|
95 |
+
if (
|
96 |
+
num_tokens(message + result + question, model=model)
|
97 |
+
> token_budget
|
98 |
+
):
|
99 |
+
break
|
100 |
+
else:
|
101 |
+
reference.append(result)
|
102 |
+
message += result
|
103 |
+
|
104 |
+
return message + question, reference
|
105 |
+
|
106 |
+
def generate_answer(question, model, token_budget):
|
107 |
+
|
108 |
+
message, reference = form_query(question, model, token_budget)
|
109 |
+
|
110 |
+
messages = [
|
111 |
+
{"role": "system", "content": "You answer questions about legal contracts."},
|
112 |
+
{"role": "user", "content": message},
|
113 |
+
]
|
114 |
+
|
115 |
+
response = openai.ChatCompletion.create(
|
116 |
+
model=model,
|
117 |
+
messages=messages,
|
118 |
+
temperature=0
|
119 |
+
)
|
120 |
+
|
121 |
+
response_message = response["choices"][0]["message"]["content"]
|
122 |
+
return response_message, reference
|
123 |
+
|
124 |
+
|
125 |
+
if False:
|
126 |
+
data = {}
|
127 |
+
|
128 |
+
question = "Why do some men have trouble with feminism?"
|
129 |
+
n_neighbors = 5
|
130 |
+
|
131 |
+
urls = ["https://www.youtube.com/watch?v=4xWJf8cERoM", "https://www.youtube.com/watch?v=vx-Si9gbijA"]
|
132 |
+
segments = []
|
133 |
+
|
134 |
+
for url in urls:
|
135 |
+
df, title, author = get_youtube_data(url)
|
136 |
+
|
137 |
+
video_segments = get_segments(df, title, author, split_by_topic = True)
|
138 |
+
|
139 |
+
segments.extend(video_segments)
|
140 |
+
|
141 |
+
print("Segments generated successfully!")
|
142 |
+
|
143 |
+
if fit_searcher(segments, n_neighbors):
|
144 |
+
print("Searcher fit successfully!")
|
145 |
+
answer, reference = generate_answer(question, model = "gpt-3.5-turbo", token_budget = 1000)
|
146 |
+
print(answer)
|
147 |
+
print(reference)
|
148 |
+
|
149 |
+
title = "Ask Youtube GPT"
|
150 |
+
|
151 |
+
description = """ """
|
152 |
+
|
153 |
+
with gr.Blocks() as demo:
|
154 |
+
|
155 |
+
gr.Markdown(f'<center><h1>{title}</h1></center>')
|
156 |
+
gr.Markdown(f'Ask YouTube GPT allows you to ask questions about a set of Youtube Videos using Universal Sentence Encoder and Open AI. The returned response cites the video title, author and timestamp in square brackets where the information is located, adding credibility to the responses and helping you to locate incorrect information. If you need one, get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a>.</p>')
|
157 |
+
|
158 |
+
with gr.Row():
|
159 |
+
|
160 |
+
with gr.Group():
|
161 |
+
|
162 |
+
openAI_key=gr.Textbox(label='Enter your OpenAI API key here')
|
163 |
+
|
164 |
+
# Allow the user to input multiple links, adding a textbox for each
|
165 |
+
links = gr.Textbox(lines=5, label="Enter the links to the YouTube videos you want to search (one per line):", placeholder="https://www.youtube.com/watch?v=4xWJf8cERoM\nhttps://www.youtube.com/watch?v=vx-Si9gbijA")
|
166 |
+
|
167 |
+
question = gr.Textbox(label='Enter your question here')
|
168 |
+
btn = gr.Button(value='Submit')
|
169 |
+
btn.style(full_width=True)
|
170 |
+
|
171 |
+
with gr.Group():
|
172 |
+
answer = gr.Textbox(label='The answer to your question is :')
|
173 |
+
|
174 |
+
# btn.click(question_answer, inputs=[url, file, question,openAI_key], outputs=[answer])
|
175 |
+
|
176 |
+
#openai.api_key = os.getenv('Your_Key_Here')
|
177 |
+
demo.launch()
|
semantic_search.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.neighbors import NearestNeighbors
|
2 |
+
import tensorflow_hub as hub
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
class SemanticSearch:
|
6 |
+
|
7 |
+
def __init__(self):
|
8 |
+
self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
|
9 |
+
self.fitted = False
|
10 |
+
|
11 |
+
|
12 |
+
def fit(self, data, batch=1000, n_neighbors=5):
|
13 |
+
self.data = data
|
14 |
+
self.embeddings = self.get_text_embedding(data, batch=batch)
|
15 |
+
n_neighbors = min(n_neighbors, len(self.embeddings))
|
16 |
+
self.nn = NearestNeighbors(n_neighbors=n_neighbors)
|
17 |
+
self.nn.fit(self.embeddings)
|
18 |
+
self.fitted = True
|
19 |
+
|
20 |
+
|
21 |
+
def __call__(self, text, return_data=True):
|
22 |
+
inp_emb = self.use([text])
|
23 |
+
neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
|
24 |
+
|
25 |
+
if return_data:
|
26 |
+
return [self.data[i] for i in neighbors]
|
27 |
+
else:
|
28 |
+
return neighbors
|
29 |
+
|
30 |
+
|
31 |
+
def get_text_embedding(self, texts, batch=1000):
|
32 |
+
embeddings = []
|
33 |
+
for i in range(0, len(texts), batch):
|
34 |
+
text_batch = texts[i:(i+batch)]
|
35 |
+
emb_batch = self.use(text_batch)
|
36 |
+
embeddings.append(emb_batch)
|
37 |
+
embeddings = np.vstack(embeddings)
|
38 |
+
return embeddings
|
youtube.ipynb
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 47,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"from youtube_transcript_api import YouTubeTranscriptApi\n",
|
10 |
+
"from nltk.tokenize import TextTilingTokenizer \n",
|
11 |
+
"import pandas as pd\n",
|
12 |
+
"import numpy as np\n",
|
13 |
+
"import requests\n",
|
14 |
+
"import json\n",
|
15 |
+
"\n",
|
16 |
+
"url = \"https://www.youtube.com/watch?v=VcVfceTsD0A&t=163s\"\n",
|
17 |
+
"video_id = url.split(\"=\")[1]\n",
|
18 |
+
"\n",
|
19 |
+
"raw = YouTubeTranscriptApi.get_transcript(video_id)\n",
|
20 |
+
"\n",
|
21 |
+
"response = requests.get(f\"https://noembed.com/embed?dataType=json&url={url}\")\n",
|
22 |
+
"data = json.loads(response.content)\n",
|
23 |
+
"\n",
|
24 |
+
"title, author = data[\"title\"], data[\"author_name\"]"
|
25 |
+
]
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"cell_type": "code",
|
29 |
+
"execution_count": 48,
|
30 |
+
"metadata": {},
|
31 |
+
"outputs": [],
|
32 |
+
"source": [
|
33 |
+
"# Convert the list of dictionaries to a pandas dataframe\n",
|
34 |
+
"df = pd.DataFrame(raw)\n",
|
35 |
+
"\n",
|
36 |
+
"# Add end column\n",
|
37 |
+
"df['end'] = df['start'] + df['duration']\n",
|
38 |
+
"\n",
|
39 |
+
"# Add a new column to the dataframe called 'total_words' that contains the total number of words so far in the transcript\n",
|
40 |
+
"df['total_words'] = df['text'].apply(lambda x: len(x.split())).cumsum()\n",
|
41 |
+
"\n",
|
42 |
+
"# Add \"\\n\\n\" at the end of df[\"text\"]\n",
|
43 |
+
"df[\"text\"] = df[\"text\"] + \"\\n\\n\""
|
44 |
+
]
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"cell_type": "code",
|
48 |
+
"execution_count": 50,
|
49 |
+
"metadata": {},
|
50 |
+
"outputs": [],
|
51 |
+
"source": [
|
52 |
+
"# Merge the text column into a single string and save to a transcript variable\n",
|
53 |
+
"\n",
|
54 |
+
"transcript = df['text'].str.cat(sep=' ')"
|
55 |
+
]
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"cell_type": "code",
|
59 |
+
"execution_count": 51,
|
60 |
+
"metadata": {},
|
61 |
+
"outputs": [],
|
62 |
+
"source": [
|
63 |
+
"tt = TextTilingTokenizer()\n",
|
64 |
+
"\n",
|
65 |
+
"# Tokenize the transcript into segments using the TextTilingTokenizer\n",
|
66 |
+
"segments = tt.tokenize(transcript)"
|
67 |
+
]
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"cell_type": "code",
|
71 |
+
"execution_count": 52,
|
72 |
+
"metadata": {},
|
73 |
+
"outputs": [],
|
74 |
+
"source": [
|
75 |
+
"# # Remove \\n\\n from each segment\n",
|
76 |
+
"segments = [segment.replace('\\n\\n','').strip() for segment in segments]"
|
77 |
+
]
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"cell_type": "code",
|
81 |
+
"execution_count": 53,
|
82 |
+
"metadata": {},
|
83 |
+
"outputs": [],
|
84 |
+
"source": [
|
85 |
+
"# Calculate a list of word count for each segment\n",
|
86 |
+
"segments_wc = [len(segment.split()) for segment in segments]\n",
|
87 |
+
"\n",
|
88 |
+
"# Make it cumulative\n",
|
89 |
+
"segments_wc = np.cumsum(segments_wc)"
|
90 |
+
]
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"cell_type": "code",
|
94 |
+
"execution_count": 54,
|
95 |
+
"metadata": {},
|
96 |
+
"outputs": [],
|
97 |
+
"source": [
|
98 |
+
"def to_timestamp(seconds):\n",
|
99 |
+
"\n",
|
100 |
+
" seconds = int(seconds)\n",
|
101 |
+
"\n",
|
102 |
+
" minutes = seconds // 60\n",
|
103 |
+
" seconds_remaining = f\"{seconds % 60}\"\n",
|
104 |
+
" \n",
|
105 |
+
" if len(seconds_remaining) == 1:\n",
|
106 |
+
" seconds_remaining = \"0\" + seconds_remaining\n",
|
107 |
+
"\n",
|
108 |
+
" return f\"{minutes}:{seconds_remaining}\""
|
109 |
+
]
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"cell_type": "code",
|
113 |
+
"execution_count": 55,
|
114 |
+
"metadata": {},
|
115 |
+
"outputs": [],
|
116 |
+
"source": [
|
117 |
+
"# For each value in segments_wc, get the index of the closest value in df['total_words']\n",
|
118 |
+
"# This will be the index of the row in df that is closest to the end of each segment\n",
|
119 |
+
"idx = [np.argmin(np.abs(df['total_words'] - total_words)) for total_words in segments_wc]\n",
|
120 |
+
"\n",
|
121 |
+
"# Get segment end times from idx\n",
|
122 |
+
"segment_end_times = df['end'].iloc[idx].values\n",
|
123 |
+
"\n",
|
124 |
+
"# Add 0.0 to the beginning of segment_end_times\n",
|
125 |
+
"segment_end_times = np.insert(segment_end_times, 0, 0.0)\n",
|
126 |
+
"\n",
|
127 |
+
"# segment_times is a list of tuples containing the start and end times of each segment\n",
|
128 |
+
"segment_times = [(to_timestamp(segment_end_times[i-1]), to_timestamp(segment_end_times[i])) for i in range(1,len(segment_end_times))]"
|
129 |
+
]
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"cell_type": "code",
|
133 |
+
"execution_count": 56,
|
134 |
+
"metadata": {},
|
135 |
+
"outputs": [],
|
136 |
+
"source": [
|
137 |
+
"# At the beginning of each segment, add the title, author, and segment times\n",
|
138 |
+
"segment_text = [f\"'{title}' by {author}\\nTimestamp: {segment_time}\\n\\n{segment}\" for segment, segment_time in zip(segments, segment_times)]"
|
139 |
+
]
|
140 |
+
}
|
141 |
+
],
|
142 |
+
"metadata": {
|
143 |
+
"kernelspec": {
|
144 |
+
"display_name": "Python 3",
|
145 |
+
"language": "python",
|
146 |
+
"name": "python3"
|
147 |
+
},
|
148 |
+
"language_info": {
|
149 |
+
"codemirror_mode": {
|
150 |
+
"name": "ipython",
|
151 |
+
"version": 3
|
152 |
+
},
|
153 |
+
"file_extension": ".py",
|
154 |
+
"mimetype": "text/x-python",
|
155 |
+
"name": "python",
|
156 |
+
"nbconvert_exporter": "python",
|
157 |
+
"pygments_lexer": "ipython3",
|
158 |
+
"version": "3.8.0"
|
159 |
+
},
|
160 |
+
"orig_nbformat": 4
|
161 |
+
},
|
162 |
+
"nbformat": 4,
|
163 |
+
"nbformat_minor": 2
|
164 |
+
}
|