wendru18 commited on
Commit
22d9367
·
0 Parent(s):

first commit

Browse files
__pycache__/semantic_search.cpython-38.pyc ADDED
Binary file (1.71 kB). View file
 
app.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from youtube_transcript_api import YouTubeTranscriptApi
2
+ from nltk.tokenize import TextTilingTokenizer
3
+ from semantic_search import SemanticSearch
4
+ import pandas as pd
5
+ import gradio as gr
6
+ import numpy as np
7
+ import requests
8
+ import tiktoken
9
+ import openai
10
+ import json
11
+
12
+ tt = TextTilingTokenizer()
13
+ searcher = SemanticSearch()
14
+
15
+ def get_youtube_data(url):
16
+
17
+ video_id = url.split("=")[1]
18
+
19
+ raw = YouTubeTranscriptApi.get_transcript(video_id)
20
+
21
+ response = requests.get(f"https://noembed.com/embed?dataType=json&url={url}")
22
+ data = json.loads(response.content)
23
+
24
+ title, author = data["title"], data["author_name"]
25
+
26
+ df = pd.DataFrame(raw)
27
+
28
+ df['end'] = df['start'] + df['duration']
29
+ df['total_words'] = df['text'].apply(lambda x: len(x.split())).cumsum()
30
+ df["text"] = df["text"] + "\n\n"
31
+
32
+ return df, title, author
33
+
34
+ def to_timestamp(seconds):
35
+ seconds = int(seconds)
36
+
37
+ hours = seconds // 3600
38
+ minutes = (seconds % 3600) // 60
39
+ seconds_remaining = seconds % 60
40
+
41
+ if seconds >= 3600:
42
+ return f"{hours:02d}:{minutes:02d}:{seconds_remaining:02d}"
43
+ else:
44
+ return f"{minutes:02d}:{seconds_remaining:02d}"
45
+
46
+ def get_segments(df, title, author, split_by_topic, segment_length = 200):
47
+
48
+ transcript = df['text'].str.cat(sep=' ')
49
+
50
+ if not split_by_topic:
51
+ words = transcript.split()
52
+ segments = [' '.join(words[i:i+segment_length]) for i in range(0, len(words), segment_length)]
53
+ else:
54
+ segments = tt.tokenize(transcript)
55
+
56
+ segments = [segment.replace('\n\n','').strip() for segment in segments]
57
+
58
+ segments_wc = [len(segment.split()) for segment in segments]
59
+ segments_wc = np.cumsum(segments_wc)
60
+
61
+ idx = [np.argmin(np.abs(df['total_words'] - total_words)) for total_words in segments_wc]
62
+
63
+ segments_end_times = df['end'].iloc[idx].values
64
+ segments_end_times = np.insert(segments_end_times, 0, 0.0)
65
+
66
+ segments_times = [(to_timestamp(segments_end_times[i-1]), to_timestamp(segments_end_times[i])) for i in range(1,len(segments_end_times))]
67
+
68
+ segments_text = [f"Segment from '{title}' by {author}\nSegment timestamp: {segment_time}\n\n{segment}" for segment, segment_time in zip(segments, segments_times)]
69
+
70
+ return segments_text
71
+
72
+ def fit_searcher(segments, n_neighbors):
73
+ global searcher
74
+ searcher.fit(segments, n_neighbors)
75
+ return True
76
+
77
+ def num_tokens(text, model):
78
+ encoding = tiktoken.encoding_for_model(model)
79
+ return len(encoding.encode(text))
80
+
81
+ def form_query(question, model, token_budget):
82
+
83
+ results = searcher(question)
84
+
85
+ introduction = 'Use the below segments from multiple youtube videos to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer." Cite each reference using the [title, author, timestamp] notation. Every sentence should have a citation at the end.'
86
+
87
+ message = introduction
88
+
89
+ question = f"\n\nQuestion: {question}"
90
+
91
+ reference = []
92
+
93
+ for result in results:
94
+ result = "\n\n" + result
95
+ if (
96
+ num_tokens(message + result + question, model=model)
97
+ > token_budget
98
+ ):
99
+ break
100
+ else:
101
+ reference.append(result)
102
+ message += result
103
+
104
+ return message + question, reference
105
+
106
+ def generate_answer(question, model, token_budget):
107
+
108
+ message, reference = form_query(question, model, token_budget)
109
+
110
+ messages = [
111
+ {"role": "system", "content": "You answer questions about legal contracts."},
112
+ {"role": "user", "content": message},
113
+ ]
114
+
115
+ response = openai.ChatCompletion.create(
116
+ model=model,
117
+ messages=messages,
118
+ temperature=0
119
+ )
120
+
121
+ response_message = response["choices"][0]["message"]["content"]
122
+ return response_message, reference
123
+
124
+
125
+ if False:
126
+ data = {}
127
+
128
+ question = "Why do some men have trouble with feminism?"
129
+ n_neighbors = 5
130
+
131
+ urls = ["https://www.youtube.com/watch?v=4xWJf8cERoM", "https://www.youtube.com/watch?v=vx-Si9gbijA"]
132
+ segments = []
133
+
134
+ for url in urls:
135
+ df, title, author = get_youtube_data(url)
136
+
137
+ video_segments = get_segments(df, title, author, split_by_topic = True)
138
+
139
+ segments.extend(video_segments)
140
+
141
+ print("Segments generated successfully!")
142
+
143
+ if fit_searcher(segments, n_neighbors):
144
+ print("Searcher fit successfully!")
145
+ answer, reference = generate_answer(question, model = "gpt-3.5-turbo", token_budget = 1000)
146
+ print(answer)
147
+ print(reference)
148
+
149
+ title = "Ask Youtube GPT"
150
+
151
+ description = """ """
152
+
153
+ with gr.Blocks() as demo:
154
+
155
+ gr.Markdown(f'<center><h1>{title}</h1></center>')
156
+ gr.Markdown(f'Ask YouTube GPT allows you to ask questions about a set of Youtube Videos using Universal Sentence Encoder and Open AI. The returned response cites the video title, author and timestamp in square brackets where the information is located, adding credibility to the responses and helping you to locate incorrect information. If you need one, get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a>.</p>')
157
+
158
+ with gr.Row():
159
+
160
+ with gr.Group():
161
+
162
+ openAI_key=gr.Textbox(label='Enter your OpenAI API key here')
163
+
164
+ # Allow the user to input multiple links, adding a textbox for each
165
+ links = gr.Textbox(lines=5, label="Enter the links to the YouTube videos you want to search (one per line):", placeholder="https://www.youtube.com/watch?v=4xWJf8cERoM\nhttps://www.youtube.com/watch?v=vx-Si9gbijA")
166
+
167
+ question = gr.Textbox(label='Enter your question here')
168
+ btn = gr.Button(value='Submit')
169
+ btn.style(full_width=True)
170
+
171
+ with gr.Group():
172
+ answer = gr.Textbox(label='The answer to your question is :')
173
+
174
+ # btn.click(question_answer, inputs=[url, file, question,openAI_key], outputs=[answer])
175
+
176
+ #openai.api_key = os.getenv('Your_Key_Here')
177
+ demo.launch()
semantic_search.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.neighbors import NearestNeighbors
2
+ import tensorflow_hub as hub
3
+ import numpy as np
4
+
5
+ class SemanticSearch:
6
+
7
+ def __init__(self):
8
+ self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
9
+ self.fitted = False
10
+
11
+
12
+ def fit(self, data, batch=1000, n_neighbors=5):
13
+ self.data = data
14
+ self.embeddings = self.get_text_embedding(data, batch=batch)
15
+ n_neighbors = min(n_neighbors, len(self.embeddings))
16
+ self.nn = NearestNeighbors(n_neighbors=n_neighbors)
17
+ self.nn.fit(self.embeddings)
18
+ self.fitted = True
19
+
20
+
21
+ def __call__(self, text, return_data=True):
22
+ inp_emb = self.use([text])
23
+ neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
24
+
25
+ if return_data:
26
+ return [self.data[i] for i in neighbors]
27
+ else:
28
+ return neighbors
29
+
30
+
31
+ def get_text_embedding(self, texts, batch=1000):
32
+ embeddings = []
33
+ for i in range(0, len(texts), batch):
34
+ text_batch = texts[i:(i+batch)]
35
+ emb_batch = self.use(text_batch)
36
+ embeddings.append(emb_batch)
37
+ embeddings = np.vstack(embeddings)
38
+ return embeddings
youtube.ipynb ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 47,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from youtube_transcript_api import YouTubeTranscriptApi\n",
10
+ "from nltk.tokenize import TextTilingTokenizer \n",
11
+ "import pandas as pd\n",
12
+ "import numpy as np\n",
13
+ "import requests\n",
14
+ "import json\n",
15
+ "\n",
16
+ "url = \"https://www.youtube.com/watch?v=VcVfceTsD0A&t=163s\"\n",
17
+ "video_id = url.split(\"=\")[1]\n",
18
+ "\n",
19
+ "raw = YouTubeTranscriptApi.get_transcript(video_id)\n",
20
+ "\n",
21
+ "response = requests.get(f\"https://noembed.com/embed?dataType=json&url={url}\")\n",
22
+ "data = json.loads(response.content)\n",
23
+ "\n",
24
+ "title, author = data[\"title\"], data[\"author_name\"]"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 48,
30
+ "metadata": {},
31
+ "outputs": [],
32
+ "source": [
33
+ "# Convert the list of dictionaries to a pandas dataframe\n",
34
+ "df = pd.DataFrame(raw)\n",
35
+ "\n",
36
+ "# Add end column\n",
37
+ "df['end'] = df['start'] + df['duration']\n",
38
+ "\n",
39
+ "# Add a new column to the dataframe called 'total_words' that contains the total number of words so far in the transcript\n",
40
+ "df['total_words'] = df['text'].apply(lambda x: len(x.split())).cumsum()\n",
41
+ "\n",
42
+ "# Add \"\\n\\n\" at the end of df[\"text\"]\n",
43
+ "df[\"text\"] = df[\"text\"] + \"\\n\\n\""
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": 50,
49
+ "metadata": {},
50
+ "outputs": [],
51
+ "source": [
52
+ "# Merge the text column into a single string and save to a transcript variable\n",
53
+ "\n",
54
+ "transcript = df['text'].str.cat(sep=' ')"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 51,
60
+ "metadata": {},
61
+ "outputs": [],
62
+ "source": [
63
+ "tt = TextTilingTokenizer()\n",
64
+ "\n",
65
+ "# Tokenize the transcript into segments using the TextTilingTokenizer\n",
66
+ "segments = tt.tokenize(transcript)"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": 52,
72
+ "metadata": {},
73
+ "outputs": [],
74
+ "source": [
75
+ "# # Remove \\n\\n from each segment\n",
76
+ "segments = [segment.replace('\\n\\n','').strip() for segment in segments]"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "code",
81
+ "execution_count": 53,
82
+ "metadata": {},
83
+ "outputs": [],
84
+ "source": [
85
+ "# Calculate a list of word count for each segment\n",
86
+ "segments_wc = [len(segment.split()) for segment in segments]\n",
87
+ "\n",
88
+ "# Make it cumulative\n",
89
+ "segments_wc = np.cumsum(segments_wc)"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": 54,
95
+ "metadata": {},
96
+ "outputs": [],
97
+ "source": [
98
+ "def to_timestamp(seconds):\n",
99
+ "\n",
100
+ " seconds = int(seconds)\n",
101
+ "\n",
102
+ " minutes = seconds // 60\n",
103
+ " seconds_remaining = f\"{seconds % 60}\"\n",
104
+ " \n",
105
+ " if len(seconds_remaining) == 1:\n",
106
+ " seconds_remaining = \"0\" + seconds_remaining\n",
107
+ "\n",
108
+ " return f\"{minutes}:{seconds_remaining}\""
109
+ ]
110
+ },
111
+ {
112
+ "cell_type": "code",
113
+ "execution_count": 55,
114
+ "metadata": {},
115
+ "outputs": [],
116
+ "source": [
117
+ "# For each value in segments_wc, get the index of the closest value in df['total_words']\n",
118
+ "# This will be the index of the row in df that is closest to the end of each segment\n",
119
+ "idx = [np.argmin(np.abs(df['total_words'] - total_words)) for total_words in segments_wc]\n",
120
+ "\n",
121
+ "# Get segment end times from idx\n",
122
+ "segment_end_times = df['end'].iloc[idx].values\n",
123
+ "\n",
124
+ "# Add 0.0 to the beginning of segment_end_times\n",
125
+ "segment_end_times = np.insert(segment_end_times, 0, 0.0)\n",
126
+ "\n",
127
+ "# segment_times is a list of tuples containing the start and end times of each segment\n",
128
+ "segment_times = [(to_timestamp(segment_end_times[i-1]), to_timestamp(segment_end_times[i])) for i in range(1,len(segment_end_times))]"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "execution_count": 56,
134
+ "metadata": {},
135
+ "outputs": [],
136
+ "source": [
137
+ "# At the beginning of each segment, add the title, author, and segment times\n",
138
+ "segment_text = [f\"'{title}' by {author}\\nTimestamp: {segment_time}\\n\\n{segment}\" for segment, segment_time in zip(segments, segment_times)]"
139
+ ]
140
+ }
141
+ ],
142
+ "metadata": {
143
+ "kernelspec": {
144
+ "display_name": "Python 3",
145
+ "language": "python",
146
+ "name": "python3"
147
+ },
148
+ "language_info": {
149
+ "codemirror_mode": {
150
+ "name": "ipython",
151
+ "version": 3
152
+ },
153
+ "file_extension": ".py",
154
+ "mimetype": "text/x-python",
155
+ "name": "python",
156
+ "nbconvert_exporter": "python",
157
+ "pygments_lexer": "ipython3",
158
+ "version": "3.8.0"
159
+ },
160
+ "orig_nbformat": 4
161
+ },
162
+ "nbformat": 4,
163
+ "nbformat_minor": 2
164
+ }