wendru18 commited on
Commit
e7c64ec
·
1 Parent(s): 58cd21f

working version

Browse files
Files changed (3) hide show
  1. app.py +117 -33
  2. notebook.ipynb +404 -19
  3. semantic_search.py +1 -0
app.py CHANGED
@@ -8,15 +8,28 @@ import requests
8
  import tiktoken
9
  import openai
10
  import json
 
11
 
12
  tt = TextTilingTokenizer()
13
  searcher = SemanticSearch()
14
 
 
 
 
 
 
 
15
  def get_youtube_data(url):
16
 
17
  video_id = url.split("=")[1]
18
 
19
- raw = YouTubeTranscriptApi.get_transcript(video_id)
 
 
 
 
 
 
20
 
21
  response = requests.get(f"https://noembed.com/embed?dataType=json&url={url}")
22
  data = json.loads(response.content)
@@ -43,6 +56,17 @@ def to_timestamp(seconds):
43
  else:
44
  return f"{minutes:02d}:{seconds_remaining:02d}"
45
 
 
 
 
 
 
 
 
 
 
 
 
46
  def get_segments(df, title, author, split_by_topic, segment_length = 200):
47
 
48
  transcript = df['text'].str.cat(sep=' ')
@@ -53,7 +77,7 @@ def get_segments(df, title, author, split_by_topic, segment_length = 200):
53
  else:
54
  segments = tt.tokenize(transcript)
55
 
56
- segments = [segment.replace('\n\n','').strip() for segment in segments]
57
 
58
  segments_wc = [len(segment.split()) for segment in segments]
59
  segments_wc = np.cumsum(segments_wc)
@@ -63,97 +87,136 @@ def get_segments(df, title, author, split_by_topic, segment_length = 200):
63
  segments_end_times = df['end'].iloc[idx].values
64
  segments_end_times = np.insert(segments_end_times, 0, 0.0)
65
 
66
- segments_times = [(to_timestamp(segments_end_times[i-1]), to_timestamp(segments_end_times[i])) for i in range(1,len(segments_end_times))]
67
 
68
- segments_text = [f"Segment from '{title}' by {author}\nSegment timestamp: {segment_time}\n\n{segment}" for segment, segment_time in zip(segments, segments_times)]
69
 
70
  return segments_text
71
 
72
- def fit_searcher(segments, n_neighbors):
73
  global searcher
74
- searcher.fit(segments, n_neighbors)
75
  return True
76
 
77
  def num_tokens(text, model):
78
  encoding = tiktoken.encoding_for_model(model)
79
  return len(encoding.encode(text))
80
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  def form_query(question, model, token_budget):
82
 
83
  results = searcher(question)
84
 
85
- introduction = 'Use the below segments from multiple youtube videos to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer." Cite each reference using the [title, author, timestamp] notation. Every sentence should have a citation at the end.'
86
 
87
  message = introduction
88
 
89
  question = f"\n\nQuestion: {question}"
90
 
91
- reference = []
92
 
93
- for result in results:
94
- result = "\n\n" + result
95
  if (
96
  num_tokens(message + result + question, model=model)
97
  > token_budget
98
  ):
99
  break
100
  else:
101
- reference.append(result)
102
  message += result
 
 
 
 
 
 
103
 
104
- return message + question, reference
105
 
106
- def generate_answer(question, model, token_budget):
107
 
108
- message, reference = form_query(question, model, token_budget)
109
 
110
  messages = [
111
- {"role": "system", "content": "You answer questions about legal contracts."},
112
  {"role": "user", "content": message},
113
  ]
114
 
115
  response = openai.ChatCompletion.create(
116
  model=model,
117
  messages=messages,
118
- temperature=0
119
  )
120
 
121
  response_message = response["choices"][0]["message"]["content"]
122
- return response_message, reference
123
 
 
 
124
 
125
- if False:
126
- data = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
- question = "Why do some men have trouble with feminism?"
129
- n_neighbors = 5
130
 
131
- urls = ["https://www.youtube.com/watch?v=4xWJf8cERoM", "https://www.youtube.com/watch?v=vx-Si9gbijA"]
132
  segments = []
133
 
134
  for url in urls:
135
  df, title, author = get_youtube_data(url)
 
 
136
 
137
- video_segments = get_segments(df, title, author, split_by_topic = True)
138
 
139
  segments.extend(video_segments)
140
 
141
  print("Segments generated successfully!")
142
 
143
- if fit_searcher(segments, n_neighbors):
144
  print("Searcher fit successfully!")
145
- answer, reference = generate_answer(question, model = "gpt-3.5-turbo", token_budget = 1000)
146
- print(answer)
147
- print(reference)
148
 
149
- title = "Ask Youtube GPT"
150
 
151
- description = """ """
152
 
153
  with gr.Blocks() as demo:
154
 
155
  gr.Markdown(f'<center><h1>{title}</h1></center>')
156
- gr.Markdown(f'Ask YouTube GPT allows you to ask questions about a set of Youtube Videos using Universal Sentence Encoder and Open AI. The returned response cites the video title, author and timestamp in square brackets where the information is located, adding credibility to the responses and helping you to locate incorrect information. If you need one, get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a>.</p>')
157
 
158
  with gr.Row():
159
 
@@ -162,16 +225,37 @@ with gr.Blocks() as demo:
162
  openAI_key=gr.Textbox(label='Enter your OpenAI API key here')
163
 
164
  # Allow the user to input multiple links, adding a textbox for each
165
- links = gr.Textbox(lines=5, label="Enter the links to the YouTube videos you want to search (one per line):", placeholder="https://www.youtube.com/watch?v=4xWJf8cERoM\nhttps://www.youtube.com/watch?v=vx-Si9gbijA")
166
 
167
  question = gr.Textbox(label='Enter your question here')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  btn = gr.Button(value='Submit')
169
  btn.style(full_width=True)
170
 
171
  with gr.Group():
172
- answer = gr.Textbox(label='The answer to your question is :')
 
 
 
 
 
173
 
174
- # btn.click(question_answer, inputs=[url, file, question,openAI_key], outputs=[answer])
175
 
176
  #openai.api_key = os.getenv('Your_Key_Here')
177
  demo.launch()
 
8
  import tiktoken
9
  import openai
10
  import json
11
+ import re
12
 
13
  tt = TextTilingTokenizer()
14
  searcher = SemanticSearch()
15
 
16
+ # Initialize a counter for duplicate titles
17
+ title_counter = {}
18
+
19
+ # One to one mapping from titles to urls
20
+ titles_to_urls = {}
21
+
22
  def get_youtube_data(url):
23
 
24
  video_id = url.split("=")[1]
25
 
26
+ try:
27
+ raw = YouTubeTranscriptApi.get_transcript(video_id)
28
+ except:
29
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
30
+ for transcript in transcript_list:
31
+ raw = transcript.translate('en').fetch()
32
+ break
33
 
34
  response = requests.get(f"https://noembed.com/embed?dataType=json&url={url}")
35
  data = json.loads(response.content)
 
56
  else:
57
  return f"{minutes:02d}:{seconds_remaining:02d}"
58
 
59
+ def to_seconds(timestamp):
60
+ time_list = timestamp.split(':')
61
+ total_seconds = 0
62
+ if len(time_list) == 2: # Minutes:Seconds format
63
+ total_seconds = int(time_list[0]) * 60 + int(time_list[1])
64
+ elif len(time_list) == 3: # Hours:Minutes:Seconds format
65
+ total_seconds = int(time_list[0]) * 3600 + int(time_list[1]) * 60 + int(time_list[2])
66
+ else:
67
+ raise ValueError("Invalid timestamp format")
68
+ return total_seconds
69
+
70
  def get_segments(df, title, author, split_by_topic, segment_length = 200):
71
 
72
  transcript = df['text'].str.cat(sep=' ')
 
77
  else:
78
  segments = tt.tokenize(transcript)
79
 
80
+ segments = [segment.replace('\n','').strip() for segment in segments]
81
 
82
  segments_wc = [len(segment.split()) for segment in segments]
83
  segments_wc = np.cumsum(segments_wc)
 
87
  segments_end_times = df['end'].iloc[idx].values
88
  segments_end_times = np.insert(segments_end_times, 0, 0.0)
89
 
90
+ segments_times = [f"({to_timestamp(segments_end_times[i-1])}, {to_timestamp(segments_end_times[i])})" for i in range(1,len(segments_end_times))]
91
 
92
+ segments_text = [f"Segment from '{title}' by {author}\nTimestamp: {segment_time}\n\n{segment}\n" for segment, segment_time in zip(segments, segments_times)]
93
 
94
  return segments_text
95
 
96
+ def fit_searcher(segments, n_neighbours):
97
  global searcher
98
+ searcher.fit(segments, n_neighbors=n_neighbours)
99
  return True
100
 
101
  def num_tokens(text, model):
102
  encoding = tiktoken.encoding_for_model(model)
103
  return len(encoding.encode(text))
104
 
105
+ def refencify(text):
106
+ title_pattern = r"Segment from '(.+)'"
107
+ timestamp_pattern = r"Timestamp: \((.+)\)"
108
+
109
+ title = re.search(title_pattern, text).group(1)
110
+ start_timestamp = re.search(timestamp_pattern, text).group(1).split(",")[0]
111
+
112
+ url = titles_to_urls[title]
113
+ start_seconds = to_seconds(start_timestamp)
114
+
115
+ return f"Segment URL: {url}&t={start_seconds}\n" + text
116
+
117
  def form_query(question, model, token_budget):
118
 
119
  results = searcher(question)
120
 
121
+ introduction = 'Use the below segments from multiple youtube videos to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer." Cite each references using the [title, author, timestamp] notation. Every sentence should have a citation at the end.'
122
 
123
  message = introduction
124
 
125
  question = f"\n\nQuestion: {question}"
126
 
127
+ references = ""
128
 
129
+ for i, result in enumerate(results):
130
+ result = result + "\n\n"
131
  if (
132
  num_tokens(message + result + question, model=model)
133
  > token_budget
134
  ):
135
  break
136
  else:
 
137
  message += result
138
+ references += f"### Segment {i+1}:\n" + refencify(result)
139
+
140
+ # Remove the last extra two newlines
141
+ message = message[:-2]
142
+
143
+ references = "Segments that might have been used to answer your question: (If you specified more segments than shown here, consider increasing your token budget)\n\n" + references
144
 
145
+ return message + question, references
146
 
147
+ def generate_answer(question, model, token_budget, temperature):
148
 
149
+ message, references = form_query(question, model, token_budget)
150
 
151
  messages = [
152
+ {"role": "system", "content": "You answer questions about YouTube videos."},
153
  {"role": "user", "content": message},
154
  ]
155
 
156
  response = openai.ChatCompletion.create(
157
  model=model,
158
  messages=messages,
159
+ temperature=temperature
160
  )
161
 
162
  response_message = response["choices"][0]["message"]["content"]
163
+ return response_message, references
164
 
165
+ def add_to_dict(title, url):
166
+ global title_counter
167
 
168
+ if title not in titles_to_urls:
169
+ # This is the first occurrence of this title
170
+ titles_to_urls[title] = url
171
+ return title
172
+ else:
173
+ # This title has already been seen, so we need to add a number suffix to it
174
+ # First, check if we've already seen this title before
175
+ if title in title_counter:
176
+ # If we have, increment the counter
177
+ title_counter[title] += 1
178
+ else:
179
+ # If we haven't, start the counter at 1
180
+ title_counter[title] = 1
181
+
182
+ # Add the suffix to the title
183
+ new_title = f"{title} ({title_counter[title]})"
184
+
185
+ # Add the new title to the dictionary
186
+ titles_to_urls[new_title] = url
187
+ return new_title
188
+
189
+ def main(urls_text, question, split_by_topic, segment_length, n_neighbours, model, token_budget, temperature):
190
 
191
+ global title_counter
192
+ title_counter = {}
193
 
194
+ urls = list(set(urls_text.split("\n")))
195
  segments = []
196
 
197
  for url in urls:
198
  df, title, author = get_youtube_data(url)
199
+
200
+ title = add_to_dict(title, url)
201
 
202
+ video_segments = get_segments(df, title, author, split_by_topic, segment_length)
203
 
204
  segments.extend(video_segments)
205
 
206
  print("Segments generated successfully!")
207
 
208
+ if fit_searcher(segments, n_neighbours):
209
  print("Searcher fit successfully!")
210
+ answer, references = generate_answer(question, model, token_budget, temperature)
 
 
211
 
212
+ return answer, references
213
 
214
+ title = "Ask YouTube GPT 📺"
215
 
216
  with gr.Blocks() as demo:
217
 
218
  gr.Markdown(f'<center><h1>{title}</h1></center>')
219
+ gr.Markdown(f'Ask YouTube GPT allows you to ask questions about a set of Youtube Videos using Universal Sentence Encoder and Open AI. The returned response cites the video title, author and timestamp in square brackets where the information is located, adding credibility to the responses and helping you locate incorrect information. If you need one, get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a>.</p>')
220
 
221
  with gr.Row():
222
 
 
225
  openAI_key=gr.Textbox(label='Enter your OpenAI API key here')
226
 
227
  # Allow the user to input multiple links, adding a textbox for each
228
+ urls_text = gr.Textbox(lines=5, label="Enter the links to the YouTube videos you want to search (one per line):", placeholder="https://www.youtube.com/watch?v=4xWJf8cERoM\nhttps://www.youtube.com/watch?v=vx-Si9gbijA")
229
 
230
  question = gr.Textbox(label='Enter your question here')
231
+
232
+ with gr.Accordion("Advanced Settings", open=False):
233
+ split_by_topic = gr.Checkbox(label="Split segments by topic", value=True, info="Whether the video transcripts are to be segmented by topic or by word count. Splitting by topic may result in a more coherent response, but results in a slower response time, especially for lengthy videos.")
234
+ segment_length = gr.Slider(label="Segment word count", minimum=50, maximum=500, step=50, value=200, visible=False)
235
+
236
+ def fn(split_by_topic):
237
+ return gr.Slider.update(visible=not split_by_topic)
238
+
239
+ # If the user wants to split by topic, allow them to set the maximum segment length. (Make segment_length visible)
240
+ split_by_topic.change(fn, split_by_topic, segment_length)
241
+
242
+ n_neighbours = gr.Slider(label="Number of segments to retrieve", minimum=1, maximum=20, step=1, value=5, info="The number of segments to retrieve from each video and feed to the GPT model for answering.")
243
+ model = gr.Dropdown(label="Model", value="gpt-3.5-turbo", choices=["gpt-3.5-turbo", "gpt-4"])
244
+ token_budget = gr.Slider(label="Prompt token budget", minimum=100, maximum=4000, step=100, value=1000, info="The maximum number of tokens the prompt can take.")
245
+ temperature = gr.Slider(label="Temperature", minimum=0, maximum=1, step=0.1, value=0, info="The GPT model's temperature. Recommended to use a low temperature to decrease the likelihood of hallucinations.")
246
+
247
  btn = gr.Button(value='Submit')
248
  btn.style(full_width=True)
249
 
250
  with gr.Group():
251
+
252
+ with gr.Tabs():
253
+ with gr.TabItem("Answer"):
254
+ answer = gr.Markdown()
255
+ with gr.TabItem("References"):
256
+ references = gr.Markdown()
257
 
258
+ btn.click(main, inputs=[urls_text, question, split_by_topic, segment_length, n_neighbours, model, token_budget, temperature], outputs=[answer, references])
259
 
260
  #openai.api_key = os.getenv('Your_Key_Here')
261
  demo.launch()
notebook.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 47,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
@@ -13,10 +13,16 @@
13
  "import requests\n",
14
  "import json\n",
15
  "\n",
16
- "url = \"https://www.youtube.com/watch?v=VcVfceTsD0A&t=163s\"\n",
17
  "video_id = url.split(\"=\")[1]\n",
18
  "\n",
19
- "raw = YouTubeTranscriptApi.get_transcript(video_id)\n",
 
 
 
 
 
 
20
  "\n",
21
  "response = requests.get(f\"https://noembed.com/embed?dataType=json&url={url}\")\n",
22
  "data = json.loads(response.content)\n",
@@ -26,7 +32,222 @@
26
  },
27
  {
28
  "cell_type": "code",
29
- "execution_count": 48,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  "metadata": {},
31
  "outputs": [],
32
  "source": [
@@ -45,7 +266,7 @@
45
  },
46
  {
47
  "cell_type": "code",
48
- "execution_count": 50,
49
  "metadata": {},
50
  "outputs": [],
51
  "source": [
@@ -56,7 +277,27 @@
56
  },
57
  {
58
  "cell_type": "code",
59
- "execution_count": 51,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  "metadata": {},
61
  "outputs": [],
62
  "source": [
@@ -68,7 +309,7 @@
68
  },
69
  {
70
  "cell_type": "code",
71
- "execution_count": 52,
72
  "metadata": {},
73
  "outputs": [],
74
  "source": [
@@ -78,7 +319,7 @@
78
  },
79
  {
80
  "cell_type": "code",
81
- "execution_count": 53,
82
  "metadata": {},
83
  "outputs": [],
84
  "source": [
@@ -91,26 +332,46 @@
91
  },
92
  {
93
  "cell_type": "code",
94
- "execution_count": 54,
95
  "metadata": {},
96
  "outputs": [],
97
  "source": [
98
  "def to_timestamp(seconds):\n",
99
- "\n",
100
  " seconds = int(seconds)\n",
101
  "\n",
102
- " minutes = seconds // 60\n",
103
- " seconds_remaining = f\"{seconds % 60}\"\n",
 
104
  " \n",
105
- " if len(seconds_remaining) == 1:\n",
106
- " seconds_remaining = \"0\" + seconds_remaining\n",
107
- "\n",
108
- " return f\"{minutes}:{seconds_remaining}\""
109
  ]
110
  },
111
  {
112
  "cell_type": "code",
113
- "execution_count": 55,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  "metadata": {},
115
  "outputs": [],
116
  "source": [
@@ -130,12 +391,136 @@
130
  },
131
  {
132
  "cell_type": "code",
133
- "execution_count": 56,
134
  "metadata": {},
135
  "outputs": [],
136
  "source": [
137
  "# At the beginning of each segment, add the title, author, and segment times\n",
138
- "segment_text = [f\"'{title}' by {author}\\nTimestamp: {segment_time}\\n\\n{segment}\" for segment, segment_time in zip(segments, segment_times)]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  ]
140
  }
141
  ],
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 41,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
 
13
  "import requests\n",
14
  "import json\n",
15
  "\n",
16
+ "url = \"https://www.youtube.com/watch?v=77zvIYDFSok\"\n",
17
  "video_id = url.split(\"=\")[1]\n",
18
  "\n",
19
+ "try:\n",
20
+ " raw = YouTubeTranscriptApi.get_transcript(video_id)\n",
21
+ "except:\n",
22
+ " transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)\n",
23
+ " for transcript in transcript_list:\n",
24
+ " raw = transcript.translate('en').fetch()\n",
25
+ " break\n",
26
  "\n",
27
  "response = requests.get(f\"https://noembed.com/embed?dataType=json&url={url}\")\n",
28
  "data = json.loads(response.content)\n",
 
32
  },
33
  {
34
  "cell_type": "code",
35
+ "execution_count": 42,
36
+ "metadata": {},
37
+ "outputs": [
38
+ {
39
+ "data": {
40
+ "text/plain": [
41
+ "[{'text': '[Music]', 'start': 2.19, 'duration': 3.5},\n",
42
+ " {'text': '[Music]', 'start': 18.73, 'duration': 3.07},\n",
43
+ " {'text': '[Applause]', 'start': 27.71, 'duration': 3.289},\n",
44
+ " {'text': '[Music]', 'start': 33.68, 'duration': 7.01},\n",
45
+ " {'text': '[Laughter] [Music] [Music] [', 'start': 36.05, 'duration': 4.64},\n",
46
+ " {'text': 'Applause]', 'start': 59.97, 'duration': 3.2},\n",
47
+ " {'text': '[Music]', 'start': 68.78, 'duration': 3.12},\n",
48
+ " {'text': 'Recently, the', 'start': 72.18, 'duration': 3.0},\n",
49
+ " {'text': \"issue of sexual assault by celebrities has not stopped. It's\",\n",
50
+ " 'start': 79.26,\n",
51
+ " 'duration': 3.24},\n",
52
+ " {'text': \"true that it happened to me. It's\",\n",
53
+ " 'start': 87.74,\n",
54
+ " 'duration': 3.96},\n",
55
+ " {'text': 'reallyembarrassing', 'start': 96.299, 'duration': 3.721},\n",
56
+ " {'text': \"[Music] It's disastrous\", 'start': 98.16, 'duration': 4.099},\n",
57
+ " {'text': '[Music]', 'start': 102.36, 'duration': 3.18},\n",
58
+ " {'text': \"There's also a part where feminists are obsessed with men's genitals. I think their germinating\",\n",
59
+ " 'start': 111.6,\n",
60
+ " 'duration': 4.5},\n",
61
+ " {'text': 'power is a really scary part. I',\n",
62
+ " 'start': 113.82,\n",
63
+ " 'duration': 4.259},\n",
64
+ " {'text': 'think this castration will happen more often as they get castrated.',\n",
65
+ " 'start': 121.259,\n",
66
+ " 'duration': 3.441},\n",
67
+ " {'text': 'In fact, feminism was popular at the time,',\n",
68
+ " 'start': 133.58,\n",
69
+ " 'duration': 7.659},\n",
70
+ " {'text': 'but thanks to its popularity,', 'start': 137.58, 'duration': 8.04},\n",
71
+ " {'text': 'a lot of', 'start': 141.239, 'duration': 8.041},\n",
72
+ " {'text': 'things happened, such as scolding, ridicule, insults, and',\n",
73
+ " 'start': 145.62,\n",
74
+ " 'duration': 5.04},\n",
75
+ " {'text': 'insults against', 'start': 149.28, 'duration': 2.28},\n",
76
+ " {'text': 'men. I', 'start': 150.66, 'duration': 2.7},\n",
77
+ " {'text': \"just couldn't stay there. Well, the\",\n",
78
+ " 'start': 153.36,\n",
79
+ " 'duration': 5.459},\n",
80
+ " {'text': 'pepper is 3 cm.', 'start': 160.2, 'duration': 5.58},\n",
81
+ " {'text': 'Besides, all men are potential rape',\n",
82
+ " 'start': 162.36,\n",
83
+ " 'duration': 7.019},\n",
84
+ " {'text': 'criminals. Men are useless. Men stopped',\n",
85
+ " 'start': 165.78,\n",
86
+ " 'duration': 6.599},\n",
87
+ " {'text': 'trusting women. If you', 'start': 176.34, 'duration': 4.92},\n",
88
+ " {'text': 'reach there, you may be hit by the #MeToo movement, so I',\n",
89
+ " 'start': 184.92,\n",
90
+ " 'duration': 3.86},\n",
91
+ " {'text': 'think there are a lot of them right now. I think',\n",
92
+ " 'start': 200.28,\n",
93
+ " 'duration': 2.539},\n",
94
+ " {'text': 'there may be a little more than in other countries.',\n",
95
+ " 'start': 221.76,\n",
96
+ " 'duration': 4.8},\n",
97
+ " {'text': '[Applause]', 'start': 238.29, 'duration': 3.23},\n",
98
+ " {'text': '[Music]', 'start': 243.27, 'duration': 7.169},\n",
99
+ " {'text': 'Personally, I', 'start': 245.78, 'duration': 8.019},\n",
100
+ " {'text': \"would say it's content that only has these emotions. As an\",\n",
101
+ " 'start': 250.439,\n",
102
+ " 'duration': 6.061},\n",
103
+ " {'text': 'example, I said that we need to strongly pass the anti-discrimination law, but',\n",
104
+ " 'start': 253.799,\n",
105
+ " 'duration': 4.801},\n",
106
+ " {'text': 'this is actually an expression', 'start': 258.6, 'duration': 3.89},\n",
107
+ " {'text': 'dictatorship class. Guys', 'start': 259.919, 'duration': 5.111},\n",
108
+ " {'text': '[Applause]', 'start': 262.49, 'duration': 4.149},\n",
109
+ " {'text': '[Music] I think you', 'start': 265.03, 'duration': 3.949},\n",
110
+ " {'text': \"'re talking a lot, but I want to hear it. Oh, you're not going to listen. You're so mean.\",\n",
111
+ " 'start': 268.979,\n",
112
+ " 'duration': 5.041},\n",
113
+ " {'text': 'For example, I went to the bathroom, and I',\n",
114
+ " 'start': 295.02,\n",
115
+ " 'duration': 2.7},\n",
116
+ " {'text': 'saw that it was unisex, but I want it to be safe with the door open. I',\n",
117
+ " 'start': 297.72,\n",
118
+ " 'duration': 4.8},\n",
119
+ " {'text': 'think I was able to see well how men and women cut off harmony at the source. My',\n",
120
+ " 'start': 354.18,\n",
121
+ " 'duration': 2.6},\n",
122
+ " {'text': 'girlfriend is', 'start': 357.96, 'duration': 4.5},\n",
123
+ " {'text': 'not satisfied with something like that.',\n",
124
+ " 'start': 359.759,\n",
125
+ " 'duration': 4.081},\n",
126
+ " {'text': 'Could anyone who really thought about gender equality talk about peeing and taking a shower? I think',\n",
127
+ " 'start': 371.539,\n",
128
+ " 'duration': 3.541},\n",
129
+ " {'text': '[Music]', 'start': 378.61, 'duration': 3.159},\n",
130
+ " {'text': 'In fact, many companies are', 'start': 384.479, 'duration': 3.321},\n",
131
+ " {'text': 'paying for it,', 'start': 388.979, 'duration': 4.44},\n",
132
+ " {'text': 'or in the labor market, there', 'start': 390.12, 'duration': 6.74},\n",
133
+ " {'text': 'is employment discrimination without any reason for gender inequality.',\n",
134
+ " 'start': 393.419,\n",
135
+ " 'duration': 3.441},\n",
136
+ " {'text': 'Evil men must disappear', 'start': 450.539, 'duration': 7.141},\n",
137
+ " {'text': 'Because of these millitons, there are many such people',\n",
138
+ " 'start': 454.56,\n",
139
+ " 'duration': 7.139},\n",
140
+ " {'text': 'in Korean society, and Ha Tae-kyung is a',\n",
141
+ " 'start': 457.68,\n",
142
+ " 'duration': 4.019},\n",
143
+ " {'text': 'representative abolitionist. It',\n",
144
+ " 'start': 472.979,\n",
145
+ " 'duration': 7.521},\n",
146
+ " {'text': 'means that I oppose giving privileges to Han Gender.',\n",
147
+ " 'start': 503.099,\n",
148
+ " 'duration': 5.641},\n",
149
+ " {'text': 'Korean feminists are now', 'start': 504.3, 'duration': 8.58},\n",
150
+ " {'text': 'understood as', 'start': 508.74, 'duration': 6.5},\n",
151
+ " {'text': 'discriminatory against men. In such a society,',\n",
152
+ " 'start': 528.42,\n",
153
+ " 'duration': 4.56},\n",
154
+ " {'text': 'I', 'start': 534.8, 'duration': 6.479},\n",
155
+ " {'text': 'think we should pay more attention to the serious discrimination against women.',\n",
156
+ " 'start': 537.18,\n",
157
+ " 'duration': 4.099},\n",
158
+ " {'text': 'Recently,', 'start': 544.08, 'duration': 3.0},\n",
159
+ " {'text': 'when I look at the political situation in Korea, I feel that it is now retreating.',\n",
160
+ " 'start': 551.459,\n",
161
+ " 'duration': 4.741},\n",
162
+ " {'text': '19% of female lawmakers in the National Assembly',\n",
163
+ " 'start': 559.62,\n",
164
+ " 'duration': 4.44},\n",
165
+ " {'text': 'are now 19%.', 'start': 561.06, 'duration': 3.0},\n",
166
+ " {'text': 'Why do people who send me messages like this send me messages like this',\n",
167
+ " 'start': 592.14,\n",
168
+ " 'duration': 3.84},\n",
169
+ " {'text': 'when they come every day? It', 'start': 593.76, 'duration': 4.92},\n",
170
+ " {'text': \"'s necessary, but it seems that there\",\n",
171
+ " 'start': 615.3,\n",
172
+ " 'duration': 3.479},\n",
173
+ " {'text': 'are many cases where the target is directed at women,',\n",
174
+ " 'start': 618.779,\n",
175
+ " 'duration': 3.74},\n",
176
+ " {'text': \"but if there's a motto that I\", 'start': 645.42, 'duration': 5.82},\n",
177
+ " {'text': \"personally take while leading this group, let's\",\n",
178
+ " 'start': 647.12,\n",
179
+ " 'duration': 5.32},\n",
180
+ " {'text': \"create a world where feminists don't have to choose feminism. I\",\n",
181
+ " 'start': 657.899,\n",
182
+ " 'duration': 3.721},\n",
183
+ " {'text': 'choose', 'start': 665.339, 'duration': 5.161},\n",
184
+ " {'text': \"feminism because I think you're watching.\",\n",
185
+ " 'start': 676.019,\n",
186
+ " 'duration': 4.081},\n",
187
+ " {'text': 'As a person, I live to protect the woman I love.',\n",
188
+ " 'start': 686.959,\n",
189
+ " 'duration': 5.701},\n",
190
+ " {'text': \"I think I'm about the level of a director who creates a hero.\",\n",
191
+ " 'start': 697.82,\n",
192
+ " 'duration': 6.94},\n",
193
+ " {'text': 'Well,', 'start': 701.04, 'duration': 3.72},\n",
194
+ " {'text': 'one day,', 'start': 707.16, 'duration': 10.41},\n",
195
+ " {'text': '[Music] We were', 'start': 727.75, 'duration': 4.46},\n",
196
+ " {'text': 'humiliated like', 'start': 730.2, 'duration': 5.139},\n",
197
+ " {'text': 'this. I', 'start': 736.7, 'duration': 4.84},\n",
198
+ " {'text': \"think there are so many messages in this very short video. First of all, I think there's\",\n",
199
+ " 'start': 741.54,\n",
200
+ " 'duration': 2.64},\n",
201
+ " {'text': 'enough room for it to be interpreted as',\n",
202
+ " 'start': 744.18,\n",
203
+ " 'duration': 2.54},\n",
204
+ " {'text': 'sexual', 'start': 756.36, 'duration': 4.979},\n",
205
+ " {'text': 'harassment.', 'start': 762.66, 'duration': 5.1},\n",
206
+ " {'text': 'But I really', 'start': 777.42, 'duration': 4.38},\n",
207
+ " {'text': 'had no intention of interfering with the event.',\n",
208
+ " 'start': 780.3,\n",
209
+ " 'duration': 3.42},\n",
210
+ " {'text': 'It was a chance to share, but I',\n",
211
+ " 'start': 791.04,\n",
212
+ " 'duration': 7.28},\n",
213
+ " {'text': \"think I couldn't give you a good answer after hearing that conversation.\",\n",
214
+ " 'start': 795.12,\n",
215
+ " 'duration': 3.2},\n",
216
+ " {'text': '[Music] I was', 'start': 802.19, 'duration': 2.649},\n",
217
+ " {'text': 'very surprised. I', 'start': 803.76, 'duration': 2.04},\n",
218
+ " {'text': 'went with my faith,', 'start': 805.8, 'duration': 4.26},\n",
219
+ " {'text': 'but', 'start': 819.06, 'duration': 2.6},\n",
220
+ " {'text': \"I felt completely betrayed. It's\",\n",
221
+ " 'start': 822.019,\n",
222
+ " 'duration': 3.991},\n",
223
+ " {'text': 'just', 'start': 824.16, 'duration': 4.91},\n",
224
+ " {'text': '[Music] [', 'start': 826.01, 'duration': 3.06},\n",
225
+ " {'text': 'Music] If', 'start': 831.75, 'duration': 3.09},\n",
226
+ " {'text': 'you', 'start': 839.88, 'duration': 5.04},\n",
227
+ " {'text': \"feel sexually shameful, that's\", 'start': 842.04, 'duration': 5.7},\n",
228
+ " {'text': \"sexual harassment. In Korea, that's the\",\n",
229
+ " 'start': 847.74,\n",
230
+ " 'duration': 5.06},\n",
231
+ " {'text': \"case. It's\", 'start': 853.399, 'duration': 6.641},\n",
232
+ " {'text': 'scary to lose everything and become something really different only from the genitals',\n",
233
+ " 'start': 864.36,\n",
234
+ " 'duration': 5.59},\n",
235
+ " {'text': '[Music]', 'start': 864.95, 'duration': 5.0},\n",
236
+ " {'text': '[Music]', 'start': 888.91, 'duration': 3.09}]"
237
+ ]
238
+ },
239
+ "execution_count": 42,
240
+ "metadata": {},
241
+ "output_type": "execute_result"
242
+ }
243
+ ],
244
+ "source": [
245
+ "raw"
246
+ ]
247
+ },
248
+ {
249
+ "cell_type": "code",
250
+ "execution_count": 43,
251
  "metadata": {},
252
  "outputs": [],
253
  "source": [
 
266
  },
267
  {
268
  "cell_type": "code",
269
+ "execution_count": 44,
270
  "metadata": {},
271
  "outputs": [],
272
  "source": [
 
277
  },
278
  {
279
  "cell_type": "code",
280
+ "execution_count": 45,
281
+ "metadata": {},
282
+ "outputs": [
283
+ {
284
+ "data": {
285
+ "text/plain": [
286
+ "\"[Music]\\n\\n [Music]\\n\\n [Applause]\\n\\n [Music]\\n\\n [Laughter] [Music] [Music] [\\n\\n Applause]\\n\\n [Music]\\n\\n Recently, the\\n\\n issue of sexual assault by celebrities has not stopped. It's\\n\\n true that it happened to me. It's\\n\\n reallyembarrassing\\n\\n [Music] It's disastrous\\n\\n [Music]\\n\\n There's also a part where feminists are obsessed with men's genitals. I think their germinating\\n\\n power is a really scary part. I\\n\\n think this castration will happen more often as they get castrated.\\n\\n In fact, feminism was popular at the time,\\n\\n but thanks to its popularity,\\n\\n a lot of\\n\\n things happened, such as scolding, ridicule, insults, and\\n\\n insults against\\n\\n men. I\\n\\n just couldn't stay there. Well, the\\n\\n pepper is 3 cm.\\n\\n Besides, all men are potential rape\\n\\n criminals. Men are useless. Men stopped\\n\\n trusting women. If you\\n\\n reach there, you may be hit by the #MeToo movement, so I\\n\\n think there are a lot of them right now. I think\\n\\n there may be a little more than in other countries.\\n\\n [Applause]\\n\\n [Music]\\n\\n Personally, I\\n\\n would say it's content that only has these emotions. As an\\n\\n example, I said that we need to strongly pass the anti-discrimination law, but\\n\\n this is actually an expression\\n\\n dictatorship class. Guys\\n\\n [Applause]\\n\\n [Music] I think you\\n\\n 're talking a lot, but I want to hear it. Oh, you're not going to listen. You're so mean.\\n\\n For example, I went to the bathroom, and I\\n\\n saw that it was unisex, but I want it to be safe with the door open. I\\n\\n think I was able to see well how men and women cut off harmony at the source. My\\n\\n girlfriend is\\n\\n not satisfied with something like that.\\n\\n Could anyone who really thought about gender equality talk about peeing and taking a shower? I think\\n\\n [Music]\\n\\n In fact, many companies are\\n\\n paying for it,\\n\\n or in the labor market, there\\n\\n is employment discrimination without any reason for gender inequality.\\n\\n Evil men must disappear\\n\\n Because of these millitons, there are many such people\\n\\n in Korean society, and Ha Tae-kyung is a\\n\\n representative abolitionist. It\\n\\n means that I oppose giving privileges to Han Gender.\\n\\n Korean feminists are now\\n\\n understood as\\n\\n discriminatory against men. In such a society,\\n\\n I\\n\\n think we should pay more attention to the serious discrimination against women.\\n\\n Recently,\\n\\n when I look at the political situation in Korea, I feel that it is now retreating.\\n\\n 19% of female lawmakers in the National Assembly\\n\\n are now 19%.\\n\\n Why do people who send me messages like this send me messages like this\\n\\n when they come every day? It\\n\\n 's necessary, but it seems that there\\n\\n are many cases where the target is directed at women,\\n\\n but if there's a motto that I\\n\\n personally take while leading this group, let's\\n\\n create a world where feminists don't have to choose feminism. I\\n\\n choose\\n\\n feminism because I think you're watching.\\n\\n As a person, I live to protect the woman I love.\\n\\n I think I'm about the level of a director who creates a hero.\\n\\n Well,\\n\\n one day,\\n\\n [Music] We were\\n\\n humiliated like\\n\\n this. I\\n\\n think there are so many messages in this very short video. First of all, I think there's\\n\\n enough room for it to be interpreted as\\n\\n sexual\\n\\n harassment.\\n\\n But I really\\n\\n had no intention of interfering with the event.\\n\\n It was a chance to share, but I\\n\\n think I couldn't give you a good answer after hearing that conversation.\\n\\n [Music] I was\\n\\n very surprised. I\\n\\n went with my faith,\\n\\n but\\n\\n I felt completely betrayed. It's\\n\\n just\\n\\n [Music] [\\n\\n Music] If\\n\\n you\\n\\n feel sexually shameful, that's\\n\\n sexual harassment. In Korea, that's the\\n\\n case. It's\\n\\n scary to lose everything and become something really different only from the genitals\\n\\n [Music]\\n\\n [Music]\\n\\n\""
287
+ ]
288
+ },
289
+ "execution_count": 45,
290
+ "metadata": {},
291
+ "output_type": "execute_result"
292
+ }
293
+ ],
294
+ "source": [
295
+ "transcript"
296
+ ]
297
+ },
298
+ {
299
+ "cell_type": "code",
300
+ "execution_count": 11,
301
  "metadata": {},
302
  "outputs": [],
303
  "source": [
 
309
  },
310
  {
311
  "cell_type": "code",
312
+ "execution_count": 12,
313
  "metadata": {},
314
  "outputs": [],
315
  "source": [
 
319
  },
320
  {
321
  "cell_type": "code",
322
+ "execution_count": 13,
323
  "metadata": {},
324
  "outputs": [],
325
  "source": [
 
332
  },
333
  {
334
  "cell_type": "code",
335
+ "execution_count": 14,
336
  "metadata": {},
337
  "outputs": [],
338
  "source": [
339
  "def to_timestamp(seconds):\n",
 
340
  " seconds = int(seconds)\n",
341
  "\n",
342
+ " hours = seconds // 3600\n",
343
+ " minutes = (seconds % 3600) // 60\n",
344
+ " seconds_remaining = seconds % 60\n",
345
  " \n",
346
+ " if seconds >= 3600:\n",
347
+ " return f\"{hours:02d}:{minutes:02d}:{seconds_remaining:02d}\"\n",
348
+ " else:\n",
349
+ " return f\"{minutes:02d}:{seconds_remaining:02d}\""
350
  ]
351
  },
352
  {
353
  "cell_type": "code",
354
+ "execution_count": 15,
355
+ "metadata": {},
356
+ "outputs": [
357
+ {
358
+ "data": {
359
+ "text/plain": [
360
+ "'01:40'"
361
+ ]
362
+ },
363
+ "execution_count": 15,
364
+ "metadata": {},
365
+ "output_type": "execute_result"
366
+ }
367
+ ],
368
+ "source": [
369
+ "to_timestamp(100)"
370
+ ]
371
+ },
372
+ {
373
+ "cell_type": "code",
374
+ "execution_count": 16,
375
  "metadata": {},
376
  "outputs": [],
377
  "source": [
 
391
  },
392
  {
393
  "cell_type": "code",
394
+ "execution_count": 22,
395
  "metadata": {},
396
  "outputs": [],
397
  "source": [
398
  "# At the beginning of each segment, add the title, author, and segment times\n",
399
+ "segments_times = [f\"({to_timestamp(segment_end_times[i-1])}, {to_timestamp(segment_end_times[i])})\" for i in range(1,len(segment_end_times))]"
400
+ ]
401
+ },
402
+ {
403
+ "cell_type": "code",
404
+ "execution_count": 23,
405
+ "metadata": {},
406
+ "outputs": [
407
+ {
408
+ "data": {
409
+ "text/plain": [
410
+ "['(00:00, 00:48)',\n",
411
+ " '(00:48, 01:10)',\n",
412
+ " '(01:10, 01:46)',\n",
413
+ " '(01:46, 02:26)',\n",
414
+ " '(02:26, 02:57)',\n",
415
+ " '(02:57, 03:25)',\n",
416
+ " '(03:25, 04:11)',\n",
417
+ " '(04:11, 04:41)',\n",
418
+ " '(04:41, 05:26)',\n",
419
+ " '(05:26, 05:45)',\n",
420
+ " '(05:45, 06:13)',\n",
421
+ " '(06:13, 06:40)',\n",
422
+ " '(06:40, 07:02)',\n",
423
+ " '(07:02, 07:54)',\n",
424
+ " '(07:54, 08:17)',\n",
425
+ " '(08:17, 09:24)',\n",
426
+ " '(09:24, 10:10)',\n",
427
+ " '(10:10, 11:02)',\n",
428
+ " '(11:02, 11:47)',\n",
429
+ " '(11:47, 12:09)',\n",
430
+ " '(12:09, 12:52)',\n",
431
+ " '(12:52, 13:50)',\n",
432
+ " '(13:50, 14:15)',\n",
433
+ " '(14:15, 14:38)',\n",
434
+ " '(14:38, 16:14)',\n",
435
+ " '(16:14, 17:16)',\n",
436
+ " '(17:16, 17:47)',\n",
437
+ " '(17:47, 18:17)',\n",
438
+ " '(18:17, 18:56)',\n",
439
+ " '(18:56, 19:31)',\n",
440
+ " '(19:31, 19:52)',\n",
441
+ " '(19:52, 21:03)',\n",
442
+ " '(21:03, 21:39)',\n",
443
+ " '(21:39, 22:08)',\n",
444
+ " '(22:08, 22:42)',\n",
445
+ " '(22:42, 23:35)',\n",
446
+ " '(23:35, 24:51)',\n",
447
+ " '(24:51, 26:01)',\n",
448
+ " '(26:01, 26:28)',\n",
449
+ " '(26:28, 26:57)',\n",
450
+ " '(26:57, 28:37)',\n",
451
+ " '(28:37, 29:00)',\n",
452
+ " '(29:00, 29:50)',\n",
453
+ " '(29:50, 30:12)',\n",
454
+ " '(30:12, 30:55)',\n",
455
+ " '(30:55, 31:47)',\n",
456
+ " '(31:47, 32:54)',\n",
457
+ " '(32:54, 33:33)',\n",
458
+ " '(33:33, 33:50)',\n",
459
+ " '(33:50, 34:20)',\n",
460
+ " '(34:20, 34:48)',\n",
461
+ " '(34:48, 35:22)',\n",
462
+ " '(35:22, 36:14)',\n",
463
+ " '(36:14, 37:15)']"
464
+ ]
465
+ },
466
+ "execution_count": 23,
467
+ "metadata": {},
468
+ "output_type": "execute_result"
469
+ }
470
+ ],
471
+ "source": [
472
+ "segments_times"
473
+ ]
474
+ },
475
+ {
476
+ "cell_type": "code",
477
+ "execution_count": 72,
478
+ "metadata": {},
479
+ "outputs": [],
480
+ "source": [
481
+ "text = '''\n",
482
+ "Segment from 'Feminism Is 'Dividing This'' Country' by VICE News\n",
483
+ "Timestamp: (10:51, 12:24)\n",
484
+ "---\n",
485
+ "personally take while leading this group, let's create a world where feminists don't have to choose feminism. I choose feminism because I think you're watching. As a person, I live to protect the woman I love. I think I'm about the level of a director who creates a hero. Well, one day, [Music] We were humiliated like this. I think there are so many messages in this very short video. First of all, I think there's\n",
486
+ "---\n",
487
+ "'''\n",
488
+ "\n",
489
+ "# Get the title and timestamp from the text\n",
490
+ "import re\n",
491
+ "\n",
492
+ "# define regular expression patterns\n",
493
+ "title_pattern = r\"Segment from '(.+)'\"\n",
494
+ "timestamp_pattern = r\"Timestamp: \\((.+)\\)\"\n",
495
+ "\n",
496
+ "# search for title, source, and timestamp using regular expressions\n",
497
+ "title = re.search(title_pattern, text).group(1)\n",
498
+ "start_timestamp = re.search(timestamp_pattern, text).group(1).split(\",\")[0]\n",
499
+ "\n",
500
+ "url = f\"URL: https://www.youtube.com/watch?v={video_id}&t={start_timestamp}\"\n",
501
+ "\n",
502
+ "# Add url in text before first \"---\"\n",
503
+ "text = re.sub(r\"---\", f\"{url}\\n---\", text, count=1)\n"
504
+ ]
505
+ },
506
+ {
507
+ "cell_type": "code",
508
+ "execution_count": 73,
509
+ "metadata": {},
510
+ "outputs": [
511
+ {
512
+ "data": {
513
+ "text/plain": [
514
+ "\"\\nSegment from 'Feminism Is 'Dividing This'' Country' by VICE News\\nTimestamp: (10:51, 12:24)\\nURL: https://www.youtube.com/watch?v=77zvIYDFSok&t=10:51 \\n---\\npersonally take while leading this group, let's create a world where feminists don't have to choose feminism. I choose feminism because I think you're watching. As a person, I live to protect the woman I love. I think I'm about the level of a director who creates a hero. Well, one day, [Music] We were humiliated like this. I think there are so many messages in this very short video. First of all, I think there's\\n---\\n\""
515
+ ]
516
+ },
517
+ "execution_count": 73,
518
+ "metadata": {},
519
+ "output_type": "execute_result"
520
+ }
521
+ ],
522
+ "source": [
523
+ "text"
524
  ]
525
  }
526
  ],
semantic_search.py CHANGED
@@ -10,6 +10,7 @@ class SemanticSearch:
10
 
11
 
12
  def fit(self, data, batch=1000, n_neighbors=5):
 
13
  self.data = data
14
  self.embeddings = self.get_text_embedding(data, batch=batch)
15
  n_neighbors = min(n_neighbors, len(self.embeddings))
 
10
 
11
 
12
  def fit(self, data, batch=1000, n_neighbors=5):
13
+ print(f"Fitting with n={n_neighbors}...")
14
  self.data = data
15
  self.embeddings = self.get_text_embedding(data, batch=batch)
16
  n_neighbors = min(n_neighbors, len(self.embeddings))