Spaces:

andaqu
/

ask-youtube-gpt

Runtime error

App Files Files Community

wendru18 commited on Apr 29, 2023

Commit

e7c64ec

1 Parent(s): 58cd21f

working version

Browse files

Files changed (3) hide show

app.py +117 -33
notebook.ipynb +404 -19
semantic_search.py +1 -0

app.py CHANGED Viewed

@@ -8,15 +8,28 @@ import requests
 import tiktoken
 import openai
 import json
 tt = TextTilingTokenizer()
 searcher = SemanticSearch()
 def get_youtube_data(url):
     video_id = url.split("=")[1]
-    raw = YouTubeTranscriptApi.get_transcript(video_id)
     response = requests.get(f"https://noembed.com/embed?dataType=json&url={url}")
     data = json.loads(response.content)
@@ -43,6 +56,17 @@ def to_timestamp(seconds):
     else:
         return f"{minutes:02d}:{seconds_remaining:02d}"
 def get_segments(df, title, author, split_by_topic, segment_length = 200):
     transcript = df['text'].str.cat(sep=' ')
@@ -53,7 +77,7 @@ def get_segments(df, title, author, split_by_topic, segment_length = 200):
     else:
         segments = tt.tokenize(transcript)
-    segments = [segment.replace('\n\n','').strip() for segment in segments]
     segments_wc = [len(segment.split()) for segment in segments]
     segments_wc = np.cumsum(segments_wc)
@@ -63,97 +87,136 @@ def get_segments(df, title, author, split_by_topic, segment_length = 200):
     segments_end_times = df['end'].iloc[idx].values
     segments_end_times = np.insert(segments_end_times, 0, 0.0)
-    segments_times = [(to_timestamp(segments_end_times[i-1]), to_timestamp(segments_end_times[i])) for i in range(1,len(segments_end_times))]
-    segments_text = [f"Segment from '{title}' by {author}\nSegment timestamp: {segment_time}\n\n{segment}" for segment, segment_time in zip(segments, segments_times)]
     return segments_text
-def fit_searcher(segments, n_neighbors):
     global searcher
-    searcher.fit(segments, n_neighbors)
     return True
 def num_tokens(text, model):
     encoding = tiktoken.encoding_for_model(model)
     return len(encoding.encode(text))
 def form_query(question, model, token_budget):
     results = searcher(question)
-    introduction = 'Use the below segments from multiple youtube videos to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer." Cite each reference using the [title, author, timestamp] notation. Every sentence should have a citation at the end.'
     message = introduction
     question = f"\n\nQuestion: {question}"
-    reference = []
-    for result in results:
-        result = "\n\n" + result
         if (
             num_tokens(message + result + question, model=model)
             > token_budget
         ):
             break
         else:
-            reference.append(result)
             message += result
-    return message + question, reference
-def generate_answer(question, model, token_budget):
-    message, reference = form_query(question, model, token_budget)
     messages = [
-        {"role": "system", "content": "You answer questions about legal contracts."},
         {"role": "user", "content": message},
     ]
     response = openai.ChatCompletion.create(
         model=model,
         messages=messages,
-        temperature=0
     )
     response_message = response["choices"][0]["message"]["content"]
-    return response_message, reference
-if False:
-    data = {}
-    question = "Why do some men have trouble with feminism?"
-    n_neighbors = 5
-    urls = ["https://www.youtube.com/watch?v=4xWJf8cERoM", "https://www.youtube.com/watch?v=vx-Si9gbijA"]
     segments = []
     for url in urls:
         df, title, author = get_youtube_data(url)
-        video_segments = get_segments(df, title, author, split_by_topic = True)
         segments.extend(video_segments)
     print("Segments generated successfully!")
-    if fit_searcher(segments, n_neighbors):
         print("Searcher fit successfully!")
-        answer, reference = generate_answer(question, model = "gpt-3.5-turbo", token_budget = 1000)
-        print(answer)
-        print(reference)
-title = "Ask Youtube GPT"
-description = """  """
 with gr.Blocks() as demo:
     gr.Markdown(f'<center><h1>{title}</h1></center>')
-    gr.Markdown(f'Ask YouTube GPT allows you to ask questions about a set of Youtube Videos using Universal Sentence Encoder and Open AI. The returned response cites the video title, author and timestamp in square brackets where the information is located, adding credibility to the responses and helping you to locate incorrect information. If you need one, get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a>.</p>')
     with gr.Row():
@@ -162,16 +225,37 @@ with gr.Blocks() as demo:
             openAI_key=gr.Textbox(label='Enter your OpenAI API key here')
             # Allow the user to input multiple links, adding a textbox for each
-            links = gr.Textbox(lines=5, label="Enter the links to the YouTube videos you want to search (one per line):", placeholder="https://www.youtube.com/watch?v=4xWJf8cERoM\nhttps://www.youtube.com/watch?v=vx-Si9gbijA")
             question = gr.Textbox(label='Enter your question here')
             btn = gr.Button(value='Submit')
             btn.style(full_width=True)
         with gr.Group():
-            answer = gr.Textbox(label='The answer to your question is :')
-        # btn.click(question_answer, inputs=[url, file, question,openAI_key], outputs=[answer])
 #openai.api_key = os.getenv('Your_Key_Here')
 demo.launch()

 import tiktoken
 import openai
 import json
+import re
 tt = TextTilingTokenizer()
 searcher = SemanticSearch()
+# Initialize a counter for duplicate titles
+title_counter = {}
+# One to one mapping from titles to urls
+titles_to_urls = {}
 def get_youtube_data(url):
     video_id = url.split("=")[1]
+    try:
+        raw = YouTubeTranscriptApi.get_transcript(video_id)
+    except:
+        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
+        for transcript in transcript_list:
+            raw = transcript.translate('en').fetch()
+            break
     response = requests.get(f"https://noembed.com/embed?dataType=json&url={url}")
     data = json.loads(response.content)
     else:
         return f"{minutes:02d}:{seconds_remaining:02d}"
+def to_seconds(timestamp):
+    time_list = timestamp.split(':')
+    total_seconds = 0
+    if len(time_list) == 2:  # Minutes:Seconds format
+        total_seconds = int(time_list[0]) * 60 + int(time_list[1])
+    elif len(time_list) == 3:  # Hours:Minutes:Seconds format
+        total_seconds = int(time_list[0]) * 3600 + int(time_list[1]) * 60 + int(time_list[2])
+    else:
+        raise ValueError("Invalid timestamp format")
+    return total_seconds
 def get_segments(df, title, author, split_by_topic, segment_length = 200):
     transcript = df['text'].str.cat(sep=' ')
     else:
         segments = tt.tokenize(transcript)
+    segments = [segment.replace('\n','').strip() for segment in segments]
     segments_wc = [len(segment.split()) for segment in segments]
     segments_wc = np.cumsum(segments_wc)
     segments_end_times = df['end'].iloc[idx].values
     segments_end_times = np.insert(segments_end_times, 0, 0.0)
+    segments_times = [f"({to_timestamp(segments_end_times[i-1])}, {to_timestamp(segments_end_times[i])})" for i in range(1,len(segments_end_times))]
+    segments_text = [f"Segment from '{title}' by {author}\nTimestamp: {segment_time}\n\n{segment}\n" for segment, segment_time in zip(segments, segments_times)]
     return segments_text
+def fit_searcher(segments, n_neighbours):
     global searcher
+    searcher.fit(segments, n_neighbors=n_neighbours)
     return True
 def num_tokens(text, model):
     encoding = tiktoken.encoding_for_model(model)
     return len(encoding.encode(text))
+def refencify(text):
+    title_pattern = r"Segment from '(.+)'"
+    timestamp_pattern = r"Timestamp: \((.+)\)"
+    title = re.search(title_pattern, text).group(1)
+    start_timestamp = re.search(timestamp_pattern, text).group(1).split(",")[0]
+    url = titles_to_urls[title]
+    start_seconds = to_seconds(start_timestamp)
+    return f"Segment URL: {url}&t={start_seconds}\n" + text
 def form_query(question, model, token_budget):
     results = searcher(question)
+    introduction = 'Use the below segments from multiple youtube videos to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer." Cite each references using the [title, author, timestamp] notation. Every sentence should have a citation at the end.'
     message = introduction
     question = f"\n\nQuestion: {question}"
+    references = ""
+    for i, result in enumerate(results):
+        result = result + "\n\n"
         if (
             num_tokens(message + result + question, model=model)
             > token_budget
         ):
             break
         else:
             message += result
+            references += f"### Segment {i+1}:\n" + refencify(result)
+    # Remove the last extra two newlines
+    message = message[:-2]
+    references = "Segments that might have been used to answer your question: (If you specified more segments than shown here, consider increasing your token budget)\n\n" + references
+    return message + question, references
+def generate_answer(question, model, token_budget, temperature):
+    message, references = form_query(question, model, token_budget)
     messages = [
+        {"role": "system", "content": "You answer questions about YouTube videos."},
         {"role": "user", "content": message},
     ]
     response = openai.ChatCompletion.create(
         model=model,
         messages=messages,
+        temperature=temperature
     )
     response_message = response["choices"][0]["message"]["content"]
+    return response_message, references
+def add_to_dict(title, url):
+    global title_counter
+    if title not in titles_to_urls:
+        # This is the first occurrence of this title
+        titles_to_urls[title] = url
+        return title
+    else:
+        # This title has already been seen, so we need to add a number suffix to it
+        # First, check if we've already seen this title before
+        if title in title_counter:
+            # If we have, increment the counter
+            title_counter[title] += 1
+        else:
+            # If we haven't, start the counter at 1
+            title_counter[title] = 1
+        # Add the suffix to the title
+        new_title = f"{title} ({title_counter[title]})"
+        # Add the new title to the dictionary
+        titles_to_urls[new_title] = url
+        return new_title
+def main(urls_text, question, split_by_topic, segment_length, n_neighbours, model, token_budget, temperature):
+    global title_counter
+    title_counter = {}
+    urls = list(set(urls_text.split("\n")))
     segments = []
     for url in urls:
         df, title, author = get_youtube_data(url)
+        title = add_to_dict(title, url)
+        video_segments = get_segments(df, title, author, split_by_topic, segment_length)
         segments.extend(video_segments)
     print("Segments generated successfully!")
+    if fit_searcher(segments, n_neighbours):
         print("Searcher fit successfully!")
+        answer, references = generate_answer(question, model, token_budget, temperature)
+    return answer, references
+title = "Ask YouTube GPT 📺"
 with gr.Blocks() as demo:
     gr.Markdown(f'<center><h1>{title}</h1></center>')
+    gr.Markdown(f'Ask YouTube GPT allows you to ask questions about a set of Youtube Videos using Universal Sentence Encoder and Open AI. The returned response cites the video title, author and timestamp in square brackets where the information is located, adding credibility to the responses and helping you locate incorrect information. If you need one, get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a>.</p>')
     with gr.Row():
             openAI_key=gr.Textbox(label='Enter your OpenAI API key here')
             # Allow the user to input multiple links, adding a textbox for each
+            urls_text = gr.Textbox(lines=5, label="Enter the links to the YouTube videos you want to search (one per line):", placeholder="https://www.youtube.com/watch?v=4xWJf8cERoM\nhttps://www.youtube.com/watch?v=vx-Si9gbijA")
             question = gr.Textbox(label='Enter your question here')
+            with gr.Accordion("Advanced Settings", open=False):
+                split_by_topic = gr.Checkbox(label="Split segments by topic", value=True, info="Whether the video transcripts are to be segmented by topic or by word count. Splitting by topic may result in a more coherent response, but results in a slower response time, especially for lengthy videos.")
+                segment_length = gr.Slider(label="Segment word count", minimum=50, maximum=500, step=50, value=200, visible=False)
+                def fn(split_by_topic):
+                    return gr.Slider.update(visible=not split_by_topic)
+                # If the user wants to split by topic, allow them to set the maximum segment length. (Make segment_length visible)
+                split_by_topic.change(fn, split_by_topic, segment_length)
+                n_neighbours = gr.Slider(label="Number of segments to retrieve", minimum=1, maximum=20, step=1, value=5, info="The number of segments to retrieve from each video and feed to the GPT model for answering.")
+                model = gr.Dropdown(label="Model", value="gpt-3.5-turbo", choices=["gpt-3.5-turbo", "gpt-4"])
+                token_budget = gr.Slider(label="Prompt token budget", minimum=100, maximum=4000, step=100, value=1000, info="The maximum number of tokens the prompt can take.")
+                temperature = gr.Slider(label="Temperature", minimum=0, maximum=1, step=0.1, value=0, info="The GPT model's temperature. Recommended to use a low temperature to decrease the likelihood of hallucinations.")
             btn = gr.Button(value='Submit')
             btn.style(full_width=True)
         with gr.Group():
+            with gr.Tabs():
+                with gr.TabItem("Answer"):
+                    answer = gr.Markdown()
+                with gr.TabItem("References"):
+                    references = gr.Markdown()
+        btn.click(main, inputs=[urls_text, question, split_by_topic, segment_length, n_neighbours, model, token_budget, temperature], outputs=[answer, references])
 #openai.api_key = os.getenv('Your_Key_Here')
 demo.launch()

notebook.ipynb CHANGED Viewed

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 47,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -13,10 +13,16 @@
     "import requests\n",
     "import json\n",
     "\n",
-    "url = \"https://www.youtube.com/watch?v=VcVfceTsD0A&t=163s\"\n",
     "video_id = url.split(\"=\")[1]\n",
     "\n",
-    "raw = YouTubeTranscriptApi.get_transcript(video_id)\n",
     "\n",
     "response = requests.get(f\"https://noembed.com/embed?dataType=json&url={url}\")\n",
     "data = json.loads(response.content)\n",
@@ -26,7 +32,222 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -45,7 +266,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -56,7 +277,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -68,7 +309,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -78,7 +319,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -91,26 +332,46 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
    "metadata": {},
    "outputs": [],
    "source": [
     "def to_timestamp(seconds):\n",
-    "\n",
     "    seconds = int(seconds)\n",
     "\n",
-    "    minutes = seconds // 60\n",
-    "    seconds_remaining = f\"{seconds % 60}\"\n",
     "    \n",
-    "    if len(seconds_remaining) == 1:\n",
-    "        seconds_remaining = \"0\" + seconds_remaining\n",
-    "\n",
-    "    return f\"{minutes}:{seconds_remaining}\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -130,12 +391,136 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
    "metadata": {},
    "outputs": [],
    "source": [
     "# At the beginning of each segment, add the title, author, and segment times\n",
-    "segment_text = [f\"'{title}' by {author}\\nTimestamp: {segment_time}\\n\\n{segment}\" for segment, segment_time in zip(segments, segment_times)]"
    ]
   }
  ],

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 41,
    "metadata": {},
    "outputs": [],
    "source": [
     "import requests\n",
     "import json\n",
     "\n",
+    "url = \"https://www.youtube.com/watch?v=77zvIYDFSok\"\n",
     "video_id = url.split(\"=\")[1]\n",
     "\n",
+    "try:\n",
+    "    raw = YouTubeTranscriptApi.get_transcript(video_id)\n",
+    "except:\n",
+    "    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)\n",
+    "    for transcript in transcript_list:\n",
+    "        raw = transcript.translate('en').fetch()\n",
+    "        break\n",
     "\n",
     "response = requests.get(f\"https://noembed.com/embed?dataType=json&url={url}\")\n",
     "data = json.loads(response.content)\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'text': '[Music]', 'start': 2.19, 'duration': 3.5},\n",
+       " {'text': '[Music]', 'start': 18.73, 'duration': 3.07},\n",
+       " {'text': '[Applause]', 'start': 27.71, 'duration': 3.289},\n",
+       " {'text': '[Music]', 'start': 33.68, 'duration': 7.01},\n",
+       " {'text': '[Laughter] [Music] [Music] [', 'start': 36.05, 'duration': 4.64},\n",
+       " {'text': 'Applause]', 'start': 59.97, 'duration': 3.2},\n",
+       " {'text': '[Music]', 'start': 68.78, 'duration': 3.12},\n",
+       " {'text': 'Recently, the', 'start': 72.18, 'duration': 3.0},\n",
+       " {'text': \"issue of sexual assault by celebrities has not stopped.  It's\",\n",
+       "  'start': 79.26,\n",
+       "  'duration': 3.24},\n",
+       " {'text': \"true that it happened to me. It's\",\n",
+       "  'start': 87.74,\n",
+       "  'duration': 3.96},\n",
+       " {'text': 'reallyembarrassing', 'start': 96.299, 'duration': 3.721},\n",
+       " {'text': \"[Music] It's disastrous\", 'start': 98.16, 'duration': 4.099},\n",
+       " {'text': '[Music]', 'start': 102.36, 'duration': 3.18},\n",
+       " {'text': \"There's also a part where feminists are obsessed with men's genitals. I think their germinating\",\n",
+       "  'start': 111.6,\n",
+       "  'duration': 4.5},\n",
+       " {'text': 'power is a really scary part.  I',\n",
+       "  'start': 113.82,\n",
+       "  'duration': 4.259},\n",
+       " {'text': 'think this castration will happen more often as they get castrated.',\n",
+       "  'start': 121.259,\n",
+       "  'duration': 3.441},\n",
+       " {'text': 'In fact, feminism was popular at the time,',\n",
+       "  'start': 133.58,\n",
+       "  'duration': 7.659},\n",
+       " {'text': 'but thanks to its popularity,', 'start': 137.58, 'duration': 8.04},\n",
+       " {'text': 'a lot of', 'start': 141.239, 'duration': 8.041},\n",
+       " {'text': 'things happened, such as scolding, ridicule, insults, and',\n",
+       "  'start': 145.62,\n",
+       "  'duration': 5.04},\n",
+       " {'text': 'insults against', 'start': 149.28, 'duration': 2.28},\n",
+       " {'text': 'men.  I', 'start': 150.66, 'duration': 2.7},\n",
+       " {'text': \"just couldn't stay there. Well, the\",\n",
+       "  'start': 153.36,\n",
+       "  'duration': 5.459},\n",
+       " {'text': 'pepper is 3 cm.', 'start': 160.2, 'duration': 5.58},\n",
+       " {'text': 'Besides, all men are potential rape',\n",
+       "  'start': 162.36,\n",
+       "  'duration': 7.019},\n",
+       " {'text': 'criminals. Men are useless. Men stopped',\n",
+       "  'start': 165.78,\n",
+       "  'duration': 6.599},\n",
+       " {'text': 'trusting women.  If you', 'start': 176.34, 'duration': 4.92},\n",
+       " {'text': 'reach there, you may be hit by the #MeToo movement, so I',\n",
+       "  'start': 184.92,\n",
+       "  'duration': 3.86},\n",
+       " {'text': 'think there are a lot of them right now. I think',\n",
+       "  'start': 200.28,\n",
+       "  'duration': 2.539},\n",
+       " {'text': 'there may be a little more than in other countries.',\n",
+       "  'start': 221.76,\n",
+       "  'duration': 4.8},\n",
+       " {'text': '[Applause]', 'start': 238.29, 'duration': 3.23},\n",
+       " {'text': '[Music]', 'start': 243.27, 'duration': 7.169},\n",
+       " {'text': 'Personally, I', 'start': 245.78, 'duration': 8.019},\n",
+       " {'text': \"would say it's content that only has these emotions. As an\",\n",
+       "  'start': 250.439,\n",
+       "  'duration': 6.061},\n",
+       " {'text': 'example, I said that we need to strongly pass the anti-discrimination law, but',\n",
+       "  'start': 253.799,\n",
+       "  'duration': 4.801},\n",
+       " {'text': 'this is actually an expression', 'start': 258.6, 'duration': 3.89},\n",
+       " {'text': 'dictatorship class.  Guys', 'start': 259.919, 'duration': 5.111},\n",
+       " {'text': '[Applause]', 'start': 262.49, 'duration': 4.149},\n",
+       " {'text': '[Music] I think you', 'start': 265.03, 'duration': 3.949},\n",
+       " {'text': \"'re talking a lot, but I want to hear it. Oh, you're not going to listen. You're so mean.\",\n",
+       "  'start': 268.979,\n",
+       "  'duration': 5.041},\n",
+       " {'text': 'For example, I went to the bathroom, and I',\n",
+       "  'start': 295.02,\n",
+       "  'duration': 2.7},\n",
+       " {'text': 'saw that it was unisex, but I want it to be safe with the door open.  I',\n",
+       "  'start': 297.72,\n",
+       "  'duration': 4.8},\n",
+       " {'text': 'think I was able to see well how men and women cut off harmony at the source. My',\n",
+       "  'start': 354.18,\n",
+       "  'duration': 2.6},\n",
+       " {'text': 'girlfriend is', 'start': 357.96, 'duration': 4.5},\n",
+       " {'text': 'not satisfied with something like that.',\n",
+       "  'start': 359.759,\n",
+       "  'duration': 4.081},\n",
+       " {'text': 'Could anyone who really thought about gender equality talk about peeing and taking a shower?  I think',\n",
+       "  'start': 371.539,\n",
+       "  'duration': 3.541},\n",
+       " {'text': '[Music]', 'start': 378.61, 'duration': 3.159},\n",
+       " {'text': 'In fact, many companies are', 'start': 384.479, 'duration': 3.321},\n",
+       " {'text': 'paying for it,', 'start': 388.979, 'duration': 4.44},\n",
+       " {'text': 'or in the labor market, there', 'start': 390.12, 'duration': 6.74},\n",
+       " {'text': 'is employment discrimination without any reason for gender inequality.',\n",
+       "  'start': 393.419,\n",
+       "  'duration': 3.441},\n",
+       " {'text': 'Evil men must disappear', 'start': 450.539, 'duration': 7.141},\n",
+       " {'text': 'Because of these millitons, there are many such people',\n",
+       "  'start': 454.56,\n",
+       "  'duration': 7.139},\n",
+       " {'text': 'in Korean society, and Ha Tae-kyung is a',\n",
+       "  'start': 457.68,\n",
+       "  'duration': 4.019},\n",
+       " {'text': 'representative abolitionist.  It',\n",
+       "  'start': 472.979,\n",
+       "  'duration': 7.521},\n",
+       " {'text': 'means that I oppose giving privileges to Han Gender.',\n",
+       "  'start': 503.099,\n",
+       "  'duration': 5.641},\n",
+       " {'text': 'Korean feminists are now', 'start': 504.3, 'duration': 8.58},\n",
+       " {'text': 'understood as', 'start': 508.74, 'duration': 6.5},\n",
+       " {'text': 'discriminatory against men.  In such a society,',\n",
+       "  'start': 528.42,\n",
+       "  'duration': 4.56},\n",
+       " {'text': 'I', 'start': 534.8, 'duration': 6.479},\n",
+       " {'text': 'think we should pay more attention to the serious discrimination against women.',\n",
+       "  'start': 537.18,\n",
+       "  'duration': 4.099},\n",
+       " {'text': 'Recently,', 'start': 544.08, 'duration': 3.0},\n",
+       " {'text': 'when I look at the political situation in Korea, I feel that it is now retreating.',\n",
+       "  'start': 551.459,\n",
+       "  'duration': 4.741},\n",
+       " {'text': '19% of female lawmakers in the National Assembly',\n",
+       "  'start': 559.62,\n",
+       "  'duration': 4.44},\n",
+       " {'text': 'are now 19%.', 'start': 561.06, 'duration': 3.0},\n",
+       " {'text': 'Why do people who send me messages like this send me messages like this',\n",
+       "  'start': 592.14,\n",
+       "  'duration': 3.84},\n",
+       " {'text': 'when they come every day?  It', 'start': 593.76, 'duration': 4.92},\n",
+       " {'text': \"'s necessary, but it seems that there\",\n",
+       "  'start': 615.3,\n",
+       "  'duration': 3.479},\n",
+       " {'text': 'are many cases where the target is directed at women,',\n",
+       "  'start': 618.779,\n",
+       "  'duration': 3.74},\n",
+       " {'text': \"but if there's a motto that I\", 'start': 645.42, 'duration': 5.82},\n",
+       " {'text': \"personally take while leading this group, let's\",\n",
+       "  'start': 647.12,\n",
+       "  'duration': 5.32},\n",
+       " {'text': \"create a world where feminists don't have to choose feminism.  I\",\n",
+       "  'start': 657.899,\n",
+       "  'duration': 3.721},\n",
+       " {'text': 'choose', 'start': 665.339, 'duration': 5.161},\n",
+       " {'text': \"feminism because I think you're watching.\",\n",
+       "  'start': 676.019,\n",
+       "  'duration': 4.081},\n",
+       " {'text': 'As a person, I live to protect the woman I love.',\n",
+       "  'start': 686.959,\n",
+       "  'duration': 5.701},\n",
+       " {'text': \"I think I'm about the level of a director who creates a hero.\",\n",
+       "  'start': 697.82,\n",
+       "  'duration': 6.94},\n",
+       " {'text': 'Well,', 'start': 701.04, 'duration': 3.72},\n",
+       " {'text': 'one day,', 'start': 707.16, 'duration': 10.41},\n",
+       " {'text': '[Music] We were', 'start': 727.75, 'duration': 4.46},\n",
+       " {'text': 'humiliated like', 'start': 730.2, 'duration': 5.139},\n",
+       " {'text': 'this.  I', 'start': 736.7, 'duration': 4.84},\n",
+       " {'text': \"think there are so many messages in this very short video. First of all, I think there's\",\n",
+       "  'start': 741.54,\n",
+       "  'duration': 2.64},\n",
+       " {'text': 'enough room for it to be interpreted as',\n",
+       "  'start': 744.18,\n",
+       "  'duration': 2.54},\n",
+       " {'text': 'sexual', 'start': 756.36, 'duration': 4.979},\n",
+       " {'text': 'harassment.', 'start': 762.66, 'duration': 5.1},\n",
+       " {'text': 'But I really', 'start': 777.42, 'duration': 4.38},\n",
+       " {'text': 'had no intention of interfering with the event.',\n",
+       "  'start': 780.3,\n",
+       "  'duration': 3.42},\n",
+       " {'text': 'It was a chance to share, but I',\n",
+       "  'start': 791.04,\n",
+       "  'duration': 7.28},\n",
+       " {'text': \"think I couldn't give you a good answer after hearing that conversation.\",\n",
+       "  'start': 795.12,\n",
+       "  'duration': 3.2},\n",
+       " {'text': '[Music] I was', 'start': 802.19, 'duration': 2.649},\n",
+       " {'text': 'very surprised. I', 'start': 803.76, 'duration': 2.04},\n",
+       " {'text': 'went with my faith,', 'start': 805.8, 'duration': 4.26},\n",
+       " {'text': 'but', 'start': 819.06, 'duration': 2.6},\n",
+       " {'text': \"I felt completely betrayed.  It's\",\n",
+       "  'start': 822.019,\n",
+       "  'duration': 3.991},\n",
+       " {'text': 'just', 'start': 824.16, 'duration': 4.91},\n",
+       " {'text': '[Music] [', 'start': 826.01, 'duration': 3.06},\n",
+       " {'text': 'Music] If', 'start': 831.75, 'duration': 3.09},\n",
+       " {'text': 'you', 'start': 839.88, 'duration': 5.04},\n",
+       " {'text': \"feel sexually shameful, that's\", 'start': 842.04, 'duration': 5.7},\n",
+       " {'text': \"sexual harassment. In Korea, that's the\",\n",
+       "  'start': 847.74,\n",
+       "  'duration': 5.06},\n",
+       " {'text': \"case.  It's\", 'start': 853.399, 'duration': 6.641},\n",
+       " {'text': 'scary to lose everything and become something really different only from the genitals',\n",
+       "  'start': 864.36,\n",
+       "  'duration': 5.59},\n",
+       " {'text': '[Music]', 'start': 864.95, 'duration': 5.0},\n",
+       " {'text': '[Music]', 'start': 888.91, 'duration': 3.09}]"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 44,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"[Music]\\n\\n [Music]\\n\\n [Applause]\\n\\n [Music]\\n\\n [Laughter] [Music] [Music] [\\n\\n Applause]\\n\\n [Music]\\n\\n Recently, the\\n\\n issue of sexual assault by celebrities has not stopped.  It's\\n\\n true that it happened to me. It's\\n\\n reallyembarrassing\\n\\n [Music] It's disastrous\\n\\n [Music]\\n\\n There's also a part where feminists are obsessed with men's genitals. I think their germinating\\n\\n power is a really scary part.  I\\n\\n think this castration will happen more often as they get castrated.\\n\\n In fact, feminism was popular at the time,\\n\\n but thanks to its popularity,\\n\\n a lot of\\n\\n things happened, such as scolding, ridicule, insults, and\\n\\n insults against\\n\\n men.  I\\n\\n just couldn't stay there. Well, the\\n\\n pepper is 3 cm.\\n\\n Besides, all men are potential rape\\n\\n criminals. Men are useless. Men stopped\\n\\n trusting women.  If you\\n\\n reach there, you may be hit by the #MeToo movement, so I\\n\\n think there are a lot of them right now. I think\\n\\n there may be a little more than in other countries.\\n\\n [Applause]\\n\\n [Music]\\n\\n Personally, I\\n\\n would say it's content that only has these emotions. As an\\n\\n example, I said that we need to strongly pass the anti-discrimination law, but\\n\\n this is actually an expression\\n\\n dictatorship class.  Guys\\n\\n [Applause]\\n\\n [Music] I think you\\n\\n 're talking a lot, but I want to hear it. Oh, you're not going to listen. You're so mean.\\n\\n For example, I went to the bathroom, and I\\n\\n saw that it was unisex, but I want it to be safe with the door open.  I\\n\\n think I was able to see well how men and women cut off harmony at the source. My\\n\\n girlfriend is\\n\\n not satisfied with something like that.\\n\\n Could anyone who really thought about gender equality talk about peeing and taking a shower?  I think\\n\\n [Music]\\n\\n In fact, many companies are\\n\\n paying for it,\\n\\n or in the labor market, there\\n\\n is employment discrimination without any reason for gender inequality.\\n\\n Evil men must disappear\\n\\n Because of these millitons, there are many such people\\n\\n in Korean society, and Ha Tae-kyung is a\\n\\n representative abolitionist.  It\\n\\n means that I oppose giving privileges to Han Gender.\\n\\n Korean feminists are now\\n\\n understood as\\n\\n discriminatory against men.  In such a society,\\n\\n I\\n\\n think we should pay more attention to the serious discrimination against women.\\n\\n Recently,\\n\\n when I look at the political situation in Korea, I feel that it is now retreating.\\n\\n 19% of female lawmakers in the National Assembly\\n\\n are now 19%.\\n\\n Why do people who send me messages like this send me messages like this\\n\\n when they come every day?  It\\n\\n 's necessary, but it seems that there\\n\\n are many cases where the target is directed at women,\\n\\n but if there's a motto that I\\n\\n personally take while leading this group, let's\\n\\n create a world where feminists don't have to choose feminism.  I\\n\\n choose\\n\\n feminism because I think you're watching.\\n\\n As a person, I live to protect the woman I love.\\n\\n I think I'm about the level of a director who creates a hero.\\n\\n Well,\\n\\n one day,\\n\\n [Music] We were\\n\\n humiliated like\\n\\n this.  I\\n\\n think there are so many messages in this very short video. First of all, I think there's\\n\\n enough room for it to be interpreted as\\n\\n sexual\\n\\n harassment.\\n\\n But I really\\n\\n had no intention of interfering with the event.\\n\\n It was a chance to share, but I\\n\\n think I couldn't give you a good answer after hearing that conversation.\\n\\n [Music] I was\\n\\n very surprised. I\\n\\n went with my faith,\\n\\n but\\n\\n I felt completely betrayed.  It's\\n\\n just\\n\\n [Music] [\\n\\n Music] If\\n\\n you\\n\\n feel sexually shameful, that's\\n\\n sexual harassment. In Korea, that's the\\n\\n case.  It's\\n\\n scary to lose everything and become something really different only from the genitals\\n\\n [Music]\\n\\n [Music]\\n\\n\""
+      ]
+     },
+     "execution_count": 45,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "transcript"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
     "def to_timestamp(seconds):\n",
     "    seconds = int(seconds)\n",
     "\n",
+    "    hours = seconds // 3600\n",
+    "    minutes = (seconds % 3600) // 60\n",
+    "    seconds_remaining = seconds % 60\n",
     "    \n",
+    "    if seconds >= 3600:\n",
+    "        return f\"{hours:02d}:{minutes:02d}:{seconds_remaining:02d}\"\n",
+    "    else:\n",
+    "        return f\"{minutes:02d}:{seconds_remaining:02d}\""
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'01:40'"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "to_timestamp(100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 22,
    "metadata": {},
    "outputs": [],
    "source": [
     "# At the beginning of each segment, add the title, author, and segment times\n",
+    "segments_times = [f\"({to_timestamp(segment_end_times[i-1])}, {to_timestamp(segment_end_times[i])})\" for i in range(1,len(segment_end_times))]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['(00:00, 00:48)',\n",
+       " '(00:48, 01:10)',\n",
+       " '(01:10, 01:46)',\n",
+       " '(01:46, 02:26)',\n",
+       " '(02:26, 02:57)',\n",
+       " '(02:57, 03:25)',\n",
+       " '(03:25, 04:11)',\n",
+       " '(04:11, 04:41)',\n",
+       " '(04:41, 05:26)',\n",
+       " '(05:26, 05:45)',\n",
+       " '(05:45, 06:13)',\n",
+       " '(06:13, 06:40)',\n",
+       " '(06:40, 07:02)',\n",
+       " '(07:02, 07:54)',\n",
+       " '(07:54, 08:17)',\n",
+       " '(08:17, 09:24)',\n",
+       " '(09:24, 10:10)',\n",
+       " '(10:10, 11:02)',\n",
+       " '(11:02, 11:47)',\n",
+       " '(11:47, 12:09)',\n",
+       " '(12:09, 12:52)',\n",
+       " '(12:52, 13:50)',\n",
+       " '(13:50, 14:15)',\n",
+       " '(14:15, 14:38)',\n",
+       " '(14:38, 16:14)',\n",
+       " '(16:14, 17:16)',\n",
+       " '(17:16, 17:47)',\n",
+       " '(17:47, 18:17)',\n",
+       " '(18:17, 18:56)',\n",
+       " '(18:56, 19:31)',\n",
+       " '(19:31, 19:52)',\n",
+       " '(19:52, 21:03)',\n",
+       " '(21:03, 21:39)',\n",
+       " '(21:39, 22:08)',\n",
+       " '(22:08, 22:42)',\n",
+       " '(22:42, 23:35)',\n",
+       " '(23:35, 24:51)',\n",
+       " '(24:51, 26:01)',\n",
+       " '(26:01, 26:28)',\n",
+       " '(26:28, 26:57)',\n",
+       " '(26:57, 28:37)',\n",
+       " '(28:37, 29:00)',\n",
+       " '(29:00, 29:50)',\n",
+       " '(29:50, 30:12)',\n",
+       " '(30:12, 30:55)',\n",
+       " '(30:55, 31:47)',\n",
+       " '(31:47, 32:54)',\n",
+       " '(32:54, 33:33)',\n",
+       " '(33:33, 33:50)',\n",
+       " '(33:50, 34:20)',\n",
+       " '(34:20, 34:48)',\n",
+       " '(34:48, 35:22)',\n",
+       " '(35:22, 36:14)',\n",
+       " '(36:14, 37:15)']"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "segments_times"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = '''\n",
+    "Segment from 'Feminism Is 'Dividing This'' Country' by VICE News\n",
+    "Timestamp: (10:51, 12:24)\n",
+    "---\n",
+    "personally take while leading this group, let's create a world where feminists don't have to choose feminism.  I choose feminism because I think you're watching. As a person, I live to protect the woman I love. I think I'm about the level of a director who creates a hero. Well, one day, [Music] We were humiliated like this.  I think there are so many messages in this very short video. First of all, I think there's\n",
+    "---\n",
+    "'''\n",
+    "\n",
+    "# Get the title and timestamp from the text\n",
+    "import re\n",
+    "\n",
+    "# define regular expression patterns\n",
+    "title_pattern = r\"Segment from '(.+)'\"\n",
+    "timestamp_pattern = r\"Timestamp: \\((.+)\\)\"\n",
+    "\n",
+    "# search for title, source, and timestamp using regular expressions\n",
+    "title = re.search(title_pattern, text).group(1)\n",
+    "start_timestamp = re.search(timestamp_pattern, text).group(1).split(\",\")[0]\n",
+    "\n",
+    "url = f\"URL: https://www.youtube.com/watch?v={video_id}&t={start_timestamp}\"\n",
+    "\n",
+    "# Add url in text before first \"---\"\n",
+    "text = re.sub(r\"---\", f\"{url}\\n---\", text, count=1)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"\\nSegment from 'Feminism Is 'Dividing This'' Country' by VICE News\\nTimestamp: (10:51, 12:24)\\nURL: https://www.youtube.com/watch?v=77zvIYDFSok&t=10:51 \\n---\\npersonally take while leading this group, let's create a world where feminists don't have to choose feminism.  I choose feminism because I think you're watching. As a person, I live to protect the woman I love. I think I'm about the level of a director who creates a hero. Well, one day, [Music] We were humiliated like this.  I think there are so many messages in this very short video. First of all, I think there's\\n---\\n\""
+      ]
+     },
+     "execution_count": 73,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "text"
    ]
   }
  ],

semantic_search.py CHANGED Viewed

@@ -10,6 +10,7 @@ class SemanticSearch:
     def fit(self, data, batch=1000, n_neighbors=5):
         self.data = data
         self.embeddings = self.get_text_embedding(data, batch=batch)
         n_neighbors = min(n_neighbors, len(self.embeddings))

     def fit(self, data, batch=1000, n_neighbors=5):
+        print(f"Fitting with n={n_neighbors}...")
         self.data = data
         self.embeddings = self.get_text_embedding(data, batch=batch)
         n_neighbors = min(n_neighbors, len(self.embeddings))