Spaces:

andaqu
/

ask-youtube-gpt

Runtime error

App Files Files Community

wendru18 commited on Apr 28, 2023

Commit

22d9367

0 Parent(s):

first commit

Browse files

Files changed (4) hide show

__pycache__/semantic_search.cpython-38.pyc +0 -0
app.py +177 -0
semantic_search.py +38 -0
youtube.ipynb +164 -0

__pycache__/semantic_search.cpython-38.pyc ADDED Viewed

Binary file (1.71 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,177 @@

+from youtube_transcript_api import YouTubeTranscriptApi
+from nltk.tokenize import TextTilingTokenizer
+from semantic_search import SemanticSearch
+import pandas as pd
+import gradio as gr
+import numpy as np
+import requests
+import tiktoken
+import openai
+import json
+tt = TextTilingTokenizer()
+searcher = SemanticSearch()
+def get_youtube_data(url):
+    video_id = url.split("=")[1]
+    raw = YouTubeTranscriptApi.get_transcript(video_id)
+    response = requests.get(f"https://noembed.com/embed?dataType=json&url={url}")
+    data = json.loads(response.content)
+    title, author = data["title"], data["author_name"]
+    df = pd.DataFrame(raw)
+    df['end'] = df['start'] + df['duration']
+    df['total_words'] = df['text'].apply(lambda x: len(x.split())).cumsum()
+    df["text"] = df["text"] + "\n\n"
+    return df, title, author
+def to_timestamp(seconds):
+    seconds = int(seconds)
+    hours = seconds // 3600
+    minutes = (seconds % 3600) // 60
+    seconds_remaining = seconds % 60
+    if seconds >= 3600:
+        return f"{hours:02d}:{minutes:02d}:{seconds_remaining:02d}"
+    else:
+        return f"{minutes:02d}:{seconds_remaining:02d}"
+def get_segments(df, title, author, split_by_topic, segment_length = 200):
+    transcript = df['text'].str.cat(sep=' ')
+    if not split_by_topic:
+        words = transcript.split()
+        segments = [' '.join(words[i:i+segment_length]) for i in range(0, len(words), segment_length)]
+    else:
+        segments = tt.tokenize(transcript)
+    segments = [segment.replace('\n\n','').strip() for segment in segments]
+    segments_wc = [len(segment.split()) for segment in segments]
+    segments_wc = np.cumsum(segments_wc)
+    idx = [np.argmin(np.abs(df['total_words'] - total_words)) for total_words in segments_wc]
+    segments_end_times = df['end'].iloc[idx].values
+    segments_end_times = np.insert(segments_end_times, 0, 0.0)
+    segments_times = [(to_timestamp(segments_end_times[i-1]), to_timestamp(segments_end_times[i])) for i in range(1,len(segments_end_times))]
+    segments_text = [f"Segment from '{title}' by {author}\nSegment timestamp: {segment_time}\n\n{segment}" for segment, segment_time in zip(segments, segments_times)]
+    return segments_text
+def fit_searcher(segments, n_neighbors):
+    global searcher
+    searcher.fit(segments, n_neighbors)
+    return True
+def num_tokens(text, model):
+    encoding = tiktoken.encoding_for_model(model)
+    return len(encoding.encode(text))
+def form_query(question, model, token_budget):
+    results = searcher(question)
+    introduction = 'Use the below segments from multiple youtube videos to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer." Cite each reference using the [title, author, timestamp] notation. Every sentence should have a citation at the end.'
+    message = introduction
+    question = f"\n\nQuestion: {question}"
+    reference = []
+    for result in results:
+        result = "\n\n" + result
+        if (
+            num_tokens(message + result + question, model=model)
+            > token_budget
+        ):
+            break
+        else:
+            reference.append(result)
+            message += result
+    return message + question, reference
+def generate_answer(question, model, token_budget):
+    message, reference = form_query(question, model, token_budget)
+    messages = [
+        {"role": "system", "content": "You answer questions about legal contracts."},
+        {"role": "user", "content": message},
+    ]
+    response = openai.ChatCompletion.create(
+        model=model,
+        messages=messages,
+        temperature=0
+    )
+    response_message = response["choices"][0]["message"]["content"]
+    return response_message, reference
+if False:
+    data = {}
+    question = "Why do some men have trouble with feminism?"
+    n_neighbors = 5
+    urls = ["https://www.youtube.com/watch?v=4xWJf8cERoM", "https://www.youtube.com/watch?v=vx-Si9gbijA"]
+    segments = []
+    for url in urls:
+        df, title, author = get_youtube_data(url)
+        video_segments = get_segments(df, title, author, split_by_topic = True)
+        segments.extend(video_segments)
+    print("Segments generated successfully!")
+    if fit_searcher(segments, n_neighbors):
+        print("Searcher fit successfully!")
+        answer, reference = generate_answer(question, model = "gpt-3.5-turbo", token_budget = 1000)
+        print(answer)
+        print(reference)
+title = "Ask Youtube GPT"
+description = """  """
+with gr.Blocks() as demo:
+    gr.Markdown(f'<center><h1>{title}</h1></center>')
+    gr.Markdown(f'Ask YouTube GPT allows you to ask questions about a set of Youtube Videos using Universal Sentence Encoder and Open AI. The returned response cites the video title, author and timestamp in square brackets where the information is located, adding credibility to the responses and helping you to locate incorrect information. If you need one, get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a>.</p>')
+    with gr.Row():
+        with gr.Group():
+            openAI_key=gr.Textbox(label='Enter your OpenAI API key here')
+            # Allow the user to input multiple links, adding a textbox for each
+            links = gr.Textbox(lines=5, label="Enter the links to the YouTube videos you want to search (one per line):", placeholder="https://www.youtube.com/watch?v=4xWJf8cERoM\nhttps://www.youtube.com/watch?v=vx-Si9gbijA")
+            question = gr.Textbox(label='Enter your question here')
+            btn = gr.Button(value='Submit')
+            btn.style(full_width=True)
+        with gr.Group():
+            answer = gr.Textbox(label='The answer to your question is :')
+        # btn.click(question_answer, inputs=[url, file, question,openAI_key], outputs=[answer])
+#openai.api_key = os.getenv('Your_Key_Here')
+demo.launch()

semantic_search.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from sklearn.neighbors import NearestNeighbors
+import tensorflow_hub as hub
+import numpy as np
+class SemanticSearch:
+    def __init__(self):
+        self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
+        self.fitted = False
+    def fit(self, data, batch=1000, n_neighbors=5):
+        self.data = data
+        self.embeddings = self.get_text_embedding(data, batch=batch)
+        n_neighbors = min(n_neighbors, len(self.embeddings))
+        self.nn = NearestNeighbors(n_neighbors=n_neighbors)
+        self.nn.fit(self.embeddings)
+        self.fitted = True
+    def __call__(self, text, return_data=True):
+        inp_emb = self.use([text])
+        neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
+        if return_data:
+            return [self.data[i] for i in neighbors]
+        else:
+            return neighbors
+    def get_text_embedding(self, texts, batch=1000):
+        embeddings = []
+        for i in range(0, len(texts), batch):
+            text_batch = texts[i:(i+batch)]
+            emb_batch = self.use(text_batch)
+            embeddings.append(emb_batch)
+        embeddings = np.vstack(embeddings)
+        return embeddings

youtube.ipynb ADDED Viewed

	@@ -0,0 +1,164 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from youtube_transcript_api import YouTubeTranscriptApi\n",
+    "from nltk.tokenize import TextTilingTokenizer  \n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import requests\n",
+    "import json\n",
+    "\n",
+    "url = \"https://www.youtube.com/watch?v=VcVfceTsD0A&t=163s\"\n",
+    "video_id = url.split(\"=\")[1]\n",
+    "\n",
+    "raw = YouTubeTranscriptApi.get_transcript(video_id)\n",
+    "\n",
+    "response = requests.get(f\"https://noembed.com/embed?dataType=json&url={url}\")\n",
+    "data = json.loads(response.content)\n",
+    "\n",
+    "title, author = data[\"title\"], data[\"author_name\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert the list of dictionaries to a pandas dataframe\n",
+    "df = pd.DataFrame(raw)\n",
+    "\n",
+    "# Add end column\n",
+    "df['end'] = df['start'] + df['duration']\n",
+    "\n",
+    "# Add a new column to the dataframe called 'total_words' that contains the total number of words so far in the transcript\n",
+    "df['total_words'] = df['text'].apply(lambda x: len(x.split())).cumsum()\n",
+    "\n",
+    "# Add \"\\n\\n\" at the end of df[\"text\"]\n",
+    "df[\"text\"] = df[\"text\"] + \"\\n\\n\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Merge the text column into a single string and save to a transcript variable\n",
+    "\n",
+    "transcript = df['text'].str.cat(sep=' ')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tt = TextTilingTokenizer()\n",
+    "\n",
+    "# Tokenize the transcript into segments using the TextTilingTokenizer\n",
+    "segments = tt.tokenize(transcript)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # Remove \\n\\n from each segment\n",
+    "segments = [segment.replace('\\n\\n','').strip() for segment in segments]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Calculate a list of word count for each segment\n",
+    "segments_wc = [len(segment.split()) for segment in segments]\n",
+    "\n",
+    "# Make it cumulative\n",
+    "segments_wc = np.cumsum(segments_wc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def to_timestamp(seconds):\n",
+    "\n",
+    "    seconds = int(seconds)\n",
+    "\n",
+    "    minutes = seconds // 60\n",
+    "    seconds_remaining = f\"{seconds % 60}\"\n",
+    "    \n",
+    "    if len(seconds_remaining) == 1:\n",
+    "        seconds_remaining = \"0\" + seconds_remaining\n",
+    "\n",
+    "    return f\"{minutes}:{seconds_remaining}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# For each value in segments_wc, get the index of the closest value in df['total_words']\n",
+    "# This will be the index of the row in df that is closest to the end of each segment\n",
+    "idx = [np.argmin(np.abs(df['total_words'] - total_words)) for total_words in segments_wc]\n",
+    "\n",
+    "# Get segment end times from idx\n",
+    "segment_end_times = df['end'].iloc[idx].values\n",
+    "\n",
+    "# Add 0.0 to the beginning of segment_end_times\n",
+    "segment_end_times = np.insert(segment_end_times, 0, 0.0)\n",
+    "\n",
+    "# segment_times is a list of tuples containing the start and end times of each segment\n",
+    "segment_times = [(to_timestamp(segment_end_times[i-1]), to_timestamp(segment_end_times[i])) for i in range(1,len(segment_end_times))]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# At the beginning of each segment, add the title, author, and segment times\n",
+    "segment_text = [f\"'{title}' by {author}\\nTimestamp: {segment_time}\\n\\n{segment}\" for segment, segment_time in zip(segments, segment_times)]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.0"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}