Commit
·
4b16400
0
Parent(s):
Duplicate from somuch4subtlety/pogcastGPT
Browse filesCo-authored-by: SoMuch4Subtlety <[email protected]>
- .gitattributes +34 -0
- README.md +25 -0
- app.py +108 -0
- requirements.txt +3 -0
.gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: PogcastGPT
|
3 |
+
emoji: 💻
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: indigo
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.10.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
duplicated_from: somuch4subtlety/pogcastGPT
|
11 |
+
---
|
12 |
+
|
13 |
+
|
14 |
+
This app uses semantic search to find and summarize relevant sections of the Pogcast to answer a user's question.
|
15 |
+
|
16 |
+
The process began by downloading and transcribing Pogcast episodes using [OpenAI’s Whisper](https://github.com/openai/whisper).
|
17 |
+
The transcriptions were then chunked into sections of ~500 words and each chunk was vectorized using [OpenAI’s embedding endpoint](https://beta.openai.com/docs/guides/embeddings).
|
18 |
+
The embeddings and text are then stored in a [vector database](Pinecone.io).
|
19 |
+
|
20 |
+
When you ask a question, the text is run through the embedding endpoint and then is compared to all of the vectorized sections using cosine similarity.
|
21 |
+
The top results are used as context and passed to [OpenAI’s GPT-3 completion endpoint](https://beta.openai.com/docs/api-reference/completions) along with your question and an explanation of how GPT-3 should answer the question.
|
22 |
+
Lastly, the summary answer and top matching sections are displayed.
|
23 |
+
|
24 |
+
Note
|
25 |
+
The parameters and completion prompt are set loosely and the bot is likely to hallucinate during its anwsers.
|
app.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pinecone
|
3 |
+
import openai
|
4 |
+
import uuid
|
5 |
+
|
6 |
+
@st.experimental_singleton
|
7 |
+
def init_pinecone():
|
8 |
+
pinecone.init(api_key=st.secrets["PINECONE_KEY"], environment="us-west1-gcp")
|
9 |
+
return pinecone.Index(st.secrets["PINECONE_INDEX"])
|
10 |
+
|
11 |
+
openai.organization = st.secrets["OPENAI_ORG"]
|
12 |
+
openai.api_key = st.secrets["OPENAI_KEY"]
|
13 |
+
|
14 |
+
|
15 |
+
def modCheck(query):
|
16 |
+
response = openai.Moderation.create(input=query)
|
17 |
+
return response["results"][0]['flagged']
|
18 |
+
|
19 |
+
def promptMaker(query, matchtext, prompt_type=None):
|
20 |
+
prompt = "The Pogcast is a weekly podcast co-hosted by Veritas and Jesse Kazam. They are both twitch streamers and on the podcast they discuss all the poggers things in life like the first-person shooter Escape from Tarkov, chess, speed-running, and everyday activities relevant to being a twitch streamer.\n"
|
21 |
+
if not prompt_type:
|
22 |
+
prompt+= "You will be given relevant snippets from the Pogcast that should help you answer or provide context to an inquiry. \n" + \
|
23 |
+
"If the inquiry is in the form of a question, answer it in a verbose manner, provide a quote from the snippets to support your answer, and provide a deep summarization of the relevant portions of the snippets.\n" + \
|
24 |
+
"If the inquiry is not in the form of a question, summarize the parts of the snippets most relevant to the inquiry.\n" + \
|
25 |
+
"Snippets:\n" + matchtext +" \nInquiry: " + query + "\nResult:"
|
26 |
+
else:
|
27 |
+
prompt+= "Use the following snippets from the podcast to write a " + prompt_type + " about " + query + "\nSnippets: " + matchtext + "\nResult:"
|
28 |
+
return prompt
|
29 |
+
|
30 |
+
def runInquiry(query):
|
31 |
+
prompt_type = None
|
32 |
+
if query.startswith("/"):
|
33 |
+
prompt_type = query.split(" ")[0][1:]
|
34 |
+
query = " ".join(query.split(" ")[1:]).strip()
|
35 |
+
|
36 |
+
if len(query)< 6:
|
37 |
+
st.error("Please ask a question with at least 6 characters")
|
38 |
+
return
|
39 |
+
with st.spinner('Checking query...'):
|
40 |
+
flagged = modCheck(query)
|
41 |
+
if flagged:
|
42 |
+
st.error("You know what you did. I ain't answering that.")
|
43 |
+
return
|
44 |
+
|
45 |
+
with st.spinner('Embedding query...'):
|
46 |
+
xq = openai.Embedding.create(input=query, engine="text-embedding-ada-002")['data'][0]['embedding']
|
47 |
+
index = init_pinecone()
|
48 |
+
res = index.query(xq, namespace=st.secrets["PINECONE_NAMESPACE"], top_k=5, include_metadata=True)
|
49 |
+
with st.spinner('Thinking...'):
|
50 |
+
matchtext = "\n".join(match['metadata']['content'] for match in res['matches'][:3])
|
51 |
+
|
52 |
+
if 'uid' not in st.session_state:
|
53 |
+
st.session_state.uid = str(uuid.uuid4())
|
54 |
+
|
55 |
+
comp = openai.Completion.create(
|
56 |
+
model="text-davinci-003",
|
57 |
+
prompt=promptMaker(query, matchtext, prompt_type),
|
58 |
+
max_tokens=2000,
|
59 |
+
temperature=0.9,
|
60 |
+
user = st.session_state.uid
|
61 |
+
)
|
62 |
+
st.markdown(f"""
|
63 |
+
<div>
|
64 |
+
<p class="lead">{comp['choices'][0]['text']}</p>
|
65 |
+
</div>
|
66 |
+
""", unsafe_allow_html=True)
|
67 |
+
|
68 |
+
for context in res['matches']:
|
69 |
+
card(
|
70 |
+
context['metadata']['episode_num'],
|
71 |
+
context['metadata']['episode_id'],
|
72 |
+
context['metadata']['start_second'],
|
73 |
+
context['metadata']['end_second'],
|
74 |
+
context['metadata']['content']
|
75 |
+
)
|
76 |
+
return (comp, res['matches'])
|
77 |
+
|
78 |
+
def card(episode, episode_id, start_second, end_second, context):
|
79 |
+
return st.markdown(f"""
|
80 |
+
<div class="container-fluid mb-2">
|
81 |
+
<div class="row align-items-start">
|
82 |
+
<div class="col-md-4 col-sm-4">
|
83 |
+
<div class="position-relative">
|
84 |
+
<iframe width="220" height="124" src="https://www.youtube.com/embed/{episode_id}?start={int(start_second)}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
|
85 |
+
</div>
|
86 |
+
</div>
|
87 |
+
<div class="col-md-8 col-sm-8">
|
88 |
+
<a href=https://www.youtube.com/watch?v={episode_id}&t={int(start_second)}s>Episode {int(episode)}</a>
|
89 |
+
<br>
|
90 |
+
<span style="color: #808080;">
|
91 |
+
<small>{context[:200].capitalize()+"...."}</small>
|
92 |
+
</span>
|
93 |
+
</div>
|
94 |
+
</div>
|
95 |
+
</div>
|
96 |
+
""", unsafe_allow_html=True)
|
97 |
+
|
98 |
+
st.markdown("<h1 style='text-align: center;'>PogcastGPT</h1>", unsafe_allow_html=True)
|
99 |
+
st.write("""
|
100 |
+
This app uses semantic search to find and summarize relevant sections of the Pogcast to answer your question
|
101 |
+
""")
|
102 |
+
st.markdown("""
|
103 |
+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
|
104 |
+
""", unsafe_allow_html=True)
|
105 |
+
|
106 |
+
query = st.text_input(label="Ask me a question about the Pogcast!", max_chars=200, value="", key="inquiryBox", type='default')
|
107 |
+
if query != "":
|
108 |
+
runInquiry(query)
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
pinecone-client
|
2 |
+
openai
|
3 |
+
streamlit
|