Spaces:
Sleeping
Sleeping
Commit
·
f1fd33e
1
Parent(s):
9cbf1f7
111
Browse files- app.py +303 -0
- requirements.txt +7 -0
app.py
ADDED
@@ -0,0 +1,303 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import faiss
|
4 |
+
import openai
|
5 |
+
import tempfile
|
6 |
+
from sentence_transformers import SentenceTransformer
|
7 |
+
import streamlit as st
|
8 |
+
from nltk.tokenize import word_tokenize
|
9 |
+
from nltk.corpus import stopwords
|
10 |
+
from nltk.stem import WordNetLemmatizer
|
11 |
+
from collections import Counter
|
12 |
+
import nltk
|
13 |
+
import time
|
14 |
+
|
15 |
+
openai.api_key = st.secrets["OPENAI_API_KEY"]
|
16 |
+
|
17 |
+
@st.cache_data
|
18 |
+
def download_nltk():
|
19 |
+
nltk.download('punkt')
|
20 |
+
nltk.download('wordnet')
|
21 |
+
nltk.download('stopwords')
|
22 |
+
|
23 |
+
def chunkstring(string, length):
|
24 |
+
return (string[0+i:length+i] for i in range(0, len(string), length))
|
25 |
+
|
26 |
+
def get_keywords(file_paths): #这里的重点是,对每一个file做尽可能简短且覆盖全面的summarization
|
27 |
+
download_nltk()
|
28 |
+
keywords_list = []
|
29 |
+
for file_path in file_paths:
|
30 |
+
with open(file_path, 'r') as file:
|
31 |
+
data = file.read()
|
32 |
+
# tokenize
|
33 |
+
words = word_tokenize(data)
|
34 |
+
# remove punctuation
|
35 |
+
words = [word for word in words if word.isalnum()]
|
36 |
+
# remove stopwords
|
37 |
+
stop_words = set(stopwords.words('english'))
|
38 |
+
words = [word for word in words if word not in stop_words]
|
39 |
+
# lemmatization
|
40 |
+
lemmatizer = WordNetLemmatizer()
|
41 |
+
words = [lemmatizer.lemmatize(word) for word in words]
|
42 |
+
# count word frequencies
|
43 |
+
word_freq = Counter(words)
|
44 |
+
# get top 20 most common words
|
45 |
+
keywords = word_freq.most_common(20)
|
46 |
+
new_keywords = []
|
47 |
+
for word in keywords:
|
48 |
+
new_keywords.append(word[0])
|
49 |
+
str_keywords = ''
|
50 |
+
for word in new_keywords:
|
51 |
+
str_keywords += word + ", "
|
52 |
+
keywords_list.append(f"Top20 frequency keywords for {file_path}: {str_keywords}")
|
53 |
+
|
54 |
+
return keywords_list
|
55 |
+
|
56 |
+
def get_completion_from_messages(messages, model="gpt-4", temperature=0):
|
57 |
+
response = openai.ChatCompletion.create(
|
58 |
+
model=model,
|
59 |
+
messages=messages,
|
60 |
+
temperature=temperature, # this is the degree of randomness of the model's output
|
61 |
+
)
|
62 |
+
return response.choices[0].message["content"]
|
63 |
+
|
64 |
+
#调用gpt API生成课程大纲 + 每节课解释,随后输出为md文档。并在课程内一直保留着
|
65 |
+
def genarating_outline(keywords, num_lessons,language):
|
66 |
+
|
67 |
+
system_message = 'You are a great AI teacher and linguist, skilled at create course outline based on summarized knowledge materials.'
|
68 |
+
user_message = f"""You are a great AI teacher and linguist,
|
69 |
+
skilled at generating course outline based on keywords of the course.
|
70 |
+
Based on keywords provided, you should carefully design a course outline.
|
71 |
+
Requirements: Through learning this course, learner should understand those key concepts.
|
72 |
+
Key concepts: {keywords}
|
73 |
+
you should output course outline in a python list format, Do not include anything else except that python list in your output.
|
74 |
+
Example output format:
|
75 |
+
[[name_lesson1, abstract_lesson1],[name_lesson2, abstrct_lesson2]]
|
76 |
+
In the example, you can see each element in this list consists of two parts: the "name_lesson" part is the name of the lesson, and the "abstract_lesson" part is the one-sentence description of the lesson, intruduces knowledge it contained.
|
77 |
+
for each lesson in this course, you should provide these two information and organize them as exemplified.
|
78 |
+
for this course, you should design {num_lessons} lessons in total.
|
79 |
+
the course outline should be written in {language}.
|
80 |
+
Start the work now.
|
81 |
+
"""
|
82 |
+
messages = [
|
83 |
+
{'role':'system',
|
84 |
+
'content': system_message},
|
85 |
+
{'role':'user',
|
86 |
+
'content': user_message},
|
87 |
+
]
|
88 |
+
|
89 |
+
response = get_completion_from_messages(messages)
|
90 |
+
|
91 |
+
list_response = ['nothing in the answers..']
|
92 |
+
|
93 |
+
try:
|
94 |
+
list_response = eval(response)
|
95 |
+
except SyntaxError:
|
96 |
+
pass
|
97 |
+
|
98 |
+
return list_response
|
99 |
+
|
100 |
+
def courseOutlineGenerating(file_paths, num_lessons, language):
|
101 |
+
summarized_materials = get_keywords(file_paths)
|
102 |
+
course_outline = genarating_outline(summarized_materials, num_lessons, language)
|
103 |
+
return course_outline
|
104 |
+
|
105 |
+
def constructVDB(file_paths):
|
106 |
+
#把KM拆解为chunks
|
107 |
+
|
108 |
+
chunks = []
|
109 |
+
for filename in file_paths:
|
110 |
+
with open(filename, 'r') as f:
|
111 |
+
content = f.read()
|
112 |
+
for chunk in chunkstring(content, 1024):
|
113 |
+
chunks.append(chunk)
|
114 |
+
chunk_df = pd.DataFrame(chunks, columns=['chunk'])
|
115 |
+
|
116 |
+
#从文本chunks到embeddings
|
117 |
+
model = SentenceTransformer('paraphrase-mpnet-base-v2')
|
118 |
+
embeddings = model.encode(chunk_df['chunk'].tolist())
|
119 |
+
# convert embeddings to a dataframe
|
120 |
+
embedding_df = pd.DataFrame(embeddings.tolist())
|
121 |
+
# Concatenate the original dataframe with the embeddings
|
122 |
+
paraphrase_embeddings_df = pd.concat([chunk_df, embedding_df], axis=1)
|
123 |
+
# Save the results to a new csv file
|
124 |
+
|
125 |
+
#从embeddings到向量数据库
|
126 |
+
# Load the embeddings
|
127 |
+
data = paraphrase_embeddings_df
|
128 |
+
embeddings = data.iloc[:, 1:].values # All columns except the first (chunk text)
|
129 |
+
|
130 |
+
# Ensure that the array is C-contiguous
|
131 |
+
embeddings = np.ascontiguousarray(embeddings, dtype=np.float32)
|
132 |
+
# Preparation for Faiss
|
133 |
+
dimension = embeddings.shape[1] # the dimension of the vector space
|
134 |
+
index = faiss.IndexFlatL2(dimension)
|
135 |
+
# Normalize the vectors
|
136 |
+
faiss.normalize_L2(embeddings)
|
137 |
+
# Build the index
|
138 |
+
index.add(embeddings)
|
139 |
+
# write index to disk
|
140 |
+
return paraphrase_embeddings_df, index
|
141 |
+
|
142 |
+
def searchVDB(search_sentence, paraphrase_embeddings_df, index):
|
143 |
+
#从向量数据库中检索相应文段
|
144 |
+
data = paraphrase_embeddings_df
|
145 |
+
embeddings = data.iloc[:, 1:].values # All columns except the first (chunk text)
|
146 |
+
embeddings = np.ascontiguousarray(embeddings, dtype=np.float32)
|
147 |
+
|
148 |
+
model = SentenceTransformer('paraphrase-mpnet-base-v2')
|
149 |
+
sentence_embedding = model.encode([search_sentence])
|
150 |
+
|
151 |
+
# Ensuring the sentence embedding is in the correct format
|
152 |
+
sentence_embedding = np.ascontiguousarray(sentence_embedding, dtype=np.float32)
|
153 |
+
# Searching for the top 3 nearest neighbors in the FAISS index
|
154 |
+
D, I = index.search(sentence_embedding, k=3)
|
155 |
+
# Printing the top 3 most similar text chunks
|
156 |
+
retrieved_chunks_list = []
|
157 |
+
|
158 |
+
for idx in I[0]:
|
159 |
+
retrieved_chunks_list.append(data.iloc[idx].chunk)
|
160 |
+
|
161 |
+
return retrieved_chunks_list
|
162 |
+
|
163 |
+
def generateCourse(topic, materials, language):
|
164 |
+
|
165 |
+
#调用gpt4 API生成一节课的内容
|
166 |
+
system_message = 'You are a great AI teacher and linguist, skilled at writing informative and easy-to-understand course script based on given lesson topic and knowledge materials.'
|
167 |
+
|
168 |
+
user_message = f"""You are a great AI teacher and linguist,
|
169 |
+
skilled at writing informative and easy-to-understand course script based on given lesson topic and knowledge materials.
|
170 |
+
You should write a course for new hands, they need detailed and vivid explaination to understand the topic.
|
171 |
+
Here are general steps of creating a well-designed course. Please follow them step-by-step:
|
172 |
+
Step 1. Write down the teaching purpose of the lesson initially in the script.
|
173 |
+
Step 2. Write down the outline of this lesson (outline is aligned to the teaching purpose), then follow the outline to write the content.
|
174 |
+
Step 3. Review the content,add some examples (including code example) to the core concepts of this lesson, making sure examples are familiar with learner. Each core concepts should at least with one example.
|
175 |
+
Step 4. Review the content again, make some analogies or metaphors to the concepts that come up frequently to make the explanation of them more easier to understand.
|
176 |
+
Make sure all these steps are considered when writing the lesson script content.
|
177 |
+
Your lesson topic and abstract is within the 「」 quotes, and the knowledge materials are within the 【】 brackets.
|
178 |
+
lesson topic and abstract: 「{topic}」,
|
179 |
+
knowledge materials related to this lesson:【{materials} 】
|
180 |
+
the script should be witten in {language}.
|
181 |
+
Start writting the script of this lesson now.
|
182 |
+
"""
|
183 |
+
|
184 |
+
messages = [
|
185 |
+
{'role':'system',
|
186 |
+
'content': system_message},
|
187 |
+
{'role':'user',
|
188 |
+
'content': user_message},
|
189 |
+
]
|
190 |
+
|
191 |
+
response = get_completion_from_messages(messages)
|
192 |
+
return response
|
193 |
+
|
194 |
+
def decorate_user_question(user_question, retrieved_chunks_for_user):
|
195 |
+
decorated_prompt = f'''You're a brilliant teaching assistant, skilled at answer stundent's question based on given materials.
|
196 |
+
student's question: 「{user_question}」
|
197 |
+
related materials:【{retrieved_chunks_for_user}】
|
198 |
+
if the given materials are irrelavant to student's question, please use your own knowledge to answer the question.
|
199 |
+
You need to break down the student's question first, find out what he really wants to ask, and then try to give a comprehensive answer.
|
200 |
+
Start to answer the question now.
|
201 |
+
'''
|
202 |
+
return decorated_prompt
|
203 |
+
|
204 |
+
def app():
|
205 |
+
st.title("OmniTutor v0.0.2")
|
206 |
+
|
207 |
+
with st.sidebar:
|
208 |
+
st.image("https://siyuan-harry.oss-cn-beijing.aliyuncs.com/oss://siyuan-harry/20231021212525.png")
|
209 |
+
added_files = st.file_uploader('Upload .md file', type=['.md'], accept_multiple_files=True)
|
210 |
+
num_lessons = st.slider('How many lessons do you want this course to have?', min_value=5, max_value=20, value=10, step=1)
|
211 |
+
language = 'English'
|
212 |
+
Chinese = st.checkbox('Output in Chinese')
|
213 |
+
if Chinese:
|
214 |
+
language = 'Chinese'
|
215 |
+
btn_outline = st.button('submit')
|
216 |
+
|
217 |
+
|
218 |
+
col1, col2 = st.columns([0.6,0.4], gap='large')
|
219 |
+
|
220 |
+
with col1:
|
221 |
+
|
222 |
+
if btn_outline:
|
223 |
+
temp_file_paths = []
|
224 |
+
file_proc_state = st.text("Processing file...")
|
225 |
+
for added_file in added_files:
|
226 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".md") as tmp:
|
227 |
+
tmp.write(added_file.getvalue())
|
228 |
+
tmp_path = tmp.name
|
229 |
+
temp_file_paths.append(tmp_path)
|
230 |
+
file_proc_state.text("Processing file...Done")
|
231 |
+
|
232 |
+
outline_generating_state = st.text("Generating Course Oueline...")
|
233 |
+
course_outline_list = courseOutlineGenerating(temp_file_paths, num_lessons, language)
|
234 |
+
outline_generating_state.text("Generating Course Oueline...Done")
|
235 |
+
|
236 |
+
course_outline_string = ''
|
237 |
+
lessons_count = 0
|
238 |
+
for outline in course_outline_list:
|
239 |
+
lessons_count += 1
|
240 |
+
course_outline_string += f"{lessons_count}." + outline[0] + '\n'
|
241 |
+
course_outline_string += outline[1] + '\n\n'
|
242 |
+
#time.sleep(1)
|
243 |
+
with st.expander("Check the course outline", expanded=False):
|
244 |
+
st.write(course_outline_string)
|
245 |
+
|
246 |
+
vdb_state = st.text("Constructing vector database from provided materials...")
|
247 |
+
embeddings_df, faiss_index = constructVDB(temp_file_paths)
|
248 |
+
vdb_state.text("Constructing vector database from provided materials...Done")
|
249 |
+
|
250 |
+
count_generating_content = 0
|
251 |
+
for lesson in course_outline_list:
|
252 |
+
count_generating_content += 1
|
253 |
+
content_generating_state = st.text(f"Writing content for lesson {count_generating_content}...")
|
254 |
+
retrievedChunksList = searchVDB(lesson, embeddings_df, faiss_index)
|
255 |
+
courseContent = generateCourse(lesson, retrievedChunksList, language)
|
256 |
+
content_generating_state.text(f"Writing content for lesson {count_generating_content}...Done")
|
257 |
+
#st.text_area("Course Content", value=courseContent)
|
258 |
+
with st.expander(f"Learn the lesson {count_generating_content} ", expanded=False):
|
259 |
+
st.markdown(courseContent)
|
260 |
+
|
261 |
+
user_question = st.chat_input("Enter your questions when learning...")
|
262 |
+
retrieved_chunks_for_user = searchVDB(user_question, embeddings_df, faiss_index)
|
263 |
+
prompt = decorate_user_question(user_question, retrieved_chunks_for_user)
|
264 |
+
|
265 |
+
with col2:
|
266 |
+
st.caption(''':blue[AI Assistant]: Ask this TA any questions related to this course and get direct answers. :sunglasses:''')
|
267 |
+
# Set a default model
|
268 |
+
|
269 |
+
with st.chat_message("assistant"):
|
270 |
+
st.write("Hello👋, how can I help you today? 😄")
|
271 |
+
if "openai_model" not in st.session_state:
|
272 |
+
st.session_state["openai_model"] = "gpt-3.5-turbo"
|
273 |
+
|
274 |
+
# Initialize chat history
|
275 |
+
if "messages" not in st.session_state:
|
276 |
+
st.session_state.messages = []
|
277 |
+
|
278 |
+
# Display chat messages from history on app rerun
|
279 |
+
for message in st.session_state.messages:
|
280 |
+
with st.chat_message(message["role"]):
|
281 |
+
st.markdown(message["content"])
|
282 |
+
#这里的session.state就是保存了这个对话会话的一些基本信息和设置
|
283 |
+
if prompt:
|
284 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
285 |
+
with st.chat_message("user"):
|
286 |
+
st.markdown(prompt)
|
287 |
+
# Display assistant response in chat message container
|
288 |
+
with st.chat_message("assistant"):
|
289 |
+
message_placeholder = st.empty()
|
290 |
+
full_response = ""
|
291 |
+
for response in openai.ChatCompletion.create(
|
292 |
+
model=st.session_state["openai_model"],
|
293 |
+
messages=[{"role": m["role"], "content": m["content"]} for m in st.session_state.messages],
|
294 |
+
stream=True,
|
295 |
+
):
|
296 |
+
full_response += response.choices[0].delta.get("content", "")
|
297 |
+
message_placeholder.markdown(full_response + "▌")
|
298 |
+
message_placeholder.markdown(full_response)
|
299 |
+
st.session_state.messages.append({"role": "assistant", "content": full_response})
|
300 |
+
|
301 |
+
|
302 |
+
if __name__ == "__main__":
|
303 |
+
app()
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
faiss_cpu==1.7.3
|
2 |
+
nltk==3.8.1
|
3 |
+
numpy==1.25.0
|
4 |
+
openai==0.27.6
|
5 |
+
pandas==2.0.2
|
6 |
+
sentence_transformers==2.2.2
|
7 |
+
streamlit==1.24.0
|