Spaces:
Running
Running
Commit
·
0aaf0b7
1
Parent(s):
54d4413
加入Pdf parser
Browse files
app.py
CHANGED
@@ -3,6 +3,8 @@ import numpy as np
|
|
3 |
import faiss
|
4 |
import openai
|
5 |
import tempfile
|
|
|
|
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
import streamlit as st
|
8 |
from nltk.tokenize import word_tokenize
|
@@ -22,6 +24,22 @@ def download_nltk():
|
|
22 |
def chunkstring(string, length):
|
23 |
return (string[0+i:length+i] for i in range(0, len(string), length))
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
def get_keywords(file_paths): #这里的重点是,对每一个file做尽可能简短且覆盖全面的summarization
|
26 |
download_nltk()
|
27 |
keywords_list = []
|
@@ -123,8 +141,7 @@ def constructVDB(file_paths):
|
|
123 |
|
124 |
#从embeddings到向量数据库
|
125 |
# Load the embeddings
|
126 |
-
|
127 |
-
embeddings = data.iloc[:, 1:].values # All columns except the first (chunk text)
|
128 |
|
129 |
# Ensure that the array is C-contiguous
|
130 |
embeddings = np.ascontiguousarray(embeddings, dtype=np.float32)
|
@@ -163,7 +180,6 @@ def searchVDB(search_sentence, paraphrase_embeddings_df, index):
|
|
163 |
return retrieved_chunks_list
|
164 |
|
165 |
def generateCourse(topic, materials, language):
|
166 |
-
|
167 |
#调用gpt4 API生成一节课的内容
|
168 |
system_message = 'You are a great AI teacher and linguist, skilled at writing informative and easy-to-understand course script based on given lesson topic and knowledge materials.'
|
169 |
|
@@ -209,6 +225,8 @@ def initialize_file(added_files):
|
|
209 |
temp_file_paths = []
|
210 |
with st.spinner('Processing file...'):
|
211 |
for added_file in added_files:
|
|
|
|
|
212 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".md") as tmp:
|
213 |
tmp.write(added_file.getvalue())
|
214 |
tmp_path = tmp.name
|
@@ -280,7 +298,7 @@ def app():
|
|
280 |
|
281 |
with st.sidebar:
|
282 |
st.image("https://siyuan-harry.oss-cn-beijing.aliyuncs.com/oss://siyuan-harry/20231021212525.png")
|
283 |
-
added_files = st.file_uploader('Upload .md
|
284 |
num_lessons = st.slider('How many lessons do you want this course to have?', min_value=2, max_value=15, value=5, step=1)
|
285 |
language = 'English'
|
286 |
Chinese = st.checkbox('Output in Chinese')
|
|
|
3 |
import faiss
|
4 |
import openai
|
5 |
import tempfile
|
6 |
+
from PyPDF2 import PdfReader
|
7 |
+
import io
|
8 |
from sentence_transformers import SentenceTransformer
|
9 |
import streamlit as st
|
10 |
from nltk.tokenize import word_tokenize
|
|
|
24 |
def chunkstring(string, length):
|
25 |
return (string[0+i:length+i] for i in range(0, len(string), length))
|
26 |
|
27 |
+
|
28 |
+
def pdf_parser(input_pdf):
|
29 |
+
pdf = PdfReader(input_pdf)
|
30 |
+
content = ""
|
31 |
+
for page in pdf.pages:
|
32 |
+
content += page.extract_text()
|
33 |
+
output_file = io.StringIO()
|
34 |
+
output_file.write(content)
|
35 |
+
output_file.seek(0)
|
36 |
+
return output_file
|
37 |
+
|
38 |
+
with open('/content/01 TikTok前景.pdf', 'rb') as input_file:
|
39 |
+
output_file = pdf_parser(input_file)
|
40 |
+
with open('output.md', 'w', encoding='utf-8') as output_md_file:
|
41 |
+
output_md_file.write(output_file.getvalue())
|
42 |
+
|
43 |
def get_keywords(file_paths): #这里的重点是,对每一个file做尽可能简短且覆盖全面的summarization
|
44 |
download_nltk()
|
45 |
keywords_list = []
|
|
|
141 |
|
142 |
#从embeddings到向量数据库
|
143 |
# Load the embeddings
|
144 |
+
embeddings = paraphrase_embeddings_df.iloc[:, 1:].values # All columns except the first (chunk text)
|
|
|
145 |
|
146 |
# Ensure that the array is C-contiguous
|
147 |
embeddings = np.ascontiguousarray(embeddings, dtype=np.float32)
|
|
|
180 |
return retrieved_chunks_list
|
181 |
|
182 |
def generateCourse(topic, materials, language):
|
|
|
183 |
#调用gpt4 API生成一节课的内容
|
184 |
system_message = 'You are a great AI teacher and linguist, skilled at writing informative and easy-to-understand course script based on given lesson topic and knowledge materials.'
|
185 |
|
|
|
225 |
temp_file_paths = []
|
226 |
with st.spinner('Processing file...'):
|
227 |
for added_file in added_files:
|
228 |
+
if added_file.name.endswith(".pdf"):
|
229 |
+
added_file = pdf_parser(added_file)
|
230 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".md") as tmp:
|
231 |
tmp.write(added_file.getvalue())
|
232 |
tmp_path = tmp.name
|
|
|
298 |
|
299 |
with st.sidebar:
|
300 |
st.image("https://siyuan-harry.oss-cn-beijing.aliyuncs.com/oss://siyuan-harry/20231021212525.png")
|
301 |
+
added_files = st.file_uploader('Upload .md and .pdf files, simultaneous mixed upload these types is supported.', type=['.md','.pdf'], accept_multiple_files=True)
|
302 |
num_lessons = st.slider('How many lessons do you want this course to have?', min_value=2, max_value=15, value=5, step=1)
|
303 |
language = 'English'
|
304 |
Chinese = st.checkbox('Output in Chinese')
|