Siyuan0730 commited on
Commit
0aaf0b7
·
1 Parent(s): 54d4413

加入Pdf parser

Browse files
Files changed (1) hide show
  1. app.py +22 -4
app.py CHANGED
@@ -3,6 +3,8 @@ import numpy as np
3
  import faiss
4
  import openai
5
  import tempfile
 
 
6
  from sentence_transformers import SentenceTransformer
7
  import streamlit as st
8
  from nltk.tokenize import word_tokenize
@@ -22,6 +24,22 @@ def download_nltk():
22
  def chunkstring(string, length):
23
  return (string[0+i:length+i] for i in range(0, len(string), length))
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def get_keywords(file_paths): #这里的重点是,对每一个file做尽可能简短且覆盖全面的summarization
26
  download_nltk()
27
  keywords_list = []
@@ -123,8 +141,7 @@ def constructVDB(file_paths):
123
 
124
  #从embeddings到向量数据库
125
  # Load the embeddings
126
- data = paraphrase_embeddings_df
127
- embeddings = data.iloc[:, 1:].values # All columns except the first (chunk text)
128
 
129
  # Ensure that the array is C-contiguous
130
  embeddings = np.ascontiguousarray(embeddings, dtype=np.float32)
@@ -163,7 +180,6 @@ def searchVDB(search_sentence, paraphrase_embeddings_df, index):
163
  return retrieved_chunks_list
164
 
165
  def generateCourse(topic, materials, language):
166
-
167
  #调用gpt4 API生成一节课的内容
168
  system_message = 'You are a great AI teacher and linguist, skilled at writing informative and easy-to-understand course script based on given lesson topic and knowledge materials.'
169
 
@@ -209,6 +225,8 @@ def initialize_file(added_files):
209
  temp_file_paths = []
210
  with st.spinner('Processing file...'):
211
  for added_file in added_files:
 
 
212
  with tempfile.NamedTemporaryFile(delete=False, suffix=".md") as tmp:
213
  tmp.write(added_file.getvalue())
214
  tmp_path = tmp.name
@@ -280,7 +298,7 @@ def app():
280
 
281
  with st.sidebar:
282
  st.image("https://siyuan-harry.oss-cn-beijing.aliyuncs.com/oss://siyuan-harry/20231021212525.png")
283
- added_files = st.file_uploader('Upload .md file', type=['.md'], accept_multiple_files=True)
284
  num_lessons = st.slider('How many lessons do you want this course to have?', min_value=2, max_value=15, value=5, step=1)
285
  language = 'English'
286
  Chinese = st.checkbox('Output in Chinese')
 
3
  import faiss
4
  import openai
5
  import tempfile
6
+ from PyPDF2 import PdfReader
7
+ import io
8
  from sentence_transformers import SentenceTransformer
9
  import streamlit as st
10
  from nltk.tokenize import word_tokenize
 
24
  def chunkstring(string, length):
25
  return (string[0+i:length+i] for i in range(0, len(string), length))
26
 
27
+
28
+ def pdf_parser(input_pdf):
29
+ pdf = PdfReader(input_pdf)
30
+ content = ""
31
+ for page in pdf.pages:
32
+ content += page.extract_text()
33
+ output_file = io.StringIO()
34
+ output_file.write(content)
35
+ output_file.seek(0)
36
+ return output_file
37
+
38
+ with open('/content/01 TikTok前景.pdf', 'rb') as input_file:
39
+ output_file = pdf_parser(input_file)
40
+ with open('output.md', 'w', encoding='utf-8') as output_md_file:
41
+ output_md_file.write(output_file.getvalue())
42
+
43
  def get_keywords(file_paths): #这里的重点是,对每一个file做尽可能简短且覆盖全面的summarization
44
  download_nltk()
45
  keywords_list = []
 
141
 
142
  #从embeddings到向量数据库
143
  # Load the embeddings
144
+ embeddings = paraphrase_embeddings_df.iloc[:, 1:].values # All columns except the first (chunk text)
 
145
 
146
  # Ensure that the array is C-contiguous
147
  embeddings = np.ascontiguousarray(embeddings, dtype=np.float32)
 
180
  return retrieved_chunks_list
181
 
182
  def generateCourse(topic, materials, language):
 
183
  #调用gpt4 API生成一节课的内容
184
  system_message = 'You are a great AI teacher and linguist, skilled at writing informative and easy-to-understand course script based on given lesson topic and knowledge materials.'
185
 
 
225
  temp_file_paths = []
226
  with st.spinner('Processing file...'):
227
  for added_file in added_files:
228
+ if added_file.name.endswith(".pdf"):
229
+ added_file = pdf_parser(added_file)
230
  with tempfile.NamedTemporaryFile(delete=False, suffix=".md") as tmp:
231
  tmp.write(added_file.getvalue())
232
  tmp_path = tmp.name
 
298
 
299
  with st.sidebar:
300
  st.image("https://siyuan-harry.oss-cn-beijing.aliyuncs.com/oss://siyuan-harry/20231021212525.png")
301
+ added_files = st.file_uploader('Upload .md and .pdf files, simultaneous mixed upload these types is supported.', type=['.md','.pdf'], accept_multiple_files=True)
302
  num_lessons = st.slider('How many lessons do you want this course to have?', min_value=2, max_value=15, value=5, step=1)
303
  language = 'English'
304
  Chinese = st.checkbox('Output in Chinese')