Spaces:

Siyuan0730
/

OmniTutor

Running

App Files Files Community

Siyuan0730 commited on Oct 31, 2023

Commit

0aaf0b7

1 Parent(s): 54d4413

加入Pdf parser

Browse files

Files changed (1) hide show

app.py +22 -4

app.py CHANGED Viewed

@@ -3,6 +3,8 @@ import numpy as np
 import faiss
 import openai
 import tempfile
 from sentence_transformers import SentenceTransformer
 import streamlit as st
 from nltk.tokenize import word_tokenize
@@ -22,6 +24,22 @@ def download_nltk():
 def chunkstring(string, length):
         return (string[0+i:length+i] for i in range(0, len(string), length))
 def get_keywords(file_paths): #这里的重点是，对每一个file做尽可能简短且覆盖全面的summarization
     download_nltk()
     keywords_list = []
@@ -123,8 +141,7 @@ def constructVDB(file_paths):
     #从embeddings到向量数据库
     # Load the embeddings
-    data = paraphrase_embeddings_df
-    embeddings = data.iloc[:, 1:].values  # All columns except the first (chunk text)
     # Ensure that the array is C-contiguous
     embeddings = np.ascontiguousarray(embeddings, dtype=np.float32)
@@ -163,7 +180,6 @@ def searchVDB(search_sentence, paraphrase_embeddings_df, index):
     return retrieved_chunks_list
 def generateCourse(topic, materials, language):
     #调用gpt4 API生成一节课的内容
     system_message = 'You are a great AI teacher and linguist, skilled at writing informative and easy-to-understand course script based on given lesson topic and knowledge materials.'
@@ -209,6 +225,8 @@ def initialize_file(added_files):
     temp_file_paths = []
     with st.spinner('Processing file...'):
         for added_file in added_files:
             with tempfile.NamedTemporaryFile(delete=False, suffix=".md") as tmp:
                 tmp.write(added_file.getvalue())
                 tmp_path = tmp.name
@@ -280,7 +298,7 @@ def app():
     with st.sidebar:
         st.image("https://siyuan-harry.oss-cn-beijing.aliyuncs.com/oss://siyuan-harry/20231021212525.png")
-        added_files = st.file_uploader('Upload .md file', type=['.md'], accept_multiple_files=True)
         num_lessons = st.slider('How many lessons do you want this course to have?', min_value=2, max_value=15, value=5, step=1)
         language = 'English'
         Chinese = st.checkbox('Output in Chinese')

 import faiss
 import openai
 import tempfile
+from PyPDF2 import PdfReader
+import io
 from sentence_transformers import SentenceTransformer
 import streamlit as st
 from nltk.tokenize import word_tokenize
 def chunkstring(string, length):
         return (string[0+i:length+i] for i in range(0, len(string), length))
+def pdf_parser(input_pdf):
+    pdf = PdfReader(input_pdf)
+    content = ""
+    for page in pdf.pages:
+        content += page.extract_text()
+    output_file = io.StringIO()
+    output_file.write(content)
+    output_file.seek(0)
+    return output_file
+with open('/content/01 TikTok前景.pdf', 'rb') as input_file:
+    output_file = pdf_parser(input_file)
+    with open('output.md', 'w', encoding='utf-8') as output_md_file:
+        output_md_file.write(output_file.getvalue())
 def get_keywords(file_paths): #这里的重点是，对每一个file做尽可能简短且覆盖全面的summarization
     download_nltk()
     keywords_list = []
     #从embeddings到向量数据库
     # Load the embeddings
+    embeddings = paraphrase_embeddings_df.iloc[:, 1:].values  # All columns except the first (chunk text)
     # Ensure that the array is C-contiguous
     embeddings = np.ascontiguousarray(embeddings, dtype=np.float32)
     return retrieved_chunks_list
 def generateCourse(topic, materials, language):
     #调用gpt4 API生成一节课的内容
     system_message = 'You are a great AI teacher and linguist, skilled at writing informative and easy-to-understand course script based on given lesson topic and knowledge materials.'
     temp_file_paths = []
     with st.spinner('Processing file...'):
         for added_file in added_files:
+            if added_file.name.endswith(".pdf"):
+                added_file = pdf_parser(added_file)
             with tempfile.NamedTemporaryFile(delete=False, suffix=".md") as tmp:
                 tmp.write(added_file.getvalue())
                 tmp_path = tmp.name
     with st.sidebar:
         st.image("https://siyuan-harry.oss-cn-beijing.aliyuncs.com/oss://siyuan-harry/20231021212525.png")
+        added_files = st.file_uploader('Upload .md and .pdf files, simultaneous mixed upload these types is supported.', type=['.md','.pdf'], accept_multiple_files=True)
         num_lessons = st.slider('How many lessons do you want this course to have?', min_value=2, max_value=15, value=5, step=1)
         language = 'English'
         Chinese = st.checkbox('Output in Chinese')