Spaces:

OuroborosM
/

STLA-BABY

Runtime error

App Files Files Community

OuroborosM commited on Aug 1, 2023

Commit

591e68e

1 Parent(s): e85fc80

Add file upload func

Browse files

Files changed (1) hide show

app.py +144 -4

app.py CHANGED Viewed

@@ -23,6 +23,139 @@ from pinecone.core.client.configuration import Configuration as OpenApiConfigura
 import gradio as gr
 import time
 class DB_Search(BaseTool):
     name = "Vector Database Search"
     description = "This is the internal database to search information firstly. If information is found, it is trustful."
@@ -183,11 +316,18 @@ def chathmi2(message, history):
 # chatbot = gr.Chatbot().style(color_map =("blue", "pink"))
 # chatbot = gr.Chatbot(color_map =("blue", "pink"))
-demo = gr.ChatInterface(
-    chathmi2,
-    title="STLA BABY - YOUR FRIENDLY GUIDE ",
-    description= "v0.2: Powered by MECH Core Team",
     )
 # demo = gr.Interface(
 #     chathmi,

 import gradio as gr
 import time
+import glob
+from typing import List
+from multiprocessing import Pool
+from tqdm import tqdm
+from langchain.document_loaders import (
+    CSVLoader,
+    EverNoteLoader,
+    PyMuPDFLoader,
+    TextLoader,
+    UnstructuredEmailLoader,
+    UnstructuredEPubLoader,
+    UnstructuredHTMLLoader,
+    UnstructuredMarkdownLoader,
+    UnstructuredODTLoader,
+    UnstructuredPowerPointLoader,
+    UnstructuredWordDocumentLoader,
+)
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.docstore.document import Document
+# Custom document loaders
+class MyElmLoader(UnstructuredEmailLoader):
+    """Wrapper to fallback to text/plain when default does not work"""
+    def load(self) -> List[Document]:
+        """Wrapper adding fallback for elm without html"""
+        try:
+            try:
+                doc = UnstructuredEmailLoader.load(self)
+            except ValueError as e:
+                if 'text/html content not found in email' in str(e):
+                    # Try plain text
+                    self.unstructured_kwargs["content_source"]="text/plain"
+                    doc = UnstructuredEmailLoader.load(self)
+                else:
+                    raise
+        except Exception as e:
+            # Add file_path to exception message
+            raise type(e)(f"{self.file_path}: {e}") from e
+        return doc
+LOADER_MAPPING = {
+    ".csv": (CSVLoader, {}),
+    # ".docx": (Docx2txtLoader, {}),
+    ".doc": (UnstructuredWordDocumentLoader, {}),
+    ".docx": (UnstructuredWordDocumentLoader, {}),
+    ".enex": (EverNoteLoader, {}),
+    ".eml": (MyElmLoader, {}),
+    ".epub": (UnstructuredEPubLoader, {}),
+    ".html": (UnstructuredHTMLLoader, {}),
+    ".md": (UnstructuredMarkdownLoader, {}),
+    ".odt": (UnstructuredODTLoader, {}),
+    ".pdf": (PyMuPDFLoader, {}),
+    ".ppt": (UnstructuredPowerPointLoader, {}),
+    ".pptx": (UnstructuredPowerPointLoader, {}),
+    ".txt": (TextLoader, {"encoding": "utf8"}),
+    # Add more mappings for other file extensions and loaders as needed
+}
+source_directory = 'Upload Files'
+file_path = ''
+chunk_size = 500
+chunk_overlap = 300
+def load_single_document(file_path: str) -> List[Document]:
+    ext = "." + file_path.rsplit(".", 1)[-1]
+    if ext in LOADER_MAPPING:
+        loader_class, loader_args = LOADER_MAPPING[ext]
+        loader = loader_class(file_path, **loader_args)
+        return loader.load()
+    raise ValueError(f"Unsupported file extension '{ext}'")
+def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
+    """
+    Loads all documents from the source documents directory, ignoring specified files
+    """
+    all_files = []
+    for ext in LOADER_MAPPING:
+        all_files.extend(
+            glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
+        )
+    filtered_files = [file_path for file_path in all_files if file_path not in ignored_files]
+    with Pool(processes=os.cpu_count()) as pool:
+        results = []
+        with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
+            for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
+                results.extend(docs)
+                pbar.update()
+    return results
+def process_documents(ignored_files: List[str] = []) -> List[Document]:
+    """
+    Load documents and split in chunks
+    """
+    print(f"Loading documents from {source_directory}")
+    documents = load_documents(source_directory, ignored_files)
+    if not documents:
+        print("No new documents to load")
+        exit(0)
+    print(f"Loaded {len(documents)} new documents from {source_directory}")
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    texts = text_splitter.split_documents(documents)
+    print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
+    return texts
+def UpdateDb(file_path: str):
+    global vectordb_p
+    # pinecone.Index(index_name).delete(delete_all=True, namespace='')
+    # collection = vectordb_p.get()
+    # split_docs = process_documents([metadata['source'] for metadata in collection['metadatas']])
+    # split_docs = process_documents()
+    documents = load_single_document(file_path)
+    if not documents:
+        print("No new documents to load")
+        exit(0)
+    print(f"Loaded {len(documents)} new documents from {source_directory}")
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    split_docs = text_splitter.split_documents(documents)
+    print(f"Split into {len(split_docs)} chunks of text (max. {chunk_size} tokens each)")
+    tt = len(split_docs)
+    print(split_docs[tt-1])
+    print(f"Creating embeddings. May take some minutes...")
+    vectordb_p = Pinecone.from_documents(split_docs, embeddings, index_name = "stla-baby")
+    print("Pinecone Updated Done")
+    print(index.describe_index_stats())
 class DB_Search(BaseTool):
     name = "Vector Database Search"
     description = "This is the internal database to search information firstly. If information is found, it is trustful."
 # chatbot = gr.Chatbot().style(color_map =("blue", "pink"))
 # chatbot = gr.Chatbot(color_map =("blue", "pink"))
+def upload_file(files):
+    print(files)
+    pass
+with gr.Blocks() as demo:
+    gr.ChatInterface(
+        chathmi2,
+        title="STLA BABY - YOUR FRIENDLY GUIDE ",
+        description= "v0.2: Powered by MECH Core Team",
     )
+    upload_button = gr.UploadButton("Upload File", file_count="multiple")
+    upload_button.upload(upload_file, upload_button)
 # demo = gr.Interface(
 #     chathmi,