Spaces:

aiswaryasankar
/

entelligence.ai

Sleeping

App Files Files Community

Aiswarya Sankar commited on Sep 27, 2023

Commit

be672a0

1 Parent(s): 5c16594

Update app

Browse files

Files changed (1) hide show

app.py +52 -52

app.py CHANGED Viewed

@@ -116,59 +116,51 @@ def index_repo(textbox: str, dropdown: str) -> Response:
     dataset_path = f"hub://{activeloop_username}/" + pathName + str(random.randint(1,100))
     invalid_dataset_path = True
     try:
-        try:
-            db = DeepLake(dataset_path=dataset_path,
-                    embedding_function=embeddings,
-                    token=os.environ['ACTIVELOOP_TOKEN'],
-                    read_only=True,
-                    num_workers=12,
-                    runtime = {"tensor_db": True}
-                )
-        except Exception as e:
-            print("Failed to read: " + str(e))
-            if "scheduled for deletion" in str(e):
-                dataset_path = f"hub://{activeloop_username}/" + pathName + str(random.randint(1,100))
-                invalid_dataset_path = True
-        if invalid_dataset_path or db is None or len(db.vectorstore.dataset) == 0:
-            print("Dataset doesn't exist, fetching data")
-            try:
-                docs = []
-                for dirpath, dirnames, filenames in os.walk(root_dir):
-                    for file in filenames:
-                        print(file)
-                        try:
-                            loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
-                            docs.extend(loader.load_and_split())
-                        except Exception as e:
-                            print("Exception: " + str(e) + "| File: " + os.path.join(dirpath, file))
-                            pass
-                activeloop_username = "aiswaryas"
-                text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
-                texts = text_splitter.split_documents(docs)
-                db = DeepLake(dataset_path=dataset_path,
-                        embedding_function=embeddings,
-                        token=os.environ['ACTIVELOOP_TOKEN'],
-                        read_only=False,
-                        num_workers=12,
-                        runtime = {"tensor_db": True}
-                )
-                # Do this in chunks to avoid hitting the ratelimit immediately
-                for i in range(0, len(texts), 500):
-                    print("Adding documents " + str(i))
-                    db.add_documents(texts[i:i+500])
-                    time.sleep(.5)
-            except Exception as e:
-                return Response(
-                    result= "Failed to index github repo",
-                    repo="",
-                    error=str(e),
-                    stdout="",
-                )
     except Exception as e:
         return Response(
@@ -178,6 +170,14 @@ def index_repo(textbox: str, dropdown: str) -> Response:
             stdout="",
         )
     vector_db_url.value = dataset_path
     return {

     dataset_path = f"hub://{activeloop_username}/" + pathName + str(random.randint(1,100))
     invalid_dataset_path = True
+    # try:
+    #     try:
+    #         db = DeepLake(dataset_path=dataset_path,
+    #                 embedding_function=embeddings,
+    #                 token=os.environ['ACTIVELOOP_TOKEN'],
+    #                 read_only=True,
+    #                 num_workers=12,
+    #                 runtime = {"tensor_db": True}
+    #             )
+    #     except Exception as e:
+    #         print("Failed to read: " + str(e))
+    #         if "scheduled for deletion" in str(e):
+    #             dataset_path = f"hub://{activeloop_username}/" + pathName + str(random.randint(1,100))
+    #             invalid_dataset_path = True
+    #     if invalid_dataset_path or db is None or len(db.vectorstore.dataset) == 0:
+    #         print("Dataset doesn't exist, fetching data")
     try:
+        docs = []
+        for dirpath, dirnames, filenames in os.walk(root_dir):
+            for file in filenames:
+                print(file)
+                try:
+                    loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
+                    docs.extend(loader.load_and_split())
+                except Exception as e:
+                    print("Exception: " + str(e) + "| File: " + os.path.join(dirpath, file))
+                    pass
+        activeloop_username = "aiswaryas"
+        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+        texts = text_splitter.split_documents(docs)
+        db = DeepLake(dataset_path=dataset_path,
+                embedding_function=embeddings,
+                token=os.environ['ACTIVELOOP_TOKEN'],
+                read_only=False,
+                num_workers=12,
+                runtime = {"tensor_db": True}
+        )
+        # Do this in chunks to avoid hitting the ratelimit immediately
+        for i in range(0, len(texts), 500):
+            print("Adding documents " + str(i))
+            db.add_documents(texts[i:i+500])
+            time.sleep(.5)
     except Exception as e:
         return Response(
             stdout="",
         )
+    # except Exception as e:
+    #     return Response(
+    #         result= "Failed to index github repo",
+    #         repo="",
+    #         error=str(e),
+    #         stdout="",
+    #     )
     vector_db_url.value = dataset_path
     return {