Spaces:

towardsai-tutors
/

buster

Running

jerpint commited on Aug 4, 2023

Commit

a3a378d

unverified ·

1 Parent(s): 51727c4

Update dataset link

* point to dataset space to download data;

* update sample questions

Files changed (3) hide show

cfg.py CHANGED Viewed

@@ -20,23 +20,26 @@ USERNAME = os.getenv("BUSTER_USERNAME")
 PASSWORD = os.getenv("BUSTER_PASSWORD")
 HUB_TOKEN = os.getenv("HUB_TOKEN")
-REPO_ID = "jerpint/towardsai-buster-data"
 HUB_DB_FILE = "deeplake_store.zip"
-if os.path.exists(HUB_DB_FILE):
-    logger.info(f"Using local {HUB_DB_FILE}...")
-else:
-    logger.info(f"Downloading {HUB_DB_FILE} from hub...")
-    hf_hub_download(
-        repo_id=REPO_ID,
-        repo_type="dataset",
-        filename=HUB_DB_FILE,
-        token=HUB_TOKEN,
-        local_dir=".",
-    )
 extract_zip(zip_file_path=HUB_DB_FILE, output_path="deeplake_store")
 buster_cfg = BusterConfig(
     validator_cfg={

 PASSWORD = os.getenv("BUSTER_PASSWORD")
 HUB_TOKEN = os.getenv("HUB_TOKEN")
+REPO_ID = os.getenv("HF_DATASET")
 HUB_DB_FILE = "deeplake_store.zip"
+logger.info(f"Downloading {HUB_DB_FILE} from hub...")
+hf_hub_download(
+    repo_id=REPO_ID,
+    repo_type="dataset",
+    filename=HUB_DB_FILE,
+    token=HUB_TOKEN,
+    local_dir=".",
+)
 extract_zip(zip_file_path=HUB_DB_FILE, output_path="deeplake_store")
+example_questions = [
+    "What's the best way to get a job in AI?",
+    "What is prompt engineering?",
+    "What is generative AI?",
+]
 buster_cfg = BusterConfig(
     validator_cfg={

embed_documents.py CHANGED Viewed

@@ -1,23 +1,21 @@
 import openai
 import pandas as pd
-from buster.documents import DeepLakeDocumentsManager
 from utils import zip_contents
-def read_csv(filename: str):
-    """Assumes a pre-chunked csv file is provided with expected columns."""
-    df = pd.read_csv(filename)
-    for col in ["url", "source", "title", "content"]:
-        assert col in df.columns
-    return df
 if __name__ == "__main__":
     vector_store_path = "deeplake_store"
-    chunk_file = "data/outputs.csv"
     overwrite = True
-    df = read_csv(chunk_file)
     dm = DeepLakeDocumentsManager(vector_store_path, overwrite=overwrite)
     dm.add(df)

 import openai
 import pandas as pd
+from buster.documents_manager import DeepLakeDocumentsManager
 from utils import zip_contents
 if __name__ == "__main__":
     vector_store_path = "deeplake_store"
+    chunk_file = "data/output.csv"
     overwrite = True
+    df = pd.read_csv(chunk_file)
+    # some pre-processing based on the latest file provided
+    df["url"] = df["source"]
+    df["source"] = "towardsai_blog"
+    df = df.dropna()
     dm = DeepLakeDocumentsManager(vector_store_path, overwrite=overwrite)
     dm.add(df)

gradio_app.py CHANGED Viewed

@@ -90,11 +90,7 @@ with block:
         submit = gr.Button(value="Send", variant="secondary")
     examples = gr.Examples(
-        examples=[
-            "What's a genetic algorithm?",
-            "What's PCA? What is it used for?",
-            "How do I deal with noisy data?",
-        ],
         inputs=question,
     )

         submit = gr.Button(value="Send", variant="secondary")
     examples = gr.Examples(
+        examples=cfg.example_questions,
         inputs=question,
     )