Spaces:

mt3842ml
/

NYC-Buddy

Sleeping

App Files Files Community

mt3842ml commited on Apr 2

Commit

c5874a9

verified ·

1 Parent(s): 04f4638

Update app.py

Browse files

Files changed (1) hide show

app.py +1 -122

app.py CHANGED Viewed

@@ -1,113 +1,8 @@
 import gradio as gr
 import os
 from groq import Groq
-############ TESTING ############
 import pandas as pd
 from datasets import Dataset
-# Define the dataset schema
-test_dataset_df = pd.DataFrame(columns=['id', 'title', 'content', 'prechunk_id', 'postchunk_id', 'arxiv_id', 'references'])
-# Populate the dataset with examples
-test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
-    'id': '1',
-    'title': 'Best restaurants in queens',
-    'content': 'I personally like to go to the J-Pan Chicken, they have fried chicken and amazing bubble tea.',
-    'prechunk_id': '',
-    'postchunk_id': '2',
-    'arxiv_id': '2401.04088',
-    'references': ['arXiv:9012.3456', 'arXiv:7890.1234']
-}])], ignore_index=True)
-test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
-    'id': '2',
-    'title': 'Best restaurants in queens',
-    'content': 'if you like asian food, flushing is second to none.',
-    'prechunk_id': '1',
-    'postchunk_id': '3',
-    'arxiv_id': '2401.04088',
-    'references': ['arXiv:6543.2109', 'arXiv:3210.9876']
-}])], ignore_index=True)
-test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
-    'id': '3',
-    'title': 'Best restaurants in queens',
-    'content': 'you have to try the ziti from ECC',
-    'prechunk_id': '2',
-    'postchunk_id': '',
-    'arxiv_id': '2401.04088',
-    'references': ['arXiv:1234.5678', 'arXiv:9012.3456']
-}])], ignore_index=True)
-test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
-    'id': '6',
-    'title': 'Best restaurants in queens',
-    'content': 'theres a good halal cart on Wub Street, they give extra sticky creamy white sauce',
-    'prechunk_id': '',
-    'postchunk_id': '',
-    'arxiv_id': '2401.04088',
-    'references': ['arXiv:1234.5678', 'arXiv:9012.3456']
-}])], ignore_index=True)
-test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
-    'id': '4',
-    'title': 'Spending a saturday in queens; what to do?',
-    'content': 'theres a hidden gem called The Lounge, you can play poker and blackjack and darts',
-    'prechunk_id': '',
-    'postchunk_id': '5',
-    'arxiv_id': '2401.04088',
-    'references': ['arXiv:1234.5678', 'arXiv:9012.3456']
-}])], ignore_index=True)
-test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
-    'id': '5',
-    'title': 'Spending a saturday in queens; what to do?',
-    'content': 'if its a nice day, basketball at Non-non-Fiction Park is always fun',
-    'prechunk_id': '',
-    'postchunk_id': '6',
-    'arxiv_id': '2401.04088',
-    'references': ['arXiv:1234.5678', 'arXiv:9012.3456']
-}])], ignore_index=True)
-test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
-    'id': '7',
-    'title': 'visiting queens for the weekend, how to get around?',
-    'content': 'nothing beats the subway, even with delays its the fastest option. you can transfer between the bus and subway with one swipe',
-    'prechunk_id': '',
-    'postchunk_id': '8',
-    'arxiv_id': '2401.04088',
-    'references': ['arXiv:1234.5678', 'arXiv:9012.3456']
-}])], ignore_index=True)
-test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
-    'id': '8',
-    'title': 'visiting queens for the weekend, how to get around?',
-    'content': 'if youre going to the bar, its honestly worth ubering there. MTA while drunk isnt something id recommend.',
-    'prechunk_id': '7',
-    'postchunk_id': '',
-    'arxiv_id': '2401.04088',
-    'references': ['arXiv:1234.5678', 'arXiv:9012.3456']
-}])], ignore_index=True)
-# Convert the DataFrame to a Hugging Face Dataset object
-test_dataset = Dataset.from_pandas(test_dataset_df)
-data = test_dataset
-data = data.map(lambda x: {
-    "id": x["id"],
-    "metadata": {
-        "title": x["title"],
-        "content": x["content"],
-    }
-})
-# drop uneeded columns
-data = data.remove_columns([
-    "title", "content", "prechunk_id",
-    "postchunk_id", "arxiv_id", "references"
-])
 from semantic_router.encoders import HuggingFaceEncoder
 encoder = HuggingFaceEncoder(name="dwzhu/e5-base-4k")
@@ -159,22 +54,6 @@ time.sleep(1)
 # view index stats
 index.describe_index_stats()
-from tqdm.auto import tqdm
-batch_size = 2  # how many embeddings we create and insert at once
-for i in tqdm(range(0, len(data), batch_size)):
-    # find end of batch
-    i_end = min(len(data), i+batch_size)
-    # create batch
-    batch = data[i:i_end]
-    # create embeddings
-    chunks = [f'{x["title"]}: {x["content"]}' for x in batch["metadata"]]
-    embeds = encoder(chunks)
-    assert len(embeds) == (i_end-i)
-    to_upsert = list(zip(batch["id"], embeds, batch["metadata"]))
-    # upsert to Pinecone
-    index.upsert(vectors=to_upsert)
 def get_docs(query: str, top_k: int) -> list[str]:
     # encode query
@@ -182,7 +61,7 @@ def get_docs(query: str, top_k: int) -> list[str]:
     # search pinecone index
     res = index.query(vector=xq, top_k=top_k, include_metadata=True)
     # get doc text
-    docs = [x["metadata"]['content'] for x in res["matches"]]
     return docs
 from groq import Groq

 import gradio as gr
 import os
 from groq import Groq
 import pandas as pd
 from datasets import Dataset
 from semantic_router.encoders import HuggingFaceEncoder
 encoder = HuggingFaceEncoder(name="dwzhu/e5-base-4k")
 # view index stats
 index.describe_index_stats()
 def get_docs(query: str, top_k: int) -> list[str]:
     # encode query
     # search pinecone index
     res = index.query(vector=xq, top_k=top_k, include_metadata=True)
     # get doc text
+    docs = [x["metadata"]['content_snippet'] for x in res["matches"]]
     return docs
 from groq import Groq