Spaces:

eagle0504
/

YSA-Larkin-Comm

Running

App Files Files Community

eagle0504 commited on Feb 14, 2024

Commit

07e3ec5

verified ·

1 Parent(s): 38a30d6

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -79

app.py CHANGED Viewed

@@ -14,21 +14,6 @@ from scipy.spatial.distance import cosine
 openai.api_key = os.environ["OPENAI_API_KEY"]
-def merge_dataframes(dataframes: List[pd.DataFrame]) -> pd.DataFrame:
-    """Merges a list of DataFrames, keeping only specific columns."""
-    # Concatenate the list of dataframes
-    combined_dataframe = pd.concat(
-        dataframes, ignore_index=True
-    )  # Combine all dataframes into one
-    # Ensure that the resulting dataframe only contains the columns "context", "questions", "answers"
-    combined_dataframe = combined_dataframe[
-        ["context", "questions", "answers"]
-    ]  # Filter for specific columns
-    return combined_dataframe  # Return the merged and filtered DataFrame
 def call_chatgpt(prompt: str) -> str:
     """
     Uses the OpenAI API to generate an AI response to a prompt.
@@ -59,67 +44,6 @@ def call_chatgpt(prompt: str) -> str:
     return ans
-def openai_text_embedding(prompt: str) -> str:
-    return openai.Embedding.create(input=prompt, model="text-embedding-ada-002")[
-        "data"
-    ][0]["embedding"]
-def calculate_sts_openai_score(sentence1: str, sentence2: str) -> float:
-    # Compute sentence embeddings
-    embedding1 = openai_text_embedding(sentence1)  # Flatten the embedding array
-    embedding2 = openai_text_embedding(sentence2)  # Flatten the embedding array
-    # Convert to array
-    embedding1 = np.asarray(embedding1)
-    embedding2 = np.asarray(embedding2)
-    # Calculate cosine similarity between the embeddings
-    similarity_score = 1 - cosine(embedding1, embedding2)
-    return similarity_score
-def add_dist_score_column(
-    dataframe: pd.DataFrame,
-    sentence: str,
-) -> pd.DataFrame:
-    dataframe["stsopenai"] = dataframe["questions"].apply(
-        lambda x: calculate_sts_openai_score(str(x), sentence)
-    )
-    sorted_dataframe = dataframe.sort_values(by="stsopenai", ascending=False)
-    return sorted_dataframe.iloc[:5, :]
-def convert_to_list_of_dict(df: pd.DataFrame) -> List[Dict[str, str]]:
-    """
-    Reads in a pandas DataFrame and produces a list of dictionaries with two keys each, 'question' and 'answer.'
-    Args:
-        df: A pandas DataFrame with columns named 'questions' and 'answers'.
-    Returns:
-        A list of dictionaries, with each dictionary containing a 'question' and 'answer' key-value pair.
-    """
-    # Initialize an empty list to store the dictionaries
-    result = []
-    # Loop through each row of the DataFrame
-    for index, row in df.iterrows():
-        # Create a dictionary with the current question and answer
-        qa_dict_quest = {"role": "user", "content": row["questions"]}
-        qa_dict_ans = {"role": "assistant", "content": row["answers"]}
-        # Add the dictionary to the result list
-        result.append(qa_dict_quest)
-        result.append(qa_dict_ans)
-    # Return the list of dictionaries
-    return result
 ## rag strategy 1
 # file_names = [f"output_files/file_{i}.txt" for i in range(131)]
 # # file_names = [f"output_files_large/file_{i}.txt" for i in range(1310)]
@@ -148,11 +72,15 @@ def convert_to_list_of_dict(df: pd.DataFrame) -> List[Dict[str, str]]:
 ## rag strategy 2
 from datasets import load_dataset
-dataset = load_dataset("eagle0504/youthless-homeless-shelter-web-scrape-dataset-qa-formatted")
 import chromadb
 client = chromadb.Client()
-collection = client.create_collection("vector_database")
 # Embed and store the first N supports for this demo
 L = len(dataset["train"]['questions'])

 openai.api_key = os.environ["OPENAI_API_KEY"]
 def call_chatgpt(prompt: str) -> str:
     """
     Uses the OpenAI API to generate an AI response to a prompt.
     return ans
 ## rag strategy 1
 # file_names = [f"output_files/file_{i}.txt" for i in range(131)]
 # # file_names = [f"output_files_large/file_{i}.txt" for i in range(1310)]
 ## rag strategy 2
 from datasets import load_dataset
 import chromadb
+import string
+dataset = load_dataset("eagle0504/youthless-homeless-shelter-web-scrape-dataset-qa-formatted")
 client = chromadb.Client()
+random_number = np.random.randint(low=1e9, high=1e10)
+random_string = ''.join(np.random.choice(list(string.ascii_uppercase + string.digits), size=10))
+combined_string = f"{random_number}{random_string}"
+collection = client.create_collection(combined_string)
 # Embed and store the first N supports for this demo
 L = len(dataset["train"]['questions'])