Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -14,21 +14,6 @@ from scipy.spatial.distance import cosine
|
|
| 14 |
openai.api_key = os.environ["OPENAI_API_KEY"]
|
| 15 |
|
| 16 |
|
| 17 |
-
def merge_dataframes(dataframes: List[pd.DataFrame]) -> pd.DataFrame:
|
| 18 |
-
"""Merges a list of DataFrames, keeping only specific columns."""
|
| 19 |
-
# Concatenate the list of dataframes
|
| 20 |
-
combined_dataframe = pd.concat(
|
| 21 |
-
dataframes, ignore_index=True
|
| 22 |
-
) # Combine all dataframes into one
|
| 23 |
-
|
| 24 |
-
# Ensure that the resulting dataframe only contains the columns "context", "questions", "answers"
|
| 25 |
-
combined_dataframe = combined_dataframe[
|
| 26 |
-
["context", "questions", "answers"]
|
| 27 |
-
] # Filter for specific columns
|
| 28 |
-
|
| 29 |
-
return combined_dataframe # Return the merged and filtered DataFrame
|
| 30 |
-
|
| 31 |
-
|
| 32 |
def call_chatgpt(prompt: str) -> str:
|
| 33 |
"""
|
| 34 |
Uses the OpenAI API to generate an AI response to a prompt.
|
|
@@ -59,67 +44,6 @@ def call_chatgpt(prompt: str) -> str:
|
|
| 59 |
return ans
|
| 60 |
|
| 61 |
|
| 62 |
-
def openai_text_embedding(prompt: str) -> str:
|
| 63 |
-
return openai.Embedding.create(input=prompt, model="text-embedding-ada-002")[
|
| 64 |
-
"data"
|
| 65 |
-
][0]["embedding"]
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
def calculate_sts_openai_score(sentence1: str, sentence2: str) -> float:
|
| 69 |
-
# Compute sentence embeddings
|
| 70 |
-
embedding1 = openai_text_embedding(sentence1) # Flatten the embedding array
|
| 71 |
-
embedding2 = openai_text_embedding(sentence2) # Flatten the embedding array
|
| 72 |
-
|
| 73 |
-
# Convert to array
|
| 74 |
-
embedding1 = np.asarray(embedding1)
|
| 75 |
-
embedding2 = np.asarray(embedding2)
|
| 76 |
-
|
| 77 |
-
# Calculate cosine similarity between the embeddings
|
| 78 |
-
similarity_score = 1 - cosine(embedding1, embedding2)
|
| 79 |
-
|
| 80 |
-
return similarity_score
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
def add_dist_score_column(
|
| 84 |
-
dataframe: pd.DataFrame,
|
| 85 |
-
sentence: str,
|
| 86 |
-
) -> pd.DataFrame:
|
| 87 |
-
dataframe["stsopenai"] = dataframe["questions"].apply(
|
| 88 |
-
lambda x: calculate_sts_openai_score(str(x), sentence)
|
| 89 |
-
)
|
| 90 |
-
|
| 91 |
-
sorted_dataframe = dataframe.sort_values(by="stsopenai", ascending=False)
|
| 92 |
-
return sorted_dataframe.iloc[:5, :]
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
def convert_to_list_of_dict(df: pd.DataFrame) -> List[Dict[str, str]]:
|
| 96 |
-
"""
|
| 97 |
-
Reads in a pandas DataFrame and produces a list of dictionaries with two keys each, 'question' and 'answer.'
|
| 98 |
-
|
| 99 |
-
Args:
|
| 100 |
-
df: A pandas DataFrame with columns named 'questions' and 'answers'.
|
| 101 |
-
|
| 102 |
-
Returns:
|
| 103 |
-
A list of dictionaries, with each dictionary containing a 'question' and 'answer' key-value pair.
|
| 104 |
-
"""
|
| 105 |
-
|
| 106 |
-
# Initialize an empty list to store the dictionaries
|
| 107 |
-
result = []
|
| 108 |
-
|
| 109 |
-
# Loop through each row of the DataFrame
|
| 110 |
-
for index, row in df.iterrows():
|
| 111 |
-
# Create a dictionary with the current question and answer
|
| 112 |
-
qa_dict_quest = {"role": "user", "content": row["questions"]}
|
| 113 |
-
qa_dict_ans = {"role": "assistant", "content": row["answers"]}
|
| 114 |
-
|
| 115 |
-
# Add the dictionary to the result list
|
| 116 |
-
result.append(qa_dict_quest)
|
| 117 |
-
result.append(qa_dict_ans)
|
| 118 |
-
|
| 119 |
-
# Return the list of dictionaries
|
| 120 |
-
return result
|
| 121 |
-
|
| 122 |
-
|
| 123 |
## rag strategy 1
|
| 124 |
# file_names = [f"output_files/file_{i}.txt" for i in range(131)]
|
| 125 |
# # file_names = [f"output_files_large/file_{i}.txt" for i in range(1310)]
|
|
@@ -148,11 +72,15 @@ def convert_to_list_of_dict(df: pd.DataFrame) -> List[Dict[str, str]]:
|
|
| 148 |
|
| 149 |
## rag strategy 2
|
| 150 |
from datasets import load_dataset
|
| 151 |
-
dataset = load_dataset("eagle0504/youthless-homeless-shelter-web-scrape-dataset-qa-formatted")
|
| 152 |
-
|
| 153 |
import chromadb
|
|
|
|
|
|
|
|
|
|
| 154 |
client = chromadb.Client()
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
# Embed and store the first N supports for this demo
|
| 158 |
L = len(dataset["train"]['questions'])
|
|
|
|
| 14 |
openai.api_key = os.environ["OPENAI_API_KEY"]
|
| 15 |
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
def call_chatgpt(prompt: str) -> str:
|
| 18 |
"""
|
| 19 |
Uses the OpenAI API to generate an AI response to a prompt.
|
|
|
|
| 44 |
return ans
|
| 45 |
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
## rag strategy 1
|
| 48 |
# file_names = [f"output_files/file_{i}.txt" for i in range(131)]
|
| 49 |
# # file_names = [f"output_files_large/file_{i}.txt" for i in range(1310)]
|
|
|
|
| 72 |
|
| 73 |
## rag strategy 2
|
| 74 |
from datasets import load_dataset
|
|
|
|
|
|
|
| 75 |
import chromadb
|
| 76 |
+
import string
|
| 77 |
+
|
| 78 |
+
dataset = load_dataset("eagle0504/youthless-homeless-shelter-web-scrape-dataset-qa-formatted")
|
| 79 |
client = chromadb.Client()
|
| 80 |
+
random_number = np.random.randint(low=1e9, high=1e10)
|
| 81 |
+
random_string = ''.join(np.random.choice(list(string.ascii_uppercase + string.digits), size=10))
|
| 82 |
+
combined_string = f"{random_number}{random_string}"
|
| 83 |
+
collection = client.create_collection(combined_string)
|
| 84 |
|
| 85 |
# Embed and store the first N supports for this demo
|
| 86 |
L = len(dataset["train"]['questions'])
|