Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -14,21 +14,6 @@ from scipy.spatial.distance import cosine
|
|
14 |
openai.api_key = os.environ["OPENAI_API_KEY"]
|
15 |
|
16 |
|
17 |
-
def merge_dataframes(dataframes: List[pd.DataFrame]) -> pd.DataFrame:
|
18 |
-
"""Merges a list of DataFrames, keeping only specific columns."""
|
19 |
-
# Concatenate the list of dataframes
|
20 |
-
combined_dataframe = pd.concat(
|
21 |
-
dataframes, ignore_index=True
|
22 |
-
) # Combine all dataframes into one
|
23 |
-
|
24 |
-
# Ensure that the resulting dataframe only contains the columns "context", "questions", "answers"
|
25 |
-
combined_dataframe = combined_dataframe[
|
26 |
-
["context", "questions", "answers"]
|
27 |
-
] # Filter for specific columns
|
28 |
-
|
29 |
-
return combined_dataframe # Return the merged and filtered DataFrame
|
30 |
-
|
31 |
-
|
32 |
def call_chatgpt(prompt: str) -> str:
|
33 |
"""
|
34 |
Uses the OpenAI API to generate an AI response to a prompt.
|
@@ -59,67 +44,6 @@ def call_chatgpt(prompt: str) -> str:
|
|
59 |
return ans
|
60 |
|
61 |
|
62 |
-
def openai_text_embedding(prompt: str) -> str:
|
63 |
-
return openai.Embedding.create(input=prompt, model="text-embedding-ada-002")[
|
64 |
-
"data"
|
65 |
-
][0]["embedding"]
|
66 |
-
|
67 |
-
|
68 |
-
def calculate_sts_openai_score(sentence1: str, sentence2: str) -> float:
|
69 |
-
# Compute sentence embeddings
|
70 |
-
embedding1 = openai_text_embedding(sentence1) # Flatten the embedding array
|
71 |
-
embedding2 = openai_text_embedding(sentence2) # Flatten the embedding array
|
72 |
-
|
73 |
-
# Convert to array
|
74 |
-
embedding1 = np.asarray(embedding1)
|
75 |
-
embedding2 = np.asarray(embedding2)
|
76 |
-
|
77 |
-
# Calculate cosine similarity between the embeddings
|
78 |
-
similarity_score = 1 - cosine(embedding1, embedding2)
|
79 |
-
|
80 |
-
return similarity_score
|
81 |
-
|
82 |
-
|
83 |
-
def add_dist_score_column(
|
84 |
-
dataframe: pd.DataFrame,
|
85 |
-
sentence: str,
|
86 |
-
) -> pd.DataFrame:
|
87 |
-
dataframe["stsopenai"] = dataframe["questions"].apply(
|
88 |
-
lambda x: calculate_sts_openai_score(str(x), sentence)
|
89 |
-
)
|
90 |
-
|
91 |
-
sorted_dataframe = dataframe.sort_values(by="stsopenai", ascending=False)
|
92 |
-
return sorted_dataframe.iloc[:5, :]
|
93 |
-
|
94 |
-
|
95 |
-
def convert_to_list_of_dict(df: pd.DataFrame) -> List[Dict[str, str]]:
|
96 |
-
"""
|
97 |
-
Reads in a pandas DataFrame and produces a list of dictionaries with two keys each, 'question' and 'answer.'
|
98 |
-
|
99 |
-
Args:
|
100 |
-
df: A pandas DataFrame with columns named 'questions' and 'answers'.
|
101 |
-
|
102 |
-
Returns:
|
103 |
-
A list of dictionaries, with each dictionary containing a 'question' and 'answer' key-value pair.
|
104 |
-
"""
|
105 |
-
|
106 |
-
# Initialize an empty list to store the dictionaries
|
107 |
-
result = []
|
108 |
-
|
109 |
-
# Loop through each row of the DataFrame
|
110 |
-
for index, row in df.iterrows():
|
111 |
-
# Create a dictionary with the current question and answer
|
112 |
-
qa_dict_quest = {"role": "user", "content": row["questions"]}
|
113 |
-
qa_dict_ans = {"role": "assistant", "content": row["answers"]}
|
114 |
-
|
115 |
-
# Add the dictionary to the result list
|
116 |
-
result.append(qa_dict_quest)
|
117 |
-
result.append(qa_dict_ans)
|
118 |
-
|
119 |
-
# Return the list of dictionaries
|
120 |
-
return result
|
121 |
-
|
122 |
-
|
123 |
## rag strategy 1
|
124 |
# file_names = [f"output_files/file_{i}.txt" for i in range(131)]
|
125 |
# # file_names = [f"output_files_large/file_{i}.txt" for i in range(1310)]
|
@@ -148,11 +72,15 @@ def convert_to_list_of_dict(df: pd.DataFrame) -> List[Dict[str, str]]:
|
|
148 |
|
149 |
## rag strategy 2
|
150 |
from datasets import load_dataset
|
151 |
-
dataset = load_dataset("eagle0504/youthless-homeless-shelter-web-scrape-dataset-qa-formatted")
|
152 |
-
|
153 |
import chromadb
|
|
|
|
|
|
|
154 |
client = chromadb.Client()
|
155 |
-
|
|
|
|
|
|
|
156 |
|
157 |
# Embed and store the first N supports for this demo
|
158 |
L = len(dataset["train"]['questions'])
|
|
|
14 |
openai.api_key = os.environ["OPENAI_API_KEY"]
|
15 |
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
def call_chatgpt(prompt: str) -> str:
|
18 |
"""
|
19 |
Uses the OpenAI API to generate an AI response to a prompt.
|
|
|
44 |
return ans
|
45 |
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
## rag strategy 1
|
48 |
# file_names = [f"output_files/file_{i}.txt" for i in range(131)]
|
49 |
# # file_names = [f"output_files_large/file_{i}.txt" for i in range(1310)]
|
|
|
72 |
|
73 |
## rag strategy 2
|
74 |
from datasets import load_dataset
|
|
|
|
|
75 |
import chromadb
|
76 |
+
import string
|
77 |
+
|
78 |
+
dataset = load_dataset("eagle0504/youthless-homeless-shelter-web-scrape-dataset-qa-formatted")
|
79 |
client = chromadb.Client()
|
80 |
+
random_number = np.random.randint(low=1e9, high=1e10)
|
81 |
+
random_string = ''.join(np.random.choice(list(string.ascii_uppercase + string.digits), size=10))
|
82 |
+
combined_string = f"{random_number}{random_string}"
|
83 |
+
collection = client.create_collection(combined_string)
|
84 |
|
85 |
# Embed and store the first N supports for this demo
|
86 |
L = len(dataset["train"]['questions'])
|