eagle0504 commited on
Commit
07e3ec5
·
verified ·
1 Parent(s): 38a30d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -79
app.py CHANGED
@@ -14,21 +14,6 @@ from scipy.spatial.distance import cosine
14
  openai.api_key = os.environ["OPENAI_API_KEY"]
15
 
16
 
17
- def merge_dataframes(dataframes: List[pd.DataFrame]) -> pd.DataFrame:
18
- """Merges a list of DataFrames, keeping only specific columns."""
19
- # Concatenate the list of dataframes
20
- combined_dataframe = pd.concat(
21
- dataframes, ignore_index=True
22
- ) # Combine all dataframes into one
23
-
24
- # Ensure that the resulting dataframe only contains the columns "context", "questions", "answers"
25
- combined_dataframe = combined_dataframe[
26
- ["context", "questions", "answers"]
27
- ] # Filter for specific columns
28
-
29
- return combined_dataframe # Return the merged and filtered DataFrame
30
-
31
-
32
  def call_chatgpt(prompt: str) -> str:
33
  """
34
  Uses the OpenAI API to generate an AI response to a prompt.
@@ -59,67 +44,6 @@ def call_chatgpt(prompt: str) -> str:
59
  return ans
60
 
61
 
62
- def openai_text_embedding(prompt: str) -> str:
63
- return openai.Embedding.create(input=prompt, model="text-embedding-ada-002")[
64
- "data"
65
- ][0]["embedding"]
66
-
67
-
68
- def calculate_sts_openai_score(sentence1: str, sentence2: str) -> float:
69
- # Compute sentence embeddings
70
- embedding1 = openai_text_embedding(sentence1) # Flatten the embedding array
71
- embedding2 = openai_text_embedding(sentence2) # Flatten the embedding array
72
-
73
- # Convert to array
74
- embedding1 = np.asarray(embedding1)
75
- embedding2 = np.asarray(embedding2)
76
-
77
- # Calculate cosine similarity between the embeddings
78
- similarity_score = 1 - cosine(embedding1, embedding2)
79
-
80
- return similarity_score
81
-
82
-
83
- def add_dist_score_column(
84
- dataframe: pd.DataFrame,
85
- sentence: str,
86
- ) -> pd.DataFrame:
87
- dataframe["stsopenai"] = dataframe["questions"].apply(
88
- lambda x: calculate_sts_openai_score(str(x), sentence)
89
- )
90
-
91
- sorted_dataframe = dataframe.sort_values(by="stsopenai", ascending=False)
92
- return sorted_dataframe.iloc[:5, :]
93
-
94
-
95
- def convert_to_list_of_dict(df: pd.DataFrame) -> List[Dict[str, str]]:
96
- """
97
- Reads in a pandas DataFrame and produces a list of dictionaries with two keys each, 'question' and 'answer.'
98
-
99
- Args:
100
- df: A pandas DataFrame with columns named 'questions' and 'answers'.
101
-
102
- Returns:
103
- A list of dictionaries, with each dictionary containing a 'question' and 'answer' key-value pair.
104
- """
105
-
106
- # Initialize an empty list to store the dictionaries
107
- result = []
108
-
109
- # Loop through each row of the DataFrame
110
- for index, row in df.iterrows():
111
- # Create a dictionary with the current question and answer
112
- qa_dict_quest = {"role": "user", "content": row["questions"]}
113
- qa_dict_ans = {"role": "assistant", "content": row["answers"]}
114
-
115
- # Add the dictionary to the result list
116
- result.append(qa_dict_quest)
117
- result.append(qa_dict_ans)
118
-
119
- # Return the list of dictionaries
120
- return result
121
-
122
-
123
  ## rag strategy 1
124
  # file_names = [f"output_files/file_{i}.txt" for i in range(131)]
125
  # # file_names = [f"output_files_large/file_{i}.txt" for i in range(1310)]
@@ -148,11 +72,15 @@ def convert_to_list_of_dict(df: pd.DataFrame) -> List[Dict[str, str]]:
148
 
149
  ## rag strategy 2
150
  from datasets import load_dataset
151
- dataset = load_dataset("eagle0504/youthless-homeless-shelter-web-scrape-dataset-qa-formatted")
152
-
153
  import chromadb
 
 
 
154
  client = chromadb.Client()
155
- collection = client.create_collection("vector_database")
 
 
 
156
 
157
  # Embed and store the first N supports for this demo
158
  L = len(dataset["train"]['questions'])
 
14
  openai.api_key = os.environ["OPENAI_API_KEY"]
15
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def call_chatgpt(prompt: str) -> str:
18
  """
19
  Uses the OpenAI API to generate an AI response to a prompt.
 
44
  return ans
45
 
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  ## rag strategy 1
48
  # file_names = [f"output_files/file_{i}.txt" for i in range(131)]
49
  # # file_names = [f"output_files_large/file_{i}.txt" for i in range(1310)]
 
72
 
73
  ## rag strategy 2
74
  from datasets import load_dataset
 
 
75
  import chromadb
76
+ import string
77
+
78
+ dataset = load_dataset("eagle0504/youthless-homeless-shelter-web-scrape-dataset-qa-formatted")
79
  client = chromadb.Client()
80
+ random_number = np.random.randint(low=1e9, high=1e10)
81
+ random_string = ''.join(np.random.choice(list(string.ascii_uppercase + string.digits), size=10))
82
+ combined_string = f"{random_number}{random_string}"
83
+ collection = client.create_collection(combined_string)
84
 
85
  # Embed and store the first N supports for this demo
86
  L = len(dataset["train"]['questions'])