Leilaaaah commited on
Commit
cc0ab1f
·
verified ·
1 Parent(s): 8eaafe5

getting rid of sem search

Browse files
Files changed (1) hide show
  1. app.py +0 -115
app.py CHANGED
@@ -1,121 +1,6 @@
1
  import gradio as gr
2
  import requests
3
 
4
-
5
- #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
6
- # Step 1 - Semantic Search
7
- ""from sentence_transformers import SentenceTransformer
8
- import torch
9
-
10
-
11
-
12
- # Step 2 - Semantic Search
13
- # Open the water_cycle.txt file in read mode with UTF-8 encoding
14
- with open("water_cycle.txt", "r", encoding="utf-8") as file:
15
- # Read the entire contents of the file and store it in a variable
16
- water_cycle_text = file.read()
17
- # Print the text below
18
- print(water_cycle_text)
19
-
20
-
21
-
22
- # Step 3 - Semantic Search
23
- def preprocess_text(text):
24
- # Strip extra whitespace from the beginning and the end of the text
25
- cleaned_text = text.strip()
26
-
27
- # Split the cleaned_text by every newline character (\n)
28
- chunks = cleaned_text.split("\n")
29
-
30
- # Create an empty list to store cleaned chunks
31
- cleaned_chunks = []
32
-
33
- # Write your for-in loop below to clean each chunk and add it to the cleaned_chunks list
34
- for chunk in chunks:
35
- stripped_chunk = chunk.strip()
36
- if len(stripped_chunk) > 0:
37
- cleaned_chunks.append(stripped_chunk)
38
-
39
- # Print cleaned_chunks
40
- print(cleaned_chunks)
41
-
42
- # Print the length of cleaned_chunks
43
- print(len(cleaned_chunks))
44
-
45
- # Return the cleaned_chunks
46
- return cleaned_chunks
47
-
48
-
49
-
50
- # Step 4 - Semantic Search
51
- # Load the pre-trained embedding model that converts text to vectors
52
- model = SentenceTransformer('all-MiniLM-L6-v2')
53
-
54
- def create_embeddings(text_chunks):
55
- # Convert each text chunk into a vector embedding and store as a tensor
56
- chunk_embeddings = model.encode(text_chunks, convert_to_tensor=True) # Replace ... with the text_chunks list
57
-
58
- # Print the chunk embeddings
59
- print(chunk_embeddings)
60
-
61
- # Print the shape of chunk_embeddings
62
- print(chunk_embeddings.shape)
63
-
64
- # Return the chunk_embeddings
65
- return chunk_embeddings
66
-
67
- # Call the create_embeddings function and store the result in a new chunk_embeddings variable
68
- chunk_embeddings = create_embeddings(cleaned_chunks) # Complete this line
69
-
70
- # Call the preprocess_text function and store the result in a cleaned_chunks variable
71
- #cleaned_chunks = preprocess_text(water_cycle_text) # Complete this line
72
-
73
-
74
-
75
- # Step 5 - Semantic Search
76
- def get_top_chunks(query, chunk_embeddings, text_chunks):
77
- # Convert the query text into a vector embedding
78
- query_embedding = model.encode(query, convert_to_tensor = True) # Complete this line
79
-
80
- # Normalize the query embedding to unit length for accurate similarity comparison
81
- query_embedding_normalized = query_embedding / query_embedding.norm()
82
-
83
- # Normalize all chunk embeddings to unit length for consistent comparison
84
- chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm(dim=1, keepdim=True)
85
-
86
- # Calculate cosine similarity between query and all chunks using matrix multiplication
87
- similarities = torch.matmul(chunk_embeddings_normalized, query_embedding_normalized) # Complete this line
88
-
89
- # Print the similarities
90
- print(similarities)
91
-
92
- # Find the indices of the 3 chunks with highest similarity scores
93
- top_indices = torch.topk(similarities, k=3).indices
94
-
95
- # Print the top indices
96
- print(top_indices)
97
-
98
- # Create an empty list to store the most relevant chunks
99
- top_chunks = []
100
-
101
- # Loop through the top indices and retrieve the corresponding text chunks
102
- for i in top_indices:
103
- chunk = text_chunks[i]
104
- top_chunks.append(chunk)
105
-
106
- # Return the list of most relevant chunks
107
- return top_chunks
108
-
109
-
110
-
111
- # Step 6 - Semantic Search
112
- # Call the get_top_chunks function with the original query
113
- top_results = get_top_chunks("How does water get into the sky", chunk_embeddings, cleaned_chunks) # Complete this line
114
- # Print the top results
115
- print(top_results)""
116
- #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
117
-
118
-
119
  def response(message, history):
120
  bitebot_chunks = get_top_chunks(message, chunk_embeddings, cleaned_chunks)
121
  print(bitebot)
 
1
  import gradio as gr
2
  import requests
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  def response(message, history):
5
  bitebot_chunks = get_top_chunks(message, chunk_embeddings, cleaned_chunks)
6
  print(bitebot)