tonic
commited on
Commit
·
4643020
1
Parent(s):
553c8dd
Update app.py
Browse files- backend/app.py +54 -7
backend/app.py
CHANGED
@@ -69,14 +69,49 @@ Article = {
|
|
69 |
# "vectorizer": "text2vec-contextionary"
|
70 |
}
|
71 |
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
schema = {
|
74 |
"classes": [Article]
|
75 |
}
|
76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
# Initialize vectorstore
|
78 |
vectorstore = Weaviate(client, index_name="HereChat", text_key="text")
|
79 |
-
client.schema.create(schema)
|
80 |
vectorstore._query_attrs = ["text", "title", "url", "views", "lang", "_additional {distance}"]
|
81 |
vectorstore.embedding = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key=cohere_api_key)
|
82 |
|
@@ -87,8 +122,16 @@ def embed_pdf(file, collection_name):
|
|
87 |
# Save the uploaded file
|
88 |
filename = file.name
|
89 |
file_path = os.path.join('./', filename)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
with open(file_path, 'wb') as f:
|
91 |
-
f.write(
|
92 |
|
93 |
# Checking filetype for document parsing
|
94 |
mime_type = mimetypes.guess_type(file_path)[0]
|
@@ -121,13 +164,15 @@ def retrieve_info(query):
|
|
121 |
# Rerank the top results
|
122 |
reranked_results = co.rerank(query=query, documents=top_docs, top_n=3, model='rerank-english-v2.0')
|
123 |
|
124 |
-
# Format the reranked results
|
125 |
formatted_results = []
|
126 |
for idx, r in enumerate(reranked_results):
|
127 |
formatted_result = {
|
128 |
"Document Rank": idx + 1,
|
129 |
-
"
|
130 |
-
"
|
|
|
|
|
131 |
"Relevance Score": f"{r.relevance_score:.2f}"
|
132 |
}
|
133 |
formatted_results.append(formatted_result)
|
@@ -162,12 +207,14 @@ def retrieve_info(query):
|
|
162 |
|
163 |
def combined_interface(query, file, collection_name):
|
164 |
if query:
|
165 |
-
|
|
|
166 |
elif file is not None and collection_name:
|
167 |
return embed_pdf(file, collection_name)
|
168 |
else:
|
169 |
return "Please enter a query or upload a PDF file."
|
170 |
|
|
|
171 |
iface = gr.Interface(
|
172 |
fn=combined_interface,
|
173 |
inputs=[
|
|
|
69 |
# "vectorizer": "text2vec-contextionary"
|
70 |
}
|
71 |
|
72 |
+
# Function to check if a class exists in the schema
|
73 |
+
def class_exists(class_name):
|
74 |
+
try:
|
75 |
+
existing_schema = client.schema.get()
|
76 |
+
existing_classes = [cls["class"] for cls in existing_schema["classes"]]
|
77 |
+
return class_name in existing_classes
|
78 |
+
except Exception as e:
|
79 |
+
print(f"Error checking if class exists: {e}")
|
80 |
+
return False
|
81 |
+
|
82 |
+
# Check if 'Article' class already exists
|
83 |
+
if not class_exists("Article"):
|
84 |
+
# Create the schema if 'Article' class does not exist
|
85 |
+
try:
|
86 |
+
client.schema.create(schema)
|
87 |
+
except Exception as e:
|
88 |
+
print(f"Error creating schema: {e}")
|
89 |
+
else:
|
90 |
+
print("Class 'Article' already exists in the schema.")
|
91 |
+
|
92 |
+
# Initialize the schema
|
93 |
schema = {
|
94 |
"classes": [Article]
|
95 |
}
|
96 |
|
97 |
+
# Check if 'Article' class already exists
|
98 |
+
if not class_exists("Article"):
|
99 |
+
# Create the schema if 'Article' class does not exist
|
100 |
+
try:
|
101 |
+
client.schema.create(schema)
|
102 |
+
except Exception as e:
|
103 |
+
print(f"Error creating schema: {e}")
|
104 |
+
else:
|
105 |
+
# Retrieve the existing schema if 'Article' class exists
|
106 |
+
try:
|
107 |
+
existing_schema = client.schema.get()
|
108 |
+
print("Existing schema retrieved:", existing_schema)
|
109 |
+
except Exception as e:
|
110 |
+
print(f"Error retrieving existing schema: {e}")
|
111 |
+
|
112 |
+
|
113 |
# Initialize vectorstore
|
114 |
vectorstore = Weaviate(client, index_name="HereChat", text_key="text")
|
|
|
115 |
vectorstore._query_attrs = ["text", "title", "url", "views", "lang", "_additional {distance}"]
|
116 |
vectorstore.embedding = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key=cohere_api_key)
|
117 |
|
|
|
122 |
# Save the uploaded file
|
123 |
filename = file.name
|
124 |
file_path = os.path.join('./', filename)
|
125 |
+
|
126 |
+
# Check if the file object has 'read' method
|
127 |
+
if hasattr(file, 'read'):
|
128 |
+
file_content = file.read()
|
129 |
+
else:
|
130 |
+
# Handle the case where 'read' method is not available
|
131 |
+
file_content = file.getvalue() # Assuming it's a NamedString or similar object
|
132 |
+
|
133 |
with open(file_path, 'wb') as f:
|
134 |
+
f.write(file_content)
|
135 |
|
136 |
# Checking filetype for document parsing
|
137 |
mime_type = mimetypes.guess_type(file_path)[0]
|
|
|
164 |
# Rerank the top results
|
165 |
reranked_results = co.rerank(query=query, documents=top_docs, top_n=3, model='rerank-english-v2.0')
|
166 |
|
167 |
+
# Format the reranked results according to the Article schema
|
168 |
formatted_results = []
|
169 |
for idx, r in enumerate(reranked_results):
|
170 |
formatted_result = {
|
171 |
"Document Rank": idx + 1,
|
172 |
+
"Title": r.document['title'],
|
173 |
+
"Content": r.document['content'],
|
174 |
+
"Author": r.document['author'],
|
175 |
+
"Publish Date": r.document['publishDate'],
|
176 |
"Relevance Score": f"{r.relevance_score:.2f}"
|
177 |
}
|
178 |
formatted_results.append(formatted_result)
|
|
|
207 |
|
208 |
def combined_interface(query, file, collection_name):
|
209 |
if query:
|
210 |
+
article_info = retrieve_info(query)
|
211 |
+
return article_info
|
212 |
elif file is not None and collection_name:
|
213 |
return embed_pdf(file, collection_name)
|
214 |
else:
|
215 |
return "Please enter a query or upload a PDF file."
|
216 |
|
217 |
+
|
218 |
iface = gr.Interface(
|
219 |
fn=combined_interface,
|
220 |
inputs=[
|