Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -2,13 +2,15 @@ import weaviate
|
|
2 |
import langchain
|
3 |
import gradio as gr
|
4 |
from langchain.embeddings import CohereEmbeddings
|
5 |
-
from langchain.document_loaders import UnstructuredFileLoader
|
6 |
-
from langchain.vectorstores import
|
|
|
7 |
import os
|
8 |
import urllib.request
|
9 |
import ssl
|
10 |
import mimetypes
|
11 |
from dotenv import load_dotenv
|
|
|
12 |
|
13 |
# Load environment variables
|
14 |
load_dotenv()
|
@@ -17,19 +19,18 @@ cohere_api_key = os.getenv('COHERE')
|
|
17 |
weaviate_api_key = os.getenv('WEAVIATE')
|
18 |
weaviate_url = os.getenv('WEAVIATE_URL')
|
19 |
|
20 |
-
|
21 |
# Weaviate connection
|
22 |
auth_config = weaviate.auth.AuthApiKey(api_key=weaviate_api_key)
|
23 |
-
client = weaviate.
|
24 |
additional_headers={"X-Cohere-Api-Key": cohere_api_key})
|
|
|
25 |
# Initialize vectorstore
|
26 |
vectorstore = Weaviate(client, index_name="Articles", text_key="text")
|
27 |
vectorstore._query_attrs = ["text", "title", "url", "views", "lang", "_additional {distance}"]
|
28 |
vectorstore.embedding = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key=cohere_api_key)
|
29 |
|
30 |
-
# Initialize
|
31 |
-
|
32 |
-
qa = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever())
|
33 |
|
34 |
def embed_pdf(file, collection_name):
|
35 |
# Save the uploaded file
|
@@ -43,37 +44,8 @@ def embed_pdf(file, collection_name):
|
|
43 |
loader = UnstructuredFileLoader(file_path)
|
44 |
docs = loader.load()
|
45 |
|
46 |
-
# Generate embeddings
|
47 |
-
embeddings = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key=cohere_api_key)
|
48 |
-
|
49 |
-
# Store documents in vectorstore (Qdrant)
|
50 |
-
for doc in docs:
|
51 |
-
embedding = embeddings.embed([doc['text']])
|
52 |
-
vectorstore_document = {
|
53 |
-
"text": doc['text'],
|
54 |
-
"embedding": embedding
|
55 |
-
}
|
56 |
-
collection_name = request.json.get("collection_name")
|
57 |
-
file_url = request.json.get("file_url")
|
58 |
-
|
59 |
-
# Download the file
|
60 |
-
folder_path = f'./'
|
61 |
-
os.makedirs(folder_path, exist_ok=True)
|
62 |
-
filename = file_url.split('/')[-1]
|
63 |
-
file_path = os.path.join(folder_path, filename)
|
64 |
-
|
65 |
-
ssl._create_default_https_context = ssl._create_unverified_context
|
66 |
-
urllib.request.urlretrieve(file_url, file_path)
|
67 |
-
|
68 |
-
# Check filetype for document parsing
|
69 |
-
mime_type = mimetypes.guess_type(file_path)[0]
|
70 |
-
loader = UnstructuredFileLoader(file_path)
|
71 |
-
docs = loader.load()
|
72 |
-
|
73 |
-
# Generate embeddings
|
74 |
embeddings = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key=cohere_api_key)
|
75 |
-
|
76 |
-
# Store documents in Weaviate
|
77 |
for doc in docs:
|
78 |
embedding = embeddings.embed([doc['text']])
|
79 |
weaviate_document = {
|
@@ -85,22 +57,17 @@ def embed_pdf(file, collection_name):
|
|
85 |
os.remove(file_path)
|
86 |
return {"message": f"Documents embedded in Weaviate collection '{collection_name}'"}
|
87 |
|
88 |
-
|
89 |
-
co = cohere.Client(api_key=cohere_api_key)
|
90 |
-
|
91 |
-
def retrieve_info():
|
92 |
-
query = request.json.get("query")
|
93 |
llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
|
94 |
qa = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever())
|
95 |
|
96 |
# Retrieve initial results
|
97 |
initial_results = qa({"query": query})
|
98 |
|
99 |
-
# Assuming initial_results are in the desired format, extract the top
|
100 |
-
# Adjust this part according to the actual format of your initial_results
|
101 |
top_docs = initial_results[:25] # Adjust this if your result format is different
|
102 |
|
103 |
-
# Rerank the top
|
104 |
reranked_results = co.rerank(query=query, documents=top_docs, top_n=3, model='rerank-english-v2.0')
|
105 |
|
106 |
# Format the reranked results
|
@@ -114,14 +81,13 @@ def retrieve_info():
|
|
114 |
}
|
115 |
formatted_results.append(formatted_result)
|
116 |
|
117 |
-
return {"results":
|
118 |
|
119 |
# Gradio interface
|
120 |
iface = gr.Interface(
|
121 |
fn=retrieve_info,
|
122 |
inputs=[
|
123 |
-
gr.inputs.Textbox(label="Query")
|
124 |
-
gr.inputs.File(label="PDF File", type="file", optional=True)
|
125 |
],
|
126 |
outputs="text",
|
127 |
allow_flagging="never"
|
|
|
2 |
import langchain
|
3 |
import gradio as gr
|
4 |
from langchain.embeddings import CohereEmbeddings
|
5 |
+
from langchain.document_loaders import UnstructuredFileLoader
|
6 |
+
from langchain.vectorstores import Weaviate
|
7 |
+
from langchain.chain_types import OpenAI, RetrievalQA
|
8 |
import os
|
9 |
import urllib.request
|
10 |
import ssl
|
11 |
import mimetypes
|
12 |
from dotenv import load_dotenv
|
13 |
+
import cohere
|
14 |
|
15 |
# Load environment variables
|
16 |
load_dotenv()
|
|
|
19 |
weaviate_api_key = os.getenv('WEAVIATE')
|
20 |
weaviate_url = os.getenv('WEAVIATE_URL')
|
21 |
|
|
|
22 |
# Weaviate connection
|
23 |
auth_config = weaviate.auth.AuthApiKey(api_key=weaviate_api_key)
|
24 |
+
client = weaviate.Client(url=weaviate_url, auth_client_secret=auth_config,
|
25 |
additional_headers={"X-Cohere-Api-Key": cohere_api_key})
|
26 |
+
|
27 |
# Initialize vectorstore
|
28 |
vectorstore = Weaviate(client, index_name="Articles", text_key="text")
|
29 |
vectorstore._query_attrs = ["text", "title", "url", "views", "lang", "_additional {distance}"]
|
30 |
vectorstore.embedding = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key=cohere_api_key)
|
31 |
|
32 |
+
# Initialize Cohere client
|
33 |
+
co = cohere.Client(api_key=cohere_api_key)
|
|
|
34 |
|
35 |
def embed_pdf(file, collection_name):
|
36 |
# Save the uploaded file
|
|
|
44 |
loader = UnstructuredFileLoader(file_path)
|
45 |
docs = loader.load()
|
46 |
|
47 |
+
# Generate embeddings and store documents in Weaviate
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
embeddings = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key=cohere_api_key)
|
|
|
|
|
49 |
for doc in docs:
|
50 |
embedding = embeddings.embed([doc['text']])
|
51 |
weaviate_document = {
|
|
|
57 |
os.remove(file_path)
|
58 |
return {"message": f"Documents embedded in Weaviate collection '{collection_name}'"}
|
59 |
|
60 |
+
def retrieve_info(query):
|
|
|
|
|
|
|
|
|
61 |
llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
|
62 |
qa = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever())
|
63 |
|
64 |
# Retrieve initial results
|
65 |
initial_results = qa({"query": query})
|
66 |
|
67 |
+
# Assuming initial_results are in the desired format, extract the top documents
|
|
|
68 |
top_docs = initial_results[:25] # Adjust this if your result format is different
|
69 |
|
70 |
+
# Rerank the top results
|
71 |
reranked_results = co.rerank(query=query, documents=top_docs, top_n=3, model='rerank-english-v2.0')
|
72 |
|
73 |
# Format the reranked results
|
|
|
81 |
}
|
82 |
formatted_results.append(formatted_result)
|
83 |
|
84 |
+
return {"results": formatted_results}
|
85 |
|
86 |
# Gradio interface
|
87 |
iface = gr.Interface(
|
88 |
fn=retrieve_info,
|
89 |
inputs=[
|
90 |
+
gr.inputs.Textbox(label="Query")
|
|
|
91 |
],
|
92 |
outputs="text",
|
93 |
allow_flagging="never"
|