Tonic commited on
Commit
79ad113
·
1 Parent(s): cd8cf8a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -48
app.py CHANGED
@@ -2,13 +2,15 @@ import weaviate
2
  import langchain
3
  import gradio as gr
4
  from langchain.embeddings import CohereEmbeddings
5
- from langchain.document_loaders import UnstructuredFileLoader, PyPDFLoader
6
- from langchain.vectorstores import Qdrant
 
7
  import os
8
  import urllib.request
9
  import ssl
10
  import mimetypes
11
  from dotenv import load_dotenv
 
12
 
13
  # Load environment variables
14
  load_dotenv()
@@ -17,19 +19,18 @@ cohere_api_key = os.getenv('COHERE')
17
  weaviate_api_key = os.getenv('WEAVIATE')
18
  weaviate_url = os.getenv('WEAVIATE_URL')
19
 
20
-
21
  # Weaviate connection
22
  auth_config = weaviate.auth.AuthApiKey(api_key=weaviate_api_key)
23
- client = weaviate.client(url=weaviate_url, auth_client_secret=auth_config,
24
  additional_headers={"X-Cohere-Api-Key": cohere_api_key})
 
25
  # Initialize vectorstore
26
  vectorstore = Weaviate(client, index_name="Articles", text_key="text")
27
  vectorstore._query_attrs = ["text", "title", "url", "views", "lang", "_additional {distance}"]
28
  vectorstore.embedding = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key=cohere_api_key)
29
 
30
- # Initialize OpenAI and RetrievalQA
31
- llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
32
- qa = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever())
33
 
34
  def embed_pdf(file, collection_name):
35
  # Save the uploaded file
@@ -43,37 +44,8 @@ def embed_pdf(file, collection_name):
43
  loader = UnstructuredFileLoader(file_path)
44
  docs = loader.load()
45
 
46
- # Generate embeddings
47
- embeddings = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key=cohere_api_key)
48
-
49
- # Store documents in vectorstore (Qdrant)
50
- for doc in docs:
51
- embedding = embeddings.embed([doc['text']])
52
- vectorstore_document = {
53
- "text": doc['text'],
54
- "embedding": embedding
55
- }
56
- collection_name = request.json.get("collection_name")
57
- file_url = request.json.get("file_url")
58
-
59
- # Download the file
60
- folder_path = f'./'
61
- os.makedirs(folder_path, exist_ok=True)
62
- filename = file_url.split('/')[-1]
63
- file_path = os.path.join(folder_path, filename)
64
-
65
- ssl._create_default_https_context = ssl._create_unverified_context
66
- urllib.request.urlretrieve(file_url, file_path)
67
-
68
- # Check filetype for document parsing
69
- mime_type = mimetypes.guess_type(file_path)[0]
70
- loader = UnstructuredFileLoader(file_path)
71
- docs = loader.load()
72
-
73
- # Generate embeddings
74
  embeddings = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key=cohere_api_key)
75
-
76
- # Store documents in Weaviate
77
  for doc in docs:
78
  embedding = embeddings.embed([doc['text']])
79
  weaviate_document = {
@@ -85,22 +57,17 @@ def embed_pdf(file, collection_name):
85
  os.remove(file_path)
86
  return {"message": f"Documents embedded in Weaviate collection '{collection_name}'"}
87
 
88
- # Initialize Cohere client
89
- co = cohere.Client(api_key=cohere_api_key)
90
-
91
- def retrieve_info():
92
- query = request.json.get("query")
93
  llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
94
  qa = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever())
95
 
96
  # Retrieve initial results
97
  initial_results = qa({"query": query})
98
 
99
- # Assuming initial_results are in the desired format, extract the top 25 documents
100
- # Adjust this part according to the actual format of your initial_results
101
  top_docs = initial_results[:25] # Adjust this if your result format is different
102
 
103
- # Rerank the top 25 results
104
  reranked_results = co.rerank(query=query, documents=top_docs, top_n=3, model='rerank-english-v2.0')
105
 
106
  # Format the reranked results
@@ -114,14 +81,13 @@ def retrieve_info():
114
  }
115
  formatted_results.append(formatted_result)
116
 
117
- return {"results": result}
118
 
119
  # Gradio interface
120
  iface = gr.Interface(
121
  fn=retrieve_info,
122
  inputs=[
123
- gr.inputs.Textbox(label="Query"),
124
- gr.inputs.File(label="PDF File", type="file", optional=True)
125
  ],
126
  outputs="text",
127
  allow_flagging="never"
 
2
  import langchain
3
  import gradio as gr
4
  from langchain.embeddings import CohereEmbeddings
5
+ from langchain.document_loaders import UnstructuredFileLoader
6
+ from langchain.vectorstores import Weaviate
7
+ from langchain.chain_types import OpenAI, RetrievalQA
8
  import os
9
  import urllib.request
10
  import ssl
11
  import mimetypes
12
  from dotenv import load_dotenv
13
+ import cohere
14
 
15
  # Load environment variables
16
  load_dotenv()
 
19
  weaviate_api_key = os.getenv('WEAVIATE')
20
  weaviate_url = os.getenv('WEAVIATE_URL')
21
 
 
22
  # Weaviate connection
23
  auth_config = weaviate.auth.AuthApiKey(api_key=weaviate_api_key)
24
+ client = weaviate.Client(url=weaviate_url, auth_client_secret=auth_config,
25
  additional_headers={"X-Cohere-Api-Key": cohere_api_key})
26
+
27
  # Initialize vectorstore
28
  vectorstore = Weaviate(client, index_name="Articles", text_key="text")
29
  vectorstore._query_attrs = ["text", "title", "url", "views", "lang", "_additional {distance}"]
30
  vectorstore.embedding = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key=cohere_api_key)
31
 
32
+ # Initialize Cohere client
33
+ co = cohere.Client(api_key=cohere_api_key)
 
34
 
35
  def embed_pdf(file, collection_name):
36
  # Save the uploaded file
 
44
  loader = UnstructuredFileLoader(file_path)
45
  docs = loader.load()
46
 
47
+ # Generate embeddings and store documents in Weaviate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  embeddings = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key=cohere_api_key)
 
 
49
  for doc in docs:
50
  embedding = embeddings.embed([doc['text']])
51
  weaviate_document = {
 
57
  os.remove(file_path)
58
  return {"message": f"Documents embedded in Weaviate collection '{collection_name}'"}
59
 
60
+ def retrieve_info(query):
 
 
 
 
61
  llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
62
  qa = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever())
63
 
64
  # Retrieve initial results
65
  initial_results = qa({"query": query})
66
 
67
+ # Assuming initial_results are in the desired format, extract the top documents
 
68
  top_docs = initial_results[:25] # Adjust this if your result format is different
69
 
70
+ # Rerank the top results
71
  reranked_results = co.rerank(query=query, documents=top_docs, top_n=3, model='rerank-english-v2.0')
72
 
73
  # Format the reranked results
 
81
  }
82
  formatted_results.append(formatted_result)
83
 
84
+ return {"results": formatted_results}
85
 
86
  # Gradio interface
87
  iface = gr.Interface(
88
  fn=retrieve_info,
89
  inputs=[
90
+ gr.inputs.Textbox(label="Query")
 
91
  ],
92
  outputs="text",
93
  allow_flagging="never"