Tonic commited on
Commit
bed05fc
·
1 Parent(s): ec10ab3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -0
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import weaviate
2
+ import langchain
3
+ import gradio as gr
4
+ from langchain.embeddings import CohereEmbeddings
5
+ from langchain.document_loaders import UnstructuredFileLoader, PyPDFLoader
6
+ from langchain.vectorstores import Qdrant
7
+ import os
8
+ import urllib.request
9
+ import ssl
10
+ import mimetypes
11
+ from dotenv import load_dotenv
12
+
13
+ # Load environment variables
14
+ load_dotenv()
15
+ openai_api_key = os.getenv('OPENAI')
16
+ cohere_api_key = os.getenv('COHERE')
17
+ weaviate_api_key = os.getenv('WEAVIATE')
18
+ weaviate_url = os.getenv('WEAVIATE_URL')
19
+
20
+ # Weaviate connection
21
+ auth_config = weaviate.auth.AuthApiKey(api_key=weaviate_api_key)
22
+ client = weaviate.Client(url=weaviate_url, auth_client_secret=auth_config, additional_headers={"X-Cohere-Api-Key": cohere_api_key})
23
+ vectorstore = Qdrant(client, index_name="Articles", text_key="text")
24
+ vectorstore._query_attrs = ["text", "title", "url", "views", "lang", "_additional {distance}"]
25
+ vectorstore.embedding = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key=cohere_api_key)
26
+
27
+ def embed_pdf(file, collection_name):
28
+ # Save the uploaded file
29
+ filename = file.name
30
+ file_path = os.path.join('./', filename)
31
+ with open(file_path, 'wb') as f:
32
+ f.write(file.read())
33
+
34
+ # Checking filetype for document parsing
35
+ mime_type = mimetypes.guess_type(file_path)[0]
36
+ loader = UnstructuredFileLoader(file_path)
37
+ docs = loader.load()
38
+
39
+ # Generate embeddings
40
+ embeddings = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key=cohere_api_key)
41
+
42
+ # Store documents in vectorstore (Qdrant)
43
+ for doc in docs:
44
+ embedding = embeddings.embed([doc['text']])
45
+ vectorstore_document = {
46
+ "text": doc['text'],
47
+ "embedding": embedding
48
+ }
49
+ collection_name = request.json.get("collection_name")
50
+ file_url = request.json.get("file_url")
51
+
52
+ # Download the file
53
+ folder_path = f'./'
54
+ os.makedirs(folder_path, exist_ok=True)
55
+ filename = file_url.split('/')[-1]
56
+ file_path = os.path.join(folder_path, filename)
57
+
58
+ ssl._create_default_https_context = ssl._create_unverified_context
59
+ urllib.request.urlretrieve(file_url, file_path)
60
+
61
+ # Check filetype for document parsing
62
+ mime_type = mimetypes.guess_type(file_path)[0]
63
+ loader = UnstructuredFileLoader(file_path)
64
+ docs = loader.load()
65
+
66
+ # Generate embeddings
67
+ embeddings = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key=cohere_api_key)
68
+
69
+ # Store documents in Weaviate
70
+ for doc in docs:
71
+ embedding = embeddings.embed([doc['text']])
72
+ weaviate_document = {
73
+ "text": doc['text'],
74
+ "embedding": embedding
75
+ }
76
+ client.data_object.create(data_object=weaviate_document, class_name=collection_name)
77
+
78
+ os.remove(file_path)
79
+ return {"message": f"Documents embedded in Weaviate collection '{collection_name}'"}
80
+
81
+ def retrieve_info():
82
+ query = request.json.get("query")
83
+ llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
84
+ qa = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever())
85
+ result = qa({"query": query})
86
+ return {"results": result}
87
+
88
+ # Gradio interface
89
+ iface = gr.Interface(
90
+ fn=retrieve_info,
91
+ inputs=[
92
+ gr.inputs.Textbox(label="Query"),
93
+ gr.inputs.File(label="PDF File", type="file", optional=True)
94
+ ],
95
+ outputs="text",
96
+ allow_flagging="never"
97
+ )
98
+
99
+ # Embed PDF function
100
+ iface.add_endpoint(
101
+ fn=embed_pdf,
102
+ inputs=[
103
+ gr.inputs.File(label="PDF File", type="file"),
104
+ gr.inputs.Textbox(label="Collection Name")
105
+ ],
106
+ outputs="text"
107
+ )
108
+
109
+ iface.launch()