HereChatBackend / app.py
Tonic's picture
Create app.py
bed05fc
raw
history blame
3.62 kB
import weaviate
import langchain
import gradio as gr
from langchain.embeddings import CohereEmbeddings
from langchain.document_loaders import UnstructuredFileLoader, PyPDFLoader
from langchain.vectorstores import Qdrant
import os
import urllib.request
import ssl
import mimetypes
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
openai_api_key = os.getenv('OPENAI')
cohere_api_key = os.getenv('COHERE')
weaviate_api_key = os.getenv('WEAVIATE')
weaviate_url = os.getenv('WEAVIATE_URL')
# Weaviate connection
auth_config = weaviate.auth.AuthApiKey(api_key=weaviate_api_key)
client = weaviate.Client(url=weaviate_url, auth_client_secret=auth_config, additional_headers={"X-Cohere-Api-Key": cohere_api_key})
vectorstore = Qdrant(client, index_name="Articles", text_key="text")
vectorstore._query_attrs = ["text", "title", "url", "views", "lang", "_additional {distance}"]
vectorstore.embedding = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key=cohere_api_key)
def embed_pdf(file, collection_name):
# Save the uploaded file
filename = file.name
file_path = os.path.join('./', filename)
with open(file_path, 'wb') as f:
f.write(file.read())
# Checking filetype for document parsing
mime_type = mimetypes.guess_type(file_path)[0]
loader = UnstructuredFileLoader(file_path)
docs = loader.load()
# Generate embeddings
embeddings = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key=cohere_api_key)
# Store documents in vectorstore (Qdrant)
for doc in docs:
embedding = embeddings.embed([doc['text']])
vectorstore_document = {
"text": doc['text'],
"embedding": embedding
}
collection_name = request.json.get("collection_name")
file_url = request.json.get("file_url")
# Download the file
folder_path = f'./'
os.makedirs(folder_path, exist_ok=True)
filename = file_url.split('/')[-1]
file_path = os.path.join(folder_path, filename)
ssl._create_default_https_context = ssl._create_unverified_context
urllib.request.urlretrieve(file_url, file_path)
# Check filetype for document parsing
mime_type = mimetypes.guess_type(file_path)[0]
loader = UnstructuredFileLoader(file_path)
docs = loader.load()
# Generate embeddings
embeddings = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key=cohere_api_key)
# Store documents in Weaviate
for doc in docs:
embedding = embeddings.embed([doc['text']])
weaviate_document = {
"text": doc['text'],
"embedding": embedding
}
client.data_object.create(data_object=weaviate_document, class_name=collection_name)
os.remove(file_path)
return {"message": f"Documents embedded in Weaviate collection '{collection_name}'"}
def retrieve_info():
query = request.json.get("query")
llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
qa = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever())
result = qa({"query": query})
return {"results": result}
# Gradio interface
iface = gr.Interface(
fn=retrieve_info,
inputs=[
gr.inputs.Textbox(label="Query"),
gr.inputs.File(label="PDF File", type="file", optional=True)
],
outputs="text",
allow_flagging="never"
)
# Embed PDF function
iface.add_endpoint(
fn=embed_pdf,
inputs=[
gr.inputs.File(label="PDF File", type="file"),
gr.inputs.Textbox(label="Collection Name")
],
outputs="text"
)
iface.launch()