File size: 4,719 Bytes
bed05fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd8cf8a
bed05fc
 
cd8cf8a
 
e8d06a3
cd8cf8a
bed05fc
 
 
cd8cf8a
 
 
 
bed05fc
 
 
 
 
 
 
 
 
d332383
 
bed05fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d332383
 
bed05fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17f29f2
 
 
bed05fc
 
 
 
17f29f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bed05fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17f29f2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import weaviate
import langchain
import gradio as gr
from langchain.embeddings import CohereEmbeddings
from langchain.document_loaders import UnstructuredFileLoader, PyPDFLoader
from langchain.vectorstores import Qdrant
import os
import urllib.request
import ssl
import mimetypes
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
openai_api_key = os.getenv('OPENAI')
cohere_api_key = os.getenv('COHERE')
weaviate_api_key = os.getenv('WEAVIATE')
weaviate_url = os.getenv('WEAVIATE_URL')


# Weaviate connection
auth_config = weaviate.auth.AuthApiKey(api_key=weaviate_api_key)
client = weaviate.client(url=weaviate_url, auth_client_secret=auth_config, 
                         additional_headers={"X-Cohere-Api-Key": cohere_api_key})
# Initialize vectorstore
vectorstore = Weaviate(client, index_name="Articles", text_key="text")
vectorstore._query_attrs = ["text", "title", "url", "views", "lang", "_additional {distance}"]
vectorstore.embedding = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key=cohere_api_key)

# Initialize OpenAI and RetrievalQA
llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
qa = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever())

def embed_pdf(file, collection_name):
    # Save the uploaded file
    filename = file.name
    file_path = os.path.join('./', filename)
    with open(file_path, 'wb') as f:
        f.write(file.read())

    # Checking filetype for document parsing
    mime_type = mimetypes.guess_type(file_path)[0]
    loader = UnstructuredFileLoader(file_path)
    docs = loader.load()

    # Generate embeddings
    embeddings = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key=cohere_api_key)
    
    # Store documents in vectorstore (Qdrant)
    for doc in docs:
        embedding = embeddings.embed([doc['text']])
        vectorstore_document = {
            "text": doc['text'],
            "embedding": embedding
        }
    collection_name = request.json.get("collection_name")
    file_url = request.json.get("file_url")

    # Download the file
    folder_path = f'./'
    os.makedirs(folder_path, exist_ok=True)
    filename = file_url.split('/')[-1]
    file_path = os.path.join(folder_path, filename)
    
    ssl._create_default_https_context = ssl._create_unverified_context
    urllib.request.urlretrieve(file_url, file_path)

    # Check filetype for document parsing
    mime_type = mimetypes.guess_type(file_path)[0]
    loader = UnstructuredFileLoader(file_path)
    docs = loader.load()

    # Generate embeddings
    embeddings = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key=cohere_api_key)
    
    # Store documents in Weaviate
    for doc in docs:
        embedding = embeddings.embed([doc['text']])
        weaviate_document = {
            "text": doc['text'],
            "embedding": embedding
        }
        client.data_object.create(data_object=weaviate_document, class_name=collection_name)

    os.remove(file_path)
    return {"message": f"Documents embedded in Weaviate collection '{collection_name}'"}

# Initialize Cohere client
co = cohere.Client(api_key=cohere_api_key)

def retrieve_info():
    query = request.json.get("query")
    llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
    qa = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever())
    
    # Retrieve initial results
    initial_results = qa({"query": query})

    # Assuming initial_results are in the desired format, extract the top 25 documents
    # Adjust this part according to the actual format of your initial_results
    top_docs = initial_results[:25]  # Adjust this if your result format is different

    # Rerank the top 25 results
    reranked_results = co.rerank(query=query, documents=top_docs, top_n=3, model='rerank-english-v2.0')

    # Format the reranked results
    formatted_results = []
    for idx, r in enumerate(reranked_results):
        formatted_result = {
            "Document Rank": idx + 1,
            "Document Index": r.index,
            "Document": r.document['text'],
            "Relevance Score": f"{r.relevance_score:.2f}"
        }
        formatted_results.append(formatted_result)
        
    return {"results": result}

# Gradio interface
iface = gr.Interface(
    fn=retrieve_info,
    inputs=[
        gr.inputs.Textbox(label="Query"),
        gr.inputs.File(label="PDF File", type="file", optional=True)
    ],
    outputs="text",
    allow_flagging="never"
)

# Embed PDF function
iface.add_endpoint(
    fn=embed_pdf,
    inputs=[
        gr.inputs.File(label="PDF File", type="file"),
        gr.inputs.Textbox(label="Collection Name")
    ],
    outputs="text"
)

iface.launch()