Commit
·
0799221
1
Parent(s):
2b729fc
Upload 2 files
Browse files- app.py +100 -0
- requirements.txt +4 -0
app.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pinecone
|
2 |
+
from langchain.vectorstores import Pinecone
|
3 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
4 |
+
from langchain.text_splitter import CharacterTextSplitter
|
5 |
+
from langchain.chat_models import ChatOpenAI
|
6 |
+
from langchain.chains import ConversationalRetrievalChain
|
7 |
+
import os
|
8 |
+
from langchain.docstore.document import Document
|
9 |
+
from langchain.document_loaders import PyPDFLoader
|
10 |
+
import gradio as gr
|
11 |
+
import openai
|
12 |
+
|
13 |
+
os.environ["OPENAI_API_KEY"] = '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'
|
14 |
+
|
15 |
+
# initialize pinecone
|
16 |
+
pinecone.init(api_key='15f4e36a-00e9-46ad-8dcb-f01e5b2f7568',
|
17 |
+
environment='us-east1-gcp')
|
18 |
+
# delete the vectors in the index if there are any
|
19 |
+
index = pinecone.Index('asesura')
|
20 |
+
delete_response = index.delete(delete_all=True)
|
21 |
+
|
22 |
+
# load the documents
|
23 |
+
loader = PyPDFLoader("SaludClasico2023.pdf")
|
24 |
+
pages = loader.load_and_split()
|
25 |
+
|
26 |
+
n_pages = [page.page_content.replace("CÓDIGO CLAUSULADO - 01/01/2023 - 1411 - P - 35 - F-14-11-0090-214- D00I. CÓDIGO NOTA TÉCNICA – 01/01/2023 - 1411 - NT-P - 35 - N-14-11-0090-031", "") for page in pages]
|
27 |
+
print(len(n_pages))
|
28 |
+
# If a page has less than 100 characters, append it to the next one
|
29 |
+
for i in range(len(n_pages)-1):
|
30 |
+
if len(n_pages[i]) < 100:
|
31 |
+
n_pages[i+1] = n_pages[i] + n_pages[i+1]
|
32 |
+
n_pages[i] = ""
|
33 |
+
|
34 |
+
n_pages = [page for page in n_pages if len(page) > 100]
|
35 |
+
|
36 |
+
pages_summarized = []
|
37 |
+
#summarize each page
|
38 |
+
for i, chunk in enumerate(n_pages):
|
39 |
+
|
40 |
+
messages = [
|
41 |
+
{"role": "system", "content": "Summarize this text in the same language as the user's input."}]
|
42 |
+
messages.append({"role": "user", "content": chunk})
|
43 |
+
|
44 |
+
response = openai.ChatCompletion.create(
|
45 |
+
model="gpt-4",
|
46 |
+
messages=messages,
|
47 |
+
temperature=.2,
|
48 |
+
top_p=1,
|
49 |
+
frequency_penalty=0,
|
50 |
+
presence_penalty=0
|
51 |
+
)
|
52 |
+
|
53 |
+
pages_summarized.append(
|
54 |
+
response["choices"][0]["message"]['content'].strip())
|
55 |
+
|
56 |
+
pages = [Document(page_content = page_summarized, metadata={"page": i}) for i, page_summarized in enumerate(pages_summarized)]
|
57 |
+
|
58 |
+
|
59 |
+
# split the documents into chunks
|
60 |
+
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
61 |
+
documents = text_splitter.split_documents(pages)
|
62 |
+
|
63 |
+
embeddings = OpenAIEmbeddings()
|
64 |
+
|
65 |
+
# initialize pinecone
|
66 |
+
pinecone.init(
|
67 |
+
api_key="15f4e36a-00e9-46ad-8dcb-f01e5b2f7568", # find at app.pinecone.io
|
68 |
+
environment="us-east1-gcp" # next to api key in console
|
69 |
+
)
|
70 |
+
|
71 |
+
index_name = "asesura"
|
72 |
+
|
73 |
+
# create the new vectors
|
74 |
+
docsearch = Pinecone.from_documents(
|
75 |
+
documents, embeddings, index_name=index_name)
|
76 |
+
|
77 |
+
# create the conversational chain
|
78 |
+
qa = ConversationalRetrievalChain.from_llm(ChatOpenAI(
|
79 |
+
temperature=0), docsearch.as_retriever(), return_source_documents=True)
|
80 |
+
|
81 |
+
|
82 |
+
def predict(query):
|
83 |
+
response = openai.ChatCompletion.create(
|
84 |
+
model="gpt-4",
|
85 |
+
messages=[{"role": "system", "content": "Change the user's question so it is correctly made, with the correct punctuation, grammar, and spelling. The question is being made to an insurance policy called 'Sura Seguro de Salud Clásico' by a company named Sura. Return the question in the same language as the user."},
|
86 |
+
{"role": "user", "content": query}],
|
87 |
+
temperature=0.1)
|
88 |
+
refined_q = response.choices[0].message.content
|
89 |
+
print(refined_q)
|
90 |
+
response = qa(
|
91 |
+
{"question": refined_q, "chat_history": []})['answer']
|
92 |
+
return response
|
93 |
+
|
94 |
+
|
95 |
+
pregunta = gr.Textbox(label="Pregunta", placeholder="Escribe tu pregunta")
|
96 |
+
respuesta = gr.Textbox(
|
97 |
+
label="Respuesta")
|
98 |
+
|
99 |
+
gr.Interface(predict, pregunta, respuesta, title="Asesura",
|
100 |
+
description="Bienvenido a tu asesor personal de seguros Sura. Pregunta lo que necesites saber sobre el Seguro de Salud Clásico de Sura 2023").launch()
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
pinecone
|
3 |
+
openai
|
4 |
+
tiktoken
|