ngcanh commited on
Commit
d657a45
·
verified ·
1 Parent(s): ff8c629

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +204 -145
app.py CHANGED
@@ -1,149 +1,208 @@
1
- import streamlit as st
2
- from langchain.llms import HuggingFacePipeline
3
- from langchain.memory import ConversationBufferMemory
4
- from langchain.chains import ConversationalRetrievalChain
5
- from langchain.prompts.prompt import PromptTemplate
6
- from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
7
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
8
- from langchain.schema import Document
9
- from langchain_community.llms import HuggingFaceEndpoint
10
- from langchain.vectorstores import Chroma
11
- from transformers import TextStreamer
12
- from langchain.llms import HuggingFacePipeline
13
- from langchain.prompts import ChatPromptTemplate
14
- from langchain.llms import HuggingFaceHub
15
  import os
16
- import pandas as pd
17
- # from langchain.vectorstores import FAISS
18
- import subprocess
19
- from langchain_community.llms import HuggingFaceHub
20
-
21
- import pandas as pd
22
-
23
- # Configuración del modelo
24
-
25
  TOKEN=os.getenv('HF_TOKEN')
26
  subprocess.run(["huggingface-cli", "login", "--token", TOKEN, "--add-to-git-credential"])
27
- ######
28
- # set this key as an environment variable
29
- os.environ["HF_TOKEN"] = st.secrets["HF_TOKEN"]
30
-
31
-
32
- # Initialize tokenizer
33
- @st.cache_resource
34
- def load_model():
35
- TOKEN=os.getenv('HF_TOKEN')
36
- subprocess.run(["huggingface-cli", "login", "--token", TOKEN, "--add-to-git-credential"])
37
- os.environ["HF_TOKEN"] = st.secrets["HF_TOKEN"]
38
- MODEL_NAME = "google/gemma-2b-it"
39
-
40
- model = AutoModelForCausalLM.from_pretrained(
41
- MODEL_NAME
42
- # quantization_config=nf4_config, # add config
43
- # torch_dtype=torch.bfloat16, # save memory using float16
44
- # low_cpu_mem_usage=True,
45
- # token= TOKEN
46
- ).to("cuda")
47
-
48
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
49
- model_pipeline = pipeline(
50
- 'text-generation',
51
- model=model,
52
- tokenizer=tokenizer,
53
- max_new_tokens=1024, # output token
54
- device_map="auto" # auto allocate GPU if available
55
- )
56
-
57
- return HuggingFacePipeline(pipeline=model_pipeline)
58
-
59
- # Initialize embeddings
60
- @st.cache_resource
61
- def load_embeddings():
62
- embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/bkai-foundation-models/vietnamese-bi-encoder')
63
- # embeddings = OpenAIEmbeddings()
64
- return embeddings
65
-
66
- # Chroma Vector store
67
- @st.cache_resource
68
- def setup_vector():
69
- chunks = []
70
- df = pd.read_excel(r"chunk_metadata_template.xlsx")
71
- for _, row in df.iterrows():
72
- chunk_with_metadata = Document(
73
- page_content=row['page_content'],
74
- metadata={
75
- 'chunk_id': row['chunk_id'],
76
- 'document_title': row['document_title'],
77
- }
78
- )
79
- chunks.append(chunk_with_metadata)
80
- embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/bkai-foundation-models/vietnamese-bi-encoder')
81
- return Chroma.from_documents(chunks, embedding=embeddings)
82
-
83
- # Set up chain
84
- def setup_conversation_chain():
85
- llm = load_model()
86
- vector = setup_vector()
87
- memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
88
-
89
- template = """Bạn một chuyên viên vấn cho khách hàng về sản phẩm bảo hiểm của công ty MB Ageas Life tại Việt Nam.
90
- Hãy trả lời chuyên nghiệp, chính xác, cung cấp thông tin trước rồi hỏi câu tiếp theo. Tất cả các thông tin cung cấp đều trong phạm vi MBAL. Khi có đủ thông tin khách hàng thì mới mời khách hàng đăng ký để nhận tư vấn trên https://www.mbageas.life/
91
- {context}
92
- Câu hỏi: {question}
93
- Trả lời:"""
94
-
95
-
96
- # PROMPT = ChatPromptTemplate.from_template(template=template)
97
- # chain = ConversationalRetrievalChain.from_llm(
98
- # llm=llm,
99
- # retriever=vector.as_retriever(search_kwargs={'k': 5}),
100
- # memory=memory,
101
- # combine_docs_chain_kwargs={"prompt": PROMPT}
102
- # # condense_question_prompt=CUSTOM_QUESTION_PROMPT
103
- # )
104
- chain = (
105
- {"context": vector.as_retriever(search_kwargs={'k': 5}) | format_docs, "question": RunnablePassthrough()}
106
- | prompt
107
- | llm
108
- | parser
109
- )
110
-
111
- return chain
112
-
113
- # Streamlit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  def main():
115
- st.title("🛡️ MBAL Chatbot 🛡️")
116
-
117
- # Inicializar la cadena de conversación
118
- if 'conversation_chain' not in st.session_state:
119
- st.session_state.conversation_chain = setup_conversation_chain()
120
-
121
- # Mostrar mensajes del chat
122
- if 'messages' not in st.session_state:
123
- st.session_state.messages = []
124
-
125
- for message in st.session_state.messages:
126
- with st.chat_message(message["role"]):
127
- st.markdown(message["content"])
128
-
129
- # Campo de entrada para el usuario
130
- if prompt := st.chat_input("Bạn cần tư vấn về điều gì? Hãy chia sẻ nhu cầu và thông tin của bạn nhé!"):
131
- st.session_state.messages.append({"role": "user", "content": prompt})
132
- with st.chat_message("user"):
133
- st.markdown(prompt)
134
-
135
- with st.chat_message("assistant"):
136
- message_placeholder = st.empty()
137
- full_response = ""
138
-
139
- # Generar respuesta
140
- response = st.session_state.conversation_chain({"question": prompt, "chat_history": []})
141
- full_response = response['answer']
142
- # full_response = response.get("answer", "No response generated.")
143
-
144
- message_placeholder.markdown(full_response)
145
-
146
- st.session_state.messages.append({"role": "assistant", "content": full_response})
147
-
148
- # if __name__ == "__main__":
149
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import streamlit as st
3
+ from openai import AzureOpenAI
4
+ import PyPDF2
5
+ import openai
6
+ from io import BytesIO
7
+ from typing import List, Dict
8
+ from dotenv import load_dotenv
9
+ # Load environment variables
10
+ OPENAI_API_KEY = os.getenv("OPENAI_API")
11
  TOKEN=os.getenv('HF_TOKEN')
12
  subprocess.run(["huggingface-cli", "login", "--token", TOKEN, "--add-to-git-credential"])
13
+ st.sidebar.title("Welcome to MBAL Chatbot")
14
+ class PDFChatbot:
15
+ def __init__(self):
16
+ # Initialize Azure OpenAI client
17
+ # self.azure_client = AzureOpenAI(
18
+ # api_key=os.getenv("AZURE_OPENAI_KEY"),
19
+ # api_version=os.getenv("AZURE_OPENAI_API_VERSION", "2024-02-01"),
20
+ # azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
21
+ # )
22
+ self.azure_client = openai.OpenAI()
23
+ # Model name for your deployment
24
+ # self.model_name = os.getenv("AZURE_OPENAI_MODEL", "gpt-4")
25
+ self.model_name = ChatOpenAI(model="gpt-3.5-turbo-0125",openai_api_key = OPENAI_API_KEY)
26
+
27
+ # Store conversation history
28
+ self.conversation_history = []
29
+ self.pdf_content = ""
30
+ def extract_text_from_pdf(self, pdf_file):
31
+ """Extract text content from uploaded PDF file."""
32
+ try:
33
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
34
+ text = ""
35
+ for page_num in range(len(pdf_reader.pages)):
36
+ page = pdf_reader.pages[page_num]
37
+ text += page.extract_text() + "\n"
38
+ return text.strip()
39
+ except Exception as e:
40
+ st.error(f"Error reading PDF: {str(e)}")
41
+ return None
42
+ def chunk_text(self, text: str, chunk_size: int = 3000) -> List[str]:
43
+ """Split text into smaller chunks for better processing."""
44
+ words = text.split()
45
+ chunks = []
46
+ current_chunk = []
47
+ current_length = 0
48
+ for word in words:
49
+ if current_length + len(word) + 1 > chunk_size:
50
+ if current_chunk:
51
+ chunks.append(" ".join(current_chunk))
52
+ current_chunk = [word]
53
+ current_length = len(word)
54
+ else:
55
+ current_chunk.append(word)
56
+ current_length += len(word) + 1
57
+ if current_chunk:
58
+ chunks.append(" ".join(current_chunk))
59
+ return chunks
60
+ def get_relevant_context(self, query: str, chunks: List[str], max_chunks: int = 3) -> str:
61
+ """Get the most relevant chunks for the query (simple keyword matching)."""
62
+ # Simple keyword-based relevance scoring
63
+ query_words = set(query.lower().split())
64
+ chunk_scores = []
65
+ for i, chunk in enumerate(chunks):
66
+ chunk_words = set(chunk.lower().split())
67
+ # Calculate simple overlap score
68
+ overlap = len(query_words.intersection(chunk_words))
69
+ chunk_scores.append((i, overlap, chunk))
70
+ # Sort by relevance score and take top chunks
71
+ chunk_scores.sort(key=lambda x: x[1], reverse=True)
72
+ relevant_chunks = [chunk for _, _, chunk in chunk_scores[:max_chunks]]
73
+ return "\n\n".join(relevant_chunks)
74
+ def chat_with_pdf(self, user_question: str, pdf_content: str) -> str:
75
+ """Generate response using Azure OpenAI based on PDF content and user question."""
76
+ try:
77
+ # Split PDF content into chunks
78
+ chunks = self.chunk_text(pdf_content)
79
+ # Get relevant context for the question
80
+ relevant_context = self.get_relevant_context(user_question, chunks)
81
+ # Prepare messages for the chat
82
+ messages = [
83
+ {
84
+ "role": "system",
85
+ "content": """You are an experienced insurance agent assistant who helps customers understand their insurance policies and coverage details. Follow these guidelines:
86
+ 1. Only provide information based on the PDF content provided
87
+ 2. If the answer is not in the PDF, clearly state that the information is not available in the document
88
+ 3. Provide clear, concise, and helpful responses in a professional manner
89
+ 4. Always respond in English using proper grammar and formatting
90
+ 5. When possible, reference specific sections or clauses from the policy
91
+ 6. Use insurance terminology appropriately but explain complex terms when necessary
92
+ 7. Be empathetic and patient, as insurance can be confusing for customers
93
+ 8. If asked about claims, coverage limits, deductibles, or policy terms, provide accurate information from the document
94
+ 9. Always prioritize customer understanding and satisfaction
95
+ 10. If multiple interpretations are possible, explain the different scenarios clearly
96
+ Remember: You are here to help customers understand their insurance coverage better."""
97
+ },
98
+ {
99
+ "role": "user",
100
+ "content": f"""Insurance Document Content:
101
+ {relevant_context}
102
+ Customer Question: {user_question}
103
+ Please provide a helpful response based on the insurance document content above."""
104
+ }
105
+ ]
106
+ # Add conversation history
107
+ for msg in self.conversation_history[-6:]: # Keep last 6 messages for context
108
+ messages.append(msg)
109
+ # Get response from Azure OpenAI
110
+ response = self.azure_client.chat.completions.create(
111
+ model=self.model_name,
112
+ messages=messages,
113
+ max_tokens=1000,
114
+ temperature=0.7
115
+ )
116
+ bot_response = response.choices[0].message.content
117
+ # Update conversation history
118
+ self.conversation_history.append({"role": "user", "content": user_question})
119
+ self.conversation_history.append({"role": "assistant", "content": bot_response})
120
+ return bot_response
121
+ except Exception as e:
122
+ return f"Error generating response: {str(e)}"
123
  def main():
124
+ st.set_page_config(page_title="Insurance PDF Chatbot", page_icon="🛡️", layout="wide")
125
+ st.title("🛡️ Insurance Policy Assistant")
126
+ st.markdown("Upload your insurance policy PDF and ask questions about your coverage, claims, deductibles, and more!")
127
+ # Initialize chatbot
128
+ if 'chatbot' not in st.session_state:
129
+ st.session_state.chatbot = PDFChatbot()
130
+ st.session_state.pdf_processed = False
131
+ st.session_state.chat_history = []
132
+ # Sidebar for PDF upload and settings
133
+ with st.sidebar:
134
+ st.header("📁 Upload Insurance Document")
135
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
136
+ if uploaded_file is not None:
137
+ if st.button("Process PDF"):
138
+ with st.spinner("Processing your insurance document..."):
139
+ # Extract text from PDF
140
+ text_content = st.session_state.chatbot.extract_text_from_pdf(uploaded_file)
141
+ if text_content:
142
+ st.session_state.chatbot.pdf_content = text_content
143
+ st.session_state.pdf_processed = True
144
+ st.success("Insurance document processed successfully!")
145
+ # Show PDF summary
146
+ st.subheader("Document Preview")
147
+ st.text_area(
148
+ "First 500 characters:",
149
+ text_content[:500] + "..." if len(text_content) > 500 else text_content,
150
+ height=100
151
+ )
152
+ else:
153
+ st.error("Failed to process PDF")
154
+ # Clear conversation
155
+ if st.button("Clear Conversation"):
156
+ st.session_state.chatbot.conversation_history = []
157
+ st.session_state.chat_history = []
158
+ st.rerun()
159
+ # Main chat interface
160
+ if st.session_state.pdf_processed:
161
+ st.header("💬 Ask About Your Insurance Policy")
162
+ # Display chat history
163
+ for i, (question, answer) in enumerate(st.session_state.chat_history):
164
+ with st.container():
165
+ st.markdown(f"**You:** {question}")
166
+ st.markdown(f"**Insurance Assistant:** {answer}")
167
+ st.divider()
168
+ # Chat input
169
+ user_question = st.chat_input("Ask about your insurance coverage, claims, deductibles, or any policy details...")
170
+ if user_question:
171
+ with st.spinner("Analyzing your policy..."):
172
+ # Get response from chatbot
173
+ response = st.session_state.chatbot.chat_with_pdf(
174
+ user_question,
175
+ st.session_state.chatbot.pdf_content
176
+ )
177
+ # Add to chat history
178
+ st.session_state.chat_history.append((user_question, response))
179
+ # Display the new response
180
+ st.markdown(f"**You:** {user_question}")
181
+ st.markdown(f"**Insurance Assistant:** {response}")
182
+ else:
183
+ st.info("👆 Please upload and process an insurance PDF document to start chatting!")
184
+ # Show example questions
185
+ st.subheader("Example questions you can ask:")
186
+ st.markdown("""
187
+ - What is my coverage limit for property damage?
188
+ - What is my deductible amount?
189
+ - What types of incidents are covered under this policy?
190
+ - What is excluded from my coverage?
191
+ - How do I file a claim?
192
+ - What is the process for claim settlement?
193
+ - What are my premium payment options?
194
+ - When does my policy expire?
195
+ - Is flood damage covered?
196
+ - What documentation do I need for a claim?
197
+ """)
198
+ # Add insurance tips
199
+ st.subheader("💡 Insurance Tips")
200
+ st.markdown("""
201
+ - Review your policy regularly to understand your coverage
202
+ - Keep your policy documents in a safe place
203
+ - Update your coverage when your circumstances change
204
+ - Document any incidents immediately
205
+ - Contact your insurance agent if you have questions
206
+ """)
207
+ if __name__ == "__main__":
208
+ main()