akash015 commited on
Commit
663478a
·
verified ·
1 Parent(s): b97943d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +205 -2
app.py CHANGED
@@ -1,4 +1,186 @@
1
- import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import PyPDF2
3
  from langchain_community.embeddings import OllamaEmbeddings
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -13,9 +195,10 @@ import logging
13
  import pypandoc
14
  import pdfkit
15
  from paddleocr import PaddleOCR
16
- import fitz
17
  import asyncio
18
  from langchain_nomic.embeddings import NomicEmbeddings
 
19
 
20
  llm_groq = ChatGroq(
21
  model_name='llama3-70b-8192'
@@ -157,6 +340,7 @@ async def on_chat_start():
157
  await msg.update()
158
  # Store the chain in user session
159
  cl.user_session.set("chain", chain)
 
160
 
161
 
162
  @cl.on_message
@@ -176,3 +360,22 @@ async def main(message: cl.Message):
176
 
177
  # Return results
178
  await cl.Message(content=answer, elements=text_elements).send()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import re
2
+ # import PyPDF2
3
+ # from langchain_community.embeddings import OllamaEmbeddings
4
+ # from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ # from langchain_community.vectorstores import Chroma
6
+ # from langchain.chains import ConversationalRetrievalChain
7
+ # from langchain_community.chat_models import ChatOllama
8
+ # from langchain_groq import ChatGroq
9
+ # from langchain.memory import ChatMessageHistory, ConversationBufferMemory
10
+ # import chainlit as cl
11
+ # from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
12
+ # import logging
13
+ # import pypandoc
14
+ # import pdfkit
15
+ # from paddleocr import PaddleOCR
16
+ # import fitz
17
+ # import asyncio
18
+ # from langchain_nomic.embeddings import NomicEmbeddings
19
+
20
+ # llm_groq = ChatGroq(
21
+ # model_name='llama3-70b-8192'
22
+ # )
23
+
24
+ # # Initialize anonymizer
25
+ # anonymizer = PresidioReversibleAnonymizer(analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL'], faker_seed=18)
26
+
27
+ # def extract_text_from_pdf(file_path):
28
+ # pdf = PyPDF2.PdfReader(file_path)
29
+ # pdf_text = ""
30
+ # for page in pdf.pages:
31
+ # pdf_text += page.extract_text()
32
+ # return pdf_text
33
+
34
+ # def has_sufficient_selectable_text(page, threshold=50):
35
+ # text = page.extract_text()
36
+ # if len(text.strip()) > threshold:
37
+ # return True
38
+ # return False
39
+
40
+ # async def get_text(file_path):
41
+ # text = ""
42
+ # try:
43
+ # logging.info("Starting OCR process for file: %s", file_path)
44
+ # extension = file_path.split(".")[-1].lower()
45
+ # allowed_extension = ["jpg", "jpeg", "png", "pdf", "docx"]
46
+ # if extension not in allowed_extension:
47
+ # error = "Not a valid File. Allowed Format are jpg, jpeg, png, pdf, docx"
48
+ # logging.error(error)
49
+ # return {"error": error}
50
+
51
+ # if extension == "docx":
52
+ # file_path = convert_docx_to_pdf(file_path)
53
+
54
+ # ocr = PaddleOCR(use_angle_cls=True, lang='en')
55
+ # result = ocr.ocr(file_path, cls=True)
56
+ # for idx in range(len(result)):
57
+ # res = result[idx]
58
+ # for line in res:
59
+ # text += line[1][0] + " "
60
+ # logging.info("OCR process completed successfully for file: %s", file_path)
61
+ # except Exception as e:
62
+ # logging.error("Error occurred during OCR process for file %s: %s", file_path, e)
63
+ # text = "Error occurred during OCR process."
64
+ # logging.info("Extracted text: %s", text)
65
+ # return text
66
+
67
+ # def convert_docx_to_pdf(input_path):
68
+ # html_path = input_path.replace('.docx', '.html')
69
+ # output_path = ".".join(input_path.split(".")[:-1]) + ".pdf"
70
+ # pypandoc.convert_file(input_path, 'html', outputfile=html_path)
71
+ # pdfkit.from_file(html_path, output_path)
72
+ # logging.info("DOCX Format Handled")
73
+ # return output_path
74
+
75
+ # async def extract_text_from_mixed_pdf(file_path):
76
+ # pdf = PyPDF2.PdfReader(file_path)
77
+ # ocr = PaddleOCR(use_angle_cls=True, lang='en')
78
+ # pdf_text = ""
79
+ # for i, page in enumerate(pdf.pages):
80
+ # text = page.extract_text()
81
+ # if not has_sufficient_selectable_text(page):
82
+ # logging.info(f"Page {i+1} has insufficient selectable text, performing OCR.")
83
+ # pdf_document = fitz.open(file_path)
84
+ # pdf_page = pdf_document.load_page(i)
85
+ # pix = pdf_page.get_pixmap()
86
+ # image_path = f"page_{i+1}.png"
87
+ # pix.save(image_path)
88
+ # result = ocr.ocr(image_path, cls=True)
89
+ # for idx in range(len(result)):
90
+ # res = result[idx]
91
+ # for line in res:
92
+ # text += line[1][0] + " "
93
+ # pdf_text += text
94
+ # return pdf_text
95
+
96
+ # @cl.on_chat_start
97
+ # async def on_chat_start():
98
+
99
+ # files = None # Initialize variable to store uploaded files
100
+
101
+ # # Wait for the user to upload a file
102
+ # while files is None:
103
+ # files = await cl.AskFileMessage(
104
+ # content="Please upload a pdf file to begin!",
105
+ # # accept=["application/pdf"],
106
+ # accept=["application/pdf", "image/jpeg", "image/png", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
107
+ # max_size_mb=100,
108
+ # timeout=180,
109
+ # ).send()
110
+
111
+ # file = files[0] # Get the first uploaded file
112
+
113
+ # # Inform the user that processing has started
114
+ # msg = cl.Message(content=f"Processing `{file.name}`...")
115
+ # await msg.send()
116
+
117
+ # # Extract text from PDF, checking for selectable and handwritten text
118
+ # if file.name.endswith('.pdf'):
119
+ # pdf_text = await extract_text_from_mixed_pdf(file.path)
120
+ # else:
121
+ # pdf_text = await get_text(file.path)
122
+
123
+ # # Anonymize the text
124
+ # anonymized_text = anonymizer.anonymize(
125
+ # pdf_text
126
+ # )
127
+
128
+ # embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
129
+
130
+ # docsearch = await cl.make_async(Chroma.from_texts)(
131
+ # [anonymized_text], embeddings, metadatas=[{"source": "0-pl"}]
132
+ # )
133
+ # # }
134
+
135
+ # # Initialize message history for conversation
136
+ # message_history = ChatMessageHistory()
137
+
138
+ # # Memory for conversational context
139
+ # memory = ConversationBufferMemory(
140
+ # memory_key="chat_history",
141
+ # output_key="answer",
142
+ # chat_memory=message_history,
143
+ # return_messages=True,
144
+ # )
145
+
146
+ # # Create a chain that uses the Chroma vector store
147
+ # chain = ConversationalRetrievalChain.from_llm(
148
+ # llm = llm_groq,
149
+ # chain_type="stuff",
150
+ # retriever=docsearch.as_retriever(),
151
+ # memory=memory,
152
+ # return_source_documents=True,
153
+ # )
154
+
155
+ # # Let the user know that the system is ready
156
+ # msg.content = f"Processing `{file.name}` done. You can now ask questions!"
157
+ # await msg.update()
158
+ # # Store the chain in user session
159
+ # cl.user_session.set("chain", chain)
160
+
161
+
162
+ # @cl.on_message
163
+ # async def main(message: cl.Message):
164
+
165
+ # # Retrieve the chain from user session
166
+ # chain = cl.user_session.get("chain")
167
+ # # Callbacks happen asynchronously/parallel
168
+ # cb = cl.AsyncLangchainCallbackHandler()
169
+
170
+ # # Call the chain with user's message content
171
+ # res = await chain.ainvoke(message.content, callbacks=[cb])
172
+ # answer = anonymizer.deanonymize(
173
+ # res["answer"]
174
+ # )
175
+ # text_elements = []
176
+
177
+ # # Return results
178
+ # await cl.Message(content=answer, elements=text_elements).send()
179
+
180
+
181
+
182
+ # v2
183
+ import re
184
  import PyPDF2
185
  from langchain_community.embeddings import OllamaEmbeddings
186
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
195
  import pypandoc
196
  import pdfkit
197
  from paddleocr import PaddleOCR
198
+ import fitz
199
  import asyncio
200
  from langchain_nomic.embeddings import NomicEmbeddings
201
+ import os
202
 
203
  llm_groq = ChatGroq(
204
  model_name='llama3-70b-8192'
 
340
  await msg.update()
341
  # Store the chain in user session
342
  cl.user_session.set("chain", chain)
343
+ cl.user_session.set("file_path", file.path) # Store the file path in session
344
 
345
 
346
  @cl.on_message
 
360
 
361
  # Return results
362
  await cl.Message(content=answer, elements=text_elements).send()
363
+
364
+ @cl.on_chat_end
365
+ async def on_chat_end():
366
+ chain = cl.user_session.get("chain")
367
+ file_path = cl.user_session.get("file_path")
368
+
369
+ if chain:
370
+ # Clear the vector store
371
+ chain.retriever.store.clear()
372
+
373
+ if file_path and os.path.exists(file_path):
374
+ # Remove the uploaded file
375
+ os.remove(file_path)
376
+
377
+ # Clear the user session data
378
+ cl.user_session.clear()
379
+
380
+ logging.info("User session ended, data cleared.")
381
+