Spaces:

bacancydataprophets
/

MeDocChat

Runtime error

App Files Files Community

MeDocChat / app.py

akash015

Update app.py

329f193 verified over 1 year ago

raw

history blame

13.1 kB

	# import re
	# import PyPDF2
	# from langchain_community.embeddings import OllamaEmbeddings
	# from langchain.text_splitter import RecursiveCharacterTextSplitter
	# from langchain_community.vectorstores import Chroma
	# from langchain.chains import ConversationalRetrievalChain
	# from langchain_community.chat_models import ChatOllama
	# from langchain_groq import ChatGroq
	# from langchain.memory import ChatMessageHistory, ConversationBufferMemory
	# import chainlit as cl
	# from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
	# import logging
	# import pypandoc
	# import pdfkit
	# from paddleocr import PaddleOCR
	# import fitz
	# import asyncio
	# from langchain_nomic.embeddings import NomicEmbeddings

	# llm_groq = ChatGroq(
	# model_name='llama3-70b-8192'
	# )

	# # Initialize anonymizer
	# anonymizer = PresidioReversibleAnonymizer(analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL'], faker_seed=18)

	# def extract_text_from_pdf(file_path):
	# pdf = PyPDF2.PdfReader(file_path)
	# pdf_text = ""
	# for page in pdf.pages:
	# pdf_text += page.extract_text()
	# return pdf_text

	# def has_sufficient_selectable_text(page, threshold=50):
	# text = page.extract_text()
	# if len(text.strip()) > threshold:
	# return True
	# return False

	# async def get_text(file_path):
	# text = ""
	# try:
	# logging.info("Starting OCR process for file: %s", file_path)
	# extension = file_path.split(".")[-1].lower()
	# allowed_extension = ["jpg", "jpeg", "png", "pdf", "docx"]
	# if extension not in allowed_extension:
	# error = "Not a valid File. Allowed Format are jpg, jpeg, png, pdf, docx"
	# logging.error(error)
	# return {"error": error}

	# if extension == "docx":
	# file_path = convert_docx_to_pdf(file_path)

	# ocr = PaddleOCR(use_angle_cls=True, lang='en')
	# result = ocr.ocr(file_path, cls=True)
	# for idx in range(len(result)):
	# res = result[idx]
	# for line in res:
	# text += line[1][0] + " "
	# logging.info("OCR process completed successfully for file: %s", file_path)
	# except Exception as e:
	# logging.error("Error occurred during OCR process for file %s: %s", file_path, e)
	# text = "Error occurred during OCR process."
	# logging.info("Extracted text: %s", text)
	# return text

	# def convert_docx_to_pdf(input_path):
	# html_path = input_path.replace('.docx', '.html')
	# output_path = ".".join(input_path.split(".")[:-1]) + ".pdf"
	# pypandoc.convert_file(input_path, 'html', outputfile=html_path)
	# pdfkit.from_file(html_path, output_path)
	# logging.info("DOCX Format Handled")
	# return output_path

	# async def extract_text_from_mixed_pdf(file_path):
	# pdf = PyPDF2.PdfReader(file_path)
	# ocr = PaddleOCR(use_angle_cls=True, lang='en')
	# pdf_text = ""
	# for i, page in enumerate(pdf.pages):
	# text = page.extract_text()
	# if not has_sufficient_selectable_text(page):
	# logging.info(f"Page {i+1} has insufficient selectable text, performing OCR.")
	# pdf_document = fitz.open(file_path)
	# pdf_page = pdf_document.load_page(i)
	# pix = pdf_page.get_pixmap()
	# image_path = f"page_{i+1}.png"
	# pix.save(image_path)
	# result = ocr.ocr(image_path, cls=True)
	# for idx in range(len(result)):
	# res = result[idx]
	# for line in res:
	# text += line[1][0] + " "
	# pdf_text += text
	# return pdf_text

	# @cl.on_chat_start
	# async def on_chat_start():

	# files = None # Initialize variable to store uploaded files

	# # Wait for the user to upload a file
	# while files is None:
	# files = await cl.AskFileMessage(
	# content="Please upload a pdf file to begin!",
	# # accept=["application/pdf"],
	# accept=["application/pdf", "image/jpeg", "image/png", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
	# max_size_mb=100,
	# timeout=180,
	# ).send()

	# file = files[0] # Get the first uploaded file

	# # Inform the user that processing has started
	# msg = cl.Message(content=f"Processing `{file.name}`...")
	# await msg.send()

	# # Extract text from PDF, checking for selectable and handwritten text
	# if file.name.endswith('.pdf'):
	# pdf_text = await extract_text_from_mixed_pdf(file.path)
	# else:
	# pdf_text = await get_text(file.path)

	# # Anonymize the text
	# anonymized_text = anonymizer.anonymize(
	# pdf_text
	# )

	# embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")

	# docsearch = await cl.make_async(Chroma.from_texts)(
	# [anonymized_text], embeddings, metadatas=[{"source": "0-pl"}]
	# )
	# # }

	# # Initialize message history for conversation
	# message_history = ChatMessageHistory()

	# # Memory for conversational context
	# memory = ConversationBufferMemory(
	# memory_key="chat_history",
	# output_key="answer",
	# chat_memory=message_history,
	# return_messages=True,
	# )

	# # Create a chain that uses the Chroma vector store
	# chain = ConversationalRetrievalChain.from_llm(
	# llm = llm_groq,
	# chain_type="stuff",
	# retriever=docsearch.as_retriever(),
	# memory=memory,
	# return_source_documents=True,
	# )

	# # Let the user know that the system is ready
	# msg.content = f"Processing `{file.name}` done. You can now ask questions!"
	# await msg.update()
	# # Store the chain in user session
	# cl.user_session.set("chain", chain)


	# @cl.on_message
	# async def main(message: cl.Message):

	# # Retrieve the chain from user session
	# chain = cl.user_session.get("chain")
	# # Callbacks happen asynchronously/parallel
	# cb = cl.AsyncLangchainCallbackHandler()

	# # Call the chain with user's message content
	# res = await chain.ainvoke(message.content, callbacks=[cb])
	# answer = anonymizer.deanonymize(
	# res["answer"]
	# )
	# text_elements = []

	# # Return results
	# await cl.Message(content=answer, elements=text_elements).send()



	# v2
	import re
	import PyPDF2
	from langchain_community.embeddings import OllamaEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import Chroma
	from langchain.chains import ConversationalRetrievalChain
	from langchain_community.chat_models import ChatOllama
	from langchain_groq import ChatGroq
	from langchain.memory import ChatMessageHistory, ConversationBufferMemory
	import chainlit as cl
	from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
	import logging
	import pypandoc
	import pdfkit
	from paddleocr import PaddleOCR
	import fitz
	import asyncio
	from langchain_nomic.embeddings import NomicEmbeddings

	llm_groq = ChatGroq(
	model_name='llama3-70b-8192'
	)

	# Initialize anonymizer
	anonymizer = PresidioReversibleAnonymizer(analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL'], faker_seed=18)

	def extract_text_from_pdf(file_path):
	pdf = PyPDF2.PdfReader(file_path)
	pdf_text = ""
	for page in pdf.pages:
	pdf_text += page.extract_text()
	return pdf_text

	def has_sufficient_selectable_text(page, threshold=50):
	text = page.extract_text()
	if len(text.strip()) > threshold:
	return True
	return False

	async def get_text(file_path):
	text = ""
	try:
	logging.info("Starting OCR process for file: %s", file_path)
	extension = file_path.split(".")[-1].lower()
	allowed_extension = ["jpg", "jpeg", "png", "pdf", "docx"]
	if extension not in allowed_extension:
	error = "Not a valid File. Allowed Format are jpg, jpeg, png, pdf, docx"
	logging.error(error)
	return {"error": error}

	if extension == "docx":
	file_path = convert_docx_to_pdf(file_path)

	ocr = PaddleOCR(use_angle_cls=True, lang='en')
	result = ocr.ocr(file_path, cls=True)
	for idx in range(len(result)):
	res = result[idx]
	for line in res:
	text += line[1][0] + " "
	logging.info("OCR process completed successfully for file: %s", file_path)
	except Exception as e:
	logging.error("Error occurred during OCR process for file %s: %s", file_path, e)
	text = "Error occurred during OCR process."
	logging.info("Extracted text: %s", text)
	return text

	def convert_docx_to_pdf(input_path):
	html_path = input_path.replace('.docx', '.html')
	output_path = ".".join(input_path.split(".")[:-1]) + ".pdf"
	pypandoc.convert_file(input_path, 'html', outputfile=html_path)
	pdfkit.from_file(html_path, output_path)
	logging.info("DOCX Format Handled")
	return output_path

	async def extract_text_from_mixed_pdf(file_path):
	pdf = PyPDF2.PdfReader(file_path)
	ocr = PaddleOCR(use_angle_cls=True, lang='en')
	pdf_text = ""
	for i, page in enumerate(pdf.pages):
	text = page.extract_text()
	if not has_sufficient_selectable_text(page):
	logging.info(f"Page {i+1} has insufficient selectable text, performing OCR.")
	pdf_document = fitz.open(file_path)
	pdf_page = pdf_document.load_page(i)
	pix = pdf_page.get_pixmap()
	image_path = f"page_{i+1}.png"
	pix.save(image_path)
	result = ocr.ocr(image_path, cls=True)
	for idx in range(len(result)):
	res = result[idx]
	for line in res:
	text += line[1][0] + " "
	pdf_text += text
	return pdf_text

	# Function to delete the ChromaDB collection
	async def delete_chroma_collection(chroma_instance):
	chroma_instance.delete_collection()

	@cl.on_chat_start
	async def on_chat_start():

	files = None # Initialize variable to store uploaded files

	# Initialize ChromaDB
	chroma_instance = await cl.make_async(Chroma.from_texts)([], None)

	# Delete the existing ChromaDB collection
	await delete_chroma_collection(chroma_instance)

	# Wait for the user to upload a file
	while files is None:
	files = await cl.AskFileMessage(
	content="Please upload a pdf file to begin!",
	accept=["application/pdf", "image/jpeg", "image/png", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
	max_size_mb=100,
	timeout=180,
	).send()

	file = files[0] # Get the first uploaded file

	# Inform the user that processing has started
	msg = cl.Message(content=f"Processing `{file.name}`...")
	await msg.send()

	# Extract text from PDF, checking for selectable and handwritten text
	if file.name.endswith('.pdf'):
	pdf_text = await extract_text_from_mixed_pdf(file.path)
	else:
	pdf_text = await get_text(file.path)

	# Anonymize the text
	anonymized_text = anonymizer.anonymize(
	pdf_text
	)

	embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")

	docsearch = await cl.make_async(Chroma.from_texts)(
	[anonymized_text], embeddings, metadatas=[{"source": "0-pl"}]
	)

	# Initialize message history for conversation
	message_history = ChatMessageHistory()

	# Memory for conversational context
	memory = ConversationBufferMemory(
	memory_key="chat_history",
	output_key="answer",
	chat_memory=message_history,
	return_messages=True,
	)

	# Create a chain that uses the Chroma vector store
	chain = ConversationalRetrievalChain.from_llm(
	llm = llm_groq,
	chain_type="stuff",
	retriever=docsearch.as_retriever(),
	memory=memory,
	return_source_documents=True,
	)

	# Let the user know that the system is ready
	msg.content = f"Processing `{file.name}` done. You can now ask questions!"
	await msg.update()
	# Store the chain in user session
	cl.user_session.set("chain", chain)


	@cl.on_message
	async def main(message: cl.Message):

	# Retrieve the chain from user session
	chain = cl.user_session.get("chain")
	# Callbacks happen asynchronously/parallel
	cb = cl.AsyncLangchainCallbackHandler()

	# Call the chain with user's message content
	res = await chain.ainvoke(message.content, callbacks=[cb])
	answer = anonymizer.deanonymize(
	res["answer"]
	)
	text_elements = []

	# Return results
	await cl.Message(content=answer, elements=text_elements).send()