Spaces:

mgbam
/

SmartDocAnalyzer

Sleeping

App Files Files Community

SmartDocAnalyzer / app.py

mgbam

Update app.py

92bab75 verified 7 months ago

raw

history blame

6.07 kB

	import streamlit as st
	import transformers
	import altair as alt
	import pandas as pd
	import streamlit_authenticator as stauth
	import bcrypt
	from difflib import SequenceMatcher

	# ------------------------------
	# User Authentication Setup
	# ------------------------------

	# Manually hash the password using bcrypt
	plain_password = "password123"
	hashed_password = bcrypt.hashpw(plain_password.encode('utf-8'), bcrypt.gensalt()).decode('utf-8')

	# Configuration for authentication
	config = {
	'credentials': {
	'usernames': {
	'demo_user': {
	'name': 'Demo User',
	'password': hashed_password # use the manually hashed password
	}
	}
	},
	'cookie': {
	'expiry_days': 30,
	'key': 'some_signature_key',
	'name': 'some_cookie_name'
	},
	'preauthorized': {
	'emails': []
	}
	}

	authenticator = stauth.Authenticate(
	config['credentials'],
	config['cookie']['name'],
	config['cookie']['key'],
	config['cookie']['expiry_days']
	)

	# Use positional arguments with a valid location parameter
	name, authentication_status, username = authenticator.login('Login', 'main')

	if authentication_status is None or authentication_status is False:
	st.error('Authentication failed. Please refresh and try again.')
	st.stop()

	st.sidebar.write(f"Welcome {name}")
	authenticator.logout('Logout', 'sidebar')

	# ------------------------------
	# Load Models
	# ------------------------------
	@st.cache_resource
	def load_qwen():
	return transformers.pipeline(
	"text2text-generation",
	model="Qwen/Qwen2.5-14B",
	device_map="auto"
	)

	@st.cache_resource
	def load_phi():
	return transformers.pipeline(
	"text-generation",
	model="microsoft/phi-4",
	model_kwargs={"torch_dtype": "auto"},
	device_map="auto"
	)

	qwen_pipeline = load_qwen()
	phi_pipeline = load_phi()

	# ------------------------------
	# Utility Functions
	# ------------------------------
	def summarize_document(document_text):
	prompt = f"Summarize the following document and highlight key insights:\n\n{document_text}"
	summary = qwen_pipeline(prompt, max_new_tokens=1024)[0]['generated_text']
	return summary

	def answer_question(summary, question):
	prompt = f"Based on the following summary:\n\n{summary}\n\nAnswer the question: {question}"
	answer = phi_pipeline(prompt, max_new_tokens=256)[0]['generated_text']
	return answer

	def find_similar_chunks(original, output):
	matcher = SequenceMatcher(None, original, output)
	segments = []
	left = 0
	for _, j, n in matcher.get_matching_blocks():
	if left < j:
	segments.append({'text': output[left:j], 'match': False})
	segments.append({'text': output[j:j+n], 'match': True})
	left = j+n
	return segments

	# ------------------------------
	# Streamlit App Layout
	# ------------------------------
	st.title("SmartDoc Analyzer")
	st.markdown("Analyze Financial & Health Documents with AI")

	# Tabs for different functionalities
	tabs = st.tabs(["Document Summarization", "Interactive Q&A", "Visualization & Data Extraction"])

	# -------- Document Summarization Tab --------
	with tabs[0]:
	st.header("Document Summarization")
	document_text = st.text_area("Paste Document Text:", height=300)
	if st.button("Summarize Document"):
	if document_text:
	summary = summarize_document(document_text)
	st.subheader("Summary")
	st.write(summary)
	# Save summary in session for use in Q&A tab
	st.session_state['last_summary'] = summary
	else:
	st.warning("Please paste document text to summarize.")

	# -------- Interactive Q&A Tab --------
	with tabs[1]:
	st.header("Interactive Q&A")
	default_summary = st.session_state.get('last_summary', '')
	summary_context = st.text_area("Summary Context:", value=default_summary, height=150)
	question = st.text_input("Enter your question about the document:")
	if st.button("Get Answer"):
	if summary_context and question:
	answer = answer_question(summary_context, question)
	st.subheader("Answer")
	st.write(answer)
	# For session saving, one could store Q&A pairs in st.session_state or database.
	else:
	st.warning("Please provide both a summary context and a question.")

	# -------- Visualization & Data Extraction Tab --------
	with tabs[2]:
	st.header("Visualization & Data Extraction")

	st.subheader("Visualization Placeholder")
	st.markdown("An interactive chart can be displayed here using Altair or Plotly.")

	# Example static Altair chart (replace with dynamic data extraction logic)
	data = pd.DataFrame({
	'Year': [2019, 2020, 2021, 2022],
	'Revenue': [150, 200, 250, 300]
	})
	chart = alt.Chart(data).mark_line(point=True).encode(
	x='Year:O',
	y='Revenue:Q',
	tooltip=['Year', 'Revenue']
	).interactive()
	st.altair_chart(chart, use_container_width=True)

	st.subheader("Data Extraction Placeholder")
	st.markdown("Implement NLP techniques or model prompts to extract structured data here.")

	# File uploader example for future data extraction features
	uploaded_file = st.file_uploader("Upload a document file for extraction", type=["pdf", "docx", "txt"])
	if uploaded_file is not None:
	st.info("File uploaded successfully. Data extraction logic would process this file.")
	# Add logic to extract tables, key figures, etc. from the uploaded file.

	# ------------------------------
	# Safety & Compliance Layer (Placeholder)
	# ------------------------------
	st.sidebar.markdown("### Safety & Compliance")
	st.sidebar.info(
	"This tool provides AI-driven insights. "
	"Please note that summaries and answers are for informational purposes only and should not be "
	"considered professional financial or medical advice."
	)

	# ------------------------------
	# End of Application
	# ------------------------------