Spaces:

mgbam
/

SmartDocAnalyzer

Sleeping

File size: 6,074 Bytes

import streamlit as st
import transformers
import altair as alt
import pandas as pd
import streamlit_authenticator as stauth
import bcrypt
from difflib import SequenceMatcher

# ------------------------------
# User Authentication Setup
# ------------------------------

# Manually hash the password using bcrypt
plain_password = "password123"
hashed_password = bcrypt.hashpw(plain_password.encode('utf-8'), bcrypt.gensalt()).decode('utf-8')

# Configuration for authentication
config = {
    'credentials': {
        'usernames': {
            'demo_user': {
                'name': 'Demo User',
                'password': hashed_password  # use the manually hashed password
            }
        }
    },
    'cookie': {
        'expiry_days': 30,
        'key': 'some_signature_key',
        'name': 'some_cookie_name'
    },
    'preauthorized': {
        'emails': []
    }
}

authenticator = stauth.Authenticate(
    config['credentials'],
    config['cookie']['name'],
    config['cookie']['key'],
    config['cookie']['expiry_days']
)

# Use positional arguments with a valid location parameter
name, authentication_status, username = authenticator.login('Login', 'main')

if authentication_status is None or authentication_status is False:
    st.error('Authentication failed. Please refresh and try again.')
    st.stop()

st.sidebar.write(f"Welcome *{name}*")
authenticator.logout('Logout', 'sidebar')

# ------------------------------
# Load Models
# ------------------------------
@st.cache_resource
def load_qwen():
    return transformers.pipeline(
        "text2text-generation",
        model="Qwen/Qwen2.5-14B",
        device_map="auto"
    )

@st.cache_resource
def load_phi():
    return transformers.pipeline(
        "text-generation",
        model="microsoft/phi-4",
        model_kwargs={"torch_dtype": "auto"},
        device_map="auto"
    )

qwen_pipeline = load_qwen()
phi_pipeline = load_phi()

# ------------------------------
# Utility Functions
# ------------------------------
def summarize_document(document_text):
    prompt = f"Summarize the following document and highlight key insights:\n\n{document_text}"
    summary = qwen_pipeline(prompt, max_new_tokens=1024)[0]['generated_text']
    return summary

def answer_question(summary, question):
    prompt = f"Based on the following summary:\n\n{summary}\n\nAnswer the question: {question}"
    answer = phi_pipeline(prompt, max_new_tokens=256)[0]['generated_text']
    return answer

def find_similar_chunks(original, output):
    matcher = SequenceMatcher(None, original, output)
    segments = []
    left = 0
    for _, j, n in matcher.get_matching_blocks():
        if left < j:
            segments.append({'text': output[left:j], 'match': False})
        segments.append({'text': output[j:j+n], 'match': True})
        left = j+n
    return segments

# ------------------------------
# Streamlit App Layout
# ------------------------------
st.title("SmartDoc Analyzer")
st.markdown("Analyze Financial & Health Documents with AI")

# Tabs for different functionalities
tabs = st.tabs(["Document Summarization", "Interactive Q&A", "Visualization & Data Extraction"])

# -------- Document Summarization Tab --------
with tabs[0]:
    st.header("Document Summarization")
    document_text = st.text_area("Paste Document Text:", height=300)
    if st.button("Summarize Document"):
        if document_text:
            summary = summarize_document(document_text)
            st.subheader("Summary")
            st.write(summary)
            # Save summary in session for use in Q&A tab
            st.session_state['last_summary'] = summary
        else:
            st.warning("Please paste document text to summarize.")

# -------- Interactive Q&A Tab --------
with tabs[1]:
    st.header("Interactive Q&A")
    default_summary = st.session_state.get('last_summary', '')
    summary_context = st.text_area("Summary Context:", value=default_summary, height=150)
    question = st.text_input("Enter your question about the document:")
    if st.button("Get Answer"):
        if summary_context and question:
            answer = answer_question(summary_context, question)
            st.subheader("Answer")
            st.write(answer)
            # For session saving, one could store Q&A pairs in st.session_state or database.
        else:
            st.warning("Please provide both a summary context and a question.")

# -------- Visualization & Data Extraction Tab --------
with tabs[2]:
    st.header("Visualization & Data Extraction")
    
    st.subheader("Visualization Placeholder")
    st.markdown("An interactive chart can be displayed here using Altair or Plotly.")
    
    # Example static Altair chart (replace with dynamic data extraction logic)
    data = pd.DataFrame({
        'Year': [2019, 2020, 2021, 2022],
        'Revenue': [150, 200, 250, 300]
    })
    chart = alt.Chart(data).mark_line(point=True).encode(
        x='Year:O',
        y='Revenue:Q',
        tooltip=['Year', 'Revenue']
    ).interactive()
    st.altair_chart(chart, use_container_width=True)

    st.subheader("Data Extraction Placeholder")
    st.markdown("Implement NLP techniques or model prompts to extract structured data here.")

    # File uploader example for future data extraction features
    uploaded_file = st.file_uploader("Upload a document file for extraction", type=["pdf", "docx", "txt"])
    if uploaded_file is not None:
        st.info("File uploaded successfully. Data extraction logic would process this file.")
        # Add logic to extract tables, key figures, etc. from the uploaded file.

# ------------------------------
# Safety & Compliance Layer (Placeholder)
# ------------------------------
st.sidebar.markdown("### Safety & Compliance")
st.sidebar.info(
    "This tool provides AI-driven insights. "
    "Please note that summaries and answers are for informational purposes only and should not be "
    "considered professional financial or medical advice."
)

# ------------------------------
# End of Application
# ------------------------------