Spaces:

amiguel
/

classfinetune

Sleeping

File size: 7,332 Bytes

e476c2e
cd58cfd
e476c2e
 
 
 
a7ba67c
418028a
e476c2e
 
 
 
 
 
 
 
a7ba67c
418028a
cd58cfd
 
 
037c4ae
418028a
cd58cfd
 
 
 
 
e476c2e
037c4ae
e476c2e
 
037c4ae
e476c2e
 
 
037c4ae
e476c2e
 
 
004fb60
418028a
 
e476c2e
 
 
 
 
 
 
418028a
e476c2e
004fb60
 
418028a
e476c2e
037c4ae
004fb60
 
418028a
 
 
 
 
004fb60
 
418028a
 
 
 
 
 
 
 
 
 
 
 
 
e476c2e
 
418028a
e476c2e
 
 
cd58cfd
e476c2e
 
a7ba67c
e476c2e
 
 
cd58cfd
 
 
418028a
cd58cfd
 
e476c2e
cd58cfd
e476c2e
 
037c4ae
e476c2e
 
 
004fb60
cd58cfd
 
 
e476c2e
418028a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e476c2e
 
 
418028a
 
 
e476c2e
 
 
cd58cfd
 
 
e476c2e
a7ba67c
e476c2e
 
 
 
 
 
 
 
 
 
 
004fb60
 
418028a
 
 
e476c2e
004fb60
e476c2e
 
 
418028a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd58cfd
e476c2e
 
cd58cfd
e476c2e

import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from huggingface_hub import login
import PyPDF2
import pandas as pd
import torch
import os
import re

# Set page configuration
st.set_page_config(
    page_title="WizNerd Insp",
    page_icon="🚀",
    layout="centered"
)

# Load Hugging Face token from environment variable
HF_TOKEN = os.getenv("HF_TOKEN")

# Model name
MODEL_NAME = "amiguel/instruct_BERT-base-uncased_model"

# Label mapping
LABEL_TO_CLASS = {
    0: "Campaign", 1: "Corrosion Monitoring", 2: "Flare Tip", 3: "Flare TIP",
    4: "FU Items", 5: "Intelligent Pigging", 6: "Lifting", 7: "Non Structural Tank",
    8: "Piping", 9: "Pressure Safety Device", 10: "Pressure Vessel (VIE)",
    11: "Pressure Vessel (VII)", 12: "Structure", 13: "Flame Arrestor"
}

# Title with rocket emojis
st.title("🚀 WizNerd Insp 🚀")

# Configure Avatars
USER_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/9904d9a0d445ab0488cf7395cb863cce7621d897/USER_AVATAR.png"
BOT_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/991f4c6e4e1dc7a8e24876ca5aae5228bcdb4dba/Ataliba_Avatar.jpg"

# Sidebar configuration
with st.sidebar:
    st.header("Upload Documents 📂")
    uploaded_file = st.file_uploader(
        "Choose a PDF, XLSX, or CSV file",
        type=["pdf", "xlsx", "csv"],
        label_visibility="collapsed"
    )

# Initialize chat history
if "messages" not in st.session_state:
    st.session_state.messages = []

# File processing function with pre-processing
@st.cache_data
def process_file(uploaded_file):
    if uploaded_file is None:
        return None
    
    try:
        if uploaded_file.type == "application/pdf":
            pdf_reader = PyPDF2.PdfReader(uploaded_file)
            text = "\n".join([page.extract_text() for page in pdf_reader.pages])
            # Basic pre-processing
            text = re.sub(r'\s+', ' ', text.lower().strip())
            return {"type": "text", "content": text}
        
        elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
            df = pd.read_excel(uploaded_file)
        elif uploaded_file.type == "text/csv":
            df = pd.read_csv(uploaded_file)
        
        # For tabular data (xlsx, csv), detect scope columns
        if 'df' in locals():
            scope_cols = [col for col in df.columns if "scope" in col.lower()]
            if not scope_cols:
                st.warning("No 'scope' column found in the file. Using all data as context.")
                return {"type": "table", "content": df.to_markdown()}
            # Pre-process scope data
            scope_data = df[scope_cols].dropna().astype(str).apply(lambda x: re.sub(r'\s+', ' ', x.lower().strip()))
            return {"type": "scope", "content": scope_data}
        
    except Exception as e:
        st.error(f"📄 Error processing file: {str(e)}")
        return None

# Model loading function
@st.cache_resource
def load_model(hf_token):
    try:
        if not hf_token:
            st.error("🔐 Authentication required! Please set the HF_TOKEN environment variable.")
            return None
        
        login(token=hf_token)
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token)
        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=len(LABEL_TO_CLASS),
            token=hf_token
        )
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model.to(device)
        return model, tokenizer
        
    except Exception as e:
        st.error(f"🤖 Model loading failed: {str(e)}")
        return None

# Classification function
def classify_instruction(prompt, file_context, model, tokenizer):
    model.eval()
    device = model.device
    
    if file_context["type"] == "scope":
        # Batch prediction for multiple scope entries
        predictions = []
        for scope in file_context["content"].values.flatten():
            full_prompt = f"Context:\n{scope}\n\nInstruction: {prompt}"
            inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            with torch.no_grad():
                outputs = model(**inputs)
                prediction = outputs.logits.argmax().item()
                predictions.append(LABEL_TO_CLASS[prediction])
        return predictions
    else:
        # Single prediction for text or table context
        full_prompt = f"Context:\n{file_context['content']}\n\nInstruction: {prompt}"
        inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            prediction = outputs.logits.argmax().item()
        return LABEL_TO_CLASS[prediction]

# Display chat messages
for message in st.session_state.messages:
    avatar = USER_AVATAR if message["role"] == "user" else BOT_AVATAR
    with st.chat_message(message["role"], avatar=avatar):
        st.markdown(message["content"])

# Chat input handling
if prompt := st.chat_input("Ask your inspection question..."):
    # Load model if not already loaded
    if "model" not in st.session_state:
        model_data = load_model(HF_TOKEN)
        if model_data is None:
            st.error("Failed to load model. Please ensure HF_TOKEN is set correctly.")
            st.stop()
        st.session_state.model, st.session_state.tokenizer = model_data
    
    model = st.session_state.model
    tokenizer = st.session_state.tokenizer
    
    # Add user message
    with st.chat_message("user", avatar=USER_AVATAR):
        st.markdown(prompt)
    st.session_state.messages.append({"role": "user", "content": prompt})

    # Process file context
    file_context = process_file(uploaded_file)
    if file_context is None:
        st.error("No file uploaded or file processing failed.")
        st.stop()
    
    # Classify the instruction
    if model and tokenizer:
        try:
            with st.chat_message("assistant", avatar=BOT_AVATAR):
                predicted_output = classify_instruction(prompt, file_context, model, tokenizer)
                if file_context["type"] == "scope":
                    # Display multiple predictions in a table
                    scope_values = file_context["content"].values.flatten()
                    result_df = pd.DataFrame({
                        "Scope": scope_values,
                        "Predicted Class": predicted_output
                    })
                    st.write("Predicted Classes:")
                    st.table(result_df)
                    response = "Predictions completed for multiple scope entries."
                else:
                    # Single prediction
                    response = f"The Item Class is: {predicted_output}"
                    st.markdown(response)
                st.session_state.messages.append({"role": "assistant", "content": response})
                
        except Exception as e:
            st.error(f"⚡ Classification error: {str(e)}")
    else:
        st.error("🤖 Model not loaded!")