Spaces:
Sleeping
Sleeping
File size: 7,332 Bytes
e476c2e cd58cfd e476c2e a7ba67c 418028a e476c2e a7ba67c 418028a cd58cfd 037c4ae 418028a cd58cfd e476c2e 037c4ae e476c2e 037c4ae e476c2e 037c4ae e476c2e 004fb60 418028a e476c2e 418028a e476c2e 004fb60 418028a e476c2e 037c4ae 004fb60 418028a 004fb60 418028a e476c2e 418028a e476c2e cd58cfd e476c2e a7ba67c e476c2e cd58cfd 418028a cd58cfd e476c2e cd58cfd e476c2e 037c4ae e476c2e 004fb60 cd58cfd e476c2e 418028a e476c2e 418028a e476c2e cd58cfd e476c2e a7ba67c e476c2e 004fb60 418028a e476c2e 004fb60 e476c2e 418028a cd58cfd e476c2e cd58cfd e476c2e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from huggingface_hub import login
import PyPDF2
import pandas as pd
import torch
import os
import re
# Set page configuration
st.set_page_config(
page_title="WizNerd Insp",
page_icon="π",
layout="centered"
)
# Load Hugging Face token from environment variable
HF_TOKEN = os.getenv("HF_TOKEN")
# Model name
MODEL_NAME = "amiguel/instruct_BERT-base-uncased_model"
# Label mapping
LABEL_TO_CLASS = {
0: "Campaign", 1: "Corrosion Monitoring", 2: "Flare Tip", 3: "Flare TIP",
4: "FU Items", 5: "Intelligent Pigging", 6: "Lifting", 7: "Non Structural Tank",
8: "Piping", 9: "Pressure Safety Device", 10: "Pressure Vessel (VIE)",
11: "Pressure Vessel (VII)", 12: "Structure", 13: "Flame Arrestor"
}
# Title with rocket emojis
st.title("π WizNerd Insp π")
# Configure Avatars
USER_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/9904d9a0d445ab0488cf7395cb863cce7621d897/USER_AVATAR.png"
BOT_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/991f4c6e4e1dc7a8e24876ca5aae5228bcdb4dba/Ataliba_Avatar.jpg"
# Sidebar configuration
with st.sidebar:
st.header("Upload Documents π")
uploaded_file = st.file_uploader(
"Choose a PDF, XLSX, or CSV file",
type=["pdf", "xlsx", "csv"],
label_visibility="collapsed"
)
# Initialize chat history
if "messages" not in st.session_state:
st.session_state.messages = []
# File processing function with pre-processing
@st.cache_data
def process_file(uploaded_file):
if uploaded_file is None:
return None
try:
if uploaded_file.type == "application/pdf":
pdf_reader = PyPDF2.PdfReader(uploaded_file)
text = "\n".join([page.extract_text() for page in pdf_reader.pages])
# Basic pre-processing
text = re.sub(r'\s+', ' ', text.lower().strip())
return {"type": "text", "content": text}
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
df = pd.read_excel(uploaded_file)
elif uploaded_file.type == "text/csv":
df = pd.read_csv(uploaded_file)
# For tabular data (xlsx, csv), detect scope columns
if 'df' in locals():
scope_cols = [col for col in df.columns if "scope" in col.lower()]
if not scope_cols:
st.warning("No 'scope' column found in the file. Using all data as context.")
return {"type": "table", "content": df.to_markdown()}
# Pre-process scope data
scope_data = df[scope_cols].dropna().astype(str).apply(lambda x: re.sub(r'\s+', ' ', x.lower().strip()))
return {"type": "scope", "content": scope_data}
except Exception as e:
st.error(f"π Error processing file: {str(e)}")
return None
# Model loading function
@st.cache_resource
def load_model(hf_token):
try:
if not hf_token:
st.error("π Authentication required! Please set the HF_TOKEN environment variable.")
return None
login(token=hf_token)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token)
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_NAME,
num_labels=len(LABEL_TO_CLASS),
token=hf_token
)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
return model, tokenizer
except Exception as e:
st.error(f"π€ Model loading failed: {str(e)}")
return None
# Classification function
def classify_instruction(prompt, file_context, model, tokenizer):
model.eval()
device = model.device
if file_context["type"] == "scope":
# Batch prediction for multiple scope entries
predictions = []
for scope in file_context["content"].values.flatten():
full_prompt = f"Context:\n{scope}\n\nInstruction: {prompt}"
inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model(**inputs)
prediction = outputs.logits.argmax().item()
predictions.append(LABEL_TO_CLASS[prediction])
return predictions
else:
# Single prediction for text or table context
full_prompt = f"Context:\n{file_context['content']}\n\nInstruction: {prompt}"
inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model(**inputs)
prediction = outputs.logits.argmax().item()
return LABEL_TO_CLASS[prediction]
# Display chat messages
for message in st.session_state.messages:
avatar = USER_AVATAR if message["role"] == "user" else BOT_AVATAR
with st.chat_message(message["role"], avatar=avatar):
st.markdown(message["content"])
# Chat input handling
if prompt := st.chat_input("Ask your inspection question..."):
# Load model if not already loaded
if "model" not in st.session_state:
model_data = load_model(HF_TOKEN)
if model_data is None:
st.error("Failed to load model. Please ensure HF_TOKEN is set correctly.")
st.stop()
st.session_state.model, st.session_state.tokenizer = model_data
model = st.session_state.model
tokenizer = st.session_state.tokenizer
# Add user message
with st.chat_message("user", avatar=USER_AVATAR):
st.markdown(prompt)
st.session_state.messages.append({"role": "user", "content": prompt})
# Process file context
file_context = process_file(uploaded_file)
if file_context is None:
st.error("No file uploaded or file processing failed.")
st.stop()
# Classify the instruction
if model and tokenizer:
try:
with st.chat_message("assistant", avatar=BOT_AVATAR):
predicted_output = classify_instruction(prompt, file_context, model, tokenizer)
if file_context["type"] == "scope":
# Display multiple predictions in a table
scope_values = file_context["content"].values.flatten()
result_df = pd.DataFrame({
"Scope": scope_values,
"Predicted Class": predicted_output
})
st.write("Predicted Classes:")
st.table(result_df)
response = "Predictions completed for multiple scope entries."
else:
# Single prediction
response = f"The Item Class is: {predicted_output}"
st.markdown(response)
st.session_state.messages.append({"role": "assistant", "content": response})
except Exception as e:
st.error(f"β‘ Classification error: {str(e)}")
else:
st.error("π€ Model not loaded!") |