amiguel commited on
Commit
de40422
Β·
verified Β·
1 Parent(s): cd58cfd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -19
app.py CHANGED
@@ -5,6 +5,8 @@ import PyPDF2
5
  import pandas as pd
6
  import torch
7
  import os
 
 
8
 
9
  # Set page configuration
10
  st.set_page_config(
@@ -37,9 +39,10 @@ BOT_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/99
37
  # Sidebar configuration
38
  with st.sidebar:
39
  st.header("Upload Documents πŸ“‚")
40
- uploaded_file = st.file_uploader(
41
- "Choose a PDF or XLSX file",
42
- type=["pdf", "xlsx"],
 
43
  label_visibility="collapsed"
44
  )
45
 
@@ -49,20 +52,38 @@ if "messages" not in st.session_state:
49
 
50
  # File processing function
51
  @st.cache_data
52
- def process_file(uploaded_file):
53
- if uploaded_file is None:
54
- return ""
55
 
 
56
  try:
57
- if uploaded_file.type == "application/pdf":
58
- pdf_reader = PyPDF2.PdfReader(uploaded_file)
59
- return "\n".join([page.extract_text() for page in pdf_reader.pages])
60
- elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
61
- df = pd.read_excel(uploaded_file)
62
- return df.to_markdown()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  except Exception as e:
64
  st.error(f"πŸ“„ Error processing file: {str(e)}")
65
- return ""
 
 
66
 
67
  # Model loading function
68
  @st.cache_resource
@@ -78,7 +99,7 @@ def load_model(hf_token):
78
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token)
79
  model = AutoModelForSequenceClassification.from_pretrained(
80
  MODEL_NAME,
81
- num_labels=len(LABEL_TO_CLASS), # Ensure correct number of labels
82
  token=hf_token
83
  )
84
 
@@ -92,7 +113,7 @@ def load_model(hf_token):
92
  st.error(f"πŸ€– Model loading failed: {str(e)}")
93
  return None
94
 
95
- # Classification function
96
  def classify_instruction(prompt, file_context, model, tokenizer):
97
  full_prompt = f"Context:\n{file_context}\n\nInstruction: {prompt}"
98
 
@@ -109,6 +130,17 @@ def classify_instruction(prompt, file_context, model, tokenizer):
109
 
110
  return class_name
111
 
 
 
 
 
 
 
 
 
 
 
 
112
  # Display chat messages
113
  for message in st.session_state.messages:
114
  try:
@@ -138,16 +170,35 @@ if prompt := st.chat_input("Ask your inspection question..."):
138
  st.markdown(prompt)
139
  st.session_state.messages.append({"role": "user", "content": prompt})
140
 
141
- # Process file context
142
- file_context = process_file(uploaded_file)
 
143
 
144
- # Classify the instruction
145
  if model and tokenizer:
146
  try:
147
  with st.chat_message("assistant", avatar=BOT_AVATAR):
 
148
  predicted_class = classify_instruction(prompt, file_context, model, tokenizer)
 
 
149
  response = f"Predicted class: {predicted_class}"
150
- st.markdown(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  st.session_state.messages.append({"role": "assistant", "content": response})
152
 
153
  except Exception as e:
 
5
  import pandas as pd
6
  import torch
7
  import os
8
+ import time
9
+ import re
10
 
11
  # Set page configuration
12
  st.set_page_config(
 
39
  # Sidebar configuration
40
  with st.sidebar:
41
  st.header("Upload Documents πŸ“‚")
42
+ uploaded_files = st.file_uploader(
43
+ "Choose PDF, XLSX, or CSV files",
44
+ type=["pdf", "xlsx", "csv"],
45
+ accept_multiple_files=True, # Allow multiple file uploads
46
  label_visibility="collapsed"
47
  )
48
 
 
52
 
53
  # File processing function
54
  @st.cache_data
55
+ def process_files(uploaded_files):
56
+ if not uploaded_files:
57
+ return []
58
 
59
+ scopes = []
60
  try:
61
+ for uploaded_file in uploaded_files:
62
+ if uploaded_file.type == "application/pdf":
63
+ pdf_reader = PyPDF2.PdfReader(uploaded_file)
64
+ text = "\n".join([page.extract_text() for page in pdf_reader.pages])
65
+ # Split text into potential scope lines (e.g., by newlines or sentences)
66
+ lines = [line.strip() for line in text.split("\n") if line.strip()]
67
+ # Filter lines that look like scope instructions (e.g., contain keywords like "at location", "DAL/")
68
+ scope_lines = [line for line in lines if re.search(r"(at location|DAL/|PSV-|CD-|DA-)", line, re.IGNORECASE)]
69
+ scopes.extend(scope_lines)
70
+
71
+ elif uploaded_file.type in ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "text/csv"]:
72
+ if uploaded_file.type == "text/csv":
73
+ df = pd.read_csv(uploaded_file)
74
+ else:
75
+ df = pd.read_excel(uploaded_file)
76
+ # Assume the first column contains scope instructions
77
+ if not df.empty:
78
+ scope_column = df.columns[0] # First column
79
+ scope_lines = df[scope_column].dropna().astype(str).tolist()
80
+ scopes.extend([line.strip() for line in scope_lines if line.strip()])
81
+
82
  except Exception as e:
83
  st.error(f"πŸ“„ Error processing file: {str(e)}")
84
+ return []
85
+
86
+ return scopes
87
 
88
  # Model loading function
89
  @st.cache_resource
 
99
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token)
100
  model = AutoModelForSequenceClassification.from_pretrained(
101
  MODEL_NAME,
102
+ num_labels=len(LABEL_TO_CLASS),
103
  token=hf_token
104
  )
105
 
 
113
  st.error(f"πŸ€– Model loading failed: {str(e)}")
114
  return None
115
 
116
+ # Classification function with streaming simulation
117
  def classify_instruction(prompt, file_context, model, tokenizer):
118
  full_prompt = f"Context:\n{file_context}\n\nInstruction: {prompt}"
119
 
 
130
 
131
  return class_name
132
 
133
+ def stream_classification_output(class_name, delay=0.05):
134
+ """Simulate streaming by displaying the class name character by character."""
135
+ response_container = st.empty()
136
+ full_response = ""
137
+ for char in class_name:
138
+ full_response += char
139
+ response_container.markdown(f"Predicted class: {full_response} β–Œ")
140
+ time.sleep(delay)
141
+ response_container.markdown(f"Predicted class: {full_response}")
142
+ return full_response
143
+
144
  # Display chat messages
145
  for message in st.session_state.messages:
146
  try:
 
170
  st.markdown(prompt)
171
  st.session_state.messages.append({"role": "user", "content": prompt})
172
 
173
+ # Process file context (if any)
174
+ file_scopes = process_files(uploaded_files)
175
+ file_context = "\n".join(file_scopes) if file_scopes else ""
176
 
177
+ # Classify the user prompt
178
  if model and tokenizer:
179
  try:
180
  with st.chat_message("assistant", avatar=BOT_AVATAR):
181
+ # Classify the user-entered prompt
182
  predicted_class = classify_instruction(prompt, file_context, model, tokenizer)
183
+ # Stream the classification output
184
+ streamed_response = stream_classification_output(predicted_class)
185
  response = f"Predicted class: {predicted_class}"
186
+
187
+ # If there are scopes from files, classify them too
188
+ if file_scopes:
189
+ st.markdown("### Classifications from Uploaded Files")
190
+ results = []
191
+ for scope in file_scopes:
192
+ predicted_class = classify_instruction(scope, file_context, model, tokenizer)
193
+ results.append({"Scope": scope, "Predicted Class": predicted_class})
194
+
195
+ # Display results in a table
196
+ df_results = pd.DataFrame(results)
197
+ st.table(df_results)
198
+
199
+ # Add table to chat history
200
+ response += "\n\n### Classifications from Uploaded Files\n" + df_results.to_markdown(index=False)
201
+
202
  st.session_state.messages.append({"role": "assistant", "content": response})
203
 
204
  except Exception as e: