Spaces:
Running
Running
Commit
·
dbaeac5
1
Parent(s):
67f7ed6
Major update. Support for 15 LLMs, World Flora Online taxonomy validation, geolocation, 2 OCR methods, significant UI changes, stability improvements, consistent JSON parsing
Browse files- app.py +48 -5
- vouchervision/OCR_google_cloud_vision.py +3 -0
- vouchervision/utils_hf.py +32 -1
app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
-
import yaml, os, json, random, time, re, torch, random, warnings, shutil, sys
|
| 3 |
import seaborn as sns
|
| 4 |
import plotly.graph_objs as go
|
| 5 |
from PIL import Image
|
|
@@ -14,7 +14,7 @@ from vouchervision.vouchervision_main import voucher_vision
|
|
| 14 |
from vouchervision.general_utils import test_GPU, get_cfg_from_full_path, summarize_expense_report, validate_dir
|
| 15 |
from vouchervision.model_maps import ModelMaps
|
| 16 |
from vouchervision.API_validation import APIvalidation
|
| 17 |
-
from vouchervision.utils_hf import setup_streamlit_config, save_uploaded_file, save_uploaded_local
|
| 18 |
from vouchervision.data_project import convert_pdf_to_jpg
|
| 19 |
from vouchervision.utils_LLM import check_system_gpus
|
| 20 |
|
|
@@ -42,7 +42,7 @@ if 'config' not in st.session_state:
|
|
| 42 |
st.session_state.config, st.session_state.dir_home = build_VV_config(loaded_cfg=None)
|
| 43 |
setup_streamlit_config(st.session_state.dir_home)
|
| 44 |
|
| 45 |
-
|
| 46 |
|
| 47 |
########################################################################################################
|
| 48 |
### Global constants ####
|
|
@@ -273,7 +273,7 @@ def content_input_images(col_left, col_right):
|
|
| 273 |
if st.session_state.is_hf:
|
| 274 |
if uploaded_files:
|
| 275 |
# Clear input image gallery and input list
|
| 276 |
-
|
| 277 |
|
| 278 |
for uploaded_file in uploaded_files:
|
| 279 |
# Determine the file type
|
|
@@ -336,6 +336,45 @@ def content_input_images(col_left, col_right):
|
|
| 336 |
pass
|
| 337 |
# elif st.session_state['input_list_small'] and (st.session_state['dir_images_local_TEMP'] != st.session_state.config['leafmachine']['project']['dir_images_local']):
|
| 338 |
elif (st.session_state['dir_images_local_TEMP'] != st.session_state.config['leafmachine']['project']['dir_images_local']):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
dir_images_local = st.session_state.config['leafmachine']['project']['dir_images_local']
|
| 340 |
count_n_imgs = list_jpg_files(dir_images_local)
|
| 341 |
st.session_state['processing_add_on'] = count_n_imgs
|
|
@@ -412,6 +451,10 @@ def delete_directory(dir_path):
|
|
| 412 |
|
| 413 |
|
| 414 |
def clear_image_gallery():
|
|
|
|
|
|
|
|
|
|
|
|
|
| 415 |
delete_directory(st.session_state['dir_uploaded_images'])
|
| 416 |
delete_directory(st.session_state['dir_uploaded_images_small'])
|
| 417 |
validate_dir(st.session_state['dir_uploaded_images'])
|
|
@@ -423,7 +466,7 @@ def use_test_image():
|
|
| 423 |
st.session_state.config['leafmachine']['project']['dir_images_local'] = os.path.join(st.session_state.dir_home,'demo','demo_images')
|
| 424 |
n_images = len([f for f in os.listdir(st.session_state.config['leafmachine']['project']['dir_images_local']) if os.path.isfile(os.path.join(st.session_state.config['leafmachine']['project']['dir_images_local'], f))])
|
| 425 |
st.session_state['processing_add_on'] = n_images
|
| 426 |
-
|
| 427 |
st.session_state['uploader_idk'] += 1
|
| 428 |
for file in os.listdir(st.session_state.config['leafmachine']['project']['dir_images_local']):
|
| 429 |
file_path = save_uploaded_file(os.path.join(st.session_state.dir_home,'demo','demo_images'), file)
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
import yaml, os, json, random, time, re, torch, random, warnings, shutil, sys, glob
|
| 3 |
import seaborn as sns
|
| 4 |
import plotly.graph_objs as go
|
| 5 |
from PIL import Image
|
|
|
|
| 14 |
from vouchervision.general_utils import test_GPU, get_cfg_from_full_path, summarize_expense_report, validate_dir
|
| 15 |
from vouchervision.model_maps import ModelMaps
|
| 16 |
from vouchervision.API_validation import APIvalidation
|
| 17 |
+
from vouchervision.utils_hf import setup_streamlit_config, save_uploaded_file, save_uploaded_local, save_uploaded_file_local
|
| 18 |
from vouchervision.data_project import convert_pdf_to_jpg
|
| 19 |
from vouchervision.utils_LLM import check_system_gpus
|
| 20 |
|
|
|
|
| 42 |
st.session_state.config, st.session_state.dir_home = build_VV_config(loaded_cfg=None)
|
| 43 |
setup_streamlit_config(st.session_state.dir_home)
|
| 44 |
|
| 45 |
+
st.session_state['is_hf'] = True
|
| 46 |
|
| 47 |
########################################################################################################
|
| 48 |
### Global constants ####
|
|
|
|
| 273 |
if st.session_state.is_hf:
|
| 274 |
if uploaded_files:
|
| 275 |
# Clear input image gallery and input list
|
| 276 |
+
clear_image_uploads()
|
| 277 |
|
| 278 |
for uploaded_file in uploaded_files:
|
| 279 |
# Determine the file type
|
|
|
|
| 336 |
pass
|
| 337 |
# elif st.session_state['input_list_small'] and (st.session_state['dir_images_local_TEMP'] != st.session_state.config['leafmachine']['project']['dir_images_local']):
|
| 338 |
elif (st.session_state['dir_images_local_TEMP'] != st.session_state.config['leafmachine']['project']['dir_images_local']):
|
| 339 |
+
has_pdf = False
|
| 340 |
+
clear_image_uploads()
|
| 341 |
+
|
| 342 |
+
for input_file in os.listdir(st.session_state.config['leafmachine']['project']['dir_images_local']):
|
| 343 |
+
if input_file.split('.')[1].lower() in ['jpg','jpeg']:
|
| 344 |
+
pass
|
| 345 |
+
elif input_file.split('.')[1].lower() in ['pdf',]:
|
| 346 |
+
has_pdf = True
|
| 347 |
+
# Handle PDF files
|
| 348 |
+
file_path = save_uploaded_file_local(st.session_state.config['leafmachine']['project']['dir_images_local'], st.session_state['dir_uploaded_images'], input_file)
|
| 349 |
+
# Convert each page of the PDF to an image
|
| 350 |
+
n_pages = convert_pdf_to_jpg(file_path, st.session_state['dir_uploaded_images'], dpi=200)#st.session_state.config['leafmachine']['project']['dir_images_local'])
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
# pdf_files_pattern = os.path.join(st.session_state['dir_uploaded_images'], '*.pdf')
|
| 354 |
+
# for pdf_file in glob.glob(pdf_files_pattern):
|
| 355 |
+
# os.remove(pdf_file)
|
| 356 |
+
|
| 357 |
+
# # Update the input list for each page image
|
| 358 |
+
# converted_files = os.listdir(st.session_state['dir_uploaded_images'])
|
| 359 |
+
# for file_name in converted_files:
|
| 360 |
+
# if file_name.lower().endswith('.jpg'):
|
| 361 |
+
# jpg_file_path = os.path.join(st.session_state['dir_uploaded_images'], file_name)
|
| 362 |
+
# st.session_state['input_list'].append(jpg_file_path)
|
| 363 |
+
|
| 364 |
+
# # Optionally, create a thumbnail for the gallery
|
| 365 |
+
# img = Image.open(jpg_file_path)
|
| 366 |
+
# img.thumbnail((GALLERY_IMAGE_SIZE, GALLERY_IMAGE_SIZE), Image.Resampling.LANCZOS)
|
| 367 |
+
# file_path_small = save_uploaded_file_local(st.session_state['dir_uploaded_images'], st.session_state['dir_uploaded_images_small'], file_name, img)
|
| 368 |
+
# st.session_state['input_list_small'].append(file_path_small)
|
| 369 |
+
|
| 370 |
+
# st.session_state.config['leafmachine']['project']['dir_images_local'] = st.session_state['dir_uploaded_images']
|
| 371 |
+
|
| 372 |
+
else:
|
| 373 |
+
pass
|
| 374 |
+
# st.warning("Inputs must be '.PDF' or '.jpg' or '.jpeg'")
|
| 375 |
+
if has_pdf:
|
| 376 |
+
st.session_state.config['leafmachine']['project']['dir_images_local'] = st.session_state['dir_uploaded_images']
|
| 377 |
+
|
| 378 |
dir_images_local = st.session_state.config['leafmachine']['project']['dir_images_local']
|
| 379 |
count_n_imgs = list_jpg_files(dir_images_local)
|
| 380 |
st.session_state['processing_add_on'] = count_n_imgs
|
|
|
|
| 451 |
|
| 452 |
|
| 453 |
def clear_image_gallery():
|
| 454 |
+
delete_directory(st.session_state['dir_uploaded_images_small'])
|
| 455 |
+
validate_dir(st.session_state['dir_uploaded_images_small'])
|
| 456 |
+
|
| 457 |
+
def clear_image_uploads():
|
| 458 |
delete_directory(st.session_state['dir_uploaded_images'])
|
| 459 |
delete_directory(st.session_state['dir_uploaded_images_small'])
|
| 460 |
validate_dir(st.session_state['dir_uploaded_images'])
|
|
|
|
| 466 |
st.session_state.config['leafmachine']['project']['dir_images_local'] = os.path.join(st.session_state.dir_home,'demo','demo_images')
|
| 467 |
n_images = len([f for f in os.listdir(st.session_state.config['leafmachine']['project']['dir_images_local']) if os.path.isfile(os.path.join(st.session_state.config['leafmachine']['project']['dir_images_local'], f))])
|
| 468 |
st.session_state['processing_add_on'] = n_images
|
| 469 |
+
clear_image_uploads()
|
| 470 |
st.session_state['uploader_idk'] += 1
|
| 471 |
for file in os.listdir(st.session_state.config['leafmachine']['project']['dir_images_local']):
|
| 472 |
file_path = save_uploaded_file(os.path.join(st.session_state.dir_home,'demo','demo_images'), file)
|
vouchervision/OCR_google_cloud_vision.py
CHANGED
|
@@ -144,6 +144,9 @@ class OCREngine:
|
|
| 144 |
|
| 145 |
def init_gemini_vision(self):
|
| 146 |
pass
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
|
| 149 |
def detect_text_craft(self):
|
|
|
|
| 144 |
|
| 145 |
def init_gemini_vision(self):
|
| 146 |
pass
|
| 147 |
+
|
| 148 |
+
def init_gpt4_vision(self):
|
| 149 |
+
pass
|
| 150 |
|
| 151 |
|
| 152 |
def detect_text_craft(self):
|
vouchervision/utils_hf.py
CHANGED
|
@@ -6,7 +6,7 @@ import base64
|
|
| 6 |
from PIL import Image
|
| 7 |
from PIL import Image
|
| 8 |
from io import BytesIO
|
| 9 |
-
from shutil import copyfileobj
|
| 10 |
|
| 11 |
# from vouchervision.general_utils import get_cfg_from_full_path
|
| 12 |
|
|
@@ -37,6 +37,37 @@ def setup_streamlit_config(dir_home):
|
|
| 37 |
f.write(config_content.strip())
|
| 38 |
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
def save_uploaded_file(directory, img_file, image=None):
|
| 42 |
if not os.path.exists(directory):
|
|
|
|
| 6 |
from PIL import Image
|
| 7 |
from PIL import Image
|
| 8 |
from io import BytesIO
|
| 9 |
+
from shutil import copyfileobj, copyfile
|
| 10 |
|
| 11 |
# from vouchervision.general_utils import get_cfg_from_full_path
|
| 12 |
|
|
|
|
| 37 |
f.write(config_content.strip())
|
| 38 |
|
| 39 |
|
| 40 |
+
def save_uploaded_file_local(directory_in, directory_out, img_file_name, image=None):
|
| 41 |
+
if not os.path.exists(directory_out):
|
| 42 |
+
os.makedirs(directory_out)
|
| 43 |
+
|
| 44 |
+
# Assuming img_file_name includes the extension
|
| 45 |
+
img_file_base, img_file_ext = os.path.splitext(img_file_name)
|
| 46 |
+
|
| 47 |
+
full_path_out = os.path.join(directory_out, img_file_name)
|
| 48 |
+
full_path_in = os.path.join(directory_in, img_file_name)
|
| 49 |
+
|
| 50 |
+
# Check if the file extension is .pdf (or add other conditions for different file types)
|
| 51 |
+
if img_file_ext.lower() == '.pdf':
|
| 52 |
+
# Copy the file from the input directory to the output directory
|
| 53 |
+
copyfile(full_path_in, full_path_out)
|
| 54 |
+
return full_path_out
|
| 55 |
+
else:
|
| 56 |
+
if image is None:
|
| 57 |
+
try:
|
| 58 |
+
with Image.open(full_path_in) as image:
|
| 59 |
+
image.save(full_path_out, "JPEG")
|
| 60 |
+
# Return the full path of the saved image
|
| 61 |
+
return full_path_out
|
| 62 |
+
except:
|
| 63 |
+
pass
|
| 64 |
+
else:
|
| 65 |
+
try:
|
| 66 |
+
image.save(full_path_out, "JPEG")
|
| 67 |
+
return full_path_out
|
| 68 |
+
except:
|
| 69 |
+
pass
|
| 70 |
+
|
| 71 |
|
| 72 |
def save_uploaded_file(directory, img_file, image=None):
|
| 73 |
if not os.path.exists(directory):
|