import gradio as gr import tensorflow as tf import tensorflow_hub as hub import numpy as np import pandas as pd from transformers import GitProcessor, GitModel, GitConfig from PIL import Image # Load models and processors git_config = GitConfig.from_pretrained("microsoft/git-large-r") git_processor_large_textcaps = GitProcessor.from_pretrained("microsoft/git-large-r") git_model_large_textcaps = GitModel.from_pretrained("microsoft/git-large-r") itm_model = hub.load("https://tfhub.dev/google/LaViT/1") use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5") # List of statements for Image-Text Matching statements = [ # (List of statements as provided in the original code) ] # Function to generate image caption def generate_caption(processor, model, image): inputs = processor(images=image, return_tensors="pt") outputs = model(**inputs) caption = processor.batch_decode(outputs.logits.argmax(-1), skip_special_tokens=True) return caption[0] # Function to compute textual similarity def compute_textual_similarity(caption, statement): captions_embeddings = use_model([caption])[0].numpy() statements_embeddings = use_model([statement])[0].numpy() similarity_score = np.inner(captions_embeddings, statements_embeddings) return similarity_score[0] # Function to compute ITM score def compute_itm_score(image, statement): image_features = itm_model(image) statement_features = use_model([statement])[0].numpy() similarity_score = np.inner(image_features, statement_features) return similarity_score[0][0] # Function to save DataFrame to CSV def save_dataframe_to_csv(df): csv_data = df.to_csv(index=False) return csv_data # Main function to perform image captioning and image-text matching def process_image_and_statements(image, file_name): all_results_list = [] caption = generate_caption(git_processor_large_textcaps, git_model_large_textcaps, image) for statement in statements: textual_similarity_score = compute_textual_similarity(caption, statement) * 100 itm_score_statement = compute_itm_score(image, statement) * 100 final_score = 0.5 * textual_similarity_score + 0.5 * itm_score_statement all_results_list.append({ 'Image File Name': file_name, # Include the image file name 'Statement': statement, 'Generated Caption': caption, 'Textual Similarity Score': f"{textual_similarity_score:.2f}%", 'ITM Score': f"{itm_score_statement:.2f}%", 'Final Combined Score': f"{final_score:.2f}%" }) results_df = pd.DataFrame(all_results_list) csv_results = save_dataframe_to_csv(results_df) return results_df, csv_results # Gradio interface with Image input to receive an image and its file name image_input = gr.inputs.Image(label="Upload Image", image_mode='RGB', source="upload") output_df = gr.outputs.Dataframe(type="pandas", label="Results") output_csv = gr.outputs.File(label="Download CSV") iface = gr.Interface( fn=process_image_and_statements, inputs=image_input, outputs=[output_df, output_csv], title="Image Captioning and Image-Text Matching", # Gradio interface with Image input to receive an image and its file name image_input = gr.inputs.Image(label="Upload Image", image_mode='RGB', source="upload") output_df = gr.outputs.Dataframe(label="Results") # Corrected syntax output_csv = gr.outputs.File(label="Download CSV") iface = gr.Interface( fn=process_image_and_statements, inputs=image_input, outputs=[output_df, output_csv], title="Image Captioning and Image-Text Matching", theme='sudeepshouche/minimalist', css=".output { flex-direction: column; } .output .outputs { width: 100%; }", # Custom CSS capture_session=True, # Capture errors and exceptions in Gradio interface ) iface.launch(debug=True)