image-captioning

Runtime error

App Files Files Community

NicolasVana commited on Dec 5, 2022

Commit

32162b0

1 Parent(s): 9c96e70

Upload 120 files

Browse files

Adding our own model

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
Inception/PretrainedInceptionLSTM/Model/keras_metadata.pb +3 -0
Inception/PretrainedInceptionLSTM/Model/saved_model.pb +3 -0
Inception/PretrainedInceptionLSTM/Model/variables/variables.data-00000-of-00001 +3 -0
Inception/PretrainedInceptionLSTM/Model/variables/variables.index +0 -0
Inception/PretrainedInceptionLSTM/index2Word.npy +0 -0
Inception/PretrainedInceptionLSTM/variable_params.npy +0 -0
Inception/PretrainedInceptionLSTM/word2Index.npy +0 -0
Inception/RetrainedInceptionFeatureExtraction/Model/keras_metadata.pb +3 -0
Inception/RetrainedInceptionFeatureExtraction/Model/saved_model.pb +3 -0
Inception/RetrainedInceptionFeatureExtraction/Model/variables/variables.data-00000-of-00001 +3 -0
Inception/RetrainedInceptionFeatureExtraction/Model/variables/variables.index +0 -0
Inception/RetrainedInceptionLSTM/Model/keras_metadata.pb +3 -0
Inception/RetrainedInceptionLSTM/Model/saved_model.pb +3 -0
Inception/RetrainedInceptionLSTM/Model/variables/variables.data-00000-of-00001 +3 -0
Inception/RetrainedInceptionLSTM/Model/variables/variables.index +0 -0
Inception/RetrainedInceptionLSTM/index2Word.npy +0 -0
Inception/RetrainedInceptionLSTM/variable_params.npy +0 -0
Inception/RetrainedInceptionLSTM/word2Index.npy +0 -0
app.py +27 -32
model.py +149 -57
samples/ROCO_00001.jpg +0 -0
samples/ROCO_00006.jpg +0 -0
samples/ROCO_00016.jpg +0 -0
samples/ROCO_00025.jpg +0 -0
samples/ROCO_00031.jpg +0 -0
samples/ROCO_00036.jpg +0 -0
samples/ROCO_00061.jpg +0 -0
samples/ROCO_00084.jpg +0 -0
samples/ROCO_00138.jpg +0 -0
samples/ROCO_00153.jpg +0 -0
samples/ROCO_00176.jpg +0 -0
samples/ROCO_00185.jpg +0 -0
samples/ROCO_00190.jpg +0 -0
samples/ROCO_00206.jpg +0 -0
samples/ROCO_00218.jpg +0 -0
samples/ROCO_00251.jpg +0 -0
samples/ROCO_00258.jpg +0 -0
samples/ROCO_00261.jpg +0 -0
samples/ROCO_00264.jpg +0 -0
samples/ROCO_00271.jpg +0 -0
samples/ROCO_00300.jpg +0 -0
samples/ROCO_00302.jpg +0 -0
samples/ROCO_00303.jpg +0 -0
samples/ROCO_00307.jpg +0 -0
samples/ROCO_00316.jpg +0 -0
samples/ROCO_00319.jpg +0 -0
samples/ROCO_00328.jpg +0 -0
samples/ROCO_00332.jpg +0 -0
samples/ROCO_00333.jpg +0 -0

.gitattributes CHANGED Viewed

@@ -14,3 +14,6 @@
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text

 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
+Inception/PretrainedInceptionLSTM/Model/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
+Inception/RetrainedInceptionFeatureExtraction/Model/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
+Inception/RetrainedInceptionLSTM/Model/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text

Inception/PretrainedInceptionLSTM/Model/keras_metadata.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90fe3518b5f0e26908c460bc876abaef2017a5252faea2854e19e6bbc80c1abb
+size 19875

Inception/PretrainedInceptionLSTM/Model/saved_model.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ac9410ec5d75b446ba1913ce546556b276f4f7243c6b84692dfe71d04785eb1
+size 2728089

Inception/PretrainedInceptionLSTM/Model/variables/variables.data-00000-of-00001 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:607eba2255866ff15c9be9dbc271e24c643b9c5650b5b36bd22c6f1ad461c443
+size 23853510

Inception/PretrainedInceptionLSTM/Model/variables/variables.index ADDED Viewed

Binary file (2.07 kB). View file

Inception/PretrainedInceptionLSTM/index2Word.npy ADDED Viewed

Binary file (91.1 kB). View file

Inception/PretrainedInceptionLSTM/variable_params.npy ADDED Viewed

Binary file (327 Bytes). View file

Inception/PretrainedInceptionLSTM/word2Index.npy ADDED Viewed

Binary file (91.1 kB). View file

Inception/RetrainedInceptionFeatureExtraction/Model/keras_metadata.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b304413d09ac695dc11a96b0305ffb4e41f34f145b90a536ed4c929c11c7306
+size 974015

Inception/RetrainedInceptionFeatureExtraction/Model/saved_model.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57d9834d47ee681be13d8ecf60b93770a30feb9d655dea12b78c0f0f7e1c845a
+size 6312206

Inception/RetrainedInceptionFeatureExtraction/Model/variables/variables.data-00000-of-00001 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12cf113be83ae0bc7024191ae51b1e41c2c016d5543c3711e0bab928904eaeab
+size 279976841

Inception/RetrainedInceptionFeatureExtraction/Model/variables/variables.index ADDED Viewed

Binary file (50.2 kB). View file

Inception/RetrainedInceptionLSTM/Model/keras_metadata.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e29ab07736ef18245cac5040bf1dd2100d21e8084ed51db859064026a1a0fba4
+size 19858

Inception/RetrainedInceptionLSTM/Model/saved_model.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e40821682b6a5e4b88848c9ec60bd8400cf2a37065137871f59112d77d027c65
+size 2727709

Inception/RetrainedInceptionLSTM/Model/variables/variables.data-00000-of-00001 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ecad5c20713acfd90563bc562f048e9cc302936b162e2e196f37d38922a0dca
+size 18577366

Inception/RetrainedInceptionLSTM/Model/variables/variables.index ADDED Viewed

Binary file (2.07 kB). View file

Inception/RetrainedInceptionLSTM/index2Word.npy ADDED Viewed

Binary file (91.1 kB). View file

Inception/RetrainedInceptionLSTM/variable_params.npy ADDED Viewed

Binary file (327 Bytes). View file

Inception/RetrainedInceptionLSTM/word2Index.npy ADDED Viewed

Binary file (91.1 kB). View file

app.py CHANGED Viewed

@@ -4,21 +4,19 @@ import io
 # Designing the interface
-st.title("🖼️ Image Captioning Demo 📝")
-st.write("[Yih-Dar SHIEH](https://huggingface.co/ydshieh)")
 st.sidebar.markdown(
     """
-    An image captioning model by combining ViT model with GPT2 model.
-    The encoder (ViT) and decoder (GPT2) are combined using Hugging Face transformers' [Vision-To-Text Encoder-Decoder
-    framework](https://huggingface.co/transformers/master/model_doc/visionencoderdecoder.html).
-    The pretrained weights of both models are loaded, with a set of randomly initialized cross-attention weights.
-    The model is trained on the COCO 2017 dataset for about 6900 steps (batch_size=256).
-    [Follow-up work of [Huggingface JAX/Flax event](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/).]\n
     """
 )
-with st.spinner('Loading and compiling ViT-GPT2 model ...'):
     from model import *
 random_image_id = get_random_image_id()
@@ -29,7 +27,17 @@ sample_image_id = st.sidebar.selectbox(
     sample_image_ids
 )
-if st.sidebar.button("Random COCO 2017 (val) images"):
     random_image_id = get_random_image_id()
     sample_image_id = "None"
@@ -51,47 +59,34 @@ else:
         assert type(sample_image_id) == int
         image_id = sample_image_id
-    sample_name = f"COCO_val2017_{str(image_id).zfill(12)}.jpg"
     sample_path = os.path.join(sample_dir, sample_name)
     if bytes_data is not None:
         image = Image.open(bytes_data)
     elif os.path.isfile(sample_path):
         image = Image.open(sample_path)
-    else:
-        url = f"http://images.cocodataset.org/val2017/{str(image_id).zfill(12)}.jpg"
-        image = Image.open(requests.get(url, stream=True).raw)
-    width, height = image.size
     resized = image.resize(size=(width, height))
-    if height > 384:
-        width = int(width / height * 384)
-        height = 384
-        resized = resized.resize(size=(width, height))
-    width, height = resized.size
-    if width > 512:
-        width = 512
-        height = int(height / width * 512)
-        resized = resized.resize(size=(width, height))
     if bytes_data is None:
-        st.markdown(f"[{str(image_id).zfill(12)}.jpg](http://images.cocodataset.org/val2017/{str(image_id).zfill(12)}.jpg)")
     show = st.image(resized)
     show.image(resized, '\n\nSelected Image')
-    resized.close()
     # For newline
     st.sidebar.write('\n')
     with st.spinner('Generating image caption ...'):
-        caption = predict(image)
-        caption_en = caption
         st.header(f'Predicted caption:\n\n')
-        st.subheader(caption_en)
-    st.sidebar.header("ViT-GPT2 predicts: ")
     st.sidebar.write(f"{caption}")
     image.close()

 # Designing the interface
+st.title("Medical Image Captioning")
 st.sidebar.markdown(
     """
+    This project features 3 different Medical image captioning models.
+    Two of the use the InceptionV3 architecture to do feature extraction and then generate the captions using an LSTM model.
+    The difference between these two is that the first one uses InceptionV3 trained on ImageNet data and outputs 2048 features.
+    The second one is based on a retrained version of InceptionV3 that uses the CUI data from the ROCO dataset to extract 745 features from the images.
+    The final model is transformer based on...
     """
 )
+with st.spinner('Loading objects ...'):
     from model import *
 random_image_id = get_random_image_id()
     sample_image_ids
 )
+st.sidebar.title("Select a model Type")
+model_type = st.sidebar.selectbox(
+    "Please choose a model",
+    ['Pretrained Inception', 'Retrained Inception', 'Transformer']
+)
+inception, lstm = fetch_model(model_type)
+word2Index, index2Word, variable_params = fetch_auxiliary_files(model_type)
+max_len = variable_params['max_caption_len']
+if st.sidebar.button("Random ROCO (test) images"):
     random_image_id = get_random_image_id()
     sample_image_id = "None"
         assert type(sample_image_id) == int
         image_id = sample_image_id
+    sample_name = f"ROCO_{str(image_id).zfill(5)}.jpg"
     sample_path = os.path.join(sample_dir, sample_name)
     if bytes_data is not None:
         image = Image.open(bytes_data)
     elif os.path.isfile(sample_path):
         image = Image.open(sample_path)
+    width, height = 299, 299
     resized = image.resize(size=(width, height))
     if bytes_data is None:
+        st.markdown(f"ROCO_{str(image_id).zfill(5)}.jpg")
     show = st.image(resized)
     show.image(resized, '\n\nSelected Image')
     # For newline
     st.sidebar.write('\n')
     with st.spinner('Generating image caption ...'):
         st.header(f'Predicted caption:\n\n')
+        preprocessed_img = preprocess_image_inception(resized)
+        features = extract_features(inception, preprocessed_img)
+        caption = generate_caption(lstm, features, max_len, word2Index, index2Word)
+        st.subheader(caption)
+    st.sidebar.header("Model predicts: ")
     st.sidebar.write(f"{caption}")
     image.close()

model.py CHANGED Viewed

@@ -1,68 +1,160 @@
 import json
 import os, shutil
 import random
 from PIL import Image
-import jax
-from transformers import FlaxVisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
-from huggingface_hub import hf_hub_download
-# create target model directory
-model_dir = './models/'
-os.makedirs(model_dir, exist_ok=True)
-files_to_download = [
-    "config.json",
-    "flax_model.msgpack",
-    "merges.txt",
-    "special_tokens_map.json",
-    "tokenizer.json",
-    "tokenizer_config.json",
-    "vocab.json",
-    "preprocessor_config.json",
-]
-# copy files from checkpoint hub:
-for fn in files_to_download:
-    file_path = hf_hub_download("ydshieh/vit-gpt2-coco-en-ckpts", f"ckpt_epoch_3_step_6900/{fn}")
-    shutil.copyfile(file_path, os.path.join(model_dir, fn))
-model = FlaxVisionEncoderDecoderModel.from_pretrained(model_dir)
-feature_extractor = ViTFeatureExtractor.from_pretrained(model_dir)
-tokenizer = AutoTokenizer.from_pretrained(model_dir)
-max_length = 16
-num_beams = 4
-gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
-@jax.jit
-def generate(pixel_values):
-    output_ids = model.generate(pixel_values, **gen_kwargs).sequences
-    return output_ids
-def predict(image):
     if image.mode != "RGB":
         image = image.convert(mode="RGB")
-    pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
-    output_ids = generate(pixel_values)
-    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-    preds = [pred.strip() for pred in preds]
-    return preds[0]
 def _compile():
-    image_path = 'samples/val_000000039769.jpg'
     image = Image.open(image_path)
-    predict(image)
     image.close()
@@ -70,13 +162,13 @@ _compile()
 sample_dir = './samples/'
-sample_image_ids = tuple(["None"] + [int(f.replace('COCO_val2017_', '').replace('.jpg', '')) for f in os.listdir(sample_dir) if f.startswith('COCO_val2017_')])
-with open(os.path.join(sample_dir, "coco-val2017-img-ids.json"), "r", encoding="UTF-8") as fp:
-    coco_2017_val_image_ids = json.load(fp)
 def get_random_image_id():
-    image_id = random.sample(coco_2017_val_image_ids, k=1)[0]
     return image_id

 import json
 import os, shutil
 import random
+import streamlit as st
+import os
+from pathlib import Path
+import numpy as np
 from PIL import Image
+import tensorflow as tf
+from tensorflow.keras.applications.inception_v3 import preprocess_input
+from tensorflow.keras.preprocessing import image
+from tensorflow.keras.applications.inception_v3 import InceptionV3
+from tensorflow.keras.models import Model
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+root = Path(os.getcwd())
+aux_pre = root / 'Inception' / 'PretrainedInceptionLSTM'
+aux_re = root / 'Inception' / 'RetrainedInceptionLSTM'
+model_re_path = root / 'Inception' / 'RetrainedInceptionLSTM' / 'Model'
+model_inception_path = root / 'Inception' / 'RetrainedInceptionFeatureExtraction' / 'Model'
+model_pre_path = root / 'Inception' / 'PretrainedInceptionLSTM' / 'Model'
+# Must create
+def get_pretrained_inceptionV3():
+    model = InceptionV3(weights='imagenet')
+    model2 = Model(model.input, model.layers[-2].output)
+    return model2
+def fetch_auxiliary_files(type):
+    if type == 'Pretrained Inception':
+        word2Index = np.load(aux_pre / "word2Index.npy", allow_pickle=True).item()
+        index2Word = np.load(aux_pre / "index2Word.npy", allow_pickle=True).item()
+        variable_params = np.load(aux_pre / "variable_params.npy", allow_pickle=True).item()
+        return word2Index, index2Word, variable_params
+    if type == 'Retrained Inception':
+        word2Index = np.load(aux_re / "word2Index.npy", allow_pickle=True).item()
+        index2Word = np.load(aux_re / "index2Word.npy", allow_pickle=True).item()
+        variable_params = np.load(aux_re / "variable_params.npy", allow_pickle=True).item()
+        return word2Index, index2Word, variable_params
+@st.cache(allow_output_mutation=True, show_spinner=False)
+def fetch_model(type):
+    with st.spinner(text="Fetching Model"):
+        if type == 'Pretrained Inception':
+            model_pre = tf.keras.models.load_model(model_pre_path)
+            model_inc = get_pretrained_inceptionV3()
+            return model_inc, model_pre
+        if type == 'Retrained Inception':
+            model_re = tf.keras.models.load_model(model_re_path)
+            model_inc = tf.keras.models.load_model(model_inception_path)
+            return model_inc, model_re
+def preprocess_image_inception(image):
     if image.mode != "RGB":
         image = image.convert(mode="RGB")
+    x = np.array(image)
+    x = np.expand_dims(x, axis = 0)
+    x = preprocess_input(x)
+    x = x.reshape(1, 299, 299, 3)
+    return x
+def extract_features(model, image):
+    features = model.predict(image, verbose = 0)
+    return features
+def generate_caption(model, features, max_len, word2Index, index2Word, beam_index = 3):
+    caption = beam_search(model, features, max_len, word2Index, index2Word, beam_index)
+    return caption
+def beam_search(model, features, max_len, word2Index, index2Word, beam_index):
+    start = [word2Index["startseq"]]
+    start_word = [[start, 1]]
+    final_preds = []
+    live_seqs = beam_index
+    features = np.tile(features, (beam_index,1))
+    count = 0
+    while len(start_word) > 0:
+        #print(count)
+        count+=1
+        temp = []
+        padded_seqs = []
+        #Get padded seqs for each of the starting seqs so far, misnamed as start_word
+        for s in start_word:
+            par_caps = pad_sequences([s[0]], maxlen=max_len, padding='post')
+            padded_seqs.append(par_caps)
+        #Formatting input so that it can be used for a prediction
+        padded_seqs = np.array(padded_seqs).reshape(len(start_word), max_len)
+        preds = model.predict([features[:len(start_word)],padded_seqs], verbose=0)
+        #Getting the best branches for each of the start seqs that we had
+        for index, pred in enumerate(preds):
+            word_preds = np.argsort(pred)[-live_seqs:]
+            for w in word_preds:
+                next_cap, prob = start_word[index][0][:], start_word[index][1]
+                next_cap.append(w)
+                prob *= pred[w]
+                temp.append([next_cap, prob])
+        start_word = temp
+        # Sorting according to the probabilities
+        start_word = sorted(start_word, reverse=False, key=lambda l: l[1])
+        # Getting the top words from all branches
+        start_word = start_word[-live_seqs:]
+        for pair in start_word:
+            if index2Word[pair[0][-1]] == 'endseq':
+                final_preds.append([pair[0][:-1], pair[1]])
+                start_word = start_word[:-1]
+                live_seqs -= 1
+            if len(pair[0]) == max_len:
+                final_preds.append(pair)
+                start_word = start_word[:-1]
+                live_seqs -= 1
+    # Between all the finished sequences (either max len or predicted endseq), decide which is best
+    max_prob = 0
+    for index, pred in enumerate(final_preds):
+        if pred[1] > max_prob:
+            best_index = index
+            max_prob = pred[1]
+    # Convert to readable text
+    final_pred = final_preds[best_index]
+    final_caption = [index2Word[i] for i in final_pred[0]]
+    final_caption = ' '.join(final_caption[1:])
+    return final_caption
+# # create target model directory
+# model_dir = './models/'
+# os.makedirs(model_dir, exist_ok=True)
+#
+# files_to_download = [
+#     "config.json",
+#     "flax_model.msgpack",
+#     "merges.txt",
+#     "special_tokens_map.json",
+#     "tokenizer.json",
+#     "tokenizer_config.json",
+#     "vocab.json",
+#     "preprocessor_config.json",
+# ]
 def _compile():
+    image_path = 'samples/ROCO_00929.jpg'
     image = Image.open(image_path)
+    #predict(image)
     image.close()
 sample_dir = './samples/'
+sample_image_ids = tuple(["None"] + [int(f.replace('ROCO_', '').replace('.jpg', '')) for f in os.listdir(sample_dir) if f.startswith('ROCO_')])
+with open(os.path.join(sample_dir, "Roco-img-ids.json"), "r", encoding="UTF-8") as fp:
+    roco_image_ids = json.load(fp)
 def get_random_image_id():
+    image_id = random.sample(roco_image_ids, k=1)[0]
     return image_id