Spaces:

openflamingo
/

OpenFlamingo

Runtime error

App Files Files Community

anas-awadalla commited on Jun 15, 2023

Commit

e548f8b

1 Parent(s): f569b1b

added stuff

Browse files

Files changed (4) hide show

app.py +47 -11
images/4645808729_2dfc59b6a5_z.jpg +0 -0
images/5944609705_4664531909_z.jpg +0 -0
images/mhJ2yWNwMtNcmijZqVEDDW-320-80.jpg +0 -0

app.py CHANGED Viewed

@@ -7,20 +7,22 @@ import os
 login(token=os.environ["HUGGINGFACE_TOKEN"])
 demo_imgs = [
-    ["images/chinchilla_web-1024x683.jpg", "images/shiba-inu-dog-in-the-snow.jpg"],
-    ["images/900.jpeg", "images/hummus.jpg"],
-    ["images/COCO_train2014_000000572279.jpg", "images/COCO_train2014_000000194806.jpg"],
     [
         "images/bcee7a-20190225-a-london-underground-sign.jpg",
         "images/istockphoto-622434332-1024x1024.jpg",
     ],
-    ["images/dogs.jpeg", "images/pandas.jpg"],
     ["images/11887_pesto-pasta_Rita-1x1-1-501c953b29074ab193e2b5ad36e64648.jpg", "images/hummus.jpg"],
 ]
 demo_texts = [
     [
         "Output: This is a chinchilla. They are mainly found in Chile.",
         "Output: This is a shiba. They are very popular in Japan.",
     ],
     [
         "Output: a pink flamingo standing in a body of water.",
@@ -31,9 +33,11 @@ demo_texts = [
     [
         "Question: Describe the scene. Answer: A white airplane being repaired on the runway. 'Cargo' is written on it in red.",
         "Question: What is the man trying to catch? Answer: The man is catching a white kite that his friend is flying. The two men are on a beach.",
     ],
     ['Output: "Underground"', 'Output: "Congress Ave"'],
-    ["Output: 2 dogs", "Output: 3 pandas"],
 ]
 # cd to open_flamingo dir and pip install .
@@ -50,12 +54,12 @@ with open("bad_words.txt", "r") as f:
 model, image_processor, tokenizer = create_model_and_transforms(
     clip_vision_encoder_pretrained="openai",
     clip_vision_encoder_path="ViT-L-14",
-    lang_encoder_path="togethercomputer/RedPajama-INCITE-Base-3B-v1",
-    tokenizer_path="togethercomputer/RedPajama-INCITE-Base-3B-v1",
     cross_attn_every_n_layers=2,
 )
-checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-4B-vitl-rpj3b", "checkpoint.pt")
 model.load_state_dict(torch.load(checkpoint_path), strict=False)
 model.eval()
@@ -97,6 +101,28 @@ def generate(
         if example_two_text is None
         else f"Output: {example_two_text}"
     )
     if (
         example_one_image is None
@@ -107,6 +133,10 @@ def generate(
         raise gr.Error("Please fill in all the fields (image and text).")
     demo_plus_text = f"<image>{example_one_text}<|endofchunk|><image>{example_two_text}<|endofchunk|>"
     demo_plus_text += (
         "<image>Output:" if idx != 2 else f"<image>Question: {text.strip()} Answer:"
     )
@@ -117,7 +147,14 @@ def generate(
     input_ids = lang_x["input_ids"]
     attention_mask = lang_x["attention_mask"]
-    vision_x = [image_processor(example_one_image).unsqueeze(0), image_processor(example_two_image).unsqueeze(0), image_processor(image).unsqueeze(0)]
     vision_x = torch.cat(vision_x, dim=0)
     vision_x = vision_x.unsqueeze(1).unsqueeze(0)
     print(vision_x.shape)
@@ -165,12 +202,11 @@ def generate(
 with gr.Blocks() as demo:
-    # As a consequence, you should treat this model as a research prototype and not as a production-ready model. Before using this demo please familiarize yourself with our [model card](https://github.com/mlfoundations/open_flamingo/blob/main/MODEL_CARD.md) and [terms and conditions](https://github.com/mlfoundations/open_flamingo/blob/main/TERMS_AND_CONDITIONS.md)
     gr.Markdown(
         """
     # 🦩 OpenFlamingo Demo
-    Blog posts: #1 [An open-source framework for training vision-language models with in-context learning](https://laion.ai/blog/open-flamingo/) // #2 [OpenFlamingo v2: New Models and Enhanced Training Setup]()\n
     GitHub: [open_flamingo](https://github.com/mlfoundations/open_flamingo)
     In this demo we implement an interactive interface that showcases the in-context learning capabilities of the OpenFlamingo-4B model, a large multimodal model trained on top of

 login(token=os.environ["HUGGINGFACE_TOKEN"])
 demo_imgs = [
+    ["images/chinchilla_web-1024x683.jpg", "images/shiba-inu-dog-in-the-snow.jpg", "images/900.jpeg", "images/dogs.jpeg"],
+    ["images/900.jpeg", "images/hummus.jpg", "images/london-underground-sign.jpg", "images/COCO_train2014_000000194806.jpg"],
+    ["images/COCO_train2014_000000572279.jpg", "images/COCO_train2014_000000194806.jpg", "images/istockphoto-622434332-1024x1024.jpg", "images/11887_pesto-pasta_Rita-1x1-1-501c953b29074ab193e2b5ad36e64648.jpg"],
     [
         "images/bcee7a-20190225-a-london-underground-sign.jpg",
         "images/istockphoto-622434332-1024x1024.jpg",
     ],
+    ["images/dogs.jpeg", "images/pandas.jpg", "images/900.jpeg", "images/mhJ2yWNwMtNcmijZqVEDDW-320-80.jpg"],
     ["images/11887_pesto-pasta_Rita-1x1-1-501c953b29074ab193e2b5ad36e64648.jpg", "images/hummus.jpg"],
 ]
 demo_texts = [
     [
         "Output: This is a chinchilla. They are mainly found in Chile.",
         "Output: This is a shiba. They are very popular in Japan.",
+        "Output: This is a flamingo. They are found in South America.",
+        "Output: These are labrador retrievers. They are found in the UK.",
     ],
     [
         "Output: a pink flamingo standing in a body of water.",
     [
         "Question: Describe the scene. Answer: A white airplane being repaired on the runway. 'Cargo' is written on it in red.",
         "Question: What is the man trying to catch? Answer: The man is catching a white kite that his friend is flying. The two men are on a beach.",
+        "Question: What does the sign say? Answer: Congress Ave",
+        "Question: What is this dish? Answer: This is pesto pasta topped with cheese and basil.",
     ],
     ['Output: "Underground"', 'Output: "Congress Ave"'],
+    ["Output: 2 dogs", "Output: 3 pandas", "Output: 1 flamingo", "Output: 5 fingers"],
 ]
 # cd to open_flamingo dir and pip install .
 model, image_processor, tokenizer = create_model_and_transforms(
     clip_vision_encoder_pretrained="openai",
     clip_vision_encoder_path="ViT-L-14",
+    lang_encoder_path="togethercomputer/RedPajama-INCITE-Instruct-3B-v1",
+    tokenizer_path="togethercomputer/RedPajama-INCITE-Instruct-3B-v1",
     cross_attn_every_n_layers=2,
 )
+checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-4B-vitl-rpj3b-langinstruct", "checkpoint.pt")
 model.load_state_dict(torch.load(checkpoint_path), strict=False)
 model.eval()
         if example_two_text is None
         else f"Output: {example_two_text}"
     )
+    if idx != -1:
+        example_three_image = (
+            Image.open(demo_imgs[idx][2])
+            if example_three_image is None
+            else example_three_image
+        )
+        example_three_text = (
+            demo_texts[idx][2]
+            if example_three_text is None
+            else f"Output: {example_three_text}"
+        )
+        example_four_image = (
+            Image.open(demo_imgs[idx][3])
+            if example_four_image is None
+            else example_four_image
+        )
+        example_four_text = (
+            demo_texts[idx][3]
+            if example_four_text is None
+            else f"Output: {example_four_text}"
+        )
     if (
         example_one_image is None
         raise gr.Error("Please fill in all the fields (image and text).")
     demo_plus_text = f"<image>{example_one_text}<|endofchunk|><image>{example_two_text}<|endofchunk|>"
+    if idx != -1:
+        demo_plus_text += f"<image>{example_three_text}<|endofchunk|><image>{example_four_text}<|endofchunk|>"
     demo_plus_text += (
         "<image>Output:" if idx != 2 else f"<image>Question: {text.strip()} Answer:"
     )
     input_ids = lang_x["input_ids"]
     attention_mask = lang_x["attention_mask"]
+    vision_x = [image_processor(example_one_image).unsqueeze(0), image_processor(example_two_image).unsqueeze(0)]
+    if idx != -1:
+        vision_x.append(image_processor(example_three_image).unsqueeze(0))
+        vision_x.append(image_processor(example_four_image).unsqueeze(0))
+    vision_x.append(image_processor(image).unsqueeze(0))
     vision_x = torch.cat(vision_x, dim=0)
     vision_x = vision_x.unsqueeze(1).unsqueeze(0)
     print(vision_x.shape)
 with gr.Blocks() as demo:
     gr.Markdown(
         """
     # 🦩 OpenFlamingo Demo
+    Blog posts: #1 [An open-source framework for training vision-language models with in-context learning](https://laion.ai/blog/open-flamingo/) // #2 [OpenFlamingo v2: New Models and Enhanced Training Setup]()
     GitHub: [open_flamingo](https://github.com/mlfoundations/open_flamingo)
     In this demo we implement an interactive interface that showcases the in-context learning capabilities of the OpenFlamingo-4B model, a large multimodal model trained on top of

images/4645808729_2dfc59b6a5_z.jpg ADDED Viewed

images/5944609705_4664531909_z.jpg ADDED Viewed

images/mhJ2yWNwMtNcmijZqVEDDW-320-80.jpg ADDED Viewed