Abe commited on
Commit
f90e7b1
·
1 Parent(s): f9091c4
Files changed (4) hide show
  1. .gitignore +2 -0
  2. README.md +4 -2
  3. app.py +69 -0
  4. requirements.txt +7 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .idea
2
+ .venv
README.md CHANGED
@@ -7,7 +7,9 @@ sdk: gradio
7
  sdk_version: 5.27.1
8
  app_file: app.py
9
  pinned: false
10
- short_description: generate video prompts from text or text-image
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
7
  sdk_version: 5.27.1
8
  app_file: app.py
9
  pinned: false
10
+ short_description: generate video prompts or captions from text-image
11
  ---
12
 
13
+ A CPU based image labelling with `Salesforce/blip-image-captioning-base` which can be used for training data generation.
14
+
15
+ [Justlab.ai](https://justlab.ai)
app.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import BlipProcessor, BlipForConditionalGeneration
4
+ from PIL import Image
5
+ import numpy as np
6
+
7
+ # Initialize model and processor globally - much smaller model
8
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
9
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
10
+
11
+ # Move to GPU if available, otherwise stays on CPU
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+ model.to(device)
14
+
15
+ def process_input(image, text=""):
16
+ """Process image and optional text input to generate description"""
17
+ try:
18
+ # Convert numpy array to PIL Image
19
+ if isinstance(image, np.ndarray):
20
+ pil_image = Image.fromarray(image)
21
+ else:
22
+ return "Please provide a valid image"
23
+
24
+ # Set conditional text if provided
25
+ conditional_text = text if text else "a video of"
26
+
27
+ # Process image
28
+ inputs = processor(
29
+ pil_image,
30
+ text=conditional_text,
31
+ return_tensors="pt"
32
+ ).to(device)
33
+
34
+ # Generate with careful parameters
35
+ output = model.generate(
36
+ **inputs,
37
+ max_new_tokens=100,
38
+ num_beams=5,
39
+ length_penalty=1.0,
40
+ repetition_penalty=1.5
41
+ )
42
+
43
+ # Decode
44
+ result = processor.decode(output[0], skip_special_tokens=True)
45
+
46
+ return result.strip()
47
+
48
+ except Exception as e:
49
+ return f"Error processing input: {str(e)}"
50
+
51
+ # Create Gradio interface
52
+ demo = gr.Interface(
53
+ fn=process_input,
54
+ inputs=[
55
+ gr.Image(type="numpy", label="Upload Image"),
56
+ gr.Textbox(
57
+ label="Prompt (Optional)",
58
+ placeholder="Guide the description or leave empty for automatic caption",
59
+ lines=2
60
+ ),
61
+ ],
62
+ outputs=gr.Textbox(label="Generated Description", lines=6),
63
+ title="Scene Description Generator",
64
+ description="Upload an image and optionally add a prompt to guide the description. Created by <a href='https://justlab.ai'>Justlab.ai</a>",
65
+
66
+ )
67
+
68
+ if __name__ == "__main__":
69
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio==5.27.1
2
+ # Model requirements
3
+ transformers>=4.45.0
4
+ Pillow~=11.2.1
5
+ requests
6
+ torch~=2.7.0
7
+ numpy~=2.2.5