Spaces:
Sleeping
Sleeping
Update clip_model.py
Browse files- clip_model.py +15 -8
clip_model.py
CHANGED
@@ -4,7 +4,7 @@ from PIL import Image
|
|
4 |
from transformers import ChineseCLIPProcessor, ChineseCLIPModel
|
5 |
|
6 |
class ClipModel:
|
7 |
-
def __init__(self, model_name="OFA-Sys/chinese-clip-vit-base-patch16", model_path=None):
|
8 |
# Set device
|
9 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
10 |
|
@@ -19,13 +19,20 @@ class ClipModel:
|
|
19 |
|
20 |
self.processor = ChineseCLIPProcessor.from_pretrained(model_name)
|
21 |
|
22 |
-
|
23 |
-
# Load image
|
24 |
-
image = Image.open(image_path)
|
25 |
|
26 |
# Load Chinese vocabulary
|
27 |
with open(vocab_path, 'r', encoding='utf-8') as f:
|
28 |
-
vocab = [line.strip() for line in f.readlines()]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
# Process images and texts
|
31 |
batch_size = 16 # Process 16 vocab at a time
|
@@ -35,8 +42,8 @@ class ClipModel:
|
|
35 |
torch.cuda.empty_cache()
|
36 |
|
37 |
with torch.no_grad():
|
38 |
-
for i in range(0, len(vocab), batch_size):
|
39 |
-
batch_vocab = vocab[i:i + batch_size]
|
40 |
inputs = self.processor(
|
41 |
text=batch_vocab,
|
42 |
images=image,
|
@@ -56,7 +63,7 @@ class ClipModel:
|
|
56 |
|
57 |
# Find top-3 similarities
|
58 |
top_k_indices = torch.topk(similarity, top_k).indices.tolist()
|
59 |
-
top_k_words = [vocab[idx] for idx in top_k_indices]
|
60 |
|
61 |
# 6. 輸出最接近的前3名中文詞彙
|
62 |
return top_k_words
|
|
|
4 |
from transformers import ChineseCLIPProcessor, ChineseCLIPModel
|
5 |
|
6 |
class ClipModel:
|
7 |
+
def __init__(self, model_name="OFA-Sys/chinese-clip-vit-base-patch16", model_path=None, vocab_path='./chiikawa/word_list.txt'):
|
8 |
# Set device
|
9 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
10 |
|
|
|
19 |
|
20 |
self.processor = ChineseCLIPProcessor.from_pretrained(model_name)
|
21 |
|
22 |
+
print("***** Clip Model LOAD DONE *****")
|
|
|
|
|
23 |
|
24 |
# Load Chinese vocabulary
|
25 |
with open(vocab_path, 'r', encoding='utf-8') as f:
|
26 |
+
self.vocab = [line.strip() for line in f.readlines()]
|
27 |
+
|
28 |
+
def clip_result(self, image_path, top_k=3):
|
29 |
+
"""
|
30 |
+
給定圖片路徑,返回最接近的 top_k 詞彙
|
31 |
+
"""
|
32 |
+
# Load image
|
33 |
+
image = Image.open(image_path)
|
34 |
+
|
35 |
+
print(f"===== Clip Model_clip_result : {image_path} ===== ")
|
36 |
|
37 |
# Process images and texts
|
38 |
batch_size = 16 # Process 16 vocab at a time
|
|
|
42 |
torch.cuda.empty_cache()
|
43 |
|
44 |
with torch.no_grad():
|
45 |
+
for i in range(0, len(self.vocab), batch_size):
|
46 |
+
batch_vocab = self.vocab[i:i + batch_size]
|
47 |
inputs = self.processor(
|
48 |
text=batch_vocab,
|
49 |
images=image,
|
|
|
63 |
|
64 |
# Find top-3 similarities
|
65 |
top_k_indices = torch.topk(similarity, top_k).indices.tolist()
|
66 |
+
top_k_words = [self.vocab[idx] for idx in top_k_indices]
|
67 |
|
68 |
# 6. 輸出最接近的前3名中文詞彙
|
69 |
return top_k_words
|