qqc1989 commited on 6 days ago

Commit

03a2d97

verified ·

1 Parent(s): 451c293

Upload 21 files

Browse files

Files changed (22) hide show

.gitattributes +3 -0
README.md +152 -3
cnclip/cn_vocab.txt +0 -0
cnclip/cnclip_vit_l14_336px_text_u16.axmodel +3 -0
cnclip/cnclip_vit_l14_336px_vision_u16u8.axmodel +3 -0
coco_1000.tar +3 -0
config.json +137 -0
gradio_01.png +3 -0
install/examples/cmdline.hpp +732 -0
install/examples/test_ax_api.cpp +37 -0
install/examples/test_axcl_api.cpp +32 -0
install/examples/test_enum_devices.cpp +59 -0
install/examples/test_load_model.cpp +84 -0
install/examples/test_match_by_text.cpp +131 -0
install/examples/timer.hpp +61 -0
install/include/clip.h +203 -0
install/lib/aarch64/libclip.so +3 -0
pyclip/example.py +51 -0
pyclip/gradio_example.png +3 -0
pyclip/gradio_example.py +80 -0
pyclip/pyclip.py +260 -0
pyclip/requirements.txt +4 -0

.gitattributes CHANGED Viewed

@@ -35,3 +35,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.axmodel filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.axmodel filter=lfs diff=lfs merge=lfs -text
+gradio_01.png filter=lfs diff=lfs merge=lfs -text
+install/lib/aarch64/libclip.so filter=lfs diff=lfs merge=lfs -text
+pyclip/gradio_example.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,152 @@
----
-license: mit
----

+---
+license: mit
+language:
+- en
+- zh
+base_model:
+- OFA-Sys/chinese-clip-vit-large-patch14-336px
+- AXERA-TECH/cnclip
+tags:
+- CLIP
+- CN_CLIP
+---
+# LibCLIP
+This SDK enables efficient text-to-image retrieval using CLIP (Contrastive Language–Image Pretraining), optimized for Axera’s NPU-based SoC platforms including AX650, AX650C, AX8850, and AX650A, or Axera's dedicated AI accelerator.
+With this SDK, you can:
+- Perform semantic image search by providing natural language queries.
+- Utilize CLIP to embed text queries and compare them against a pre-computed set of image embeddings.
+- Run all inference processes directly on Axera NPUs for low-latency, high-throughput performance at the edge.
+This solution is well-suited for smart cameras, content filtering, AI-powered user interfaces, and other edge AI scenarios where natural language-based image retrieval is required.
+## References links:
+For those who are interested in model conversion, you can try to export axmodel through
+- [The github repo of libclip's open source](https://github.com/AXERA-TECH/libclip.axera)
+- [Pulsar2 Link, How to Convert ONNX to axmodel](https://pulsar2-docs.readthedocs.io/en/latest/pulsar2/introduction.html)
+- https://huggingface.co/AXERA-TECH/cnclip
+## Support Platform
+- AX650
+  - [M4N-Dock(爱芯派Pro)](https://wiki.sipeed.com/hardware/zh/maixIV/m4ndock/m4ndock.html)
+  - [M.2 Accelerator card](https://axcl-docs.readthedocs.io/zh-cn/latest/doc_guide_hardware.html)
+## Performance
+| Model                                     | Input Shape       | Latency (ms) | CMM Usage (MB) |
+| ----------------------------------------- | ----------------- | ------------ | -------------- |
+| cnclip_vit_l14_336px_vision_u16u8.axmodel | 1 x 3 x 336 x 336 | 88.475 ms    | 304 MB         |
+| cnclip_vit_l14_336px_text_u16.axmodel     | 1 x 52            | 4.576 ms     | 122 MB         |
+## How to use
+Download all files from this repository to the device
+```
+(base) axera@raspberrypi:~/samples/AXERA-TECH/libclip.axera $ tree -L 2
+.
+├── cnclip
+│   ├── cnclip_vit_l14_336px_text_u16.axmodel
+│   ├── cnclip_vit_l14_336px_vision_u16u8.axmodel
+│   └── cn_vocab.txt
+├── coco_1000.tar
+├── config.json
+├── gradio_01.png
+├── install
+│   ├── examples
+│   ├── include
+│   └── lib
+├── pyclip
+│   ├── example.py
+│   ├── gradio_example.png
+│   ├── gradio_example.py
+│   ├── libclip.so
+│   ├── __pycache__
+│   ├── pyclip.py
+│   └── requirements.txt
+└── README.md
+8 directories, 13 files
+```
+### python env requirement
+#### pyaxengine
+https://github.com/AXERA-TECH/pyaxengine
+```
+wget https://github.com/AXERA-TECH/pyaxengine/releases/download/0.1.3.rc1/axengine-0.1.3-py3-none-any.whl
+pip install axengine-0.1.3-py3-none-any.whl
+```
+#### others
+```
+pip install -r pyclip/requirements.txt
+```
+#### Inference with AX650 Host, such as M4N-Dock(爱芯派Pro)
+TODO
+#### Inference with M.2 Accelerator card
+[What is M.2 Accelerator card?](https://axcl-docs.readthedocs.io/zh-cn/latest/doc_guide_hardware.html), Show this DEMO based on Raspberry PI 5.
+```
+(py312) axera@raspberrypi:~/samples/AXERA-TECH/libclip.axera $ export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libstdc++.so.6
+(py312) axera@raspberrypi:~/samples/AXERA-TECH/libclip.axera $ cp install/lib/aarch64/libclip.so pyclip/
+(py312) axera@raspberrypi:~/samples/AXERA-TECH/libclip.axera $ tar xf coco_1000.tar
+(py312) axera@raspberrypi:~/samples/AXERA-TECH/libclip.axera $ python pyclip/gradio_example.py --ienc cnclip/cnclip_vit_l14_336px_vision_u16u8.axmodel --tenc cnclip/cnclip_vit_l14_336px_text_u16.axmodel --vocab cnclip/cn_vocab.txt --isCN 1 --db_path clip_feat_db_coco --image_folder coco_1000/
+Trying to load: /home/axera/samples/AXERA-TECH/libclip.axera/pyclip/aarch64/libclip.so
+❌ Failed to load: /home/axera/samples/AXERA-TECH/libclip.axera/pyclip/aarch64/libclip.so
+   /home/axera/samples/AXERA-TECH/libclip.axera/pyclip/aarch64/libclip.so: cannot open shared object file: No such file or directory
+🔍 File not found. Please verify that libclip.so exists and the path is correct.
+Trying to load: /home/axera/samples/AXERA-TECH/libclip.axera/pyclip/libclip.so
+open libax_sys.so failed
+open libax_engine.so failed
+✅ Successfully loaded: /home/axera/samples/AXERA-TECH/libclip.axera/pyclip/libclip.so
+可用设备: {'host': {'available': True, 'version': '', 'mem_info': {'remain': 0, 'total': 0}}, 'devices': {'host_version': 'V3.6.2_20250603154858', 'dev_version': 'V3.6.2_20250603154858', 'count': 1, 'devices_info': [{'temp': 37, 'cpu_usage': 1, 'npu_usage': 0, 'mem_info': {'remain': 7022, 'total': 7040}}]}}
+[I][                             run][  31]: AXCLWorker start with devid 0
+input size: 1
+    name:    image [unknown] [unknown]
+        1 x 3 x 336 x 336
+output size: 1
+    name: unnorm_image_features
+        1 x 768
+[I][              load_image_encoder][  50]: nchw 336 336
+[I][              load_image_encoder][  60]: image feature len 768
+input size: 1
+    name:     text [unknown] [unknown]
+        1 x 52
+output size: 1
+    name: unnorm_text_features
+        1 x 768
+[I][               load_text_encoder][  44]: text feature len 768
+[I][                  load_tokenizer][  60]: text token len 52
+100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:40<00:00,  9.93it/s]
+* Running on local URL:  http://0.0.0.0:7860
+```
+If your Raspberry PI 5 ip is 192.168.1.100, so using this URL `http://192.168.1.100:7860` with your WebApp.
+![](gradio_01.png)

cnclip/cn_vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

cnclip/cnclip_vit_l14_336px_text_u16.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f33786ab988ca22dbc883c9fa6ebfa842a425445e4dac945c30069a6d9d5cf8
+size 127341129

cnclip/cnclip_vit_l14_336px_vision_u16u8.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6278e99001c198082b59219b163f7136d3049bca223be9176678ac6031348cde
+size 334708454

coco_1000.tar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3e18a198658270e19ced079de7a404e3478e69c2ef94fb47c87ddf056e6a541
+size 163112960

config.json ADDED Viewed

	@@ -0,0 +1,137 @@

+{
+  "_commit_hash": null,
+  "architectures": [
+    "InternVLChatModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_internvl_chat.InternVLChatConfig",
+    "AutoModel": "modeling_internvl_chat.InternVLChatModel",
+    "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel"
+  },
+  "downsample_ratio": 0.5,
+  "dynamic_image_size": true,
+  "force_image_size": 448,
+  "llm_config": {
+    "_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
+    "add_cross_attention": false,
+    "architectures": [
+      "Qwen2ForCausalLM"
+    ],
+    "_attn_implementation": "flash_attention_2",
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 151643,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 151645,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "silu",
+    "hidden_size": 896,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 4864,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 32768,
+    "max_window_layers": 21,
+    "min_length": 0,
+    "model_type": "qwen2",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 14,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 24,
+    "num_key_value_heads": 2,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rms_norm_eps": 1e-06,
+    "rope_theta": 1000000.0,
+    "sep_token_id": null,
+    "sliding_window": 32768,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": false,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "transformers_version": "4.37.2",
+    "typical_p": 1.0,
+    "use_bfloat16": true,
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 151674
+  },
+  "max_dynamic_patch": 12,
+  "min_dynamic_patch": 1,
+  "model_type": "internvl_chat",
+  "ps_version": "v2",
+  "select_layer": -1,
+  "template": "internvl2_5",
+  "torch_dtype": "bfloat16",
+  "use_backbone_lora": 0,
+  "use_llm_lora": 0,
+  "use_thumbnail": true,
+  "vision_config": {
+    "architectures": [
+      "InternVisionModel"
+    ],
+    "attention_dropout": 0.0,
+    "drop_path_rate": 0.0,
+    "dropout": 0.0,
+    "hidden_act": "gelu",
+    "hidden_size": 1024,
+    "image_size": 448,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "layer_norm_eps": 1e-06,
+    "model_type": "intern_vit_6b",
+    "norm_type": "layer_norm",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "patch_size": 14,
+    "qk_normalization": false,
+    "qkv_bias": true,
+    "return_dict": true,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.37.2",
+    "use_bfloat16": true,
+    "use_flash_attn": true
+  }
+}

gradio_01.png ADDED Viewed

Git LFS Details

SHA256: 12c161b2eebd187c1d0ddb12a487ad66a376cb460533a1ffd268434af3617c54
Pointer size: 131 Bytes
Size of remote file: 797 kB

install/examples/cmdline.hpp ADDED Viewed

	@@ -0,0 +1,732 @@

+/*
+  Copyright (c) 2009, Hideyuki Tanaka
+  All rights reserved.
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+  * Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+  * Neither the name of the <organization> nor the
+  names of its contributors may be used to endorse or promote products
+  derived from this software without specific prior written permission.
+  THIS SOFTWARE IS PROVIDED BY <copyright holder> ''AS IS'' AND ANY
+  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL <copyright holder> BE LIABLE FOR ANY
+  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#pragma once
+#include <cxxabi.h>
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <typeinfo>
+#include <vector>
+namespace cmdline {
+namespace detail {
+template <typename Target, typename Source, bool Same>
+class lexical_cast_t {
+ public:
+  static Target cast(const Source &arg) {
+    Target ret;
+    std::stringstream ss;
+    if (!(ss << arg && ss >> ret && ss.eof())) throw std::bad_cast();
+    return ret;
+  }
+};
+template <typename Target, typename Source>
+class lexical_cast_t<Target, Source, true> {
+ public:
+  static Target cast(const Source &arg) { return arg; }
+};
+template <typename Source>
+class lexical_cast_t<std::string, Source, false> {
+ public:
+  static std::string cast(const Source &arg) {
+    std::ostringstream ss;
+    ss << arg;
+    return ss.str();
+  }
+};
+template <typename Target>
+class lexical_cast_t<Target, std::string, false> {
+ public:
+  static Target cast(const std::string &arg) {
+    Target ret;
+    std::istringstream ss(arg);
+    if (!(ss >> ret && ss.eof())) throw std::bad_cast();
+    return ret;
+  }
+};
+template <typename T1, typename T2>
+struct is_same {
+  static const bool value = false;
+};
+template <typename T>
+struct is_same<T, T> {
+  static const bool value = true;
+};
+template <typename Target, typename Source>
+Target lexical_cast(const Source &arg) {
+  return lexical_cast_t<Target, Source,
+                        detail::is_same<Target, Source>::value>::cast(arg);
+}
+static inline std::string demangle(const std::string &name) {
+  int status = 0;
+  char *p = abi::__cxa_demangle(name.c_str(), 0, 0, &status);
+  std::string ret(p);
+  free(p);
+  return ret;
+}
+template <class T>
+std::string readable_typename() {
+  return demangle(typeid(T).name());
+}
+template <class T>
+std::string default_value(T def) {
+  return detail::lexical_cast<std::string>(def);
+}
+template <>
+inline std::string readable_typename<std::string>() {
+  return "string";
+}
+}  // namespace detail
+//-----
+class cmdline_error : public std::exception {
+ public:
+  cmdline_error(const std::string &msg) : msg(msg) {}
+  ~cmdline_error() throw() {}
+  const char *what() const throw() { return msg.c_str(); }
+ private:
+  std::string msg;
+};
+template <class T>
+struct default_reader {
+  T operator()(const std::string &str) { return detail::lexical_cast<T>(str); }
+};
+template <class T>
+struct range_reader {
+  range_reader(const T &low, const T &high) : low(low), high(high) {}
+  T operator()(const std::string &s) const {
+    T ret = default_reader<T>()(s);
+    if (!(ret >= low && ret <= high))
+      throw cmdline::cmdline_error("range_error");
+    return ret;
+  }
+ private:
+  T low, high;
+};
+template <class T>
+range_reader<T> range(const T &low, const T &high) {
+  return range_reader<T>(low, high);
+}
+template <class T>
+struct oneof_reader {
+  T operator()(const std::string &s) {
+    T ret = default_reader<T>()(s);
+    if (std::find(alt.begin(), alt.end(), ret) == alt.end())
+      throw cmdline_error("");
+    return ret;
+  }
+  void add(const T &v) { alt.push_back(v); }
+ private:
+  std::vector<T> alt;
+};
+template <class T>
+oneof_reader<T> oneof(T a1) {
+  oneof_reader<T> ret;
+  ret.add(a1);
+  return ret;
+}
+template <class T>
+oneof_reader<T> oneof(T a1, T a2) {
+  oneof_reader<T> ret;
+  ret.add(a1);
+  ret.add(a2);
+  return ret;
+}
+template <class T>
+oneof_reader<T> oneof(T a1, T a2, T a3) {
+  oneof_reader<T> ret;
+  ret.add(a1);
+  ret.add(a2);
+  ret.add(a3);
+  return ret;
+}
+template <class T>
+oneof_reader<T> oneof(T a1, T a2, T a3, T a4) {
+  oneof_reader<T> ret;
+  ret.add(a1);
+  ret.add(a2);
+  ret.add(a3);
+  ret.add(a4);
+  return ret;
+}
+template <class T>
+oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5) {
+  oneof_reader<T> ret;
+  ret.add(a1);
+  ret.add(a2);
+  ret.add(a3);
+  ret.add(a4);
+  ret.add(a5);
+  return ret;
+}
+template <class T>
+oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6) {
+  oneof_reader<T> ret;
+  ret.add(a1);
+  ret.add(a2);
+  ret.add(a3);
+  ret.add(a4);
+  ret.add(a5);
+  ret.add(a6);
+  return ret;
+}
+template <class T>
+oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7) {
+  oneof_reader<T> ret;
+  ret.add(a1);
+  ret.add(a2);
+  ret.add(a3);
+  ret.add(a4);
+  ret.add(a5);
+  ret.add(a6);
+  ret.add(a7);
+  return ret;
+}
+template <class T>
+oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8) {
+  oneof_reader<T> ret;
+  ret.add(a1);
+  ret.add(a2);
+  ret.add(a3);
+  ret.add(a4);
+  ret.add(a5);
+  ret.add(a6);
+  ret.add(a7);
+  ret.add(a8);
+  return ret;
+}
+template <class T>
+oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9) {
+  oneof_reader<T> ret;
+  ret.add(a1);
+  ret.add(a2);
+  ret.add(a3);
+  ret.add(a4);
+  ret.add(a5);
+  ret.add(a6);
+  ret.add(a7);
+  ret.add(a8);
+  ret.add(a9);
+  return ret;
+}
+template <class T>
+oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9,
+                      T a10) {
+  oneof_reader<T> ret;
+  ret.add(a1);
+  ret.add(a2);
+  ret.add(a3);
+  ret.add(a4);
+  ret.add(a5);
+  ret.add(a6);
+  ret.add(a7);
+  ret.add(a8);
+  ret.add(a9);
+  ret.add(a10);
+  return ret;
+}
+//-----
+class parser {
+ public:
+  parser() {}
+  ~parser() {
+    for (std::map<std::string, option_base *>::iterator p = options.begin();
+         p != options.end(); p++)
+      delete p->second;
+  }
+  void add(const std::string &name, char short_name = 0,
+           const std::string &desc = "") {
+    if (options.count(name))
+      throw cmdline_error("multiple definition: " + name);
+    options[name] = new option_without_value(name, short_name, desc);
+    ordered.push_back(options[name]);
+  }
+  template <class T>
+  void add(const std::string &name, char short_name = 0,
+           const std::string &desc = "", bool need = true, const T def = T()) {
+    add(name, short_name, desc, need, def, default_reader<T>());
+  }
+  template <class T, class F>
+  void add(const std::string &name, char short_name = 0,
+           const std::string &desc = "", bool need = true, const T def = T(),
+           F reader = F()) {
+    if (options.count(name))
+      throw cmdline_error("multiple definition: " + name);
+    options[name] = new option_with_value_with_reader<T, F>(
+        name, short_name, need, def, desc, reader);
+    ordered.push_back(options[name]);
+  }
+  void footer(const std::string &f) { ftr = f; }
+  void set_program_name(const std::string &name) { prog_name = name; }
+  bool exist(const std::string &name) const {
+    if (options.count(name) == 0)
+      throw cmdline_error("there is no flag: --" + name);
+    return options.find(name)->second->has_set();
+  }
+  template <class T>
+  const T &get(const std::string &name) const {
+    if (options.count(name) == 0)
+      throw cmdline_error("there is no flag: --" + name);
+    const option_with_value<T> *p =
+        dynamic_cast<const option_with_value<T> *>(options.find(name)->second);
+    if (p == NULL) throw cmdline_error("type mismatch flag '" + name + "'");
+    return p->get();
+  }
+  const std::vector<std::string> &rest() const { return others; }
+  bool parse(const std::string &arg) {
+    std::vector<std::string> args;
+    std::string buf;
+    bool in_quote = false;
+    for (std::string::size_type i = 0; i < arg.length(); i++) {
+      if (arg[i] == '\"') {
+        in_quote = !in_quote;
+        continue;
+      }
+      if (arg[i] == ' ' && !in_quote) {
+        args.push_back(buf);
+        buf = "";
+        continue;
+      }
+      if (arg[i] == '\\') {
+        i++;
+        if (i >= arg.length()) {
+          errors.push_back("unexpected occurrence of '\\' at end of string");
+          return false;
+        }
+      }
+      buf += arg[i];
+    }
+    if (in_quote) {
+      errors.push_back("quote is not closed");
+      return false;
+    }
+    if (buf.length() > 0) args.push_back(buf);
+    for (size_t i = 0; i < args.size(); i++)
+      std::cout << "\"" << args[i] << "\"" << std::endl;
+    return parse(args);
+  }
+  bool parse(const std::vector<std::string> &args) {
+    int argc = static_cast<int>(args.size());
+    std::vector<const char *> argv(argc);
+    for (int i = 0; i < argc; i++) argv[i] = args[i].c_str();
+    return parse(argc, &argv[0]);
+  }
+  bool parse(int argc, const char *const argv[]) {
+    errors.clear();
+    others.clear();
+    if (argc < 1) {
+      errors.push_back("argument number must be longer than 0");
+      return false;
+    }
+    if (prog_name == "") prog_name = argv[0];
+    std::map<char, std::string> lookup;
+    for (std::map<std::string, option_base *>::iterator p = options.begin();
+         p != options.end(); p++) {
+      if (p->first.length() == 0) continue;
+      char initial = p->second->short_name();
+      if (initial) {
+        if (lookup.count(initial) > 0) {
+          lookup[initial] = "";
+          errors.push_back(std::string("short option '") + initial +
+                           "' is ambiguous");
+          return false;
+        } else
+          lookup[initial] = p->first;
+      }
+    }
+    for (int i = 1; i < argc; i++) {
+      if (strncmp(argv[i], "--", 2) == 0) {
+        const char *p = strchr(argv[i] + 2, '=');
+        if (p) {
+          std::string name(argv[i] + 2, p);
+          std::string val(p + 1);
+          set_option(name, val);
+        } else {
+          std::string name(argv[i] + 2);
+          if (options.count(name) == 0) {
+            errors.push_back("undefined option: --" + name);
+            continue;
+          }
+          if (options[name]->has_value()) {
+            if (i + 1 >= argc) {
+              errors.push_back("option needs value: --" + name);
+              continue;
+            } else {
+              i++;
+              set_option(name, argv[i]);
+            }
+          } else {
+            set_option(name);
+          }
+        }
+      } else if (strncmp(argv[i], "-", 1) == 0) {
+        if (!argv[i][1]) continue;
+        char last = argv[i][1];
+        for (int j = 2; argv[i][j]; j++) {
+          last = argv[i][j];
+          if (lookup.count(argv[i][j - 1]) == 0) {
+            errors.push_back(std::string("undefined short option: -") +
+                             argv[i][j - 1]);
+            continue;
+          }
+          if (lookup[argv[i][j - 1]] == "") {
+            errors.push_back(std::string("ambiguous short option: -") +
+                             argv[i][j - 1]);
+            continue;
+          }
+          set_option(lookup[argv[i][j - 1]]);
+        }
+        if (lookup.count(last) == 0) {
+          errors.push_back(std::string("undefined short option: -") + last);
+          continue;
+        }
+        if (lookup[last] == "") {
+          errors.push_back(std::string("ambiguous short option: -") + last);
+          continue;
+        }
+        if (i + 1 < argc && options[lookup[last]]->has_value()) {
+          set_option(lookup[last], argv[i + 1]);
+          i++;
+        } else {
+          set_option(lookup[last]);
+        }
+      } else {
+        others.push_back(argv[i]);
+      }
+    }
+    for (std::map<std::string, option_base *>::iterator p = options.begin();
+         p != options.end(); p++)
+      if (!p->second->valid())
+        errors.push_back("need option: --" + std::string(p->first));
+    return errors.size() == 0;
+  }
+  void parse_check(const std::string &arg) {
+    if (!options.count("help")) add("help", '?', "print this message");
+    check(0, parse(arg));
+  }
+  void parse_check(const std::vector<std::string> &args) {
+    if (!options.count("help")) add("help", '?', "print this message");
+    check(args.size(), parse(args));
+  }
+  void parse_check(int argc, char *argv[]) {
+    if (!options.count("help")) add("help", '?', "print this message");
+    check(argc, parse(argc, argv));
+  }
+  std::string error() const { return errors.size() > 0 ? errors[0] : ""; }
+  std::string error_full() const {
+    std::ostringstream oss;
+    for (size_t i = 0; i < errors.size(); i++) oss << errors[i] << std::endl;
+    return oss.str();
+  }
+  std::string usage() const {
+    std::ostringstream oss;
+    oss << "usage: " << prog_name << " ";
+    for (size_t i = 0; i < ordered.size(); i++) {
+      if (ordered[i]->must()) oss << ordered[i]->short_description() << " ";
+    }
+    oss << "[options] ... " << ftr << std::endl;
+    oss << "options:" << std::endl;
+    size_t max_width = 0;
+    for (size_t i = 0; i < ordered.size(); i++) {
+      max_width = std::max(max_width, ordered[i]->name().length());
+    }
+    for (size_t i = 0; i < ordered.size(); i++) {
+      if (ordered[i]->short_name()) {
+        oss << "  -" << ordered[i]->short_name() << ", ";
+      } else {
+        oss << "      ";
+      }
+      oss << "--" << ordered[i]->name();
+      for (size_t j = ordered[i]->name().length(); j < max_width + 4; j++)
+        oss << ' ';
+      oss << ordered[i]->description() << std::endl;
+    }
+    return oss.str();
+  }
+ private:
+  void check(int argc, bool ok) {
+    if ((argc == 1 && !ok) || exist("help")) {
+      std::cerr << usage();
+      exit(0);
+    }
+    if (!ok) {
+      std::cerr << error() << std::endl << usage();
+      exit(1);
+    }
+  }
+  void set_option(const std::string &name) {
+    if (options.count(name) == 0) {
+      errors.push_back("undefined option: --" + name);
+      return;
+    }
+    if (!options[name]->set()) {
+      errors.push_back("option needs value: --" + name);
+      return;
+    }
+  }
+  void set_option(const std::string &name, const std::string &value) {
+    if (options.count(name) == 0) {
+      errors.push_back("undefined option: --" + name);
+      return;
+    }
+    if (!options[name]->set(value)) {
+      errors.push_back("option value is invalid: --" + name + "=" + value);
+      return;
+    }
+  }
+  class option_base {
+   public:
+    virtual ~option_base() {}
+    virtual bool has_value() const = 0;
+    virtual bool set() = 0;
+    virtual bool set(const std::string &value) = 0;
+    virtual bool has_set() const = 0;
+    virtual bool valid() const = 0;
+    virtual bool must() const = 0;
+    virtual const std::string &name() const = 0;
+    virtual char short_name() const = 0;
+    virtual const std::string &description() const = 0;
+    virtual std::string short_description() const = 0;
+  };
+  class option_without_value : public option_base {
+   public:
+    option_without_value(const std::string &name, char short_name,
+                         const std::string &desc)
+        : nam(name), snam(short_name), desc(desc), has(false) {}
+    ~option_without_value() {}
+    bool has_value() const { return false; }
+    bool set() {
+      has = true;
+      return true;
+    }
+    bool set(const std::string &) { return false; }
+    bool has_set() const { return has; }
+    bool valid() const { return true; }
+    bool must() const { return false; }
+    const std::string &name() const { return nam; }
+    char short_name() const { return snam; }
+    const std::string &description() const { return desc; }
+    std::string short_description() const { return "--" + nam; }
+   private:
+    std::string nam;
+    char snam;
+    std::string desc;
+    bool has;
+  };
+  template <class T>
+  class option_with_value : public option_base {
+   public:
+    option_with_value(const std::string &name, char short_name, bool need,
+                      const T &def, const std::string &desc)
+        : nam(name),
+          snam(short_name),
+          need(need),
+          has(false),
+          def(def),
+          actual(def) {
+      this->desc = full_description(desc);
+    }
+    ~option_with_value() {}
+    const T &get() const { return actual; }
+    bool has_value() const { return true; }
+    bool set() { return false; }
+    bool set(const std::string &value) {
+      try {
+        actual = read(value);
+        has = true;
+      } catch (const std::exception &e) {
+        return false;
+      }
+      return true;
+    }
+    bool has_set() const { return has; }
+    bool valid() const {
+      if (need && !has) return false;
+      return true;
+    }
+    bool must() const { return need; }
+    const std::string &name() const { return nam; }
+    char short_name() const { return snam; }
+    const std::string &description() const { return desc; }
+    std::string short_description() const {
+      return "--" + nam + "=" + detail::readable_typename<T>();
+    }
+   protected:
+    std::string full_description(const std::string &desc) {
+      return desc + " (" + detail::readable_typename<T>() +
+             (need ? "" : " [=" + detail::default_value<T>(def) + "]") + ")";
+    }
+    virtual T read(const std::string &s) = 0;
+    std::string nam;
+    char snam;
+    bool need;
+    std::string desc;
+    bool has;
+    T def;
+    T actual;
+  };
+  template <class T, class F>
+  class option_with_value_with_reader : public option_with_value<T> {
+   public:
+    option_with_value_with_reader(const std::string &name, char short_name,
+                                  bool need, const T def,
+                                  const std::string &desc, F reader)
+        : option_with_value<T>(name, short_name, need, def, desc),
+          reader(reader) {}
+   private:
+    T read(const std::string &s) { return reader(s); }
+    F reader;
+  };
+  std::map<std::string, option_base *> options;
+  std::vector<option_base *> ordered;
+  std::string ftr;
+  std::string prog_name;
+  std::vector<std::string> others;
+  std::vector<std::string> errors;
+};
+}  // namespace cmdline

install/examples/test_ax_api.cpp ADDED Viewed

	@@ -0,0 +1,37 @@

+#include "runner/ax650/ax_api_loader.h"
+#include "runner/ax650/ax_model_runner_ax650.hpp"
+#include <fstream>
+#include <vector>
+#include <cstring>
+AxSysApiLoader &get_ax_sys_loader();
+AxEngineApiLoader &get_ax_engine_loader();
+int main()
+{
+    AxSysApiLoader &ax_sys_loader = get_ax_sys_loader();
+    AxEngineApiLoader &ax_engine_loader = get_ax_engine_loader();
+    ax_sys_loader.AX_SYS_Init();
+    AX_ENGINE_NPU_ATTR_T npu_attr;
+    memset(&npu_attr, 0, sizeof(AX_ENGINE_NPU_ATTR_T));
+    npu_attr.eHardMode = AX_ENGINE_VIRTUAL_NPU_DISABLE;
+    ax_engine_loader.AX_ENGINE_Init(&npu_attr);
+    ax_runner_ax650 runner;
+    std::ifstream file("cnclip/cnclip_vit_l14_336px_text_u16.axmodel", std::ios::binary);
+    if (!file.is_open())
+    {
+        printf("open file failed\n");
+        return -1;
+    }
+    std::vector<uint8_t> model_data((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+    runner.init(model_data.data(), model_data.size(), 0);
+    ax_engine_loader.AX_ENGINE_Deinit();
+    ax_sys_loader.AX_SYS_Deinit();
+    return 0;
+}

install/examples/test_axcl_api.cpp ADDED Viewed

	@@ -0,0 +1,32 @@

+#include "runner/axcl/axcl_manager.h"
+#include "runner/axcl/ax_model_runner_axcl.hpp"
+#include <fstream>
+int main()
+{
+    auto ret = axclInit();
+    if (ret != 0)
+    {
+        printf("axclInit failed\n");
+        return -1;
+    }
+    axcl_Dev_Init(0);
+    ax_runner_axcl runner;
+    std::ifstream file("cnclip/cnclip_vit_l14_336px_text_u16.axmodel", std::ios::binary);
+    if (!file.is_open())
+    {
+        printf("open file failed\n");
+        return -1;
+    }
+    std::vector<uint8_t> model_data((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+    runner.init(model_data.data(), model_data.size(), 0);
+    runner.deinit();
+    axcl_Dev_Exit(0);
+    axclFinalize();
+    return 0;
+}

install/examples/test_enum_devices.cpp ADDED Viewed

	@@ -0,0 +1,59 @@

+#include "clip.h"
+#include <iostream>
+#include <cstring>
+int main()
+{
+    clip_devices_t clip_devices;
+    memset(&clip_devices, 0, sizeof(clip_devices_t));
+    if (clip_enum_devices(&clip_devices) != 0)
+    {
+        printf("enum devices failed\n");
+        return -1;
+    }
+    std::cout << "host npu avaiable:" << static_cast<int>(clip_devices.host.available) << " version:" << clip_devices.host.version << std::endl;
+    std::cout << "host mem total:" << clip_devices.host.mem_info.total << " MiB remain:" << clip_devices.host.mem_info.remain << " MiB" << std::endl;
+    std::cout << "Host Version: " << clip_devices.devices.host_version << std::endl;
+    std::cout << "Dev Version: " << clip_devices.devices.dev_version << std::endl;
+    std::cout << "Detected Devices Count: " << static_cast<int>(clip_devices.devices.count) << std::endl;
+    for (unsigned char i = 0; i < clip_devices.devices.count; ++i)
+    {
+        std::cout << "  Device " << static_cast<int>(i) << ":" << std::endl;
+        std::cout << "    Temperature: " << clip_devices.devices.devices_info[i].temp << "C" << std::endl;
+        std::cout << "    CPU Usage: " << clip_devices.devices.devices_info[i].cpu_usage << "%" << std::endl;
+        std::cout << "    NPU Usage: " << clip_devices.devices.devices_info[i].npu_usage << "%" << std::endl;
+        std::cout << "    Memory Remaining: " << clip_devices.devices.devices_info[i].mem_info.remain << " MiB" << std::endl;
+        std::cout << "    Memory Total: " << clip_devices.devices.devices_info[i].mem_info.total << " MiB" << std::endl;
+    }
+    if (clip_devices.host.available)
+    {
+        clip_sys_init(host_device, -1);
+    }
+    if (clip_devices.devices.count > 0)
+    {
+        for (unsigned char i = 0; i < clip_devices.devices.count; ++i)
+        {
+            clip_sys_init(axcl_device, i);
+        }
+    }
+    if (clip_devices.host.available)
+    {
+        clip_sys_deinit(host_device, -1);
+    }
+    if (clip_devices.devices.count > 0)
+    {
+        for (unsigned char i = 0; i < clip_devices.devices.count; ++i)
+        {
+            clip_sys_deinit(axcl_device, i);
+        }
+    }
+    return 0;
+}

install/examples/test_load_model.cpp ADDED Viewed

	@@ -0,0 +1,84 @@

+#include "clip.h"
+#include "cmdline.hpp"
+#include <fstream>
+#include <cstring>
+int main(int argc, char *argv[])
+{
+    clip_devices_t clip_devices;
+    memset(&clip_devices, 0, sizeof(clip_devices_t));
+    if (clip_enum_devices(&clip_devices) != 0)
+    {
+        printf("enum devices failed\n");
+        return -1;
+    }
+    if (clip_devices.host.available)
+    {
+        clip_sys_init(host_device, -1);
+    }
+    else if (clip_devices.devices.count > 0)
+    {
+        clip_sys_init(axcl_device, 0);
+    }
+    else
+    {
+        printf("no device available\n");
+        return -1;
+    }
+    clip_init_t init_info;
+    memset(&init_info, 0, sizeof(init_info));
+    cmdline::parser parser;
+    parser.add<std::string>("ienc", 0, "encoder model(onnx model or axmodel)", true, "cnclip/cnclip_vit_l14_336px_vision_u16u8.axmodel");
+    parser.add<std::string>("tenc", 0, "text encoder model(onnx model or axmodel)", true, "cnclip/cnclip_vit_l14_336px_text_u16.axmodel");
+    parser.add<std::string>("vocab", 'v', "vocab path", true, "cnclip/cn_vocab.txt");
+    parser.add<int>("language", 'l', "language choose, 0:english 1:chinese", false, 1);
+    parser.add<std::string>("db_path", 'd', "db path", false, "");
+    parser.parse_check(argc, argv);
+    sprintf(init_info.image_encoder_path, "%s", parser.get<std::string>("ienc").c_str());
+    sprintf(init_info.text_encoder_path, "%s", parser.get<std::string>("tenc").c_str());
+    sprintf(init_info.tokenizer_path, "%s", parser.get<std::string>("vocab").c_str());
+    init_info.isCN = parser.get<int>("language");
+    sprintf(init_info.db_path, "%s", parser.get<std::string>("db_path").c_str());
+    printf("image_encoder_path: %s\n", init_info.image_encoder_path);
+    printf("text_encoder_path: %s\n", init_info.text_encoder_path);
+    printf("tokenizer_path: %s\n", init_info.tokenizer_path);
+    printf("isCN: %d\n", init_info.isCN);
+    printf("db_path: %s\n", init_info.db_path);
+    if (clip_devices.host.available)
+    {
+        init_info.dev_type = host_device;
+    }
+    else if (clip_devices.devices.count > 0)
+    {
+        init_info.dev_type = axcl_device;
+        init_info.devid = 0;
+    }
+    clip_handle_t handle;
+    int ret = clip_create(&init_info, &handle);
+    if (ret != clip_errcode_success)
+    {
+        printf("clip_create failed\n");
+        return -1;
+    }
+    clip_destroy(handle);
+    if (clip_devices.host.available)
+    {
+        clip_sys_deinit(host_device, -1);
+    }
+    else if (clip_devices.devices.count > 0)
+    {
+        clip_sys_deinit(axcl_device, 0);
+    }
+    return 0;
+}

install/examples/test_match_by_text.cpp ADDED Viewed

	@@ -0,0 +1,131 @@

+#include "clip.h"
+#include "cmdline.hpp"
+#include "timer.hpp"
+#include <fstream>
+#include <cstring>
+#include <opencv2/opencv.hpp>
+int main(int argc, char *argv[])
+{
+    clip_devices_t clip_devices;
+    memset(&clip_devices, 0, sizeof(clip_devices_t));
+    if (clip_enum_devices(&clip_devices) != 0)
+    {
+        printf("enum devices failed\n");
+        return -1;
+    }
+    if (clip_devices.host.available)
+    {
+        clip_sys_init(host_device, -1);
+    }
+    else if (clip_devices.devices.count > 0)
+    {
+        clip_sys_init(axcl_device, 0);
+    }
+    else
+    {
+        printf("no device available\n");
+        return -1;
+    }
+    clip_init_t init_info;
+    memset(&init_info, 0, sizeof(init_info));
+    cmdline::parser parser;
+    parser.add<std::string>("ienc", 0, "encoder model(onnx model or axmodel)", true, "cnclip/cnclip_vit_l14_336px_vision_u16u8.axmodel");
+    parser.add<std::string>("tenc", 0, "text encoder model(onnx model or axmodel)", true, "cnclip/cnclip_vit_l14_336px_text_u16.axmodel");
+    parser.add<std::string>("vocab", 'v', "vocab path", true, "cnclip/cn_vocab.txt");
+    parser.add<int>("language", 'l', "language choose, 0:english 1:chinese", false, 1);
+    parser.add<std::string>("db_path", 'd', "db path", false, "clip_feat_db");
+    parser.add<std::string>("image", 'i', "image folder(jpg png etc....)", true);
+    parser.add<std::string>("text", 't', "text or txt file", true);
+    parser.parse_check(argc, argv);
+    sprintf(init_info.image_encoder_path, "%s", parser.get<std::string>("ienc").c_str());
+    sprintf(init_info.text_encoder_path, "%s", parser.get<std::string>("tenc").c_str());
+    sprintf(init_info.tokenizer_path, "%s", parser.get<std::string>("vocab").c_str());
+    init_info.isCN = parser.get<int>("language");
+    sprintf(init_info.db_path, "%s", parser.get<std::string>("db_path").c_str());
+    printf("image_encoder_path: %s\n", init_info.image_encoder_path);
+    printf("text_encoder_path: %s\n", init_info.text_encoder_path);
+    printf("tokenizer_path: %s\n", init_info.tokenizer_path);
+    printf("isCN: %d\n", init_info.isCN);
+    printf("db_path: %s\n", init_info.db_path);
+    if (clip_devices.host.available)
+    {
+        init_info.dev_type = host_device;
+    }
+    else if (clip_devices.devices.count > 0)
+    {
+        init_info.dev_type = axcl_device;
+        init_info.devid = 0;
+    }
+    clip_handle_t handle;
+    int ret = clip_create(&init_info, &handle);
+    if (ret != clip_errcode_success)
+    {
+        printf("clip_create failed\n");
+        return -1;
+    }
+    std::string image_src = parser.get<std::string>("image");
+    std::string text = parser.get<std::string>("text");
+    std::vector<std::string> image_paths;
+    cv::glob(image_src + "/*.*", image_paths);
+    for (size_t i = 0; i < image_paths.size(); i++)
+    {
+        std::string image_path = image_paths[i];
+        std::string image_name = image_path.substr(image_path.find_last_of("/") + 1);
+        char key[CLIP_KEY_MAX_LEN];
+        sprintf(key, "%s", image_name.c_str());
+        if (clip_contain(handle, key))
+        {
+            // printf("%s is exist %04ld/%04ld\n", key, i, image_paths.size());
+            continue;
+        }
+        cv::Mat src = cv::imread(image_path);
+        cv::cvtColor(src, src, cv::COLOR_BGR2RGB);
+        clip_image_t image;
+        image.data = src.data;
+        image.width = src.cols;
+        image.height = src.rows;
+        image.channels = src.channels();
+        image.stride = src.step;
+        timer t;
+        clip_add(handle, key, &image, 0);
+        // printf("add image %s  %04ld/%04ld  %6.2fms\n", image_name.c_str(), i, image_paths.size(), t.cost());
+    }
+    int topk = 10;
+    std::vector<clip_result_item_t> results(topk);
+    timer t;
+    clip_match_text(handle, text.c_str(), results.data(), topk);
+    printf("match text \"%s\" %6.2fms\n", text.c_str(), t.cost());
+    printf("|%32s | %6s|\n", "key", "score");
+    for (size_t i = 0; i < results.size(); i++)
+    {
+        printf("|%32s | %6.2f|\n", results[i].key, results[i].score);
+    }
+    clip_destroy(handle);
+    if (clip_devices.host.available)
+    {
+        clip_sys_deinit(host_device, -1);
+    }
+    else if (clip_devices.devices.count > 0)
+    {
+        clip_sys_deinit(axcl_device, 0);
+    }
+    return 0;
+}

install/examples/timer.hpp ADDED Viewed

	@@ -0,0 +1,61 @@

+/*
+ * AXERA is pleased to support the open source community by making ax-samples available.
+ *
+ * Copyright (c) 2022, AXERA Semiconductor (Shanghai) Co., Ltd. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed
+ * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+ * CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ */
+/*
+ * Author: ls.wang
+ */
+#pragma once
+#include <chrono>
+class timer
+{
+private:
+    std::chrono::system_clock::time_point start_time, end_time;
+public:
+    timer()
+    {
+        start();
+    }
+    void start()
+    {
+        stop();
+        this->start_time = this->end_time;
+    }
+    void stop()
+    {
+#ifdef _MSC_VER
+        this->end_time = std::chrono::system_clock::now();
+#else
+        this->end_time = std::chrono::high_resolution_clock::now();
+#endif
+    }
+    float cost()
+    {
+        if (this->end_time <= this->start_time)
+        {
+            this->stop();
+        }
+        auto ms = std::chrono::duration_cast<std::chrono::microseconds>(this->end_time - this->start_time).count();
+        return static_cast<float>(ms) / 1000.f;
+    }
+};

install/include/clip.h ADDED Viewed

	@@ -0,0 +1,203 @@

+#ifndef __CLIP_H__
+#define __CLIP_H__
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+#define CLIP_DEVICES_COUNT 16
+#define CLIP_VERSION_LEN 32
+#define CLIP_KEY_MAX_LEN 64
+#define CLIP_PATH_LEN 128
+    typedef enum
+    {
+        clip_errcode_failed = -1,
+        clip_errcode_success = 0,
+        clip_errcode_invalid_ptr,
+        clip_errcode_sysinit_failed,
+        clip_errcode_sysdeinit_failed,
+        clip_errcode_axcl_sysinit_failed,
+        clip_errcode_axcl_sysdeinit_failed,
+        clip_errcode_create_failed = 0x10000,
+        clip_errcode_create_failed_sys,
+        clip_errcode_create_failed_ienc,
+        clip_errcode_create_failed_tenc,
+        clip_errcode_create_failed_vocab,
+        clip_errcode_create_failed_db,
+        clip_errcode_destroy_failed = 0x20000,
+        clip_errcode_add_failed = 0x30000,
+        clip_errcode_add_failed_key_exist,
+        clip_errcode_add_failed_encode_image,
+        clip_errcode_add_failed_push_db,
+        clip_errcode_remove_failed = 0x40000,
+        clip_errcode_remove_failed_key_not_exist,
+        clip_errcode_remove_failed_del_db,
+        clip_errcode_match_failed = 0x50000,
+        clip_errcode_match_failed_encode_text,
+        clip_errcode_match_failed_encode_image,
+    } clip_errcode_e;
+    typedef enum
+    {
+        unknown_device = 0,
+        host_device = 1,
+        axcl_device = 2
+    } clip_devive_e;
+    typedef void *clip_handle_t;
+    typedef struct
+    {
+        struct
+        {
+            char available;
+            char version[CLIP_VERSION_LEN];
+            struct
+            {
+                int remain;
+                int total;
+            } mem_info;
+        } host;
+        struct
+        {
+            char host_version[CLIP_VERSION_LEN];
+            char dev_version[CLIP_VERSION_LEN];
+            unsigned char count;
+            struct
+            {
+                int temp;
+                int cpu_usage;
+                int npu_usage;
+                struct
+                {
+                    int remain;
+                    int total;
+                } mem_info;
+            } devices_info[CLIP_DEVICES_COUNT];
+        } devices;
+    } clip_devices_t;
+    typedef struct
+    {
+        clip_devive_e dev_type;                 // Device type
+        char devid;                             // axcl device ID
+        char text_encoder_path[CLIP_PATH_LEN];  // Text encoder model path
+        char image_encoder_path[CLIP_PATH_LEN]; // Image encoder model path
+        char tokenizer_path[CLIP_PATH_LEN];     // Tokenizer model path
+        char isCN;                              // Whether it's a Chinese model (0: English, 1: Chinese)
+        char db_path[CLIP_PATH_LEN];            // Database path (if empty path is specified, a folder will be created)
+    } clip_init_t;
+    typedef struct
+    {
+        unsigned char *data;
+        int width;
+        int height;
+        int channels;
+        int stride;
+    } clip_image_t;
+    typedef struct
+    {
+        char key[CLIP_KEY_MAX_LEN];
+        float score;
+    } clip_result_item_t;
+    /**
+     * @brief Enumerate available devices in the current system
+     * @param devices Pointer to device information structure
+     * @return int Returns 0 on success, -1 on failure
+     */
+    int clip_enum_devices(clip_devices_t *devices);
+    /**
+     * @brief Initialize CLIP system resources
+     * @param dev_type Device type
+     * @param devid Device ID
+     * @return clip_errcode_e Returns 0 on success, error codes see clip_errcode_e
+     */
+    int clip_sys_init(clip_devive_e dev_type, char devid);
+    /**
+     * @brief Deinitialize CLIP system resources
+     * @param dev_type Device type
+     * @param devid Device ID
+     * @return clip_errcode_e Returns 0 on success, error codes see clip_errcode_e
+     */
+    int clip_sys_deinit(clip_devive_e dev_type, char devid);
+    /**
+     * @brief Create CLIP handle
+     * @param init_info Pointer to initialization information structure
+     * @param handle Handle pointer
+     * @return clip_errcode_e Returns 0 on success, error codes see clip_errcode_e
+     */
+    int clip_create(clip_init_t *init_info, clip_handle_t *handle);
+    /**
+     * @brief Destroy CLIP handle
+     * @param handle Handle
+     * @return clip_errcode_e Returns 0 on success, error codes see clip_errcode_e
+     */
+    int clip_destroy(clip_handle_t handle);
+    /**
+     * @brief Add image to CLIP database
+     * @param handle Handle
+     * @param key Image key
+     * @param image Pointer to image structure
+     * @param overwrite Whether to overwrite
+     * @return clip_errcode_e Returns 0 on success, error codes see clip_errcode_e
+     */
+    int clip_add(clip_handle_t handle, char key[CLIP_KEY_MAX_LEN], clip_image_t *image, char overwrite);
+    /**
+     * @brief Remove image from CLIP database
+     * @param handle Handle
+     * @param key Image key
+     * @return clip_errcode_e Returns 0 on success, error codes see clip_errcode_e
+     */
+    int clip_remove(clip_handle_t handle, char key[CLIP_KEY_MAX_LEN]);
+    /**
+     * @brief Check if image exists in CLIP database
+     * @param handle Handle
+     * @param key Image key
+     * @return clip_errcode_e Returns 0 on success, error codes see clip_errcode_e
+     */
+    int clip_contain(clip_handle_t handle, char key[CLIP_KEY_MAX_LEN]);
+    /**
+     * @brief Text match CLIP database images (softmax)
+     * @param handle Handle
+     * @param text Text
+     * @param results Pointer to result structure
+     * @param top_k Top k results
+     * @return clip_errcode_e Returns 0 on success, error codes see clip_errcode_e
+     */
+    int clip_match_text(clip_handle_t handle, const char *text, clip_result_item_t *results, int top_k);
+    /**
+     * @brief Image match CLIP database images (cosine similarity)
+     * @param handle Handle
+     * @param image Pointer to image structure
+     * @param results Pointer to result structure
+     * @param top_k Top k results
+     * @return clip_errcode_e Returns 0 on success, error codes see clip_errcode_e
+     */
+    int clip_match_image(clip_handle_t handle, clip_image_t *image, clip_result_item_t *results, int top_k);
+#if defined(__cplusplus)
+}
+#endif
+#endif // __CLIP_H__

install/lib/aarch64/libclip.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d26c8940d14c0d749ece83b4be6ed13eb86487c31acb0c7591fffb6e21fe3ad
+size 4309856

pyclip/example.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+from pyclip import Clip, enum_devices, sys_init, sys_deinit, ClipDeviceType
+import cv2
+import glob
+import argparse
+import tqdm
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--ienc', type=str, default='cnclip/cnclip_vit_l14_336px_vision_u16u8.axmodel')
+    parser.add_argument('--tenc', type=str, default='cnclip/cnclip_vit_l14_336px_text_u16.axmodel')
+    parser.add_argument('--vocab', type=str, default='cnclip/cn_vocab.txt')
+    parser.add_argument('--isCN', type=int, default=1)
+    parser.add_argument('--db_path', type=str, default='clip_feat_db_coco')
+    parser.add_argument('--image_folder', type=str, default='coco_1000')
+    args = parser.parse_args()
+    image_folder = args.image_folder
+    # 枚举设备
+    print("可用设备:", enum_devices())
+    # 初始化系统
+    sys_init(ClipDeviceType.axcl_device, 0)
+    try:
+        # 创建CLIP实例
+        clip = Clip({
+            'text_encoder_path': args.tenc,
+            'image_encoder_path': args.ienc,
+            'tokenizer_path': args.vocab,
+            'db_path': args.db_path,
+            'isCN': args.isCN
+        })
+        # 添加图像
+        image_files = glob.glob(os.path.join(image_folder, '*.jpg'))
+        for image_file in tqdm.tqdm(image_files):
+            img = cv2.imread(image_file)
+            cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+            filename = os.path.basename(image_file)
+            clip.add_image(filename, img)
+        # 文本匹配
+        results = clip.match_text('dog', top_k=10)
+        print("匹配结果:", results)
+    finally:
+        # 反初始化系统
+        sys_deinit(ClipDeviceType.axcl_device, 0)

pyclip/gradio_example.png ADDED Viewed

Git LFS Details

SHA256: 5462a8506d2c6c335bcc778486b82ab8665958d711545182429addc3565f7fe1
Pointer size: 132 Bytes
Size of remote file: 1.53 MB

pyclip/gradio_example.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import os
+import gradio as gr
+from pyclip import Clip, enum_devices, sys_init, sys_deinit, ClipDeviceType
+import cv2
+import glob
+from PIL import Image
+import tqdm
+import argparse
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--ienc', type=str, default='cnclip/cnclip_vit_l14_336px_vision_u16u8.axmodel')
+    parser.add_argument('--tenc', type=str, default='cnclip/cnclip_vit_l14_336px_text_u16.axmodel')
+    parser.add_argument('--vocab', type=str, default='cnclip/cn_vocab.txt')
+    parser.add_argument('--isCN', type=int, default=1)
+    parser.add_argument('--db_path', type=str, default='clip_feat_db_coco')
+    parser.add_argument('--image_folder', type=str, default='coco_1000')
+    args = parser.parse_args()
+    image_folder = args.image_folder
+    # 初始化
+    print("可用设备:", enum_devices())
+    sys_init(ClipDeviceType.axcl_device, 0)
+    clip = Clip({
+        'text_encoder_path': args.tenc,
+        'image_encoder_path': args.ienc,
+        'tokenizer_path': args.vocab,
+        'db_path': args.db_path,
+        'isCN': args.isCN
+    })
+    # 加载图片数据库（只做一次）
+    image_files = glob.glob(os.path.join(image_folder, '*.jpg'))
+    for image_file in tqdm.tqdm(image_files):
+        filename = os.path.basename(image_file)
+        if clip.contains_image(filename) == 1:
+            continue
+        img = cv2.imread(image_file)
+        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+        clip.add_image(filename, img)
+    # 工具函数：图片转 base64
+    def img_to_pil(img_path):
+        return Image.open(img_path).convert("RGB")
+    # 主搜索函数
+    def search_images(query, top_k):
+        results = clip.match_text(query, top_k=top_k)
+        images = []
+        for filename, score in results:
+            img_path = os.path.join(image_folder, filename)
+            if os.path.exists(img_path):
+                img = img_to_pil(img_path)
+                images.append((img, f"{filename}  Score: {score:.4f}"))
+        return images
+    # Gradio界面
+    with gr.Blocks() as demo:
+        gr.Markdown("# 🔍 文搜图 Demo")
+        with gr.Row():
+            query_input = gr.Textbox(label="请输入文本查询")
+            topk_input = gr.Number(value=25, precision=0, label="Top-K")
+        search_btn = gr.Button("搜图")
+        gallery = gr.Gallery(label="匹配结果", show_label=True, columns=4)
+        search_btn.click(fn=search_images, inputs=[query_input, topk_input], outputs=gallery)
+    # 启动
+    ip = "0.0.0.0"
+    demo.launch(server_name=ip, server_port=7860)
+    # 关闭系统（你可加信号处理来自动关闭）
+    import atexit
+    atexit.register(lambda: sys_deinit(ClipDeviceType.axcl_device, 0))

pyclip/pyclip.py ADDED Viewed

	@@ -0,0 +1,260 @@

+import ctypes
+import os
+from typing import List, Tuple, Optional
+import numpy as np
+import platform
+base_dir = os.path.dirname(__file__)
+arch = platform.machine()
+if arch == 'x86_64':
+    arch_dir = 'x86_64'
+elif arch in ('aarch64', 'arm64'):
+    arch_dir = 'aarch64'
+else:
+    raise RuntimeError(f"Unsupported architecture: {arch}")
+lib_paths = [
+    os.path.join(base_dir, arch_dir, 'libclip.so'),
+    os.path.join(base_dir, 'libclip.so')
+]
+last_error = None
+diagnostic_shown = set()
+for lib_path in lib_paths:
+    try:
+        print(f"Trying to load: {lib_path}")
+        _lib = ctypes.CDLL(lib_path)
+        print(f"✅ Successfully loaded: {lib_path}")
+        break
+    except OSError as e:
+        last_error = e
+        err_str = str(e)
+        print(f"\n❌ Failed to load: {lib_path}")
+        print(f"   {err_str}")
+        # Only show GLIBCXX tip once
+        if "GLIBCXX" in err_str and "not found" in err_str:
+            if "missing_glibcxx" not in diagnostic_shown:
+                diagnostic_shown.add("missing_glibcxx")
+                print("🔍 Detected missing GLIBCXX version in libstdc++.so.6")
+                print("💡 This usually happens when your environment (like Conda) uses an older libstdc++")
+                print(f"👉 Try running with system libstdc++ preloaded:")
+                print(f"   export LD_PRELOAD=/usr/lib/{arch_dir}-linux-gnu/libstdc++.so.6\n")
+        elif "No such file" in err_str:
+            if "file_not_found" not in diagnostic_shown:
+                diagnostic_shown.add("file_not_found")
+                print("🔍 File not found. Please verify that libclip.so exists and the path is correct.\n")
+        elif "wrong ELF class" in err_str:
+            if "elf_mismatch" not in diagnostic_shown:
+                diagnostic_shown.add("elf_mismatch")
+                print("🔍 ELF class mismatch — likely due to architecture conflict (e.g., loading x86_64 .so on aarch64).")
+                print(f"👉 Run `file {lib_path}` to verify the binary architecture.\n")
+        else:
+            if "generic_error" not in diagnostic_shown:
+                diagnostic_shown.add("generic_error")
+                print("📎 Tip: Use `ldd` to inspect missing dependencies:")
+                print(f"   ldd {lib_path}\n")
+else:
+    raise RuntimeError(f"\n❗ Failed to load libclip.so.\nLast error:\n{last_error}")
+# 定义枚举类型
+class ClipDeviceType(ctypes.c_int):
+    unknown_device = 0
+    host_device = 1
+    axcl_device = 2
+# 定义结构体
+class ClipMemInfo(ctypes.Structure):
+    _fields_ = [
+        ('remain', ctypes.c_int),
+        ('total', ctypes.c_int)
+    ]
+class ClipHostInfo(ctypes.Structure):
+    _fields_ = [
+        ('available', ctypes.c_char),
+        ('version', ctypes.c_char * 32),
+        ('mem_info', ClipMemInfo)
+    ]
+class ClipDeviceInfo(ctypes.Structure):
+    _fields_ = [
+        ('temp', ctypes.c_int),
+        ('cpu_usage', ctypes.c_int),
+        ('npu_usage', ctypes.c_int),
+        ('mem_info', ClipMemInfo)
+    ]
+class ClipDevices(ctypes.Structure):
+    _fields_ = [
+        ('host', ClipHostInfo),
+        ('host_version', ctypes.c_char * 32),
+        ('dev_version', ctypes.c_char * 32),
+        ('count', ctypes.c_ubyte),
+        ('devices_info', ClipDeviceInfo * 16)
+    ]
+class ClipInit(ctypes.Structure):
+    _fields_ = [
+        ('dev_type', ClipDeviceType),
+        ('devid', ctypes.c_char),
+        ('text_encoder_path', ctypes.c_char * 128),
+        ('image_encoder_path', ctypes.c_char * 128),
+        ('tokenizer_path', ctypes.c_char * 128),
+        ('isCN', ctypes.c_char),
+        ('db_path', ctypes.c_char * 128)
+    ]
+class ClipImage(ctypes.Structure):
+    _fields_ = [
+        ('data', ctypes.POINTER(ctypes.c_ubyte)),
+        ('width', ctypes.c_int),
+        ('height', ctypes.c_int),
+        ('channels', ctypes.c_int),
+        ('stride', ctypes.c_int)
+    ]
+class ClipResultItem(ctypes.Structure):
+    _fields_ = [
+        ('key', ctypes.c_char * 64),
+        ('score', ctypes.c_float)
+    ]
+# 设置函数参数和返回类型
+_lib.clip_enum_devices.argtypes = [ctypes.POINTER(ClipDevices)]
+_lib.clip_enum_devices.restype = ctypes.c_int
+_lib.clip_sys_init.argtypes = [ClipDeviceType, ctypes.c_char]
+_lib.clip_sys_init.restype = ctypes.c_int
+_lib.clip_sys_deinit.argtypes = [ClipDeviceType, ctypes.c_char]
+_lib.clip_sys_deinit.restype = ctypes.c_int
+_lib.clip_create.argtypes = [ctypes.POINTER(ClipInit), ctypes.POINTER(ctypes.c_void_p)]
+_lib.clip_create.restype = ctypes.c_int
+_lib.clip_destroy.argtypes = [ctypes.c_void_p]
+_lib.clip_destroy.restype = ctypes.c_int
+_lib.clip_add.argtypes = [ctypes.c_void_p, ctypes.c_char_p, ctypes.POINTER(ClipImage), ctypes.c_char]
+_lib.clip_add.restype = ctypes.c_int
+_lib.clip_remove.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
+_lib.clip_remove.restype = ctypes.c_int
+_lib.clip_contain.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
+_lib.clip_contain.restype = ctypes.c_int
+_lib.clip_match_text.argtypes = [ctypes.c_void_p, ctypes.c_char_p, ctypes.POINTER(ClipResultItem), ctypes.c_int]
+_lib.clip_match_text.restype = ctypes.c_int
+_lib.clip_match_image.argtypes = [ctypes.c_void_p, ctypes.POINTER(ClipImage), ctypes.POINTER(ClipResultItem), ctypes.c_int]
+_lib.clip_match_image.restype = ctypes.c_int
+class ClipError(Exception):
+    pass
+def check_error(code: int) -> None:
+    if code != 0:
+        raise ClipError(f"CLIP API错误: {code}")
+class Clip:
+    def __init__(self, init_info: dict):
+        self.handle = None
+        self.init_info = ClipInit()
+        # 设置初始化参数
+        self.init_info.dev_type = init_info.get('dev_type', ClipDeviceType.axcl_device)
+        self.init_info.devid = init_info.get('devid', 0)
+        self.init_info.isCN = init_info.get('isCN', 1)
+        # 设置路径
+        for path_name in ['text_encoder_path', 'image_encoder_path', 'tokenizer_path', 'db_path']:
+            if path_name in init_info:
+                setattr(self.init_info, path_name, init_info[path_name].encode('utf-8'))
+        # 创建CLIP实例
+        handle = ctypes.c_void_p()
+        check_error(_lib.clip_create(ctypes.byref(self.init_info), ctypes.byref(handle)))
+        self.handle = handle
+    def __del__(self):
+        if self.handle:
+            _lib.clip_destroy(self.handle)
+    def add_image(self, key: str, image_data: np.ndarray) -> None:
+        if self.contains_image(key):
+            return
+        image = ClipImage()
+        image.data = ctypes.cast(image_data.ctypes.data, ctypes.POINTER(ctypes.c_ubyte))
+        image.width = image_data.shape[1]
+        image.height = image_data.shape[0]
+        image.channels = image_data.shape[2]
+        image.stride = image_data.shape[1] * image_data.shape[2]
+        check_error(_lib.clip_add(self.handle, key.encode('utf-8'), ctypes.byref(image), 0))
+    def remove_image(self, key: str) -> None:
+        check_error(_lib.clip_remove(self.handle, key.encode('utf-8')))
+    def contains_image(self, key: str) -> bool:
+        return _lib.clip_contain(self.handle, key.encode('utf-8')) == 1
+    def match_text(self, text: str, top_k: int = 10) -> List[Tuple[str, float]]:
+        results = (ClipResultItem * top_k)()
+        check_error(_lib.clip_match_text(self.handle, text.encode('utf-8'), results, top_k))
+        return [(item.key.decode('utf-8'), item.score) for item in results]
+    def match_image(self, image_data: bytes, width: int, height: int, channels: int = 3, top_k: int = 10) -> List[Tuple[str, float]]:
+        image = ClipImage()
+        image.data = ctypes.cast(ctypes.create_string_buffer(image_data), ctypes.POINTER(ctypes.c_ubyte))
+        image.width = width
+        image.height = height
+        image.channels = channels
+        image.stride = width * channels
+        results = (ClipResultItem * top_k)()
+        check_error(_lib.clip_match_image(self.handle, ctypes.byref(image), ctypes.byref(results), top_k))
+        return [(item.key.decode('utf-8'), item.score) for item in results]
+def enum_devices() -> dict:
+    devices = ClipDevices()
+    check_error(_lib.clip_enum_devices(ctypes.byref(devices)))
+    return {
+        'host': {
+            'available': bool(devices.host.available),
+            'version': devices.host.version.decode('utf-8'),
+            'mem_info': {
+                'remain': devices.host.mem_info.remain,
+                'total': devices.host.mem_info.total
+            }
+        },
+        'devices': {
+            'host_version': devices.host_version.decode('utf-8'),
+            'dev_version': devices.dev_version.decode('utf-8'),
+            'count': devices.count,
+            'devices_info': [{
+                'temp': dev.temp,
+                'cpu_usage': dev.cpu_usage,
+                'npu_usage': dev.npu_usage,
+                'mem_info': {
+                    'remain': dev.mem_info.remain,
+                    'total': dev.mem_info.total
+                }
+            } for dev in devices.devices_info[:devices.count]]
+        }
+    }
+def sys_init(dev_type: ClipDeviceType = ClipDeviceType.axcl_device, devid: int = 0) -> None:
+    check_error(_lib.clip_sys_init(dev_type, devid))
+def sys_deinit(dev_type: ClipDeviceType = ClipDeviceType.axcl_device, devid: int = 0) -> None:
+    check_error(_lib.clip_sys_deinit(dev_type, devid))

pyclip/requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio
+opencv-python
+tqdm
+Pillow