admin commited on
Commit
4d30baa
·
1 Parent(s): 032244c
Files changed (4) hide show
  1. app.py +51 -45
  2. model.py +7 -4
  3. requirements.txt +5 -3
  4. utils.py +51 -10
app.py CHANGED
@@ -7,13 +7,17 @@ import numpy as np
7
  import gradio as gr
8
  import librosa.display
9
  import matplotlib.pyplot as plt
10
- from utils import get_modelist, find_audio_files, embed_img
11
  from model import EvalNet
12
-
13
-
14
- CLASSES = ["Gong", "Shang", "Jue", "Zhi", "Yu"]
15
- TEMP_DIR = "./__pycache__/tmp"
16
- SAMPLE_RATE = 44100
 
 
 
 
 
17
 
18
 
19
  def zero_padding(y: np.ndarray, end: int):
@@ -73,28 +77,29 @@ def audio2chroma(audio_path: str, seg_len=20):
73
 
74
 
75
  def infer(wav_path: str, log_name: str, folder_path=TEMP_DIR):
76
- if os.path.exists(folder_path):
77
- shutil.rmtree(folder_path)
 
 
 
78
 
79
- if not wav_path:
80
- return None, "Please input an audio!"
81
 
82
- spec = log_name.split("_")[-3]
83
- os.makedirs(folder_path, exist_ok=True)
84
- try:
85
- model = EvalNet(log_name, len(CLASSES)).model
86
  eval("audio2%s" % spec)(wav_path)
 
 
 
 
 
87
 
88
  except Exception as e:
89
- return None, f"{e}"
90
-
91
- input = embed_img(f"{folder_path}/output.jpg")
92
- output: torch.Tensor = model(input)
93
- pred_id = torch.max(output.data, 1)[1]
94
- return (
95
- os.path.basename(wav_path),
96
- CLASSES[pred_id].capitalize(),
97
- )
98
 
99
 
100
  if __name__ == "__main__":
@@ -109,39 +114,40 @@ if __name__ == "__main__":
109
  gr.Interface(
110
  fn=infer,
111
  inputs=[
112
- gr.Audio(label="Upload a recording", type="filepath"),
113
- gr.Dropdown(choices=models, label="Select a model", value=models[0]),
114
  ],
115
  outputs=[
116
- gr.Textbox(label="Audio filename", show_copy_button=True),
 
117
  gr.Textbox(
118
- label="Chinese pentatonic mode recognition",
119
  show_copy_button=True,
120
  ),
121
  ],
122
  examples=examples,
123
  cache_examples=False,
124
  flagging_mode="never",
125
- title="It is recommended to keep the recording length around 20s.",
126
  )
127
 
128
  gr.Markdown(
129
- """
130
- # Cite
131
- ```bibtex
132
- @article{Zhou-2025,
133
- author = {Monan Zhou and Shenyang Xu and Zhaorui Liu and Zhaowen Wang and Feng Yu and Wei Li and Baoqiang Han},
134
- title = {CCMusic: An Open and Diverse Database for Chinese Music Information Retrieval Research},
135
- journal = {Transactions of the International Society for Music Information Retrieval},
136
- volume = {8},
137
- number = {1},
138
- pages = {22--38},
139
- month = {Mar},
140
- year = {2025},
141
- url = {https://doi.org/10.5334/tismir.194},
142
- doi = {10.5334/tismir.194}
143
- }
144
- ```"""
145
  )
146
 
147
- demo.launch(ssr_mode=False)
 
7
  import gradio as gr
8
  import librosa.display
9
  import matplotlib.pyplot as plt
 
10
  from model import EvalNet
11
+ from utils import (
12
+ get_modelist,
13
+ find_audio_files,
14
+ embed_img,
15
+ _L,
16
+ SAMPLE_RATE,
17
+ TEMP_DIR,
18
+ TRANSLATE,
19
+ CLASSES,
20
+ )
21
 
22
 
23
  def zero_padding(y: np.ndarray, end: int):
 
77
 
78
 
79
  def infer(wav_path: str, log_name: str, folder_path=TEMP_DIR):
80
+ status = "Success"
81
+ filename = result = None
82
+ try:
83
+ if os.path.exists(folder_path):
84
+ shutil.rmtree(folder_path)
85
 
86
+ if not wav_path:
87
+ raise ValueError("请输入音频!")
88
 
89
+ spec = log_name.split("_")[-3]
90
+ os.makedirs(folder_path, exist_ok=True)
91
+ model = EvalNet(log_name, len(TRANSLATE)).model
 
92
  eval("audio2%s" % spec)(wav_path)
93
+ input = embed_img(f"{folder_path}/output.jpg")
94
+ output: torch.Tensor = model(input)
95
+ pred_id = torch.max(output.data, 1)[1]
96
+ filename = os.path.basename(wav_path)
97
+ result = f"{TRANSLATE[CLASSES[pred_id]]} ({CLASSES[pred_id].capitalize()})"
98
 
99
  except Exception as e:
100
+ status = f"{e}"
101
+
102
+ return status, filename, result
 
 
 
 
 
 
103
 
104
 
105
  if __name__ == "__main__":
 
114
  gr.Interface(
115
  fn=infer,
116
  inputs=[
117
+ gr.Audio(label=_L("上传录音"), type="filepath"),
118
+ gr.Dropdown(choices=models, label=_L("选择模型"), value=models[0]),
119
  ],
120
  outputs=[
121
+ gr.Textbox(label=_L("状态栏"), show_copy_button=True),
122
+ gr.Textbox(label=_L("音频文件名"), show_copy_button=True),
123
  gr.Textbox(
124
+ label=_L("中国五声调式识别"),
125
  show_copy_button=True,
126
  ),
127
  ],
128
  examples=examples,
129
  cache_examples=False,
130
  flagging_mode="never",
131
+ title=_L("建议录音时长保持在 20s 左右"),
132
  )
133
 
134
  gr.Markdown(
135
+ f"# {_L('引用')}"
136
+ + """
137
+ ```bibtex
138
+ @article{Zhou-2025,
139
+ author = {Monan Zhou and Shenyang Xu and Zhaorui Liu and Zhaowen Wang and Feng Yu and Wei Li and Baoqiang Han},
140
+ title = {CCMusic: An Open and Diverse Database for Chinese Music Information Retrieval Research},
141
+ journal = {Transactions of the International Society for Music Information Retrieval},
142
+ volume = {8},
143
+ number = {1},
144
+ pages = {22--38},
145
+ month = {Mar},
146
+ year = {2025},
147
+ url = {https://doi.org/10.5334/tismir.194},
148
+ doi = {10.5334/tismir.194}
149
+ }
150
+ ```"""
151
  )
152
 
153
+ demo.launch()
model.py CHANGED
@@ -1,7 +1,7 @@
1
  import torch
2
  import torch.nn as nn
3
  import torchvision.models as models
4
- from datasets import load_dataset
5
  from utils import MODEL_DIR
6
 
7
 
@@ -17,7 +17,7 @@ class EvalNet:
17
  self.m_type, self.input_size = self._model_info(m_ver)
18
 
19
  if not hasattr(models, m_ver):
20
- raise Exception("Unsupported model.")
21
 
22
  self.model = eval("models.%s()" % m_ver)
23
  linear_output = self._set_outsize()
@@ -34,11 +34,14 @@ class EvalNet:
34
  if ver == bb["ver"]:
35
  return bb
36
 
37
- print("Backbone name not found, using default option - alexnet.")
38
  return backbone_list[0]
39
 
40
  def _model_info(self, m_ver: str):
41
- backbone_list = load_dataset("monetjoe/cv_backbones", split="train")
 
 
 
42
  backbone = self._get_backbone(m_ver, backbone_list)
43
  m_type = str(backbone["type"])
44
  input_size = int(backbone["input_size"])
 
1
  import torch
2
  import torch.nn as nn
3
  import torchvision.models as models
4
+ from modelscope.msdatasets import MsDataset
5
  from utils import MODEL_DIR
6
 
7
 
 
17
  self.m_type, self.input_size = self._model_info(m_ver)
18
 
19
  if not hasattr(models, m_ver):
20
+ raise Exception("不支持的模型")
21
 
22
  self.model = eval("models.%s()" % m_ver)
23
  linear_output = self._set_outsize()
 
34
  if ver == bb["ver"]:
35
  return bb
36
 
37
+ print("未找到骨干网络名称,使用默认选项 - alexnet")
38
  return backbone_list[0]
39
 
40
  def _model_info(self, m_ver: str):
41
+ backbone_list = MsDataset.load(
42
+ "monetjoe/cv_backbones",
43
+ split="v1",
44
+ )
45
  backbone = self._get_backbone(m_ver, backbone_list)
46
  m_type = str(backbone["type"])
47
  input_size = int(backbone["input_size"])
requirements.txt CHANGED
@@ -1,5 +1,7 @@
1
- torch
2
- pillow
 
 
3
  librosa
4
  matplotlib
5
- torchvision
 
1
+ torch==2.6.0+cu118
2
+ -f https://download.pytorch.org/whl/torch
3
+ torchvision==0.21.0+cu118
4
+ -f https://download.pytorch.org/whl/torchvision
5
  librosa
6
  matplotlib
7
+ modelscope[framework]==1.21.0
utils.py CHANGED
@@ -1,10 +1,54 @@
1
  import os
2
  import torch
3
  import torchvision.transforms as transforms
4
- from huggingface_hub import snapshot_download
 
5
  from PIL import Image
6
 
7
- MODEL_DIR = snapshot_download("ccmusic-database/CNPM", cache_dir="./__pycache__")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  def toCUDA(x):
@@ -27,19 +71,16 @@ def find_audio_files(folder_path=f"{MODEL_DIR}/examples"):
27
 
28
 
29
  def get_modelist(model_dir=MODEL_DIR, assign_model=""):
30
- try:
31
- entries = os.listdir(model_dir)
32
- except OSError as e:
33
- print(f"Cannot access {model_dir}: {e}")
34
- return
35
-
36
  output = []
37
- for entry in entries:
 
38
  full_path = os.path.join(model_dir, entry)
 
39
  if entry == ".git" or entry == "examples":
40
- print(f"Skip .git / examples dir: {full_path}")
41
  continue
42
 
 
43
  if os.path.isdir(full_path):
44
  model = os.path.basename(full_path)
45
  if assign_model and assign_model.lower() in model:
 
1
  import os
2
  import torch
3
  import torchvision.transforms as transforms
4
+ import huggingface_hub
5
+ import modelscope
6
  from PIL import Image
7
 
8
+ EN_US = os.getenv("LANG") != "zh_CN.UTF-8"
9
+
10
+ ZH2EN = {
11
+ "上传录音": "Upload a recording",
12
+ "选择模型": "Select a model",
13
+ "状态栏": "Status",
14
+ "音频文件名": "Audio filename",
15
+ "中国五声调式识别": "Chinese pentatonic mode recognition",
16
+ "建议录音时长保持在 20s 左右": "It is recommended to keep the recording length around 20s.",
17
+ "引用": "Cite",
18
+ "宫": "Gong",
19
+ "商": "Shang",
20
+ "角": "Jue",
21
+ "徵": "Zhi",
22
+ "羽": "Yu",
23
+ }
24
+
25
+ MODEL_DIR = (
26
+ huggingface_hub.snapshot_download(
27
+ "ccmusic-database/CNPM",
28
+ cache_dir="./__pycache__",
29
+ )
30
+ if EN_US
31
+ else modelscope.snapshot_download(
32
+ "ccmusic-database/CNPM",
33
+ cache_dir="./__pycache__",
34
+ )
35
+ )
36
+
37
+
38
+ def _L(zh_txt: str):
39
+ return ZH2EN[zh_txt] if EN_US else zh_txt
40
+
41
+
42
+ TRANSLATE = {
43
+ "Gong": _L("宫"),
44
+ "Shang": _L("商"),
45
+ "Jue": _L("角"),
46
+ "Zhi": _L("徵"),
47
+ "Yu": _L("羽"),
48
+ }
49
+ CLASSES = list(TRANSLATE.keys())
50
+ TEMP_DIR = "./__pycache__/tmp"
51
+ SAMPLE_RATE = 44100
52
 
53
 
54
  def toCUDA(x):
 
71
 
72
 
73
  def get_modelist(model_dir=MODEL_DIR, assign_model=""):
 
 
 
 
 
 
74
  output = []
75
+ for entry in os.listdir(model_dir):
76
+ # 获取完整路径
77
  full_path = os.path.join(model_dir, entry)
78
+ # 跳过'.git'文件夹
79
  if entry == ".git" or entry == "examples":
80
+ print(f"跳过 .git examples 文件夹: {full_path}")
81
  continue
82
 
83
+ # 检查条目是文件还是目录
84
  if os.path.isdir(full_path):
85
  model = os.path.basename(full_path)
86
  if assign_model and assign_model.lower() in model: