.dockerignore CHANGED
@@ -1,3 +1,15 @@
1
- /downloads
2
- /llama.cpp
3
- /outputs
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # IDE
2
+ .idea/
3
+ .vscode/
4
+
5
+ .git*
6
+ .dockerignore
7
+ docker-compose.yml
8
+ Dockerfile
9
+
10
+ # LLama.cpp
11
+ llama.cpp/
12
+
13
+ # Working files
14
+ downloads/
15
+ outputs/
.gitignore CHANGED
@@ -1,3 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Byte-compiled / optimized / DLL files
2
  __pycache__/
3
  *.py[cod]
@@ -11,7 +150,6 @@ __pycache__/
11
  build/
12
  develop-eggs/
13
  dist/
14
- downloads/
15
  eggs/
16
  .eggs/
17
  lib/
@@ -106,10 +244,8 @@ ipython_config.py
106
  #pdm.lock
107
  # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
  # in version control.
109
- # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
  .pdm.toml
111
- .pdm-python
112
- .pdm-build/
113
 
114
  # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
  __pypackages__/
@@ -161,7 +297,66 @@ cython_debug/
161
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
  #.idea/
163
 
164
- /downloads
165
- !/downloads/.keep
166
- /llama.cpp
167
- /outputs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created by https://www.toptal.com/developers/gitignore/api/linux,macos,windows,python,jetbrains+all,visualstudiocode
2
+ # Edit at https://www.toptal.com/developers/gitignore?templates=linux,macos,windows,python,jetbrains+all,visualstudiocode
3
+
4
+ ### JetBrains+all ###
5
+ # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
6
+ # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
7
+
8
+ # User-specific stuff
9
+ .idea/**/workspace.xml
10
+ .idea/**/tasks.xml
11
+ .idea/**/usage.statistics.xml
12
+ .idea/**/dictionaries
13
+ .idea/**/shelf
14
+
15
+ # AWS User-specific
16
+ .idea/**/aws.xml
17
+
18
+ # Generated files
19
+ .idea/**/contentModel.xml
20
+
21
+ # Sensitive or high-churn files
22
+ .idea/**/dataSources/
23
+ .idea/**/dataSources.ids
24
+ .idea/**/dataSources.local.xml
25
+ .idea/**/sqlDataSources.xml
26
+ .idea/**/dynamic.xml
27
+ .idea/**/uiDesigner.xml
28
+ .idea/**/dbnavigator.xml
29
+
30
+ # Gradle
31
+ .idea/**/gradle.xml
32
+ .idea/**/libraries
33
+
34
+ # Gradle and Maven with auto-import
35
+ # When using Gradle or Maven with auto-import, you should exclude module files,
36
+ # since they will be recreated, and may cause churn. Uncomment if using
37
+ # auto-import.
38
+ # .idea/artifacts
39
+ # .idea/compiler.xml
40
+ # .idea/jarRepositories.xml
41
+ # .idea/modules.xml
42
+ # .idea/*.iml
43
+ # .idea/modules
44
+ # *.iml
45
+ # *.ipr
46
+
47
+ # CMake
48
+ cmake-build-*/
49
+
50
+ # Mongo Explorer plugin
51
+ .idea/**/mongoSettings.xml
52
+
53
+ # File-based project format
54
+ *.iws
55
+
56
+ # IntelliJ
57
+ out/
58
+
59
+ # mpeltonen/sbt-idea plugin
60
+ .idea_modules/
61
+
62
+ # JIRA plugin
63
+ atlassian-ide-plugin.xml
64
+
65
+ # Cursive Clojure plugin
66
+ .idea/replstate.xml
67
+
68
+ # SonarLint plugin
69
+ .idea/sonarlint/
70
+
71
+ # Crashlytics plugin (for Android Studio and IntelliJ)
72
+ com_crashlytics_export_strings.xml
73
+ crashlytics.properties
74
+ crashlytics-build.properties
75
+ fabric.properties
76
+
77
+ # Editor-based Rest Client
78
+ .idea/httpRequests
79
+
80
+ # Android studio 3.1+ serialized cache file
81
+ .idea/caches/build_file_checksums.ser
82
+
83
+ ### JetBrains+all Patch ###
84
+ # Ignore everything but code style settings and run configurations
85
+ # that are supposed to be shared within teams.
86
+
87
+ .idea/*
88
+
89
+ !.idea/codeStyles
90
+ !.idea/runConfigurations
91
+
92
+ ### Linux ###
93
+ *~
94
+
95
+ # temporary files which can be created if a process still has a handle open of a deleted file
96
+ .fuse_hidden*
97
+
98
+ # KDE directory preferences
99
+ .directory
100
+
101
+ # Linux trash folder which might appear on any partition or disk
102
+ .Trash-*
103
+
104
+ # .nfs files are created when an open file is removed but is still being accessed
105
+ .nfs*
106
+
107
+ ### macOS ###
108
+ # General
109
+ .DS_Store
110
+ .AppleDouble
111
+ .LSOverride
112
+
113
+ # Icon must end with two \r
114
+ Icon
115
+
116
+ # Thumbnails
117
+ ._*
118
+
119
+ # Files that might appear in the root of a volume
120
+ .DocumentRevisions-V100
121
+ .fseventsd
122
+ .Spotlight-V100
123
+ .TemporaryItems
124
+ .Trashes
125
+ .VolumeIcon.icns
126
+ .com.apple.timemachine.donotpresent
127
+
128
+ # Directories potentially created on remote AFP share
129
+ .AppleDB
130
+ .AppleDesktop
131
+ Network Trash Folder
132
+ Temporary Items
133
+ .apdisk
134
+
135
+ ### macOS Patch ###
136
+ # iCloud generated files
137
+ *.icloud
138
+
139
+ ### Python ###
140
  # Byte-compiled / optimized / DLL files
141
  __pycache__/
142
  *.py[cod]
 
150
  build/
151
  develop-eggs/
152
  dist/
 
153
  eggs/
154
  .eggs/
155
  lib/
 
244
  #pdm.lock
245
  # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
246
  # in version control.
247
+ # https://pdm.fming.dev/#use-with-ide
248
  .pdm.toml
 
 
249
 
250
  # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
251
  __pypackages__/
 
297
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
298
  #.idea/
299
 
300
+ ### Python Patch ###
301
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
302
+ poetry.toml
303
+
304
+ # ruff
305
+ .ruff_cache/
306
+
307
+ # LSP config files
308
+ pyrightconfig.json
309
+
310
+ ### VisualStudioCode ###
311
+ .vscode/*
312
+ !.vscode/settings.json
313
+ !.vscode/tasks.json
314
+ !.vscode/launch.json
315
+ !.vscode/extensions.json
316
+ !.vscode/*.code-snippets
317
+
318
+ # Local History for Visual Studio Code
319
+ .history/
320
+
321
+ # Built Visual Studio Code Extensions
322
+ *.vsix
323
+
324
+ ### VisualStudioCode Patch ###
325
+ # Ignore all local history of files
326
+ .history
327
+ .ionide
328
+
329
+ ### Windows ###
330
+ # Windows thumbnail cache files
331
+ Thumbs.db
332
+ Thumbs.db:encryptable
333
+ ehthumbs.db
334
+ ehthumbs_vista.db
335
+
336
+ # Dump file
337
+ *.stackdump
338
+
339
+ # Folder config file
340
+ [Dd]esktop.ini
341
+
342
+ # Recycle Bin used on file shares
343
+ $RECYCLE.BIN/
344
+
345
+ # Windows Installer files
346
+ *.cab
347
+ *.msi
348
+ *.msix
349
+ *.msm
350
+ *.msp
351
+
352
+ # Windows shortcuts
353
+ *.lnk
354
+
355
+ # End of https://www.toptal.com/developers/gitignore/api/linux,macos,windows,python,jetbrains+all,visualstudiocode
356
+
357
+ # Working folders
358
+ downloads/
359
+ outputs/
360
+ llama.cpp/
361
+
362
+ !*/.keep
Dockerfile CHANGED
@@ -1,65 +1,27 @@
1
- FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04
2
-
3
- ENV DEBIAN_FRONTEND=noninteractive
4
- RUN apt-get update && \
5
- apt-get upgrade -y && \
6
- apt-get install -y --no-install-recommends --fix-missing \
7
- git \
8
- git-lfs \
9
- wget \
10
- curl \
11
- cmake \
12
- # python build dependencies \
13
- build-essential \
14
- libssl-dev \
15
- zlib1g-dev \
16
- libbz2-dev \
17
- libreadline-dev \
18
- libsqlite3-dev \
19
- libncursesw5-dev \
20
- xz-utils \
21
- tk-dev \
22
- libxml2-dev \
23
- libxmlsec1-dev \
24
- libffi-dev \
25
- liblzma-dev \
26
- ffmpeg \
27
- nvidia-driver-570
28
 
29
  # Check if user with UID 1000 exists, if not create it
30
  RUN id -u 1000 &>/dev/null || useradd -m -u 1000 user
31
  USER 1000
 
32
  ENV HOME=/home/user \
33
- PATH=/home/user/.local/bin:${PATH}
34
- WORKDIR ${HOME}/app
35
 
36
- RUN curl https://pyenv.run | bash
37
- ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
38
- ARG PYTHON_VERSION=3.11
39
- RUN pyenv install ${PYTHON_VERSION} && \
40
- pyenv global ${PYTHON_VERSION} && \
41
- pyenv rehash && \
42
- pip install --no-cache-dir -U pip setuptools wheel && \
43
- pip install "huggingface-hub" "hf-transfer" "gradio[oauth]" "gradio_huggingfacehub_search" "APScheduler"
44
 
45
- COPY --chown=1000 . ${HOME}/app
46
- RUN git clone https://github.com/ggerganov/llama.cpp
47
- RUN pip install -r llama.cpp/requirements/requirements-convert_hf_to_gguf.txt
48
 
49
- COPY groups_merged.txt ${HOME}/app/llama.cpp/
50
 
51
- ENV PYTHONPATH=${HOME}/app \
52
- PYTHONUNBUFFERED=1 \
53
  HF_HUB_ENABLE_HF_TRANSFER=1 \
54
  GRADIO_ALLOW_FLAGGING=never \
55
  GRADIO_NUM_PORTS=1 \
56
  GRADIO_SERVER_NAME=0.0.0.0 \
57
  GRADIO_THEME=huggingface \
58
- TQDM_POSITION=-1 \
59
- TQDM_MININTERVAL=1 \
60
- SYSTEM=spaces \
61
- LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH} \
62
- PATH=/usr/local/nvidia/bin:${PATH}
63
 
64
- ENTRYPOINT /bin/bash start.sh
65
 
 
 
1
+ FROM ghcr.io/ggml-org/llama.cpp:full-cuda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  # Check if user with UID 1000 exists, if not create it
4
  RUN id -u 1000 &>/dev/null || useradd -m -u 1000 user
5
  USER 1000
6
+
7
  ENV HOME=/home/user \
8
+ PATH=${PATH}:/home/user/.local/bin:/usr/local/nvidia/bin:/app
 
9
 
10
+ WORKDIR ${HOME}/app
 
 
 
 
 
 
 
11
 
12
+ COPY --chown=1000 requirements.txt ${HOME}/app
 
 
13
 
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
 
16
+ ENV PYTHONPATH=${PYTHONPATH}:${HOME}/.local/bin \
17
+ LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64:/app \
18
  HF_HUB_ENABLE_HF_TRANSFER=1 \
19
  GRADIO_ALLOW_FLAGGING=never \
20
  GRADIO_NUM_PORTS=1 \
21
  GRADIO_SERVER_NAME=0.0.0.0 \
22
  GRADIO_THEME=huggingface \
23
+ SYSTEM=spaces
 
 
 
 
24
 
25
+ COPY --chown=1000 . ${HOME}/app
26
 
27
+ ENTRYPOINT ["/bin/bash", "start.sh"]
app.py CHANGED
@@ -1,443 +1,879 @@
1
  import os
2
  import subprocess
3
  import signal
4
- os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
5
- import gradio as gr
6
  import tempfile
 
 
 
 
7
 
 
 
 
8
  from huggingface_hub import HfApi, ModelCard, whoami
9
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
10
- from pathlib import Path
11
- from textwrap import dedent
12
  from apscheduler.schedulers.background import BackgroundScheduler
13
 
14
 
15
- # used for restarting the space
16
- HF_TOKEN = os.environ.get("HF_TOKEN")
17
- CONVERSION_SCRIPT = "./llama.cpp/convert_hf_to_gguf.py"
18
-
19
- # escape HTML for logging
20
- def escape(s: str) -> str:
21
- s = s.replace("&", "&") # Must be done first!
22
- s = s.replace("<", "&lt;")
23
- s = s.replace(">", "&gt;")
24
- s = s.replace('"', "&quot;")
25
- s = s.replace("\n", "<br/>")
26
- return s
27
-
28
- def generate_importance_matrix(model_path: str, train_data_path: str, output_path: str):
29
- imatrix_command = [
30
- "./llama.cpp/llama-imatrix",
31
- "-m", model_path,
32
- "-f", train_data_path,
33
- "-ngl", "99",
34
- "--output-frequency", "10",
35
- "-o", output_path,
36
- ]
37
-
38
- if not os.path.isfile(model_path):
39
- raise Exception(f"Model file not found: {model_path}")
40
-
41
- print("Running imatrix command...")
42
- process = subprocess.Popen(imatrix_command, shell=False)
43
-
44
- try:
45
- process.wait(timeout=60) # added wait
46
- except subprocess.TimeoutExpired:
47
- print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
48
- process.send_signal(signal.SIGINT)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  try:
50
- process.wait(timeout=5) # grace period
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  except subprocess.TimeoutExpired:
52
- print("Imatrix proc still didn't term. Forecfully terming process...")
53
- process.kill()
54
-
55
- print("Importance matrix generation completed.")
56
-
57
- def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
58
- print(f"Model path: {model_path}")
59
- print(f"Output dir: {outdir}")
60
-
61
- if oauth_token is None or oauth_token.token is None:
62
- raise ValueError("You have to be logged in.")
63
-
64
- split_cmd = [
65
- "./llama.cpp/llama-gguf-split",
66
- "--split",
67
- ]
68
- if split_max_size:
69
- split_cmd.append("--split-max-size")
70
- split_cmd.append(split_max_size)
71
- else:
72
- split_cmd.append("--split-max-tensors")
73
- split_cmd.append(str(split_max_tensors))
74
-
75
- # args for output
76
- model_path_prefix = '.'.join(model_path.split('.')[:-1]) # remove the file extension
77
- split_cmd.append(model_path)
78
- split_cmd.append(model_path_prefix)
79
-
80
- print(f"Split command: {split_cmd}")
81
-
82
- result = subprocess.run(split_cmd, shell=False, capture_output=True, text=True)
83
- print(f"Split command stdout: {result.stdout}")
84
- print(f"Split command stderr: {result.stderr}")
85
-
86
- if result.returncode != 0:
87
- stderr_str = result.stderr.decode("utf-8")
88
- raise Exception(f"Error splitting the model: {stderr_str}")
89
- print("Model split successfully!")
90
-
91
- # remove the original model file if needed
92
- if os.path.exists(model_path):
93
- os.remove(model_path)
94
-
95
- model_file_prefix = model_path_prefix.split('/')[-1]
96
- print(f"Model file name prefix: {model_file_prefix}")
97
- sharded_model_files = [f for f in os.listdir(outdir) if f.startswith(model_file_prefix) and f.endswith(".gguf")]
98
- if sharded_model_files:
99
  print(f"Sharded model files: {sharded_model_files}")
100
- api = HfApi(token=oauth_token.token)
101
  for file in sharded_model_files:
102
- file_path = os.path.join(outdir, file)
103
- print(f"Uploading file: {file_path}")
104
  try:
105
- api.upload_file(
106
- path_or_fileobj=file_path,
107
- path_in_repo=file,
108
- repo_id=repo_id,
109
- )
110
  except Exception as e:
111
- raise Exception(f"Error uploading file {file_path}: {e}")
112
- else:
113
- raise Exception("No sharded files found.")
114
-
115
- print("Sharded model has been uploaded successfully!")
116
-
117
- def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
118
- if oauth_token is None or oauth_token.token is None:
119
- raise gr.Error("You must be logged in to use GGUF-my-repo")
120
-
121
- # validate the oauth token
122
- try:
123
- whoami(oauth_token.token)
124
- except Exception as e:
125
- raise gr.Error("You must be logged in to use GGUF-my-repo")
126
-
127
- model_name = model_id.split('/')[-1]
128
-
129
- try:
130
- api = HfApi(token=oauth_token.token)
131
-
132
- dl_pattern = ["*.md", "*.json", "*.model"]
133
-
134
- pattern = (
135
- "*.safetensors"
136
- if any(
137
- file.path.endswith(".safetensors")
138
- for file in api.list_repo_tree(
139
- repo_id=model_id,
140
- recursive=True,
141
  )
 
142
  )
143
- else "*.bin"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  )
 
 
 
145
 
146
- dl_pattern += [pattern]
147
-
148
- if not os.path.exists("downloads"):
149
- os.makedirs("downloads")
150
-
151
- if not os.path.exists("outputs"):
152
- os.makedirs("outputs")
153
-
154
- with tempfile.TemporaryDirectory(dir="outputs") as outdir:
155
- fp16 = str(Path(outdir)/f"{model_name}.fp16.gguf")
156
-
157
- with tempfile.TemporaryDirectory(dir="downloads") as tmpdir:
158
- # Keep the model name as the dirname so the model name metadata is populated correctly
159
- local_dir = Path(tmpdir)/model_name
160
- print(local_dir)
161
- api.snapshot_download(repo_id=model_id, local_dir=local_dir, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
162
- print("Model downloaded successfully!")
163
- print(f"Current working directory: {os.getcwd()}")
164
- print(f"Model directory contents: {os.listdir(local_dir)}")
165
-
166
- config_dir = local_dir/"config.json"
167
- adapter_config_dir = local_dir/"adapter_config.json"
168
- if os.path.exists(adapter_config_dir) and not os.path.exists(config_dir):
169
- raise Exception('adapter_config.json is present.<br/><br/>If you are converting a LoRA adapter to GGUF, please use <a href="https://huggingface.co/spaces/ggml-org/gguf-my-lora" target="_blank" style="text-decoration:underline">GGUF-my-lora</a>.')
170
-
171
- result = subprocess.run([
172
- "python", CONVERSION_SCRIPT, local_dir, "--outtype", "f16", "--outfile", fp16
173
- ], shell=False, capture_output=True)
174
- print(result)
175
- if result.returncode != 0:
176
- stderr_str = result.stderr.decode("utf-8")
177
- raise Exception(f"Error converting to fp16: {stderr_str}")
178
- print("Model converted to fp16 successfully!")
179
- print(f"Converted model path: {fp16}")
180
-
181
- imatrix_path = Path(outdir)/"imatrix.dat"
182
-
183
- if use_imatrix:
184
- if train_data_file:
185
- train_data_path = train_data_file.name
186
- else:
187
- train_data_path = "llama.cpp/groups_merged.txt" #fallback calibration dataset
188
-
189
- print(f"Training data file path: {train_data_path}")
190
-
191
- if not os.path.isfile(train_data_path):
192
- raise Exception(f"Training data file not found: {train_data_path}")
193
-
194
- generate_importance_matrix(fp16, train_data_path, imatrix_path)
195
- else:
196
- print("Not using imatrix quantization.")
197
-
198
- # Quantize the model
199
- quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
200
- quantized_gguf_path = str(Path(outdir)/quantized_gguf_name)
201
- if use_imatrix:
202
- quantise_ggml = [
203
- "./llama.cpp/llama-quantize",
204
- "--imatrix", imatrix_path, fp16, quantized_gguf_path, imatrix_q_method
205
- ]
206
- else:
207
- quantise_ggml = [
208
- "./llama.cpp/llama-quantize",
209
- fp16, quantized_gguf_path, q_method
210
- ]
211
- result = subprocess.run(quantise_ggml, shell=False, capture_output=True)
212
- if result.returncode != 0:
213
- stderr_str = result.stderr.decode("utf-8")
214
- raise Exception(f"Error quantizing: {stderr_str}")
215
- print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
216
- print(f"Quantized model path: {quantized_gguf_path}")
217
-
218
- # Create empty repo
219
- username = whoami(oauth_token.token)["name"]
220
- new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
221
- new_repo_id = new_repo_url.repo_id
222
- print("Repo created successfully!", new_repo_url)
223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  try:
225
- card = ModelCard.load(model_id, token=oauth_token.token)
226
- except:
227
- card = ModelCard("")
228
- if card.data.tags is None:
229
- card.data.tags = []
230
- card.data.tags.append("llama-cpp")
231
- card.data.tags.append("gguf-my-repo")
232
- card.data.base_model = model_id
233
- card.text = dedent(
234
- f"""
235
- # {new_repo_id}
236
- This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
237
- Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
238
-
239
- ## Use with llama.cpp
240
- Install llama.cpp through brew (works on Mac and Linux)
241
-
242
- ```bash
243
- brew install llama.cpp
244
-
245
- ```
246
- Invoke the llama.cpp server or the CLI.
247
-
248
- ### CLI:
249
- ```bash
250
- llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
251
- ```
252
-
253
- ### Server:
254
- ```bash
255
- llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
256
- ```
257
-
258
- Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
259
-
260
- Step 1: Clone llama.cpp from GitHub.
261
- ```
262
- git clone https://github.com/ggerganov/llama.cpp
263
- ```
264
-
265
- Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
266
- ```
267
- cd llama.cpp && LLAMA_CURL=1 make
268
- ```
269
-
270
- Step 3: Run inference through the main binary.
271
- ```
272
- ./llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
273
- ```
274
- or
275
- ```
276
- ./llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
277
- ```
278
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  )
280
- readme_path = Path(outdir)/"README.md"
281
- card.save(readme_path)
282
 
283
- if split_model:
284
- split_upload_model(str(quantized_gguf_path), outdir, new_repo_id, oauth_token, split_max_tensors, split_max_size)
285
- else:
286
- try:
287
- print(f"Uploading quantized model: {quantized_gguf_path}")
288
- api.upload_file(
289
- path_or_fileobj=quantized_gguf_path,
290
- path_in_repo=quantized_gguf_name,
291
- repo_id=new_repo_id,
292
- )
293
- except Exception as e:
294
- raise Exception(f"Error uploading quantized model: {e}")
295
-
296
- if os.path.isfile(imatrix_path):
297
- try:
298
- print(f"Uploading imatrix.dat: {imatrix_path}")
299
- api.upload_file(
300
- path_or_fileobj=imatrix_path,
301
- path_in_repo="imatrix.dat",
302
- repo_id=new_repo_id,
303
- )
304
- except Exception as e:
305
- raise Exception(f"Error uploading imatrix.dat: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
 
307
- api.upload_file(
308
- path_or_fileobj=readme_path,
309
- path_in_repo="README.md",
310
- repo_id=new_repo_id,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  )
312
- print(f"Uploaded successfully with {imatrix_q_method if use_imatrix else q_method} option!")
313
 
314
- # end of the TemporaryDirectory(dir="outputs") block; temporary outputs are deleted here
 
 
 
 
315
 
316
- return (
317
- f'<h1>✅ DONE</h1><br/>Find your repo here: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{new_repo_id}</a>',
318
- "llama.png",
319
- )
320
- except Exception as e:
321
- return (f'<h1>❌ ERROR</h1><br/><pre style="white-space:pre-wrap;">{escape(str(e))}</pre>', "error.png")
322
-
323
-
324
- css="""/* Custom CSS to allow scrolling */
325
- .gradio-container {overflow-y: auto;}
326
- """
327
- model_id = HuggingfaceHubSearch(
328
- label="Hub Model ID",
329
- placeholder="Search for model id on Huggingface",
330
- search_type="model",
331
- )
332
-
333
- q_method = gr.Dropdown(
334
- ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
335
- label="Quantization Method",
336
- info="GGML quantization type",
337
- value="Q4_K_M",
338
- filterable=False,
339
- visible=True
340
- )
341
-
342
- imatrix_q_method = gr.Dropdown(
343
- ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
344
- label="Imatrix Quantization Method",
345
- info="GGML imatrix quants type",
346
- value="IQ4_NL",
347
- filterable=False,
348
- visible=False
349
- )
350
-
351
- use_imatrix = gr.Checkbox(
352
- value=False,
353
- label="Use Imatrix Quantization",
354
- info="Use importance matrix for quantization."
355
- )
356
-
357
- private_repo = gr.Checkbox(
358
- value=False,
359
- label="Private Repo",
360
- info="Create a private repo under your username."
361
- )
362
-
363
- train_data_file = gr.File(
364
- label="Training Data File",
365
- file_types=["txt"],
366
- visible=False
367
- )
368
-
369
- split_model = gr.Checkbox(
370
- value=False,
371
- label="Split Model",
372
- info="Shard the model using gguf-split."
373
- )
374
-
375
- split_max_tensors = gr.Number(
376
- value=256,
377
- label="Max Tensors per File",
378
- info="Maximum number of tensors per file when splitting model.",
379
- visible=False
380
- )
381
-
382
- split_max_size = gr.Textbox(
383
- label="Max File Size",
384
- info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default. Accepted suffixes: M, G. Example: 256M, 5G",
385
- visible=False
386
- )
387
-
388
- iface = gr.Interface(
389
- fn=process_model,
390
- inputs=[
391
- model_id,
392
- q_method,
393
- use_imatrix,
394
- imatrix_q_method,
395
- private_repo,
396
- train_data_file,
397
- split_model,
398
- split_max_tensors,
399
- split_max_size,
400
- ],
401
- outputs=[
402
- gr.Markdown(label="output"),
403
- gr.Image(show_label=False),
404
- ],
405
- title="Create your own GGUF Quants, blazingly fast ⚡!",
406
- description="The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.",
407
- api_name=False
408
- )
409
-
410
- # Create Gradio interface
411
- with gr.Blocks(css=css) as demo:
412
- gr.Markdown("You must be logged in to use GGUF-my-repo.")
413
- gr.LoginButton(min_width=250)
414
-
415
- iface.render()
416
-
417
- def update_split_visibility(split_model):
418
- return gr.update(visible=split_model), gr.update(visible=split_model)
419
-
420
- split_model.change(
421
- fn=update_split_visibility,
422
- inputs=split_model,
423
- outputs=[split_max_tensors, split_max_size]
424
- )
425
-
426
- def update_visibility(use_imatrix):
427
- return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
428
-
429
- use_imatrix.change(
430
- fn=update_visibility,
431
- inputs=use_imatrix,
432
- outputs=[q_method, imatrix_q_method, train_data_file]
433
- )
434
-
435
- def restart_space():
436
- HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
437
-
438
- scheduler = BackgroundScheduler()
439
- scheduler.add_job(restart_space, "interval", seconds=21600)
440
- scheduler.start()
441
-
442
- # Launch the interface
443
- demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)
 
1
  import os
2
  import subprocess
3
  import signal
 
 
4
  import tempfile
5
+ from pathlib import Path
6
+ from textwrap import dedent
7
+ from typing import Optional, Tuple, List, Union
8
+ from dataclasses import dataclass, field
9
 
10
+ os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
11
+
12
+ import gradio as gr
13
  from huggingface_hub import HfApi, ModelCard, whoami
14
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
 
 
15
  from apscheduler.schedulers.background import BackgroundScheduler
16
 
17
 
18
+ @dataclass
19
+ class QuantizationConfig:
20
+ """Configuration for model quantization."""
21
+ method: str
22
+ use_imatrix: bool = False
23
+ imatrix_method: str = "IQ4_NL"
24
+ train_data: str = ""
25
+ quant_embedding: bool = False
26
+ embedding_tensor_method: str = "Q8_0"
27
+ leave_output: bool = False
28
+ quant_output: bool = False
29
+ output_tensor_method: str = "Q8_0"
30
+ # Generated values - These will be set during processing
31
+ fp16_model: str = field(default="", init=False)
32
+ quantized_gguf: str = field(default="", init=False)
33
+ imatrix_file: str = field(default="", init=False)
34
+
35
+
36
+ @dataclass
37
+ class SplitConfig:
38
+ """Configuration for model splitting."""
39
+ enabled: bool = False
40
+ max_tensors: int = 256
41
+ max_size: Optional[str] = None
42
+
43
+
44
+ @dataclass
45
+ class OutputConfig:
46
+ """Configuration for output settings."""
47
+ private_repo: bool = False
48
+ repo_name: str = ""
49
+ filename: str = ""
50
+
51
+
52
+ @dataclass
53
+ class ModelProcessingConfig:
54
+ """Configuration for the entire model processing pipeline."""
55
+ token: str
56
+ model_id: str
57
+ model_name: str
58
+ outdir: str
59
+ quant_config: QuantizationConfig
60
+ split_config: SplitConfig
61
+ output_config: OutputConfig
62
+ # Generated values - These will be set during processing
63
+ new_repo_url: str = field(default="", init=False)
64
+ new_repo_id: str = field(default="", init=False)
65
+
66
+
67
+ class GGUFConverterError(Exception):
68
+ """Custom exception for GGUF conversion errors."""
69
+ pass
70
+
71
+
72
+ class HuggingFaceModelProcessor:
73
+ """Handles the processing of Hugging Face models to GGUF format."""
74
+
75
+ ERROR_LOGIN = "You must be logged in to use GGUF-my-repo."
76
+ DOWNLOAD_FOLDER = "./downloads"
77
+ OUTPUT_FOLDER = "./outputs"
78
+ CALIBRATION_FILE = "calibration_data_v5_rc.txt"
79
+
80
+ QUANTIZE_TIMEOUT=86400
81
+ HF_TO_GGUF_TIMEOUT=3600
82
+ IMATRIX_TIMEOUT=86400
83
+ SPLIT_TIMEOUT=3600
84
+ KILL_TIMEOUT=5
85
+
86
+ def __init__(self):
87
+ self.SPACE_ID = os.environ.get("SPACE_ID", "")
88
+ self.SPACE_URL = f"https://{self.SPACE_ID.replace('/', '-')}.hf.space/" if self.SPACE_ID else "http://localhost:7860/"
89
+ self.HF_TOKEN = os.environ.get("HF_TOKEN")
90
+ self.RUN_LOCALLY = os.environ.get("RUN_LOCALLY")
91
+
92
+ # Create necessary folders
93
+ self._create_folder(self.DOWNLOAD_FOLDER)
94
+ self._create_folder(self.OUTPUT_FOLDER)
95
+
96
+ def _create_folder(self, folder_name: str) -> str:
97
+ """Create a folder if it doesn't exist."""
98
+ if not os.path.exists(folder_name):
99
+ print(f"Creating folder: {folder_name}")
100
+ os.makedirs(folder_name)
101
+ return folder_name
102
+
103
+ def _validate_token(self, oauth_token: Optional[gr.OAuthToken]) -> str:
104
+ """Validate the OAuth token and return the token string."""
105
+ if oauth_token is None or oauth_token.token is None:
106
+ raise GGUFConverterError(self.ERROR_LOGIN)
107
+
108
  try:
109
+ whoami(oauth_token.token)
110
+ return oauth_token.token
111
+ except Exception as e:
112
+ raise GGUFConverterError(self.ERROR_LOGIN)
113
+
114
+ def _escape_html(self, s: str) -> str:
115
+ """Escape HTML characters for safe display."""
116
+ replacements = [
117
+ ("&", "&amp;"),
118
+ ("<", "&lt;"),
119
+ (">", "&gt;"),
120
+ ('"', "&quot;"),
121
+ ("\n", "<br/>")
122
+ ]
123
+ for old, new in replacements:
124
+ s = s.replace(old, new)
125
+ return s
126
+
127
+ def _get_model_creator(self, model_id: str) -> str:
128
+ """Extract model creator from model ID."""
129
+ return model_id.split('/')[0]
130
+
131
+ def _get_model_name(self, model_id: str) -> str:
132
+ """Extract model name from model ID."""
133
+ return model_id.split('/')[-1]
134
+
135
+ def _upload_file(self, processing_config: ModelProcessingConfig, path_or_fileobj: str, path_in_repo: str) -> None:
136
+ """Upload a file to Hugging Face repository."""
137
+ if self.RUN_LOCALLY == "1":
138
+ print("Skipping upload...")
139
+ return
140
+
141
+ api = HfApi(token=processing_config.token)
142
+ api.upload_file(
143
+ path_or_fileobj=path_or_fileobj,
144
+ path_in_repo=path_in_repo,
145
+ repo_id=processing_config.new_repo_id,
146
+ )
147
+
148
+ def _generate_importance_matrix(self, quant_config: QuantizationConfig) -> None:
149
+ """Generate importance matrix for quantization."""
150
+ if not os.path.isfile(quant_config.fp16_model):
151
+ raise GGUFConverterError(f"Model file not found: {quant_config.fp16_model}")
152
+
153
+ if quant_config.train_data:
154
+ train_data_path = quant_config.train_data
155
+ else:
156
+ train_data_path = self.CALIBRATION_FILE
157
+
158
+ if not os.path.isfile(train_data_path):
159
+ raise GGUFConverterError(f"Training data file not found: {train_data_path}")
160
+
161
+ print(f"Training data file path: {train_data_path}")
162
+ print("Running imatrix command...")
163
+
164
+ imatrix_command = [
165
+ "llama-imatrix",
166
+ "-m", quant_config.fp16_model,
167
+ "-f", train_data_path,
168
+ "-ngl", "99",
169
+ "--output-frequency", "10",
170
+ "-o", quant_config.imatrix_file,
171
+ ]
172
+
173
+ process = subprocess.Popen(imatrix_command, shell=False, stderr=subprocess.STDOUT)
174
+ try:
175
+ process.wait(timeout=self.IMATRIX_TIMEOUT)
176
+ except subprocess.TimeoutExpired:
177
+ print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
178
+ process.send_signal(signal.SIGINT)
179
+ try:
180
+ process.wait(timeout=self.KILL_TIMEOUT)
181
+ except subprocess.TimeoutExpired:
182
+ print("Imatrix proc still didn't term. Forcefully terminating process...")
183
+ process.kill()
184
+ raise GGUFConverterError("Error generating imatrix: Operation timed out.")
185
+
186
+ if process.returncode != 0:
187
+ raise GGUFConverterError(f"Error generating imatrix: code={process.returncode}.")
188
+
189
+ print(f"Importance matrix generation completed: {os.path.abspath(quant_config.imatrix_file)}")
190
+
191
+ def _split_and_upload_model(self, processing_config: ModelProcessingConfig) -> None:
192
+ """Split large model files and upload shards."""
193
+ quant_config = processing_config.quant_config
194
+ split_config = processing_config.split_config
195
+
196
+ print(f"Model path: {quant_config.quantized_gguf}")
197
+ print(f"Output dir: {processing_config.outdir}")
198
+
199
+ split_cmd = ["llama-gguf-split", "--split"]
200
+
201
+ if split_config.max_size:
202
+ split_cmd.extend(["--split-max-size", split_config.max_size])
203
+ else:
204
+ split_cmd.extend(["--split-max-tensors", str(split_config.max_tensors)])
205
+
206
+ model_path_prefix = '.'.join(quant_config.quantized_gguf.split('.')[:-1])
207
+ split_cmd.extend([quant_config.quantized_gguf, model_path_prefix])
208
+
209
+ print(f"Split command: {split_cmd}")
210
+ process = subprocess.Popen(split_cmd, shell=False, stderr=subprocess.STDOUT)
211
+ try:
212
+ process.wait(timeout=self.SPLIT_TIMEOUT)
213
  except subprocess.TimeoutExpired:
214
+ print("Splitting timed out. Sending SIGINT to allow graceful termination...")
215
+ process.send_signal(signal.SIGINT)
216
+ try:
217
+ process.wait(timeout=self.KILL_TIMEOUT)
218
+ except subprocess.TimeoutExpired:
219
+ print("Splitting timed out. Killing process...")
220
+ process.kill()
221
+ raise GGUFConverterError("Error splitting the model: Operation timed out.")
222
+
223
+ if process.returncode != 0:
224
+ raise GGUFConverterError(f"Error splitting the model: code={process.returncode}")
225
+
226
+ print("Model split successfully!")
227
+
228
+ # Remove original model file
229
+ if os.path.exists(quant_config.quantized_gguf):
230
+ os.remove(quant_config.quantized_gguf)
231
+
232
+ model_file_prefix = model_path_prefix.split('/')[-1]
233
+ print(f"Model file name prefix: {model_file_prefix}")
234
+
235
+ sharded_model_files = [
236
+ f for f in os.listdir(processing_config.outdir)
237
+ if f.startswith(model_file_prefix) and f.endswith(".gguf")
238
+ ]
239
+
240
+ if not sharded_model_files:
241
+ raise GGUFConverterError("No sharded files found.")
242
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  print(f"Sharded model files: {sharded_model_files}")
244
+
245
  for file in sharded_model_files:
246
+ file_path = os.path.join(processing_config.outdir, file)
 
247
  try:
248
+ print(f"Uploading file: {file_path}")
249
+ self._upload_file(processing_config, file_path, file)
 
 
 
250
  except Exception as e:
251
+ raise GGUFConverterError(f"Error uploading file {file_path}: {e}")
252
+
253
+ print("Sharded model has been uploaded successfully!")
254
+
255
+ def _download_base_model(self, processing_config: ModelProcessingConfig) -> str:
256
+ """Download and convert Hugging Face model to GGUF FP16 format."""
257
+ print(f"Downloading model {processing_config.model_name}")
258
+
259
+ if os.path.exists(processing_config.quant_config.fp16_model):
260
+ print("Skipping fp16 conversion...")
261
+ print(f"Converted model path: {os.path.abspath(processing_config.quant_config.fp16_model)}")
262
+ return processing_config.quant_config.fp16_model
263
+
264
+ with tempfile.TemporaryDirectory(dir=self.DOWNLOAD_FOLDER) as tmpdir:
265
+ local_dir = f"{Path(tmpdir)}/{processing_config.model_name}"
266
+ print(f"Local directory: {os.path.abspath(local_dir)}")
267
+
268
+ # Download model
269
+ api = HfApi(token=processing_config.token)
270
+ pattern = (
271
+ "*.safetensors"
272
+ if any(
273
+ file.path.endswith(".safetensors")
274
+ for file in api.list_repo_tree(
275
+ repo_id=processing_config.model_id,
276
+ recursive=True,
277
+ )
 
 
 
278
  )
279
+ else "*.bin"
280
  )
281
+ dl_pattern = ["*.md", "*.json", "*.model"]
282
+ dl_pattern += [pattern]
283
+ api.snapshot_download(repo_id=processing_config.model_id, local_dir=local_dir, allow_patterns=dl_pattern)
284
+ print("Model downloaded successfully!")
285
+ print(f"Model directory contents: {os.listdir(local_dir)}")
286
+
287
+ config_dir = os.path.join(local_dir, "config.json")
288
+ adapter_config_dir = os.path.join(local_dir, "adapter_config.json")
289
+ if os.path.exists(adapter_config_dir) and not os.path.exists(config_dir):
290
+ raise GGUFConverterError(
291
+ 'adapter_config.json is present.<br/><br/>If you are converting a LoRA adapter to GGUF, '
292
+ 'please use <a href="https://huggingface.co/spaces/ggml-org/gguf-my-lora" target="_blank" '
293
+ 'style="text-decoration:underline">GGUF-my-lora</a>.'
294
+ )
295
+
296
+ # Convert HF to GGUF
297
+ print(f"Converting to GGUF FP16: {os.path.abspath(processing_config.quant_config.fp16_model)}")
298
+ convert_command = [
299
+ "python3", "/app/convert_hf_to_gguf.py", local_dir,
300
+ "--outtype", "f16", "--outfile", processing_config.quant_config.fp16_model
301
+ ]
302
+ process = subprocess.Popen(convert_command, shell=False, stderr=subprocess.STDOUT)
303
+ try:
304
+ process.wait(timeout=self.HF_TO_GGUF_TIMEOUT)
305
+ except subprocess.TimeoutExpired:
306
+ print("Conversion timed out. Sending SIGINT to allow graceful termination...")
307
+ process.send_signal(signal.SIGINT)
308
+ try:
309
+ process.wait(timeout=self.KILL_TIMEOUT)
310
+ except subprocess.TimeoutExpired:
311
+ print("Conversion timed out. Killing process...")
312
+ process.kill()
313
+ raise GGUFConverterError("Error converting to fp16: Operation timed out.")
314
+
315
+ if process.returncode != 0:
316
+ raise GGUFConverterError(f"Error converting to fp16: code={process.returncode}")
317
+
318
+ print("Model converted to fp16 successfully!")
319
+ print(f"Converted model path: {os.path.abspath(processing_config.quant_config.fp16_model)}")
320
+ return processing_config.quant_config.fp16_model
321
+
322
+ def _quantize_model(self, quant_config: QuantizationConfig) -> str:
323
+ """Quantize the GGUF model."""
324
+ quantize_cmd = ["llama-quantize"]
325
+
326
+ if quant_config.quant_embedding:
327
+ quantize_cmd.extend(["--token-embedding-type", quant_config.embedding_tensor_method])
328
+
329
+ if quant_config.leave_output:
330
+ quantize_cmd.append("--leave-output-tensor")
331
+ else:
332
+ if quant_config.quant_output:
333
+ quantize_cmd.extend(["--output-tensor-type", quant_config.output_tensor_method])
334
+
335
+ # Set imatrix file path if needed
336
+ if quant_config.use_imatrix:
337
+ self._generate_importance_matrix(quant_config)
338
+ quantize_cmd.extend(["--imatrix", quant_config.imatrix_file])
339
+ else:
340
+ print("Not using imatrix quantization.")
341
+
342
+ quantize_cmd.append(quant_config.fp16_model)
343
+ quantize_cmd.append(quant_config.quantized_gguf)
344
+
345
+ if quant_config.use_imatrix:
346
+ quantize_cmd.append(quant_config.imatrix_method)
347
+ else:
348
+ quantize_cmd.append(quant_config.method)
349
+
350
+ print(f"Quantizing model with {quantize_cmd}")
351
+
352
+ # Use Popen for quantization
353
+ process = subprocess.Popen(quantize_cmd, shell=False, stderr=subprocess.STDOUT)
354
+ try:
355
+ process.wait(timeout=self.QUANTIZE_TIMEOUT)
356
+ except subprocess.TimeoutExpired:
357
+ print("Quantization timed out. Sending SIGINT to allow graceful termination...")
358
+ process.send_signal(signal.SIGINT)
359
+ try:
360
+ process.wait(timeout=self.KILL_TIMEOUT)
361
+ except subprocess.TimeoutExpired:
362
+ print("Quantization timed out. Killing process...")
363
+ process.kill()
364
+ raise GGUFConverterError("Error quantizing: Operation timed out.")
365
+
366
+ if process.returncode != 0:
367
+ raise GGUFConverterError(f"Error quantizing: code={process.returncode}")
368
+
369
+ print(f"Quantized successfully with {quant_config.imatrix_method if quant_config.use_imatrix else quant_config.method} option!")
370
+ print(f"Quantized model path: {os.path.abspath(quant_config.quantized_gguf)}")
371
+ return quant_config.quantized_gguf
372
+
373
+ def _create_empty_repo(self, processing_config: ModelProcessingConfig):
374
+ api = HfApi(token=processing_config.token)
375
+ new_repo_url = api.create_repo(
376
+ repo_id=processing_config.output_config.repo_name,
377
+ exist_ok=True,
378
+ private=processing_config.output_config.private_repo
379
  )
380
+ processing_config.new_repo_url = new_repo_url.url
381
+ processing_config.new_repo_id = new_repo_url.repo_id
382
+ print("Repo created successfully!", processing_config.new_repo_url)
383
 
384
+ return new_repo_url
385
+
386
+ def _generate_readme(self, processing_config: ModelProcessingConfig) -> str:
387
+ """Generate README.md for the quantized model."""
388
+ creator = self._get_model_creator(processing_config.model_id)
389
+ username = whoami(processing_config.token)["name"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
 
391
+ try:
392
+ card = ModelCard.load(processing_config.model_id, token=processing_config.token)
393
+ except:
394
+ card = ModelCard("")
395
+
396
+ if card.data.tags is None:
397
+ card.data.tags = []
398
+ card.data.tags.extend(["llama-cpp", "gguf-my-repo"])
399
+ card.data.base_model = processing_config.model_id
400
+
401
+ card.text = dedent(
402
+ f"""
403
+ # {processing_config.model_name}
404
+ **Model creator:** [{creator}](https://huggingface.co/{creator})<br/>
405
+ **Original model**: [{processing_config.model_id}](https://huggingface.co/{processing_config.model_id})<br/>
406
+ **GGUF quantization:** provided by [{username}](https:/huggingface.co/{username}) using `llama.cpp`<br/>
407
+ ## Special thanks
408
+ 🙏 Special thanks to [Georgi Gerganov](https://github.com/ggerganov) and the whole team working on [llama.cpp](https://github.com/ggerganov/llama.cpp/) for making all of this possible.
409
+ ## Use with Ollama
410
+ ```bash
411
+ ollama run "hf.co/{processing_config.new_repo_id}:<quantization>"
412
+ ```
413
+ ## Use with LM Studio
414
+ ```bash
415
+ lms load "{processing_config.new_repo_id}"
416
+ ```
417
+ ## Use with llama.cpp CLI
418
+ ```bash
419
+ llama-cli --hf-repo "{processing_config.new_repo_id}" --hf-file "{processing_config.output_config.filename}" -p "The meaning to life and the universe is"
420
+ ```
421
+ ## Use with llama.cpp Server:
422
+ ```bash
423
+ llama-server --hf-repo "{processing_config.new_repo_id}" --hf-file "{processing_config.output_config.filename}" -c 4096
424
+ ```
425
+ """
426
+ )
427
+
428
+ readme_path = f"{processing_config.outdir}/README.md"
429
+ card.save(readme_path)
430
+ return readme_path
431
+
432
+ def process_model(self, processing_config: ModelProcessingConfig) -> Tuple[str, str]:
433
+ """Main method to process a model through the entire pipeline."""
434
+ quant_config = processing_config.quant_config
435
+ split_config = processing_config.split_config
436
+ output_config = processing_config.output_config
437
+
438
+ print(f"Current working directory: {os.path.abspath(os.getcwd())}")
439
+
440
+ # Download and convert base model
441
+ self._download_base_model(processing_config)
442
+
443
+ # Quantize the model
444
+ self._quantize_model(quant_config)
445
+
446
+ # Create empty repo
447
+ self._create_empty_repo(processing_config)
448
+
449
+ # Upload model
450
+ if split_config.enabled:
451
+ print(f"Splitting quantized model: {os.path.abspath(quant_config.quantized_gguf)}")
452
+ self._split_and_upload_model(processing_config)
453
+ else:
454
+ try:
455
+ print(f"Uploading quantized model: {os.path.abspath(quant_config.quantized_gguf)}")
456
+ self._upload_file(processing_config, quant_config.quantized_gguf, output_config.filename)
457
+ except Exception as e:
458
+ raise GGUFConverterError(f"Error uploading quantized model: {e}")
459
+
460
+ # Upload imatrix if it exists
461
+ if quant_config.use_imatrix and os.path.isfile(quant_config.imatrix_file):
462
  try:
463
+ print(f"Uploading imatrix.dat: {os.path.abspath(quant_config.imatrix_file)}")
464
+ self._upload_file(processing_config, quant_config.imatrix_file, f"{processing_config.model_name}-imatrix.gguf")
465
+ except Exception as e:
466
+ raise GGUFConverterError(f"Error uploading imatrix.dat: {e}")
467
+
468
+ # Upload README.md
469
+ readme_path = self._generate_readme(processing_config)
470
+ self._upload_file(processing_config, readme_path, "README.md")
471
+
472
+ print(f"Uploaded successfully with {quant_config.imatrix_method if quant_config.use_imatrix else quant_config.method} option!")
473
+
474
+
475
+ class GGUFConverterUI:
476
+ """Gradio UI for the GGUF Converter."""
477
+
478
+ def __init__(self):
479
+ self.processor = HuggingFaceModelProcessor()
480
+ self.css = """/* Custom CSS to allow scrolling */
481
+ .gradio-container {overflow-y: auto;}
482
+ """
483
+
484
+ # Initialize components
485
+ self._initialize_components()
486
+ self._setup_interface()
487
+
488
+ def _initialize_components(self):
489
+ """Initialize all UI components."""
490
+ #####
491
+ # Base model section
492
+ #####
493
+ self.model_id = HuggingfaceHubSearch(
494
+ label="Hub Model ID",
495
+ placeholder="Search for model id on Huggingface",
496
+ search_type="model",
497
+ )
498
+
499
+ #####
500
+ # Quantization section
501
+ #####
502
+ self.use_imatrix = gr.Checkbox(
503
+ value=False,
504
+ label="Use Imatrix Quantization",
505
+ info="Use importance matrix for quantization."
506
+ )
507
+ self.q_method = gr.Dropdown(
508
+ choices=["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0", "F16", "BF16"],
509
+ label="Quantization Method",
510
+ info="GGML quantization type",
511
+ value="Q4_K_M",
512
+ filterable=False,
513
+ visible=True
514
+ )
515
+ self.imatrix_q_method = gr.Dropdown(
516
+ choices=["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
517
+ label="Imatrix Quantization Method",
518
+ info="GGML imatrix quants type",
519
+ value="IQ4_NL",
520
+ filterable=False,
521
+ visible=False
522
+ )
523
+ self.train_data_file = gr.File(
524
+ label="Training Data File",
525
+ file_types=[".txt"],
526
+ visible=False
527
+ )
528
+
529
+ #####
530
+ # Advanced Options section
531
+ #####
532
+ self.split_model = gr.Checkbox(
533
+ value=False,
534
+ label="Split Model",
535
+ info="Shard the model using gguf-split."
536
+ )
537
+ self.split_max_tensors = gr.Number(
538
+ value=256,
539
+ label="Max Tensors per File",
540
+ info="Maximum number of tensors per file when splitting model.",
541
+ visible=False
542
+ )
543
+ self.split_max_size = gr.Textbox(
544
+ label="Max File Size",
545
+ info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default. Accepted suffixes: M, G. Example: 256M, 5G",
546
+ visible=False
547
+ )
548
+ self.leave_output = gr.Checkbox(
549
+ value=False,
550
+ label="Leave output tensor",
551
+ info="Leaves output.weight un(re)quantized"
552
+ )
553
+ self.quant_embedding = gr.Checkbox(
554
+ value=False,
555
+ label="Quant embeddings tensor",
556
+ info="Quantize embeddings tensor separately"
557
+ )
558
+ self.embedding_tensor_method = gr.Dropdown(
559
+ choices=["Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K", "Q8_0"],
560
+ label="Embeddings Quantization Method",
561
+ info="use a specific quant type for the token embeddings tensor",
562
+ value="Q8_0",
563
+ filterable=False,
564
+ visible=False
565
+ )
566
+ self.quant_output = gr.Checkbox(
567
+ value=False,
568
+ label="Quant output tensor",
569
+ info="Quantize output tensor separately"
570
+ )
571
+ self.output_tensor_method = gr.Dropdown(
572
+ choices=["Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K", "Q8_0"],
573
+ label="Output Quantization Method",
574
+ info="use a specific quant type for the output.weight tensor",
575
+ value="Q8_0",
576
+ filterable=False,
577
+ visible=False
578
+ )
579
+
580
+ #####
581
+ # Output Settings section
582
+ #####
583
+ self.private_repo = gr.Checkbox(
584
+ value=False,
585
+ label="Private Repo",
586
+ info="Create a private repo under your username."
587
+ )
588
+ self.repo_name = gr.Textbox(
589
+ label="Output Repository Name",
590
+ info="Set your repository name",
591
+ max_lines=1
592
+ )
593
+ self.gguf_name = gr.Textbox(
594
+ label="Output File Name",
595
+ info="Set output file name",
596
+ max_lines=1
597
+ )
598
+
599
+ #####
600
+ # Buttons section
601
+ #####
602
+ self.clear_btn = gr.ClearButton(
603
+ value="Clear",
604
+ variant="secondary",
605
+ components=[
606
+ self.model_id,
607
+ self.q_method,
608
+ self.use_imatrix,
609
+ self.imatrix_q_method,
610
+ self.private_repo,
611
+ self.train_data_file,
612
+ self.leave_output,
613
+ self.quant_embedding,
614
+ self.embedding_tensor_method,
615
+ self.quant_output,
616
+ self.output_tensor_method,
617
+ self.split_model,
618
+ self.split_max_tensors,
619
+ self.split_max_size,
620
+ self.repo_name,
621
+ self.gguf_name,
622
+ ]
623
+ )
624
+ self.submit_btn = gr.Button(
625
+ value="Submit",
626
+ variant="primary"
627
+ )
628
+
629
+ #####
630
+ # Outputs section
631
+ #####
632
+ self.output_label = gr.Markdown(label="output")
633
+ self.output_image = gr.Image(
634
+ show_label=False,
635
+ show_download_button=False,
636
+ interactive=False
637
+ )
638
+
639
+ @staticmethod
640
+ def _update_output_repo(model_id: str, oauth_token: Optional[gr.OAuthToken]) -> str:
641
+ """Update output repository name based on model and user."""
642
+ if oauth_token is None or not oauth_token.token:
643
+ return ""
644
+ if not model_id:
645
+ return ""
646
+ try:
647
+ username = whoami(oauth_token.token)["name"]
648
+ model_name = model_id.split('/')[-1]
649
+ return f"{username}/{model_name}-GGUF"
650
+ except:
651
+ return ""
652
+
653
+ @staticmethod
654
+ def _update_output_filename(model_id: str, use_imatrix: bool, q_method: str, imatrix_q_method: str) -> str:
655
+ """Update output filename based on model and quantization settings."""
656
+ if not model_id:
657
+ return ""
658
+ model_name = model_id.split('/')[-1]
659
+ if use_imatrix:
660
+ return f"{model_name}-{imatrix_q_method.upper()}-imat.gguf"
661
+ return f"{model_name}-{q_method.upper()}.gguf"
662
+
663
+ def _setup_interface(self):
664
+ """Set up the Gradio interface."""
665
+ with gr.Blocks(css=self.css) as self.demo:
666
+ #####
667
+ # Layout
668
+ #####
669
+ gr.Markdown(HuggingFaceModelProcessor.ERROR_LOGIN)
670
+ gr.LoginButton(min_width=250)
671
+ gr.HTML("<h1 style=\"text-aling:center;\">Create your own GGUF Quants!</h1>")
672
+ gr.Markdown(f"The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.<br/>Use via {self.processor.SPACE_URL}")
673
+
674
+ with gr.Row():
675
+ with gr.Column() as inputs:
676
+ gr.Markdown("### Model Configuration")
677
+ self.model_id.render()
678
+ with gr.Column():
679
+ self.use_imatrix.render()
680
+ self.q_method.render()
681
+ self.imatrix_q_method.render()
682
+ self.train_data_file.render()
683
+ gr.Markdown("### Advanced Options")
684
+ self.quant_embedding.render()
685
+ self.embedding_tensor_method.render()
686
+ self.leave_output.render()
687
+ self.quant_output.render()
688
+ self.output_tensor_method.render()
689
+ self.split_model.render()
690
+ with gr.Row() as split_options:
691
+ self.split_max_tensors.render()
692
+ self.split_max_size.render()
693
+ gr.Markdown("### Output Settings")
694
+ gr.Markdown("You can customize settings for your GGUF repo.")
695
+ self.private_repo.render()
696
+ with gr.Row():
697
+ self.repo_name.render()
698
+ self.gguf_name.render()
699
+ # Buttons
700
+ with gr.Row() as buttons:
701
+ self.clear_btn.render()
702
+ self.submit_btn.render()
703
+ with gr.Column() as outputs:
704
+ self.output_label.render()
705
+ self.output_image.render()
706
+
707
+ #####
708
+ # Event handlers
709
+ #####
710
+ self.submit_btn.click(
711
+ fn=self._process_model_wrapper,
712
+ inputs=[
713
+ self.model_id,
714
+ self.q_method,
715
+ self.use_imatrix,
716
+ self.imatrix_q_method,
717
+ self.private_repo,
718
+ self.train_data_file,
719
+ self.repo_name,
720
+ self.gguf_name,
721
+ self.quant_embedding,
722
+ self.embedding_tensor_method,
723
+ self.leave_output,
724
+ self.quant_output,
725
+ self.output_tensor_method,
726
+ self.split_model,
727
+ self.split_max_tensors,
728
+ self.split_max_size
729
+ ],
730
+ outputs=[
731
+ self.output_label,
732
+ self.output_image,
733
+ ],
734
  )
 
 
735
 
736
+ #####
737
+ # OnChange handlers
738
+ #####
739
+ self.use_imatrix.change(
740
+ fn=lambda use_imatrix: [gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)],
741
+ inputs=self.use_imatrix,
742
+ outputs=[self.q_method, self.imatrix_q_method, self.train_data_file]
743
+ )
744
+ self.split_model.change(
745
+ fn=lambda split_model: [gr.update(visible=split_model), gr.update(visible=split_model)],
746
+ inputs=self.split_model,
747
+ outputs=[self.split_max_tensors, self.split_max_size]
748
+ )
749
+ self.quant_embedding.change(
750
+ fn=lambda quant_embedding: gr.update(visible=quant_embedding),
751
+ inputs=self.quant_embedding,
752
+ outputs=[self.embedding_tensor_method]
753
+ )
754
+ self.leave_output.change(
755
+ fn=lambda leave_output, quant_output: [gr.update(visible=not leave_output), gr.update(visible=not leave_output and quant_output)],
756
+ inputs=[self.leave_output, self.leave_output],
757
+ outputs=[self.quant_output, self.output_tensor_method]
758
+ )
759
+ self.quant_output.change(
760
+ fn=lambda quant_output: [gr.update(visible=not quant_output), gr.update(visible=quant_output)],
761
+ inputs=self.quant_output,
762
+ outputs=[self.leave_output, self.output_tensor_method]
763
+ )
764
+ self.model_id.change(
765
+ fn=self._update_output_repo,
766
+ inputs=[self.model_id],
767
+ outputs=[self.repo_name]
768
+ )
769
+ self.model_id.change(
770
+ fn=self._update_output_filename,
771
+ inputs=[self.model_id, self.use_imatrix, self.q_method, self.imatrix_q_method],
772
+ outputs=[self.gguf_name]
773
+ )
774
+ self.use_imatrix.change(
775
+ fn=self._update_output_filename,
776
+ inputs=[self.model_id, self.use_imatrix, self.q_method, self.imatrix_q_method],
777
+ outputs=[self.gguf_name]
778
+ )
779
+ self.q_method.change(
780
+ fn=self._update_output_filename,
781
+ inputs=[self.model_id, self.use_imatrix, self.q_method, self.imatrix_q_method],
782
+ outputs=[self.gguf_name]
783
+ )
784
+ self.imatrix_q_method.change(
785
+ fn=self._update_output_filename,
786
+ inputs=[self.model_id, self.use_imatrix, self.q_method, self.imatrix_q_method],
787
+ outputs=[self.gguf_name]
788
+ )
789
 
790
+ def _process_model_wrapper(self, model_id: str, q_method: str, use_imatrix: bool,
791
+ imatrix_q_method: str, private_repo: bool, train_data_file,
792
+ repo_name: str, gguf_name: str, quant_embedding: bool,
793
+ embedding_tensor_method: str, leave_output: bool,
794
+ quant_output: bool, output_tensor_method: str,
795
+ split_model: bool, split_max_tensors, split_max_size: str, oauth_token: Optional[gr.OAuthToken]) -> Tuple[str, str]:
796
+ """Wrapper for the process_model method to handle the conversion using ModelProcessingConfig."""
797
+ try:
798
+ # Validate token and get token string
799
+ token = self.processor._validate_token(oauth_token)
800
+
801
+ # Create configuration objects
802
+ quant_config = QuantizationConfig(
803
+ method=q_method,
804
+ use_imatrix=use_imatrix,
805
+ imatrix_method=imatrix_q_method,
806
+ train_data=train_data_file.name,
807
+ quant_embedding=quant_embedding,
808
+ embedding_tensor_method=embedding_tensor_method,
809
+ leave_output=leave_output,
810
+ quant_output=quant_output,
811
+ output_tensor_method=output_tensor_method
812
  )
 
813
 
814
+ split_config = SplitConfig(
815
+ enabled=split_model,
816
+ max_tensors=split_max_tensors if isinstance(split_max_tensors, int) else 256,
817
+ max_size=split_max_size
818
+ )
819
 
820
+ output_config = OutputConfig(
821
+ private_repo=private_repo,
822
+ repo_name=repo_name,
823
+ filename=gguf_name
824
+ )
825
+
826
+ model_name = self.processor._get_model_name(model_id)
827
+
828
+ with tempfile.TemporaryDirectory(dir=self.processor.OUTPUT_FOLDER) as outDirObj:
829
+ outdir = (
830
+ self.processor._create_folder(os.path.join(self.processor.OUTPUT_FOLDER, model_name))
831
+ if self.processor.RUN_LOCALLY == "1"
832
+ else Path(outDirObj)
833
+ )
834
+
835
+ quant_config.fp16_model = f"{outdir}/{model_name}-fp16.gguf"
836
+ quant_config.imatrix_file = f"{outdir}/{model_name}-imatrix.gguf"
837
+ quant_config.quantized_gguf = f"{outdir}/{gguf_name}"
838
+
839
+ processing_config = ModelProcessingConfig(
840
+ token=token,
841
+ model_id=model_id,
842
+ model_name=model_name,
843
+ outdir=outdir,
844
+ quant_config=quant_config,
845
+ split_config=split_config,
846
+ output_config=output_config
847
+ )
848
+
849
+ # Call the processor's main method with the config object
850
+ self.processor.process_model(processing_config)
851
+
852
+ return (
853
+ f'<h1>✅ DONE</h1><br/>Find your repo here: <a href="{processing_config.new_repo_url}" target="_blank" style="text-decoration:underline">{processing_config.new_repo_id}</a>',
854
+ "llama.png",
855
+ )
856
+
857
+ except Exception as e:
858
+ print(f"Error processing model: {e}")
859
+ return (f'<h1>❌ ERROR</h1><br/><pre style="white-space:pre-wrap;">{self.processor._escape_html(str(e))}</pre>', "error.png")
860
+
861
+
862
+ def launch(self):
863
+ """Launch the Gradio interface."""
864
+ # Set up space restart scheduler
865
+ def restart_space():
866
+ HfApi().restart_space(repo_id=self.processor.SPACE_ID, token=self.processor.HF_TOKEN, factory_reboot=True)
867
+
868
+ scheduler = BackgroundScheduler()
869
+ scheduler.add_job(restart_space, "interval", seconds=21600)
870
+ scheduler.start()
871
+
872
+ # Launch the interface
873
+ self.demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)
874
+
875
+
876
+ # Main execution
877
+ if __name__ == "__main__":
878
+ ui = GGUFConverterUI()
879
+ ui.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groups_merged.txt → calibration_data_v5_rc.txt RENAMED
The diff for this file is too large to render. See raw diff
 
docker-compose.yml CHANGED
@@ -1,16 +1,16 @@
1
- # Docker compose file to LOCAL development
2
-
3
  services:
4
  gguf-my-repo:
5
  build:
6
  context: .
7
  dockerfile: Dockerfile
8
- image: gguf-my-repo
9
  container_name: gguf-my-repo
10
  ports:
11
  - "7860:7860"
12
  volumes:
13
  - .:/home/user/app
14
  environment:
15
- - RUN_LOCALLY=1
 
16
  - HF_TOKEN=${HF_TOKEN}
 
 
 
 
1
  services:
2
  gguf-my-repo:
3
  build:
4
  context: .
5
  dockerfile: Dockerfile
6
+ image: gguf-my-repo-cuda
7
  container_name: gguf-my-repo
8
  ports:
9
  - "7860:7860"
10
  volumes:
11
  - .:/home/user/app
12
  environment:
13
+ - RUN_CUDA=1
14
+ - RUN_LOCALLY=0
15
  - HF_TOKEN=${HF_TOKEN}
16
+ - HF_HUB_CACHE=/home/user/app/downloads
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ huggingface-hub
2
+ hf-transfer
3
+ gradio[oauth]
4
+ gradio_huggingfacehub_search
5
+ APScheduler
start.sh CHANGED
@@ -1,21 +1,9 @@
1
  #!/bin/bash
2
 
3
- if [ ! -d "llama.cpp" ]; then
4
- # only run in dev env
5
- git clone https://github.com/ggerganov/llama.cpp
6
- fi
7
-
8
  export GGML_CUDA=OFF
9
- if [[ -z "${RUN_LOCALLY}" ]]; then
10
- # enable CUDA if NOT running locally
11
  export GGML_CUDA=ON
12
  fi
13
 
14
- cd llama.cpp
15
- cmake -B build -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=${GGML_CUDA} -DLLAMA_CURL=OFF
16
- cmake --build build --config Release -j --target llama-quantize llama-gguf-split llama-imatrix
17
- cp ./build/bin/llama-* .
18
- rm -rf build
19
-
20
- cd ..
21
- python app.py
 
1
  #!/bin/bash
2
 
 
 
 
 
 
3
  export GGML_CUDA=OFF
4
+ # enable CUDA
5
+ if [[ -z "${RUN_CUDA}" ]]; then
6
  export GGML_CUDA=ON
7
  fi
8
 
9
+ python3 app.py