Spaces:
Build error
Build error
#!/usr/bin/env python3 | |
""" | |
Use llamafile to serve a (quantized) mistral-7b-instruct-v0.2 model | |
Usage: | |
cd <repo-root>/autogpt | |
./scripts/llamafile/serve.py | |
""" | |
import os | |
import platform | |
import subprocess | |
from pathlib import Path | |
from typing import Optional | |
import click | |
LLAMAFILE = Path("mistral-7b-instruct-v0.2.Q5_K_M.llamafile") | |
LLAMAFILE_URL = f"https://huggingface.co/jartine/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/{LLAMAFILE.name}" # noqa | |
LLAMAFILE_EXE = Path("llamafile.exe") | |
LLAMAFILE_EXE_URL = "https://github.com/Mozilla-Ocho/llamafile/releases/download/0.8.6/llamafile-0.8.6" # noqa | |
def main( | |
llamafile: Optional[Path] = None, | |
llamafile_url: Optional[str] = None, | |
host: Optional[str] = None, | |
port: Optional[int] = None, | |
force_gpu: bool = False, | |
): | |
print(f"type(llamafile) = {type(llamafile)}") | |
if not llamafile: | |
if not llamafile_url: | |
llamafile = LLAMAFILE | |
else: | |
llamafile = Path(llamafile_url.rsplit("/", 1)[1]) | |
if llamafile.suffix != ".llamafile": | |
click.echo( | |
click.style( | |
"The given URL does not end with '.llamafile' -> " | |
"can't get filename from URL. " | |
"Specify the filename using --llamafile.", | |
fg="red", | |
), | |
err=True, | |
) | |
return | |
if llamafile == LLAMAFILE and not llamafile_url: | |
llamafile_url = LLAMAFILE_URL | |
elif llamafile_url != LLAMAFILE_URL: | |
if not click.prompt( | |
click.style( | |
"You seem to have specified a different URL for the default model " | |
f"({llamafile.name}). Are you sure this is correct? " | |
"If you want to use a different model, also specify --llamafile.", | |
fg="yellow", | |
), | |
type=bool, | |
): | |
return | |
# Go to classic/original_autogpt/scripts/llamafile/ | |
os.chdir(Path(__file__).resolve().parent) | |
on_windows = platform.system() == "Windows" | |
if not llamafile.is_file(): | |
if not llamafile_url: | |
click.echo( | |
click.style( | |
"Please use --lamafile_url to specify a download URL for " | |
f"'{llamafile.name}'. " | |
"This will only be necessary once, so we can download the model.", | |
fg="red", | |
), | |
err=True, | |
) | |
return | |
download_file(llamafile_url, llamafile) | |
if not on_windows: | |
llamafile.chmod(0o755) | |
subprocess.run([llamafile, "--version"], check=True) | |
if not on_windows: | |
base_command = [f"./{llamafile}"] | |
else: | |
# Windows does not allow executables over 4GB, so we have to download a | |
# model-less llamafile.exe and run that instead. | |
if not LLAMAFILE_EXE.is_file(): | |
download_file(LLAMAFILE_EXE_URL, LLAMAFILE_EXE) | |
LLAMAFILE_EXE.chmod(0o755) | |
subprocess.run([f".\\{LLAMAFILE_EXE}", "--version"], check=True) | |
base_command = [f".\\{LLAMAFILE_EXE}", "-m", llamafile] | |
if host: | |
base_command.extend(["--host", host]) | |
if port: | |
base_command.extend(["--port", str(port)]) | |
if force_gpu: | |
base_command.extend(["-ngl", "9999"]) | |
subprocess.run( | |
[ | |
*base_command, | |
"--server", | |
"--nobrowser", | |
"--ctx-size", | |
"0", | |
"--n-predict", | |
"1024", | |
], | |
check=True, | |
) | |
# note: --ctx-size 0 means the prompt context size will be set directly from the | |
# underlying model configuration. This may cause slow response times or consume | |
# a lot of memory. | |
def download_file(url: str, to_file: Path) -> None: | |
print(f"Downloading {to_file.name}...") | |
import urllib.request | |
urllib.request.urlretrieve(url, to_file, reporthook=report_download_progress) | |
print() | |
def report_download_progress(chunk_number: int, chunk_size: int, total_size: int): | |
if total_size != -1: | |
downloaded_size = chunk_number * chunk_size | |
percent = min(1, downloaded_size / total_size) | |
bar = "#" * int(40 * percent) | |
print( | |
f"\rDownloading: [{bar:<40}] {percent:.0%}" | |
f" - {downloaded_size/1e6:.1f}/{total_size/1e6:.1f} MB", | |
end="", | |
) | |
if __name__ == "__main__": | |
main() | |