Spaces:

seawolf2357
/

autogpt2

Build error

App Files Files Community

autogpt2 / classic /original_autogpt /scripts /llamafile /serve.py

seawolf2357

Deploy from GitHub repository

3382f47 verified about 1 month ago

raw

history blame contribute delete

5.21 kB

	#!/usr/bin/env python3
	"""
	Use llamafile to serve a (quantized) mistral-7b-instruct-v0.2 model

	Usage:
	cd <repo-root>/autogpt
	./scripts/llamafile/serve.py
	"""

	import os
	import platform
	import subprocess
	from pathlib import Path
	from typing import Optional

	import click

	LLAMAFILE = Path("mistral-7b-instruct-v0.2.Q5_K_M.llamafile")
	LLAMAFILE_URL = f"https://huggingface.co/jartine/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/{LLAMAFILE.name}" # noqa
	LLAMAFILE_EXE = Path("llamafile.exe")
	LLAMAFILE_EXE_URL = "https://github.com/Mozilla-Ocho/llamafile/releases/download/0.8.6/llamafile-0.8.6" # noqa


	@click.command()
	@click.option(
	"--llamafile",
	type=click.Path(dir_okay=False, path_type=Path),
	help=f"Name of the llamafile to serve. Default: {LLAMAFILE.name}",
	)
	@click.option("--llamafile_url", help="Download URL for the llamafile you want to use")
	@click.option(
	"--host", help="Specify the address for the llamafile server to listen on"
	)
	@click.option(
	"--port", type=int, help="Specify the port for the llamafile server to listen on"
	)
	@click.option(
	"--force-gpu",
	is_flag=True,
	hidden=platform.system() != "Darwin",
	help="Run the model using only the GPU (AMD or Nvidia). "
	"Otherwise, both CPU and GPU may be (partially) used.",
	)
	def main(
	llamafile: Optional[Path] = None,
	llamafile_url: Optional[str] = None,
	host: Optional[str] = None,
	port: Optional[int] = None,
	force_gpu: bool = False,
	):
	print(f"type(llamafile) = {type(llamafile)}")
	if not llamafile:
	if not llamafile_url:
	llamafile = LLAMAFILE
	else:
	llamafile = Path(llamafile_url.rsplit("/", 1)[1])
	if llamafile.suffix != ".llamafile":
	click.echo(
	click.style(
	"The given URL does not end with '.llamafile' -> "
	"can't get filename from URL. "
	"Specify the filename using --llamafile.",
	fg="red",
	),
	err=True,
	)
	return

	if llamafile == LLAMAFILE and not llamafile_url:
	llamafile_url = LLAMAFILE_URL
	elif llamafile_url != LLAMAFILE_URL:
	if not click.prompt(
	click.style(
	"You seem to have specified a different URL for the default model "
	f"({llamafile.name}). Are you sure this is correct? "
	"If you want to use a different model, also specify --llamafile.",
	fg="yellow",
	),
	type=bool,
	):
	return

	# Go to classic/original_autogpt/scripts/llamafile/
	os.chdir(Path(__file__).resolve().parent)

	on_windows = platform.system() == "Windows"

	if not llamafile.is_file():
	if not llamafile_url:
	click.echo(
	click.style(
	"Please use --lamafile_url to specify a download URL for "
	f"'{llamafile.name}'. "
	"This will only be necessary once, so we can download the model.",
	fg="red",
	),
	err=True,
	)
	return

	download_file(llamafile_url, llamafile)

	if not on_windows:
	llamafile.chmod(0o755)
	subprocess.run([llamafile, "--version"], check=True)

	if not on_windows:
	base_command = [f"./{llamafile}"]
	else:
	# Windows does not allow executables over 4GB, so we have to download a
	# model-less llamafile.exe and run that instead.
	if not LLAMAFILE_EXE.is_file():
	download_file(LLAMAFILE_EXE_URL, LLAMAFILE_EXE)
	LLAMAFILE_EXE.chmod(0o755)
	subprocess.run([f".\\{LLAMAFILE_EXE}", "--version"], check=True)

	base_command = [f".\\{LLAMAFILE_EXE}", "-m", llamafile]

	if host:
	base_command.extend(["--host", host])
	if port:
	base_command.extend(["--port", str(port)])
	if force_gpu:
	base_command.extend(["-ngl", "9999"])

	subprocess.run(
	[
	*base_command,
	"--server",
	"--nobrowser",
	"--ctx-size",
	"0",
	"--n-predict",
	"1024",
	],
	check=True,
	)

	# note: --ctx-size 0 means the prompt context size will be set directly from the
	# underlying model configuration. This may cause slow response times or consume
	# a lot of memory.


	def download_file(url: str, to_file: Path) -> None:
	print(f"Downloading {to_file.name}...")
	import urllib.request

	urllib.request.urlretrieve(url, to_file, reporthook=report_download_progress)
	print()


	def report_download_progress(chunk_number: int, chunk_size: int, total_size: int):
	if total_size != -1:
	downloaded_size = chunk_number * chunk_size
	percent = min(1, downloaded_size / total_size)
	bar = "#" * int(40 * percent)
	print(
	f"\rDownloading: [{bar:<40}] {percent:.0%}"
	f" - {downloaded_size/1e6:.1f}/{total_size/1e6:.1f} MB",
	end="",
	)


	if __name__ == "__main__":
	main()