llama-cpp-api2

Sleeping

App Files Files Community

toaster61 commited on Sep 30, 2023

Commit

e3396ba

1 Parent(s): 559ea97

it works!

Browse files

Files changed (5) hide show

Dockerfile +18 -17
__pycache__/app.cpython-311.pyc +0 -0
app.py +15 -6
run-docker.sh +5 -0
system.prompt +1 -0

Dockerfile CHANGED Viewed

@@ -1,27 +1,28 @@
-FROM python:3.11.1-bullseye
 USER root
-ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
-RUN apt install g++ -y
-RUN git clone https://github.com/ggerganov/llama.cpp.git
-RUN cd llama.cpp
-RUN make
-RUN wget https://huggingface.co/OpenBuddy/openbuddy-ggml/resolve/main/openbuddy-openllama-3b-v10-q5_0.bin
-COPY . ./
-RUN chmod -R 777 ./
-WORKDIR ./
 RUN python3 -m pip install -U --no-cache-dir pip setuptools wheel
 RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
-RUN mkdir -p /.cache/huggingface/hub
-RUN chown -R root:root /.cache/huggingface/hub
-RUN chmod -R 777 /.cache/huggingface/hub
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

+# Loading base. I'm using Debian, u can use whatever u want.
+FROM python:3.11.5-slim-bookworm
+# Just for sure everything will be fine.
 USER root
+# Installing gcc compiler and main library.
+RUN apt update && apt install gcc cmake build-essential -y
+RUN CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-python==0.1.78
+# Installing wget and downloading model.
+RUN apt install wget -y
+RUN wget -O model.bin https://huggingface.co/OpenBuddy/openbuddy-ggml/resolve/main/openbuddy-openllama-3b-v10-q5_0.bin
+# You can use other models! Visit https://huggingface.co/OpenBuddy/openbuddy-ggml and choose model that u like!
+# Or u can comment this two RUNs and include in Space/repo/Docker image own model with name "model.bin".
+# Copying files into folder and making it working dir.
+RUN mkdir app
+COPY . /app
+RUN chmod -R 777 /app
+WORKDIR /app
+# Updating pip and installing everything from requirements
 RUN python3 -m pip install -U --no-cache-dir pip setuptools wheel
 RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
+# Now it's time to run Quart app using uvicorn! (It's faster, trust me.)
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

__pycache__/app.cpython-311.pyc ADDED Viewed

Binary file (2.8 kB). View file

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ from quart import Quart, request
 from llama_cpp import Llama
 app = Quart(__name__)
 with open('system.prompt', 'r', encoding='utf-8') as f:
     prompt = f.read()
@@ -10,18 +11,26 @@ with open('system.prompt', 'r', encoding='utf-8') as f:
 async def echo():
     try:
         data = await request.get_json()
-        if data.get("max_tokens") != None and data.get("max_tokens") > 500: data['max_tokens'] = 500
         userPrompt = prompt + "\n\nUser: " + data['request'] + "\nAssistant: "
     except: return {"error": "Not enough data"}, 400
-    return {"output": output}
 @app.get("/")
 async def get():
-    return '''<h1>Hello, world!</h1>
 This is showcase how to make own server with OpenBuddy's model.<br>
 I'm using here 3b model just for example. Also here's only CPU power.<br>
 But you can use GPU power as well!<br>
-<br>
 <h1>How to GPU?</h1>
-'''

 from llama_cpp import Llama
 app = Quart(__name__)
+llm = Llama(model_path="./model.bin")
 with open('system.prompt', 'r', encoding='utf-8') as f:
     prompt = f.read()
 async def echo():
     try:
         data = await request.get_json()
+        maxTokens = data.get("max_tokens", 64)
         userPrompt = prompt + "\n\nUser: " + data['request'] + "\nAssistant: "
     except: return {"error": "Not enough data"}, 400
+    try:
+        output = llm(userPrompt, max_tokens=maxTokens, stop=["User:", "\n"], echo=False)
+        return {"output": output["choices"][0]["text"]}
+    except Exception as e:
+        print(e)
+        return {"error": "Server error"}, 500
 @app.get("/")
 async def get():
+    return '''<style>a:visited{color:black;}</style>
+<h1>Hello, world!</h1>
 This is showcase how to make own server with OpenBuddy's model.<br>
 I'm using here 3b model just for example. Also here's only CPU power.<br>
 But you can use GPU power as well!<br>
 <h1>How to GPU?</h1>
+Change <code>`CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS`</code> in Dockerfile on <code>`CMAKE_ARGS="-DLLAMA_CUBLAS=on"`</code>. Also you can try <code>`DLLAMA_CLBLAST`</code>, <code>`DLLAMA_METAL`</code> or <code>`DLLAMA_METAL`</code>.<br>
+Powered by <a href="https://github.com/abetlen/llama-cpp-python">llama-cpp-python</a>, <a href="https://quart.palletsprojects.com/">Quart</a> and <a href="https://www.uvicorn.org/">Uvicorn</a>.<br>
+<h1>How to test it on own machine?</h1>
+You can install Docker, build image and run it. I made <code>`run-docker.sh`</code> for ya. To stop container run <code>`docker ps`</code>, find name of container and run <code>`docker stop _dockerContainerName_`</code><br>
+Or you can once follow steps in Dockerfile and try it on your machine, not in Docker.<br>'''

run-docker.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+# This is SH file for running Dockerfile.
+# Use it for tests. AND INSTALL DOCKER BEFORE U RUN IT!!!
+docker build -t llama-server .
+docker run -dp 0.0.0.0:7860:7860 llama-server

system.prompt CHANGED Viewed

	@@ -0,0 +1 @@


1	+ Prompt: Отвечай максимально кратко и по делу.