Create main.py
Browse files
main.py
ADDED
@@ -0,0 +1,457 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, request, jsonify, make_response, Response
|
2 |
+
import requests
|
3 |
+
import time
|
4 |
+
import uuid
|
5 |
+
import warnings
|
6 |
+
from waitress import serve
|
7 |
+
import json
|
8 |
+
import tiktoken
|
9 |
+
import socket
|
10 |
+
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
11 |
+
from mistral_common.protocol.instruct.messages import UserMessage
|
12 |
+
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
13 |
+
from pymemcache.client.base import Client
|
14 |
+
from flask_limiter import Limiter
|
15 |
+
from flask_limiter.util import get_remote_address
|
16 |
+
import os
|
17 |
+
import logging
|
18 |
+
from io import BytesIO
|
19 |
+
import coloredlogs
|
20 |
+
import printedcolors
|
21 |
+
import base64
|
22 |
+
|
23 |
+
# Suppress warnings from flask_limiter
|
24 |
+
warnings.filterwarnings("ignore", category=UserWarning, module="flask_limiter.extension")
|
25 |
+
|
26 |
+
# Create a logger object
|
27 |
+
logger = logging.getLogger("1min-relay")
|
28 |
+
|
29 |
+
# Install coloredlogs with desired log level
|
30 |
+
coloredlogs.install(level='DEBUG', logger=logger)
|
31 |
+
|
32 |
+
def check_memcached_connection(host='memcached', port=11211):
|
33 |
+
try:
|
34 |
+
client = Client((host, port))
|
35 |
+
client.set('test_key', 'test_value')
|
36 |
+
if client.get('test_key') == b'test_value':
|
37 |
+
client.delete('test_key') # Clean up
|
38 |
+
return True
|
39 |
+
else:
|
40 |
+
return False
|
41 |
+
except:
|
42 |
+
return False
|
43 |
+
|
44 |
+
logger.info('''
|
45 |
+
_ __ __ _ ___ _
|
46 |
+
/ | \/ (_)_ _ | _ \___| |__ _ _ _
|
47 |
+
| | |\/| | | ' \| / -_) / _` | || |
|
48 |
+
|_|_| |_|_|_||_|_|_\___|_\__,_|\_, |
|
49 |
+
|__/ ''')
|
50 |
+
|
51 |
+
|
52 |
+
def calculate_token(sentence, model="DEFAULT"):
|
53 |
+
"""Calculate the number of tokens in a sentence based on the specified model."""
|
54 |
+
|
55 |
+
if model.startswith("mistral"):
|
56 |
+
# Initialize the Mistral tokenizer
|
57 |
+
tokenizer = MistralTokenizer.v3(is_tekken=True)
|
58 |
+
model_name = "open-mistral-nemo" # Default to Mistral Nemo
|
59 |
+
tokenizer = MistralTokenizer.from_model(model_name)
|
60 |
+
tokenized = tokenizer.encode_chat_completion(
|
61 |
+
ChatCompletionRequest(
|
62 |
+
messages=[
|
63 |
+
UserMessage(content=sentence),
|
64 |
+
],
|
65 |
+
model=model_name,
|
66 |
+
)
|
67 |
+
)
|
68 |
+
tokens = tokenized.tokens
|
69 |
+
return len(tokens)
|
70 |
+
|
71 |
+
elif model in ["gpt-3.5-turbo", "gpt-4"]:
|
72 |
+
# Use OpenAI's tiktoken for GPT models
|
73 |
+
encoding = tiktoken.encoding_for_model(model)
|
74 |
+
tokens = encoding.encode(sentence)
|
75 |
+
return len(tokens)
|
76 |
+
|
77 |
+
else:
|
78 |
+
# Default to openai
|
79 |
+
encoding = tiktoken.encoding_for_model("gpt-4")
|
80 |
+
tokens = encoding.encode(sentence)
|
81 |
+
return len(tokens)
|
82 |
+
app = Flask(__name__)
|
83 |
+
if check_memcached_connection():
|
84 |
+
limiter = Limiter(
|
85 |
+
get_remote_address,
|
86 |
+
app=app,
|
87 |
+
storage_uri="memcached://memcached:11211", # Connect to Memcached created with docker
|
88 |
+
)
|
89 |
+
else:
|
90 |
+
# Used for ratelimiting without memcached
|
91 |
+
limiter = Limiter(
|
92 |
+
get_remote_address,
|
93 |
+
app=app,
|
94 |
+
)
|
95 |
+
logger.warning("Memcached is not available. Using in-memory storage for rate limiting. Not-Recommended")
|
96 |
+
|
97 |
+
|
98 |
+
ONE_MIN_API_URL = "https://api.1min.ai/api/features"
|
99 |
+
ONE_MIN_CONVERSATION_API_URL = "https://api.1min.ai/api/conversations"
|
100 |
+
ONE_MIN_CONVERSATION_API_STREAMING_URL = "https://api.1min.ai/api/features?isStreaming=true"
|
101 |
+
ONE_MIN_ASSET_URL = "https://api.1min.ai/api/assets"
|
102 |
+
|
103 |
+
# Define the models that are available for use
|
104 |
+
ALL_ONE_MIN_AVAILABLE_MODELS = [
|
105 |
+
"deepseek-chat",
|
106 |
+
"deepseek-reasoner",
|
107 |
+
"o1-preview",
|
108 |
+
"o1-mini",
|
109 |
+
"gpt-4o-mini",
|
110 |
+
"gpt-4o",
|
111 |
+
"gpt-4-turbo",
|
112 |
+
"gpt-4",
|
113 |
+
"gpt-3.5-turbo",
|
114 |
+
"claude-instant-1.2",
|
115 |
+
"claude-2.1",
|
116 |
+
"claude-3-7-sonnet-20250219",
|
117 |
+
"claude-3-5-sonnet-20240620",
|
118 |
+
"claude-3-opus-20240229",
|
119 |
+
"claude-3-sonnet-20240229",
|
120 |
+
"claude-3-haiku-20240307",
|
121 |
+
"gemini-1.0-pro",
|
122 |
+
"gemini-1.5-pro",
|
123 |
+
"gemini-1.5-flash",
|
124 |
+
"mistral-large-latest",
|
125 |
+
"mistral-small-latest",
|
126 |
+
"mistral-nemo",
|
127 |
+
"open-mistral-7b",
|
128 |
+
|
129 |
+
# Replicate
|
130 |
+
"meta/llama-2-70b-chat",
|
131 |
+
"meta/meta-llama-3-70b-instruct",
|
132 |
+
"meta/meta-llama-3.1-405b-instruct",
|
133 |
+
"command"
|
134 |
+
]
|
135 |
+
|
136 |
+
# Define the models that support vision inputs
|
137 |
+
vision_supported_models = [
|
138 |
+
"gpt-4o",
|
139 |
+
"gpt-4o-mini",
|
140 |
+
"gpt-4-turbo"
|
141 |
+
]
|
142 |
+
|
143 |
+
|
144 |
+
# Default values
|
145 |
+
SUBSET_OF_ONE_MIN_PERMITTED_MODELS = ["mistral-nemo", "gpt-4o", "deepseek-chat"]
|
146 |
+
PERMIT_MODELS_FROM_SUBSET_ONLY = False
|
147 |
+
|
148 |
+
# Read environment variables
|
149 |
+
one_min_models_env = os.getenv("SUBSET_OF_ONE_MIN_PERMITTED_MODELS") # e.g. "mistral-nemo,gpt-4o,deepseek-chat"
|
150 |
+
permit_not_in_available_env = os.getenv("PERMIT_MODELS_FROM_SUBSET_ONLY") # e.g. "True" or "False"
|
151 |
+
|
152 |
+
# Parse or fall back to defaults
|
153 |
+
if one_min_models_env:
|
154 |
+
SUBSET_OF_ONE_MIN_PERMITTED_MODELS = one_min_models_env.split(",")
|
155 |
+
|
156 |
+
if permit_not_in_available_env and permit_not_in_available_env.lower() == "true":
|
157 |
+
PERMIT_MODELS_FROM_SUBSET_ONLY = True
|
158 |
+
|
159 |
+
# Combine into a single list
|
160 |
+
AVAILABLE_MODELS = []
|
161 |
+
AVAILABLE_MODELS.extend(SUBSET_OF_ONE_MIN_PERMITTED_MODELS)
|
162 |
+
|
163 |
+
@app.route('/', methods=['GET', 'POST'])
|
164 |
+
def index():
|
165 |
+
if request.method == 'POST':
|
166 |
+
return ERROR_HANDLER(1212)
|
167 |
+
if request.method == 'GET':
|
168 |
+
internal_ip = socket.gethostbyname(socket.gethostname())
|
169 |
+
return "Congratulations! Your API is working! You can now make requests to the API.\n\nEndpoint: " + internal_ip + ':5001/v1'
|
170 |
+
@app.route('/v1/models')
|
171 |
+
@limiter.limit("500 per minute")
|
172 |
+
def models():
|
173 |
+
# Dynamically create the list of models with additional fields
|
174 |
+
models_data = []
|
175 |
+
if not PERMIT_MODELS_FROM_SUBSET_ONLY:
|
176 |
+
one_min_models_data = [
|
177 |
+
{
|
178 |
+
"id": model_name,
|
179 |
+
"object": "model",
|
180 |
+
"owned_by": "1minai",
|
181 |
+
"created": 1727389042
|
182 |
+
}
|
183 |
+
for model_name in ALL_ONE_MIN_AVAILABLE_MODELS
|
184 |
+
]
|
185 |
+
else:
|
186 |
+
one_min_models_data = [
|
187 |
+
{"id": model_name, "object": "model", "owned_by": "1minai", "created": 1727389042}
|
188 |
+
for model_name in SUBSET_OF_ONE_MIN_PERMITTED_MODELS
|
189 |
+
]
|
190 |
+
models_data.extend(one_min_models_data)
|
191 |
+
return jsonify({"data": models_data, "object": "list"})
|
192 |
+
|
193 |
+
def ERROR_HANDLER(code, model=None, key=None):
|
194 |
+
# Handle errors in OpenAI-Structued Error
|
195 |
+
error_codes = { # Internal Error Codes
|
196 |
+
1002: {"message": f"The model {model} does not exist.", "type": "invalid_request_error", "param": None, "code": "model_not_found", "http_code": 400},
|
197 |
+
1020: {"message": f"Incorrect API key provided: {key}. You can find your API key at https://app.1min.ai/api.", "type": "authentication_error", "param": None, "code": "invalid_api_key", "http_code": 401},
|
198 |
+
1021: {"message": "Invalid Authentication", "type": "invalid_request_error", "param": None, "code": None, "http_code": 401},
|
199 |
+
1212: {"message": f"Incorrect Endpoint. Please use the /v1/chat/completions endpoint.", "type": "invalid_request_error", "param": None, "code": "model_not_supported", "http_code": 400},
|
200 |
+
1044: {"message": f"This model does not support image inputs.", "type": "invalid_request_error", "param": None, "code": "model_not_supported", "http_code": 400},
|
201 |
+
1412: {"message": f"No message provided.", "type": "invalid_request_error", "param": "messages", "code": "invalid_request_error", "http_code": 400},
|
202 |
+
1423: {"message": f"No content in last message.", "type": "invalid_request_error", "param": "messages", "code": "invalid_request_error", "http_code": 400},
|
203 |
+
}
|
204 |
+
error_data = {k: v for k, v in error_codes.get(code, {"message": "Unknown error", "type": "unknown_error", "param": None, "code": None}).items() if k != "http_code"} # Remove http_code from the error data
|
205 |
+
logger.error(f"An error has occurred while processing the user's request. Error code: {code}")
|
206 |
+
return jsonify({"error": error_data}), error_codes.get(code, {}).get("http_code", 400) # Return the error data without http_code inside the payload and get the http_code to return.
|
207 |
+
|
208 |
+
def format_conversation_history(messages, new_input):
|
209 |
+
"""
|
210 |
+
Formats the conversation history into a structured string.
|
211 |
+
|
212 |
+
Args:
|
213 |
+
messages (list): List of message dictionaries from the request
|
214 |
+
new_input (str): The new user input message
|
215 |
+
|
216 |
+
Returns:
|
217 |
+
str: Formatted conversation history
|
218 |
+
"""
|
219 |
+
formatted_history = ["Conversation History:\n"]
|
220 |
+
|
221 |
+
for message in messages:
|
222 |
+
role = message.get('role', '').capitalize()
|
223 |
+
content = message.get('content', '')
|
224 |
+
|
225 |
+
# Handle potential list content
|
226 |
+
if isinstance(content, list):
|
227 |
+
content = '\n'.join(item['text'] for item in content if 'text' in item)
|
228 |
+
|
229 |
+
formatted_history.append(f"{role}: {content}")
|
230 |
+
|
231 |
+
# Append additional messages only if there are existing messages
|
232 |
+
if messages: # Save credits if it is the first message.
|
233 |
+
formatted_history.append("Respond like normal. The conversation history will be automatically updated on the next MESSAGE. DO NOT ADD User: or Assistant: to your output. Just respond like normal.")
|
234 |
+
formatted_history.append("User Message:\n")
|
235 |
+
formatted_history.append(new_input)
|
236 |
+
|
237 |
+
return '\n'.join(formatted_history)
|
238 |
+
|
239 |
+
|
240 |
+
@app.route('/v1/chat/completions', methods=['POST', 'OPTIONS'])
|
241 |
+
@limiter.limit("500 per minute")
|
242 |
+
def conversation():
|
243 |
+
if request.method == 'OPTIONS':
|
244 |
+
return handle_options_request()
|
245 |
+
image = False
|
246 |
+
|
247 |
+
|
248 |
+
auth_header = request.headers.get('Authorization')
|
249 |
+
if not auth_header or not auth_header.startswith("Bearer "):
|
250 |
+
logger.error("Invalid Authentication")
|
251 |
+
return ERROR_HANDLER(1021)
|
252 |
+
|
253 |
+
api_key = auth_header.split(" ")[1]
|
254 |
+
|
255 |
+
headers = {
|
256 |
+
'API-KEY': api_key
|
257 |
+
}
|
258 |
+
|
259 |
+
request_data = request.json
|
260 |
+
|
261 |
+
all_messages = format_conversation_history(request_data.get('messages', []), request_data.get('new_input', ''))
|
262 |
+
|
263 |
+
messages = request_data.get('messages', [])
|
264 |
+
if not messages:
|
265 |
+
return ERROR_HANDLER(1412)
|
266 |
+
|
267 |
+
user_input = messages[-1].get('content')
|
268 |
+
if not user_input:
|
269 |
+
return ERROR_HANDLER(1423)
|
270 |
+
|
271 |
+
# Check if user_input is a list and combine text if necessary
|
272 |
+
image = False
|
273 |
+
if isinstance(user_input, list):
|
274 |
+
image_paths = []
|
275 |
+
for item in user_input:
|
276 |
+
if 'text' in item:
|
277 |
+
combined_text = '\n'.join(item['text'])
|
278 |
+
try:
|
279 |
+
if 'image_url' in item:
|
280 |
+
if request_data.get('model', 'mistral-nemo') not in vision_supported_models:
|
281 |
+
return ERROR_HANDLER(1044, request_data.get('model', 'mistral-nemo'))
|
282 |
+
if item['image_url']['url'].startswith("data:image/png;base64,"):
|
283 |
+
base64_image = item['image_url']['url'].split(",")[1]
|
284 |
+
binary_data = base64.b64decode(base64_image)
|
285 |
+
else:
|
286 |
+
binary_data = requests.get(item['image_url']['url'])
|
287 |
+
binary_data.raise_for_status() # Raise an error for bad responses
|
288 |
+
binary_data = BytesIO(binary_data.content)
|
289 |
+
files = {
|
290 |
+
'asset': ("relay" + str(uuid.uuid4()), binary_data, 'image/png')
|
291 |
+
}
|
292 |
+
asset = requests.post(ONE_MIN_ASSET_URL, files=files, headers=headers)
|
293 |
+
asset.raise_for_status() # Raise an error for bad responses
|
294 |
+
image_path = asset.json()['fileContent']['path']
|
295 |
+
image_paths.append(image_path)
|
296 |
+
image = True
|
297 |
+
except Exception as e:
|
298 |
+
print(f"An error occurred e:" + str(e)[:60])
|
299 |
+
# Optionally log the error or return an appropriate response
|
300 |
+
|
301 |
+
user_input = str(combined_text)
|
302 |
+
|
303 |
+
prompt_token = calculate_token(str(all_messages))
|
304 |
+
if PERMIT_MODELS_FROM_SUBSET_ONLY and request_data.get('model', 'mistral-nemo') not in AVAILABLE_MODELS:
|
305 |
+
return ERROR_HANDLER(1002, request_data.get('model', 'mistral-nemo')) # Handle invalid model
|
306 |
+
|
307 |
+
logger.debug(f"Proccessing {prompt_token} prompt tokens with model {request_data.get('model', 'mistral-nemo')}")
|
308 |
+
|
309 |
+
if not image:
|
310 |
+
payload = {
|
311 |
+
"type": "CHAT_WITH_AI",
|
312 |
+
"model": request_data.get('model', 'mistral-nemo'),
|
313 |
+
"promptObject": {
|
314 |
+
"prompt": all_messages,
|
315 |
+
"isMixed": False,
|
316 |
+
"webSearch": False
|
317 |
+
}
|
318 |
+
}
|
319 |
+
else:
|
320 |
+
payload = {
|
321 |
+
"type": "CHAT_WITH_IMAGE",
|
322 |
+
"model": request_data.get('model', 'mistral-nemo'),
|
323 |
+
"promptObject": {
|
324 |
+
"prompt": all_messages,
|
325 |
+
"isMixed": False,
|
326 |
+
"imageList": image_paths
|
327 |
+
}
|
328 |
+
}
|
329 |
+
|
330 |
+
headers = {"API-KEY": api_key, 'Content-Type': 'application/json'}
|
331 |
+
|
332 |
+
if not request_data.get('stream', False):
|
333 |
+
# Non-Streaming Response
|
334 |
+
logger.debug("Non-Streaming AI Response")
|
335 |
+
response = requests.post(ONE_MIN_API_URL, json=payload, headers=headers)
|
336 |
+
response.raise_for_status()
|
337 |
+
one_min_response = response.json()
|
338 |
+
|
339 |
+
transformed_response = transform_response(one_min_response, request_data, prompt_token)
|
340 |
+
response = make_response(jsonify(transformed_response))
|
341 |
+
set_response_headers(response)
|
342 |
+
|
343 |
+
return response, 200
|
344 |
+
|
345 |
+
else:
|
346 |
+
# Streaming Response
|
347 |
+
logger.debug("Streaming AI Response")
|
348 |
+
response_stream = requests.post(ONE_MIN_CONVERSATION_API_STREAMING_URL, data=json.dumps(payload), headers=headers, stream=True)
|
349 |
+
if response_stream.status_code != 200:
|
350 |
+
if response_stream.status_code == 401:
|
351 |
+
return ERROR_HANDLER(1020)
|
352 |
+
logger.error(f"An unknown error occurred while processing the user's request. Error code: {response_stream.status_code}")
|
353 |
+
return ERROR_HANDLER(response_stream.status_code)
|
354 |
+
return Response(stream_response(response_stream, request_data, request_data.get('model', 'mistral-nemo'), int(prompt_token)), content_type='text/event-stream')
|
355 |
+
def handle_options_request():
|
356 |
+
response = make_response()
|
357 |
+
response.headers.add('Access-Control-Allow-Origin', '*')
|
358 |
+
response.headers.add('Access-Control-Allow-Headers', 'Content-Type,Authorization')
|
359 |
+
response.headers.add('Access-Control-Allow-Methods', 'POST, OPTIONS')
|
360 |
+
return response, 204
|
361 |
+
|
362 |
+
def transform_response(one_min_response, request_data, prompt_token):
|
363 |
+
completion_token = calculate_token(one_min_response['aiRecord']["aiRecordDetail"]["resultObject"][0])
|
364 |
+
logger.debug(f"Finished processing Non-Streaming response. Completion tokens: {str(completion_token)}")
|
365 |
+
logger.debug(f"Total tokens: {str(completion_token + prompt_token)}")
|
366 |
+
return {
|
367 |
+
"id": f"chatcmpl-{uuid.uuid4()}",
|
368 |
+
"object": "chat.completion",
|
369 |
+
"created": int(time.time()),
|
370 |
+
"model": request_data.get('model', 'mistral-nemo'),
|
371 |
+
"choices": [
|
372 |
+
{
|
373 |
+
"index": 0,
|
374 |
+
"message": {
|
375 |
+
"role": "assistant",
|
376 |
+
"content": one_min_response['aiRecord']["aiRecordDetail"]["resultObject"][0],
|
377 |
+
},
|
378 |
+
"finish_reason": "stop"
|
379 |
+
}
|
380 |
+
],
|
381 |
+
"usage": {
|
382 |
+
"prompt_tokens": prompt_token,
|
383 |
+
"completion_tokens": completion_token,
|
384 |
+
"total_tokens": prompt_token + completion_token
|
385 |
+
}
|
386 |
+
}
|
387 |
+
|
388 |
+
def set_response_headers(response):
|
389 |
+
response.headers['Content-Type'] = 'application/json'
|
390 |
+
response.headers['Access -Control-Allow-Origin'] = '*'
|
391 |
+
response.headers['X-Request-ID'] = str (uuid.uuid4())
|
392 |
+
|
393 |
+
def stream_response(response, request_data, model, prompt_tokens):
|
394 |
+
all_chunks = ""
|
395 |
+
for chunk in response.iter_content(chunk_size=1024):
|
396 |
+
finish_reason = None
|
397 |
+
|
398 |
+
return_chunk = {
|
399 |
+
"id": f"chatcmpl-{uuid.uuid4()}",
|
400 |
+
"object": "chat.completion.chunk",
|
401 |
+
"created": int(time.time()),
|
402 |
+
"model": request_data.get('model', 'mistral-nemo'),
|
403 |
+
"choices": [
|
404 |
+
{
|
405 |
+
"index": 0,
|
406 |
+
"delta": {
|
407 |
+
"content": chunk.decode('utf-8')
|
408 |
+
},
|
409 |
+
"finish_reason": finish_reason
|
410 |
+
}
|
411 |
+
]
|
412 |
+
}
|
413 |
+
all_chunks += chunk.decode('utf-8')
|
414 |
+
yield f"data: {json.dumps(return_chunk)}\n\n"
|
415 |
+
|
416 |
+
tokens = calculate_token(all_chunks)
|
417 |
+
logger.debug(f"Finished processing streaming response. Completion tokens: {str(tokens)}")
|
418 |
+
logger.debug(f"Total tokens: {str(tokens + prompt_tokens)}")
|
419 |
+
|
420 |
+
# Final chunk when iteration stops
|
421 |
+
final_chunk = {
|
422 |
+
"id": f"chatcmpl-{uuid.uuid4()}",
|
423 |
+
"object": "chat.completion.chunk",
|
424 |
+
"created": int(time.time()),
|
425 |
+
"model": request_data.get('model', 'mistral-nemo'),
|
426 |
+
"choices": [
|
427 |
+
{
|
428 |
+
"index": 0,
|
429 |
+
"delta": {
|
430 |
+
"content": ""
|
431 |
+
},
|
432 |
+
"finish_reason": "stop"
|
433 |
+
}
|
434 |
+
],
|
435 |
+
"usage": {
|
436 |
+
"prompt_tokens": prompt_tokens,
|
437 |
+
"completion_tokens": tokens,
|
438 |
+
"total_tokens": tokens + prompt_tokens
|
439 |
+
}
|
440 |
+
}
|
441 |
+
yield f"data: {json.dumps(final_chunk)}\n\n"
|
442 |
+
yield "data: [DONE]\n\n"
|
443 |
+
|
444 |
+
if __name__ == '__main__':
|
445 |
+
internal_ip = socket.gethostbyname(socket.gethostname())
|
446 |
+
response = requests.get('https://api.ipify.org')
|
447 |
+
public_ip = response.text
|
448 |
+
logger.info(f"""{printedcolors.Color.fg.lightcyan}
|
449 |
+
Server is ready to serve at:
|
450 |
+
Internal IP: {internal_ip}:5001
|
451 |
+
Public IP: {public_ip} (only if you've setup port forwarding on your router.)
|
452 |
+
Enter this url to OpenAI clients supporting custom endpoint:
|
453 |
+
{internal_ip}:5001/v1
|
454 |
+
If does not work, try:
|
455 |
+
{internal_ip}:5001/v1/chat/completions
|
456 |
+
{printedcolors.Color.reset}""")
|
457 |
+
serve(app, host='0.0.0.0', port=5001, threads=6) # Thread has a default of 4 if not specified. We use 6 to increase performance and allow multiple requests at once.
|