Spaces:

tianlong12
/

cerebras2api

Paused

File size: 3,467 Bytes

97ce33c


import random
import json
import aiohttp
import asyncio
from aiohttp import web
from datetime import datetime

# Debug mode switch
DEBUG_MODE = False

# Define fixed model information
DEFAULT_MODEL = "llama3.1-8b"
ALTERNATE_MODEL = "llama3.1-70b"
FIXED_URL = "https://api.cerebras.ai/v1/chat/completions"
FIXED_TEMPERATURE = 0.2
FIXED_TOP_P = 1
FIXED_MAX_TOKENS = 4096

# Log function for basic information
def log_basic_info(message):
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(f"[{timestamp}] {message}")

# Asynchronous function to send request and print debug information
async def send_request(auth_tokens, data):
    try:
        headers = {
            "accept": "application/json",
            "authorization": f"Bearer {auth_tokens[0]}",
            "content-type": "application/json"
        }

        requested_model = data.get("model", DEFAULT_MODEL)
        model_to_use = ALTERNATE_MODEL if requested_model == ALTERNATE_MODEL else DEFAULT_MODEL
        
        log_basic_info(f"Requested model: {requested_model}, Using model: {model_to_use}")
        
        payload = {
            "messages": data.get("messages", []),
            "model": model_to_use,
            "temperature": FIXED_TEMPERATURE,
            "top_p": FIXED_TOP_P,
            "max_tokens": FIXED_MAX_TOKENS
        }

        if DEBUG_MODE:
            print("Request Payload:", json.dumps(payload, indent=4))
            print("Request Headers:", headers)

        async with aiohttp.ClientSession() as session:
            async with session.post(FIXED_URL, headers=headers, json=payload) as resp:
                response_text = await resp.text()
                response_json = json.loads(response_text)

                total_tokens = response_json.get('usage', {}).get('total_tokens', 'N/A')
                total_time = response_json.get('time_info', {}).get('total_time', 'N/A')

                log_basic_info(f"Path: {FIXED_URL}, Status Code: {resp.status}, Total Tokens Used: {total_tokens}, Total Time: {total_time:.3f} seconds")

                return response_text

    except Exception as e:
        log_basic_info(f"Exception occurred: {str(e)}")

# Main handler function
async def handle_request(request):
    try:
        request_data = await request.json()
        headers = dict(request.headers)

        authorization_header = headers.get('Authorization', '')
        auth_tokens = [auth.strip() for auth in authorization_header.replace('Bearer ', '').split(',')]
        
        if not auth_tokens:
            return web.json_response({"error": "Missing Authorization token"}, status=400)
        
        auth_token = random.choice(auth_tokens)
        headers['Authorization'] = f"Bearer {auth_token}"

        log_basic_info(f"Received request for path: {request.path}")

        if DEBUG_MODE:
            print("Received Request Data:", json.dumps(request_data, indent=4))
            print("Received Headers:", headers)

        response_text = await send_request(auth_tokens, request_data)

        return web.json_response(json.loads(response_text))

    except Exception as e:
        log_basic_info(f"Exception occurred in handling request: {str(e)}")
        return web.json_response({"error": str(e)}, status=500)

# Set up routes
app = web.Application()
app.router.add_post('/hf/v1/chat/completions', handle_request)

# Run the server
if __name__ == '__main__':
    web.run_app(app, host='0.0.0.0', port=7860)