EnzGamers commited on
Commit
c06bfc1
·
verified ·
1 Parent(s): a9f3836

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -91
app.py CHANGED
@@ -3,122 +3,133 @@ from fastapi.responses import StreamingResponse
3
  from pydantic import BaseModel, Extra
4
  import torch
5
  from transformers import AutoModelForCausalLM, AutoTokenizer
6
- import time, uuid, json, asyncio, requests
7
- from bs4 import BeautifulSoup
8
- from typing import Optional, List, Union
9
- import re
10
-
11
- # ==============================================================================
12
- # === BLOC DE CONFIGURATION DE L'AGENT ===
13
- # ==============================================================================
14
-
15
- MODEL_CONFIG = {
16
- # L'identifiant du modèle depuis le Hub Hugging Face.
17
- "MODEL_ID": "Qwen/Qwen2.5-Coder-0.5B-Instruct",
18
-
19
- # Paramètres de génération
20
- "MAX_TOKENS_ANSWER": 1500,
21
- "TEMPERATURE": 0.2,
22
- }
23
-
24
- # --- NOUVEAU CERVEAU DE L'AGENT (SYSTEM PROMPT TEMPLATE) ---
25
- # On crée un template avec un emplacement {user_instructions}
26
- SYSTEM_PROMPT_TEMPLATE = """
27
- This work will be evaluated by a committee of the smartest thinkers to determine if you deserve to be classified as a reliable model. Be worthy of winning and bring out your best, now and in the future,Understanding the intent of user_input & respond in user language only.
28
-
29
- The user's specific instructions for this task are provided below. You must follow them precisely.
30
- ---
31
- USER INSTRUCTIONS:
32
- {user_instructions}
33
- ---
34
- """
35
-
36
- # ==============================================================================
37
- # === FIN DU BLOC DE CONFIGURATION - NE PAS MODIFIER CI-DESSOUS ===
38
- # ==============================================================================
39
-
40
- # --- Configuration & Model Loading ---
41
  DEVICE = "cpu"
42
- print(f"Loading model: {MODEL_CONFIG['MODEL_ID']}")
43
- model = AutoModelForCausalLM.from_pretrained(MODEL_CONFIG['MODEL_ID'], torch_dtype=torch.bfloat16, device_map=DEVICE)
44
- tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG['MODEL_ID'], padding_side='left')
45
- tokenizer.pad_token = tokenizer.eos_token
46
- print("Model and tokenizer loaded successfully.")
47
 
 
 
 
 
 
 
 
 
 
 
 
48
  app = FastAPI()
49
 
50
- # --- Pydantic Models ---
51
- class ContentPart(BaseModel): type: str; text: str
52
- class ChatMessage(BaseModel): role: str; content: Union[str, List[ContentPart]]
 
 
 
 
 
 
53
  class ChatCompletionRequest(BaseModel):
54
  model: Optional[str] = None
55
  messages: List[ChatMessage]
56
  stream: Optional[bool] = False
57
- class Config: extra = Extra.ignore
58
- class ModelData(BaseModel): id: str; object: str = "model"; owned_by: str = "user"
59
- class ModelList(BaseModel): object: str = "list"; data: List[ModelData]
 
 
 
 
 
 
 
 
 
 
 
60
 
61
- # --- API Endpoints ---
62
  @app.get("/models", response_model=ModelList)
63
  async def list_models():
64
- return ModelList(data=[ModelData(id=MODEL_CONFIG['MODEL_ID'])])
 
65
 
66
  @app.post("/chat/completions")
67
  async def create_chat_completion(request: ChatCompletionRequest):
 
 
 
68
  user_prompt = ""
69
  last_message = request.messages[-1]
70
  if isinstance(last_message.content, list):
71
  for part in last_message.content:
72
- if part.type == 'text': user_prompt += part.text + "\n"
73
- elif isinstance(last_message.content, str): user_prompt = last_message.content
74
-
75
- if not user_prompt: return {"error": "Prompt not found."}
76
-
77
- async def stream_direct_response():
 
 
 
 
 
 
 
 
 
 
 
 
78
  response_id = f"chatcmpl-{uuid.uuid4()}"
79
-
80
- def stream_chunk(content: str):
81
- chunk = {"id": response_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": MODEL_CONFIG['MODEL_ID'], "choices": [{"index": 0, "delta": {"content": content}, "finish_reason": None}]}
82
- return f"data: {json.dumps(chunk)}\n\n"
83
-
84
- # --- LOGIQUE DYNAMIQUE ---
85
- # 1. On injecte l'input de l'utilisateur dans le template du system prompt
86
- final_system_prompt = SYSTEM_PROMPT_TEMPLATE.format(user_instructions=user_prompt)
87
-
88
- # 2. On crée le message pour le modèle. Le rôle 'user' devient un simple déclencheur.
89
- messages = [
90
- {'role': 'system', 'content': final_system_prompt},
91
- {'role': 'user', 'content': "Based on the detailed instructions I provided in the system prompt, generate the required response."}
92
- ]
93
 
94
- # On prépare les données pour le modèle
95
- formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
96
- inputs = tokenizer(formatted_prompt, return_tensors="pt", padding=True).to(DEVICE)
97
-
98
- # On génère la réponse
99
- outputs = model.generate(
100
- **inputs,
101
- max_new_tokens=MODEL_CONFIG['MAX_TOKENS_ANSWER'],
102
- do_sample=True,
103
- temperature=MODEL_CONFIG['TEMPERATURE'],
104
- top_k=50,
105
- top_p=0.95,
106
- eos_token_id=tokenizer.eos_token_id
107
- )
108
- response_text = tokenizer.decode(outputs[0][len(inputs['input_ids'][0]):], skip_special_tokens=True)
109
-
110
- # On streame la réponse finale
111
  for char in response_text:
112
- yield stream_chunk(char)
113
- await asyncio.sleep(0.005)
 
 
 
 
 
 
 
 
 
 
 
114
 
115
- # --- Fin du stream ---
116
- final_chunk = {"id": response_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": MODEL_CONFIG['MODEL_ID'], "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]}
 
 
 
 
 
 
 
 
 
 
117
  yield f"data: {json.dumps(final_chunk)}\n\n"
 
 
118
  yield "data: [DONE]\n\n"
119
 
120
- return StreamingResponse(stream_direct_response(), media_type="text/event-stream")
 
 
 
 
 
121
 
122
  @app.get("/")
123
  def root():
124
- return {"status": "Dynamic Context Agent is online", "model_id": MODEL_CONFIG['MODEL_ID']}
 
 
 
 
3
  from pydantic import BaseModel, Extra
4
  import torch
5
  from transformers import AutoModelForCausalLM, AutoTokenizer
6
+ import time
7
+ import uuid
8
+ import json
9
+ from typing import Optional, List, Union, Dict, Any
10
+
11
+ # --- Configuration ---
12
+ MODEL_ID = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  DEVICE = "cpu"
 
 
 
 
 
14
 
15
+ # --- Chargement du modèle ---
16
+ print(f"Début du chargement du modèle : {MODEL_ID}")
17
+ model = AutoModelForCausalLM.from_pretrained(
18
+ MODEL_ID,
19
+ torch_dtype=torch.bfloat16,
20
+ device_map=DEVICE
21
+ )
22
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
23
+ print("Modèle et tokenizer chargés avec succès sur le CPU.")
24
+
25
+ # --- Création de l'application API ---
26
  app = FastAPI()
27
 
28
+ # --- Modèles de données pour accepter la structure complexe de l'extension ---
29
+ class ContentPart(BaseModel):
30
+ type: str
31
+ text: str
32
+
33
+ class ChatMessage(BaseModel):
34
+ role: str
35
+ content: Union[str, List[ContentPart]]
36
+
37
  class ChatCompletionRequest(BaseModel):
38
  model: Optional[str] = None
39
  messages: List[ChatMessage]
40
  stream: Optional[bool] = False
41
+
42
+ class Config:
43
+ extra = Extra.ignore
44
+
45
+ class ModelData(BaseModel):
46
+ id: str
47
+ object: str = "model"
48
+ owned_by: str = "user"
49
+
50
+ class ModelList(BaseModel):
51
+ object: str = "list"
52
+ data: List[ModelData]
53
+
54
+ # --- Définition des API ---
55
 
 
56
  @app.get("/models", response_model=ModelList)
57
  async def list_models():
58
+ """Répond à la requête GET /models pour satisfaire l'extension."""
59
+ return ModelList(data=[ModelData(id=MODEL_ID)])
60
 
61
  @app.post("/chat/completions")
62
  async def create_chat_completion(request: ChatCompletionRequest):
63
+ """Endpoint principal qui gère la génération de texte en streaming."""
64
+
65
+ # On extrait le prompt de l'utilisateur de la structure complexe
66
  user_prompt = ""
67
  last_message = request.messages[-1]
68
  if isinstance(last_message.content, list):
69
  for part in last_message.content:
70
+ if part.type == 'text':
71
+ user_prompt += part.text + "\n"
72
+ elif isinstance(last_message.content, str):
73
+ user_prompt = last_message.content
74
+
75
+ if not user_prompt:
76
+ return {"error": "Prompt non trouvé."}
77
+
78
+ # Préparation pour le modèle DeepSeek
79
+ messages_for_model = [{'role': 'user', 'content': user_prompt}]
80
+ inputs = tokenizer.apply_chat_template(messages_for_model, add_generation_prompt=True, return_tensors="pt").to(DEVICE)
81
+
82
+ # Génération de la réponse complète
83
+ outputs = model.generate(inputs, max_new_tokens=250, do_sample=True, temperature=0.2, top_k=50, top_p=0.95, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
84
+ response_text = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
85
+
86
+ # Fonction génératrice pour le streaming
87
+ async def stream_generator():
88
  response_id = f"chatcmpl-{uuid.uuid4()}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ # On envoie la réponse caractère par caractère, au format attendu
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  for char in response_text:
92
+ chunk = {
93
+ "id": response_id,
94
+ "object": "chat.completion.chunk",
95
+ "created": int(time.time()),
96
+ "model": MODEL_ID,
97
+ "choices": [{
98
+ "index": 0,
99
+ "delta": {"content": char},
100
+ "finish_reason": None
101
+ }]
102
+ }
103
+ yield f"data: {json.dumps(chunk)}\n\n"
104
+ await asyncio.sleep(0.01) # Petite pause pour simuler un flux
105
 
106
+ # On envoie le chunk final de fin
107
+ final_chunk = {
108
+ "id": response_id,
109
+ "object": "chat.completion.chunk",
110
+ "created": int(time.time()),
111
+ "model": MODEL_ID,
112
+ "choices": [{
113
+ "index": 0,
114
+ "delta": {},
115
+ "finish_reason": "stop"
116
+ }]
117
+ }
118
  yield f"data: {json.dumps(final_chunk)}\n\n"
119
+
120
+ # On envoie le signal [DONE]
121
  yield "data: [DONE]\n\n"
122
 
123
+ # Si l'extension demande un stream, on renvoie le générateur
124
+ if request.stream:
125
+ return StreamingResponse(stream_generator(), media_type="text/event-stream")
126
+ else:
127
+ # Code de secours si le stream n'est pas demandé (peu probable)
128
+ return {"choices": [{"message": {"role": "assistant", "content": response_text}}]}
129
 
130
  @app.get("/")
131
  def root():
132
+ return {"status": "API compatible OpenAI en ligne (avec streaming)", "model_id": MODEL_ID}
133
+
134
+ # On a besoin de asyncio pour la pause dans le stream
135
+ import asyncio