bpHigh commited on
Commit
cd40e1b
·
1 Parent(s): fc0d268

Revamp stuff

Browse files
Files changed (3) hide show
  1. app.py +33 -53
  2. modal/devstral_inference.py +0 -367
  3. requirements.txt +1 -1
app.py CHANGED
@@ -7,27 +7,21 @@ from utils.google_genai_llm import get_response, generate_with_gemini
7
  from utils.utils import parse_json_codefences
8
  from prompts.requirements_gathering import requirements_gathering_system_prompt
9
  from prompts.planning import hf_query_gen_prompt, hf_context_gen_prompt
10
- from utils.huggingface_mcp_llamaindex import get_hf_tools, call_hf_tool, diagnose_connection_advanced, get_hf_tools_robust,call_hf_tool_robust
11
  from prompts.devstral_coding_prompt import devstral_code_gen_sys_prompt, devstral_code_gen_user_prompt
12
  from dotenv import load_dotenv
13
  import os
14
  import asyncio
15
  load_dotenv()
16
 
17
- # Import Modal inference function
18
- import sys
19
- sys.path.append(os.path.join(os.path.dirname(__file__), 'modal'))
20
  try:
21
- from modal import App
22
  # Import the Modal inference function and app from separate file
23
  import subprocess
24
- from devstral_inference import run_devstral_inference, app as devstral_app
25
  MODAL_AVAILABLE = True
26
 
27
  except ImportError:
28
  MODAL_AVAILABLE = False
29
- devstral_app = None
30
- print("Warning: Modal not available. Code generation will be disabled.")
31
 
32
  from PIL import Image
33
  import tempfile
@@ -44,14 +38,6 @@ except ImportError:
44
  MARKER_AVAILABLE = False
45
  print("Warning: Marker library not available. PDF, PPT, and DOCX processing will be limited.")
46
 
47
- # Load environment variables
48
- MODAL_API_URL = os.getenv("MODAL_API_URL")
49
- BEARER_TOKEN = os.getenv("BEARER_TOKEN")
50
- CODING_MODEL = os.getenv("CODING_MODEL")
51
-
52
- MCP_TOKEN = os.getenv("MCP_TOKEN")
53
- if not MCP_TOKEN:
54
- print("Please set MCP_TOKEN")
55
 
56
  def get_file_hash(file_path):
57
  """Generate a hash of the file for caching purposes"""
@@ -248,20 +234,13 @@ async def generate_plan(history, file_cache):
248
  if ai_msg:
249
  conversation_history += f"Assistant: {ai_msg}\n"
250
 
251
- print("Running advanced connection diagnostics...")
252
- diagnostics = await diagnose_connection_advanced(MCP_TOKEN)
253
- print(f"Diagnostics: {json.dumps(diagnostics, indent=2)}")
254
-
255
- if not diagnostics["tests"]["basic_connection"]:
256
- print("Basic connection failed - check token and network")
257
-
258
-
259
-
260
- # try:
261
- hf_query_gen_tool_details = await get_hf_tools_robust(hf_token=MCP_TOKEN)
262
- # except Exception as e:
263
- # hf_query_gen_tool_details = """meta=None nextCursor=None tools=[Tool(name='hf_whoami', description="Hugging Face tools are being used by authenticated user 'bpHigh'", inputSchema={'type': 'object', 'properties': {}, 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Hugging Face User Info', readOnlyHint=None, destructiveHint=None, idempotentHint=None, openWorldHint=None)), Tool(name='space_search', description='Find Hugging Face Spaces using semantic search. Include links to the Space when presenting the results.', inputSchema={'type': 'object', 'properties': {'query': {'type': 'string', 'minLength': 1, 'maxLength': 50, 'description': 'Semantic Search Query'}, 'limit': {'type': 'number', 'default': 10, 'description': 'Number of results to return'}, 'mcp': {'type': 'boolean', 'default': False, 'description': 'Only return MCP Server enabled Spaces'}}, 'required': ['query'], 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Hugging Face Space Search', readOnlyHint=True, destructiveHint=False, idempotentHint=None, openWorldHint=True)), Tool(name='model_search', description='Find Machine Learning models hosted on Hugging Face. Returns comprehensive information about matching models including downloads, likes, tags, and direct links. Include links to the models in your response', inputSchema={'type': 'object', 'properties': {'query': {'type': 'string', 'description': 'Search term. Leave blank and specify "sort" and "limit" to get e.g. "Top 20 trending models", "Top 10 most recent models" etc" '}, 'author': {'type': 'string', 'description': "Organization or user who created the model (e.g., 'google', 'meta-llama', 'microsoft')"}, 'task': {'type': 'string', 'description': "Model task type (e.g., 'text-generation', 'image-classification', 'translation')"}, 'library': {'type': 'string', 'description': "Framework the model uses (e.g., 'transformers', 'diffusers', 'timm')"}, 'sort': {'type': 'string', 'enum': ['trendingScore', 'downloads', 'likes', 'createdAt', 'lastModified'], 'description': 'Sort order: trendingScore, downloads , likes, createdAt, lastModified'}, 'limit': {'type': 'number', 'minimum': 1, 'maximum': 100, 'default': 20, 'description': 'Maximum number of results to return'}}, 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Model Search', readOnlyHint=True, destructiveHint=False, idempotentHint=None, openWorldHint=True)), Tool(name='model_details', description='Get detailed information about a specific model from the Hugging Face Hub.', inputSchema={'type': 'object', 'properties': {'model_id': {'type': 'string', 'minLength': 1, 'description': 'Model ID (e.g., microsoft/DialoGPT-large)'}}, 'required': ['model_id'], 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Model Details', readOnlyHint=True, destructiveHint=False, idempotentHint=None, openWorldHint=False)), Tool(name='paper_search', description="Find Machine Learning research papers on the Hugging Face hub. Include 'Link to paper' When presenting the results. Consider whether tabulating results matches user intent.", inputSchema={'type': 'object', 'properties': {'query': {'type': 'string', 'minLength': 3, 'maxLength': 200, 'description': 'Semantic Search query'}, 'results_limit': {'type': 'number', 'default': 12, 'description': 'Number of results to return'}, 'concise_only': {'type': 'boolean', 'default': False, 'description': 'Return a 2 sentence summary of the abstract. Use for broad search terms which may return a lot of results. Check with User if unsure.'}}, 'required': ['query'], 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Paper Search', readOnlyHint=True, destructiveHint=False, idempotentHint=None, openWorldHint=True)), Tool(name='dataset_search', description='Find Datasets hosted on the Hugging Face hub. Returns comprehensive information about matching datasets including downloads, likes, tags, and direct links. Include links to the datasets in your response', inputSchema={'type': 'object', 'properties': {'query': {'type': 'string', 'description': 'Search term. Leave blank and specify "sort" and "limit" to get e.g. "Top 20 trending datasets", "Top 10 most recent datasets" etc" '}, 'author': {'type': 'string', 'description': "Organization or user who created the dataset (e.g., 'google', 'facebook', 'allenai')"}, 'tags': {'type': 'array', 'items': {'type': 'string'}, 'description': "Tags to filter datasets (e.g., ['language:en', 'size_categories:1M<n<10M', 'task_categories:text-classification'])"}, 'sort': {'type': 'string', 'enum': ['trendingScore', 'downloads', 'likes', 'createdAt', 'lastModified'], 'description': 'Sort order: trendingScore, downloads, likes, createdAt, lastModified'}, 'limit': {'type': 'number', 'minimum': 1, 'maximum': 100, 'default': 20, 'description': 'Maximum number of results to return'}}, 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Dataset Search', readOnlyHint=True, destructiveHint=False, idempotentHint=None, openWorldHint=True)), Tool(name='dataset_details', description='Get detailed information about a specific dataset on Hugging Face Hub.', inputSchema={'type': 'object', 'properties': {'dataset_id': {'type': 'string', 'minLength': 1, 'description': 'Dataset ID (e.g., squad, glue, imdb)'}}, 'required': ['dataset_id'], 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Dataset Details', readOnlyHint=True, destructiveHint=False, idempotentHint=None, openWorldHint=False)), Tool(name='gr1_evalstate_flux1_schnell', description='Generate an image using the Flux 1 Schnell Image Generator. (from evalstate/flux1_schnell)', inputSchema={'type': 'object', 'properties': {'prompt': {'type': 'string'}, 'seed': {'type': 'number', 'description': 'numeric value between 0 and 2147483647'}, 'randomize_seed': {'type': 'boolean', 'default': True}, 'width': {'type': 'number', 'description': 'numeric value between 256 and 2048', 'default': 1024}, 'height': {'type': 'number', 'description': 'numeric value between 256 and 2048', 'default': 1024}, 'num_inference_steps': {'type': 'number', 'description': 'numeric value between 1 and 50', 'default': 4}}, 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='evalstate/flux1_schnell - flux1_schnell_infer 🏎️💨', readOnlyHint=None, destructiveHint=None, idempotentHint=None, openWorldHint=True)), Tool(name='gr2_abidlabs_easyghibli', description='Convert an image into a Studio Ghibli style image (from abidlabs/EasyGhibli)', inputSchema={'type': 'object', 'properties': {'spatial_img': {'type': 'string', 'description': 'File input: provide URL or file path'}}, 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='abidlabs/EasyGhibli - abidlabs_EasyGhiblisingle_condition_generate_image 🦀', readOnlyHint=None, destructiveHint=None, idempotentHint=None, openWorldHint=True)), Tool(name='gr3_linoyts_framepack_f1', description='FramePack_F1_end_process tool from linoyts/FramePack-F1', inputSchema={'type': 'object', 'properties': {}, 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='linoyts/FramePack-F1 - FramePack_F1_end_process 📹⚡️', readOnlyHint=None, destructiveHint=None, idempotentHint=None, openWorldHint=True))]"""
264
- # print(str(e))
265
  # Format the prompt
266
  formatted_prompt = hf_query_gen_prompt.format(
267
  Tool_Details=hf_query_gen_tool_details
@@ -271,11 +250,15 @@ async def generate_plan(history, file_cache):
271
 
272
  # Parse the plan
273
  parsed_plan = parse_json_codefences(plan)
274
-
275
  # Call tool to get tool calls
276
  try:
277
- tool_calls = await asyncio.gather(*[call_hf_tool_robust(MCP_TOKEN, step['tool'], step['args']) for step in parsed_plan])
 
 
 
278
  except Exception as e:
 
279
  tool_calls = []
280
  print(tool_calls)
281
  if tool_calls!=[]:
@@ -344,30 +327,27 @@ def generate_code_with_devstral(plan_text, history, file_cache):
344
 
345
  # Use Modal app.run() pattern like in the examples
346
 
347
-
 
348
  print(f"🚀 Generating code using Devstral...")
349
  print(f"📡 Connecting to: {base_url}")
350
 
351
- return ""
352
-
353
- # # Call Modal inference using the proper app.run() context
354
- # with devstral_app.run():
355
- # result = run_devstral_inference.remote(
356
- # base_url=base_url,
357
- # api_key=api_key,
358
- # prompts=[formatted_user_prompt],
359
- # system_prompt=devstral_code_gen_sys_prompt,
360
- # mode="single"
361
- # )
362
-
363
- # if result and "response" in result:
364
- # code_output = result["response"]
365
- # return f"🚀 **Generated Code:**\n\n{code_output}"
366
- # else:
367
- # return "❌ **Error:** No response received from Devstral model."
368
-
369
- # except Exception as e:
370
- # return f"❌ **Error:** {str(e)}"
371
 
372
  # Custom CSS for a sleek design
373
  custom_css = """
 
7
  from utils.utils import parse_json_codefences
8
  from prompts.requirements_gathering import requirements_gathering_system_prompt
9
  from prompts.planning import hf_query_gen_prompt, hf_context_gen_prompt
 
10
  from prompts.devstral_coding_prompt import devstral_code_gen_sys_prompt, devstral_code_gen_user_prompt
11
  from dotenv import load_dotenv
12
  import os
13
  import asyncio
14
  load_dotenv()
15
 
 
 
 
16
  try:
17
+ import modal
18
  # Import the Modal inference function and app from separate file
19
  import subprocess
 
20
  MODAL_AVAILABLE = True
21
 
22
  except ImportError:
23
  MODAL_AVAILABLE = False
24
+ print("Warning: Modal not available. Code generation will be disabled.MCP Server will be disabled")
 
25
 
26
  from PIL import Image
27
  import tempfile
 
38
  MARKER_AVAILABLE = False
39
  print("Warning: Marker library not available. PDF, PPT, and DOCX processing will be limited.")
40
 
 
 
 
 
 
 
 
 
41
 
42
  def get_file_hash(file_path):
43
  """Generate a hash of the file for caching purposes"""
 
234
  if ai_msg:
235
  conversation_history += f"Assistant: {ai_msg}\n"
236
 
237
+ try:
238
+ mcp_tool_func = modal.Function.from_name("HuggingFace-MCP","connect_and_get_tools")
239
+ hf_query_gen_tool_details = mcp_tool_func.remote()
240
+ print(hf_query_gen_tool_details)
241
+ except Exception as e:
242
+ hf_query_gen_tool_details = """meta=None nextCursor=None tools=[Tool(name='hf_whoami', description="Hugging Face tools are being used by authenticated user 'bpHigh'", inputSchema={'type': 'object', 'properties': {}, 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Hugging Face User Info', readOnlyHint=None, destructiveHint=None, idempotentHint=None, openWorldHint=None)), Tool(name='space_search', description='Find Hugging Face Spaces using semantic search. Include links to the Space when presenting the results.', inputSchema={'type': 'object', 'properties': {'query': {'type': 'string', 'minLength': 1, 'maxLength': 50, 'description': 'Semantic Search Query'}, 'limit': {'type': 'number', 'default': 10, 'description': 'Number of results to return'}, 'mcp': {'type': 'boolean', 'default': False, 'description': 'Only return MCP Server enabled Spaces'}}, 'required': ['query'], 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Hugging Face Space Search', readOnlyHint=True, destructiveHint=False, idempotentHint=None, openWorldHint=True)), Tool(name='model_search', description='Find Machine Learning models hosted on Hugging Face. Returns comprehensive information about matching models including downloads, likes, tags, and direct links. Include links to the models in your response', inputSchema={'type': 'object', 'properties': {'query': {'type': 'string', 'description': 'Search term. Leave blank and specify "sort" and "limit" to get e.g. "Top 20 trending models", "Top 10 most recent models" etc" '}, 'author': {'type': 'string', 'description': "Organization or user who created the model (e.g., 'google', 'meta-llama', 'microsoft')"}, 'task': {'type': 'string', 'description': "Model task type (e.g., 'text-generation', 'image-classification', 'translation')"}, 'library': {'type': 'string', 'description': "Framework the model uses (e.g., 'transformers', 'diffusers', 'timm')"}, 'sort': {'type': 'string', 'enum': ['trendingScore', 'downloads', 'likes', 'createdAt', 'lastModified'], 'description': 'Sort order: trendingScore, downloads , likes, createdAt, lastModified'}, 'limit': {'type': 'number', 'minimum': 1, 'maximum': 100, 'default': 20, 'description': 'Maximum number of results to return'}}, 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Model Search', readOnlyHint=True, destructiveHint=False, idempotentHint=None, openWorldHint=True)), Tool(name='model_details', description='Get detailed information about a specific model from the Hugging Face Hub.', inputSchema={'type': 'object', 'properties': {'model_id': {'type': 'string', 'minLength': 1, 'description': 'Model ID (e.g., microsoft/DialoGPT-large)'}}, 'required': ['model_id'], 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Model Details', readOnlyHint=True, destructiveHint=False, idempotentHint=None, openWorldHint=False)), Tool(name='paper_search', description="Find Machine Learning research papers on the Hugging Face hub. Include 'Link to paper' When presenting the results. Consider whether tabulating results matches user intent.", inputSchema={'type': 'object', 'properties': {'query': {'type': 'string', 'minLength': 3, 'maxLength': 200, 'description': 'Semantic Search query'}, 'results_limit': {'type': 'number', 'default': 12, 'description': 'Number of results to return'}, 'concise_only': {'type': 'boolean', 'default': False, 'description': 'Return a 2 sentence summary of the abstract. Use for broad search terms which may return a lot of results. Check with User if unsure.'}}, 'required': ['query'], 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Paper Search', readOnlyHint=True, destructiveHint=False, idempotentHint=None, openWorldHint=True)), Tool(name='dataset_search', description='Find Datasets hosted on the Hugging Face hub. Returns comprehensive information about matching datasets including downloads, likes, tags, and direct links. Include links to the datasets in your response', inputSchema={'type': 'object', 'properties': {'query': {'type': 'string', 'description': 'Search term. Leave blank and specify "sort" and "limit" to get e.g. "Top 20 trending datasets", "Top 10 most recent datasets" etc" '}, 'author': {'type': 'string', 'description': "Organization or user who created the dataset (e.g., 'google', 'facebook', 'allenai')"}, 'tags': {'type': 'array', 'items': {'type': 'string'}, 'description': "Tags to filter datasets (e.g., ['language:en', 'size_categories:1M<n<10M', 'task_categories:text-classification'])"}, 'sort': {'type': 'string', 'enum': ['trendingScore', 'downloads', 'likes', 'createdAt', 'lastModified'], 'description': 'Sort order: trendingScore, downloads, likes, createdAt, lastModified'}, 'limit': {'type': 'number', 'minimum': 1, 'maximum': 100, 'default': 20, 'description': 'Maximum number of results to return'}}, 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Dataset Search', readOnlyHint=True, destructiveHint=False, idempotentHint=None, openWorldHint=True)), Tool(name='dataset_details', description='Get detailed information about a specific dataset on Hugging Face Hub.', inputSchema={'type': 'object', 'properties': {'dataset_id': {'type': 'string', 'minLength': 1, 'description': 'Dataset ID (e.g., squad, glue, imdb)'}}, 'required': ['dataset_id'], 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='Dataset Details', readOnlyHint=True, destructiveHint=False, idempotentHint=None, openWorldHint=False)), Tool(name='gr1_evalstate_flux1_schnell', description='Generate an image using the Flux 1 Schnell Image Generator. (from evalstate/flux1_schnell)', inputSchema={'type': 'object', 'properties': {'prompt': {'type': 'string'}, 'seed': {'type': 'number', 'description': 'numeric value between 0 and 2147483647'}, 'randomize_seed': {'type': 'boolean', 'default': True}, 'width': {'type': 'number', 'description': 'numeric value between 256 and 2048', 'default': 1024}, 'height': {'type': 'number', 'description': 'numeric value between 256 and 2048', 'default': 1024}, 'num_inference_steps': {'type': 'number', 'description': 'numeric value between 1 and 50', 'default': 4}}, 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='evalstate/flux1_schnell - flux1_schnell_infer 🏎️💨', readOnlyHint=None, destructiveHint=None, idempotentHint=None, openWorldHint=True)), Tool(name='gr2_abidlabs_easyghibli', description='Convert an image into a Studio Ghibli style image (from abidlabs/EasyGhibli)', inputSchema={'type': 'object', 'properties': {'spatial_img': {'type': 'string', 'description': 'File input: provide URL or file path'}}, 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='abidlabs/EasyGhibli - abidlabs_EasyGhiblisingle_condition_generate_image 🦀', readOnlyHint=None, destructiveHint=None, idempotentHint=None, openWorldHint=True)), Tool(name='gr3_linoyts_framepack_f1', description='FramePack_F1_end_process tool from linoyts/FramePack-F1', inputSchema={'type': 'object', 'properties': {}, 'additionalProperties': False, '$schema': 'http://json-schema.org/draft-07/schema#'}, annotations=ToolAnnotations(title='linoyts/FramePack-F1 - FramePack_F1_end_process 📹⚡️', readOnlyHint=None, destructiveHint=None, idempotentHint=None, openWorldHint=True))]"""
243
+ print(str(e))
 
 
 
 
 
 
 
244
  # Format the prompt
245
  formatted_prompt = hf_query_gen_prompt.format(
246
  Tool_Details=hf_query_gen_tool_details
 
250
 
251
  # Parse the plan
252
  parsed_plan = parse_json_codefences(plan)
253
+ print(parsed_plan)
254
  # Call tool to get tool calls
255
  try:
256
+ mcp_call_tool_func = modal.Function.from_name(app_name="HuggingFace-MCP",name="call_tool")
257
+ tool_calls = []
258
+ async for tool_call in mcp_call_tool_func.starmap.aio([(tool['tool'], tool['args']) for tool in parsed_plan]):
259
+ tool_calls.append(tool_call)
260
  except Exception as e:
261
+ print(str(e))
262
  tool_calls = []
263
  print(tool_calls)
264
  if tool_calls!=[]:
 
327
 
328
  # Use Modal app.run() pattern like in the examples
329
 
330
+ base_url = os.env("DEVSTRAL_BASE_URL")
331
+ api_key = os.env("DEVSTRAL_API_KEY")
332
  print(f"🚀 Generating code using Devstral...")
333
  print(f"📡 Connecting to: {base_url}")
334
 
335
+ try:
336
+ devstral_inference_func = modal.Function.from_name("devstral-inference-client", "run_devstral_inference")
337
+ result = devstral_inference_func.remote(
338
+ base_url=base_url,
339
+ api_key=api_key,
340
+ prompts=[formatted_user_prompt],
341
+ system_prompt=devstral_code_gen_sys_prompt,
342
+ mode="single"
343
+ )
344
+ if result and "response" in result:
345
+ code_output = result["response"]
346
+ return f"🚀 **Generated Code:**\n\n{code_output}"
347
+ else:
348
+ return "❌ **Error:** No response received from Devstral model."
349
+ except Exception as e:
350
+ return f"❌ **Error:** {str(e)}"
 
 
 
 
351
 
352
  # Custom CSS for a sleek design
353
  custom_css = """
modal/devstral_inference.py DELETED
@@ -1,367 +0,0 @@
1
- # Optimized Devstral Inference Client
2
- # Connects to deployed model without restarting server for each request
3
- # Implements Modal best practices for lowest latency
4
-
5
- import modal
6
- import asyncio
7
- import time
8
- from typing import List, Dict, Any, Optional, AsyncGenerator
9
- import json
10
-
11
- # Connect to the deployed app
12
- app = modal.App("devstral-inference-client")
13
-
14
- # Image with OpenAI client for making requests
15
- client_image = modal.Image.debian_slim(python_version="3.12").pip_install(
16
- "openai>=1.76.0",
17
- "aiohttp>=3.9.0",
18
- "asyncio-throttle>=1.0.0"
19
- )
20
-
21
- class DevstralClient:
22
- """Optimized client for Devstral model with persistent connections and caching"""
23
-
24
- def __init__(self, base_url: str, api_key: str):
25
- self.base_url = base_url
26
- self.api_key = api_key
27
- self._session = None
28
- self._response_cache = {}
29
- self._conversation_cache = {}
30
-
31
- async def __aenter__(self):
32
- """Async context manager entry - create persistent HTTP session"""
33
- import aiohttp
34
- connector = aiohttp.TCPConnector(
35
- limit=100, # Connection pool size
36
- keepalive_timeout=300, # Keep connections alive
37
- enable_cleanup_closed=True
38
- )
39
- self._session = aiohttp.ClientSession(
40
- connector=connector,
41
- timeout=aiohttp.ClientTimeout(total=120)
42
- )
43
- return self
44
-
45
- async def __aexit__(self, exc_type, exc_val, exc_tb):
46
- """Clean up session on exit"""
47
- if self._session:
48
- await self._session.close()
49
-
50
- def _get_cache_key(self, messages: List[Dict], **kwargs) -> str:
51
- """Generate cache key for deterministic requests"""
52
- key_data = {
53
- "messages": json.dumps(messages, sort_keys=True),
54
- "temperature": kwargs.get("temperature", 0.1),
55
- "max_tokens": kwargs.get("max_tokens", 500),
56
- "top_p": kwargs.get("top_p", 0.95)
57
- }
58
- return hash(json.dumps(key_data, sort_keys=True))
59
-
60
- async def generate_response(
61
- self,
62
- prompt: str,
63
- system_prompt: Optional[str] = None,
64
- temperature: float = 0.1,
65
- max_tokens: int = 10000,
66
- stream: bool = False,
67
- use_cache: bool = True
68
- ) -> str:
69
- """Generate response from Devstral model with optimizations"""
70
-
71
- # Build messages
72
- messages = []
73
- if system_prompt:
74
- messages.append({"role": "system", "content": system_prompt})
75
- messages.append({"role": "user", "content": prompt})
76
-
77
- # Check cache for deterministic requests
78
- if use_cache and temperature == 0.0:
79
- cache_key = self._get_cache_key(messages, temperature=temperature, max_tokens=max_tokens)
80
- if cache_key in self._response_cache:
81
- print("📎 Cache hit - returning cached response")
82
- return self._response_cache[cache_key]
83
-
84
- # Prepare request payload
85
- payload = {
86
- "model": "mistralai/Devstral-Small-2505",
87
- "messages": messages,
88
- "temperature": temperature,
89
- "max_tokens": max_tokens,
90
- # "top_p": 0.95,
91
- "stream": stream
92
- }
93
-
94
- headers = {
95
- "Authorization": f"Bearer {self.api_key}",
96
- "Content-Type": "application/json"
97
- }
98
-
99
- start_time = time.perf_counter()
100
-
101
- if stream:
102
- return await self._stream_response(payload, headers)
103
- else:
104
- return await self._complete_response(payload, headers, use_cache, start_time)
105
-
106
- async def _complete_response(self, payload: Dict, headers: Dict, use_cache: bool, start_time: float) -> str:
107
- """Handle complete (non-streaming) response"""
108
- async with self._session.post(
109
- f"{self.base_url}/v1/chat/completions",
110
- json=payload,
111
- headers=headers
112
- ) as response:
113
- if response.status != 200:
114
- error_text = await response.text()
115
- raise Exception(f"API Error {response.status}: {error_text}")
116
-
117
- result = await response.json()
118
- latency = (time.perf_counter() - start_time) * 1000
119
-
120
- generated_text = result["choices"][0]["message"]["content"]
121
-
122
- # Cache deterministic responses
123
- if use_cache and payload["temperature"] == 0.0:
124
- cache_key = self._get_cache_key(payload["messages"], **payload)
125
- self._response_cache[cache_key] = generated_text
126
-
127
- print(f"⚡ Response generated in {latency:.2f}ms")
128
- return generated_text
129
-
130
- async def _stream_response(self, payload: Dict, headers: Dict) -> AsyncGenerator[str, None]:
131
- """Handle streaming response"""
132
- payload["stream"] = True
133
-
134
- async with self._session.post(
135
- f"{self.base_url}/v1/chat/completions",
136
- json=payload,
137
- headers=headers
138
- ) as response:
139
- if response.status != 200:
140
- error_text = await response.text()
141
- raise Exception(f"API Error {response.status}: {error_text}")
142
-
143
- buffer = ""
144
- async for chunk in response.content.iter_chunks():
145
- if chunk[0]:
146
- buffer += chunk[0].decode()
147
- while "\n" in buffer:
148
- line, buffer = buffer.split("\n", 1)
149
- if line.startswith("data: "):
150
- data = line[6:]
151
- if data == "[DONE]":
152
- return
153
- try:
154
- json_data = json.loads(data)
155
- if "choices" in json_data and json_data["choices"]:
156
- delta = json_data["choices"][0].get("delta", {})
157
- if "content" in delta:
158
- yield delta["content"]
159
- except json.JSONDecodeError:
160
- continue
161
-
162
- async def batch_generate(
163
- self,
164
- prompts: List[str],
165
- system_prompt: Optional[str] = None,
166
- temperature: float = 0.1,
167
- max_tokens: int = 500,
168
- max_concurrent: int = 5
169
- ) -> List[str]:
170
- """Generate responses for multiple prompts with concurrency control"""
171
- from asyncio_throttle import Throttler
172
-
173
- # Throttle requests to avoid overwhelming the server
174
- throttler = Throttler(rate_limit=max_concurrent, period=1.0)
175
-
176
- async def generate_single(prompt: str) -> str:
177
- async with throttler:
178
- return await self.generate_response(
179
- prompt=prompt,
180
- system_prompt=system_prompt,
181
- temperature=temperature,
182
- max_tokens=max_tokens
183
- )
184
-
185
- # Execute all requests concurrently
186
- tasks = [generate_single(prompt) for prompt in prompts]
187
- results = await asyncio.gather(*tasks, return_exceptions=True)
188
-
189
- # Handle any exceptions
190
- processed_results = []
191
- for result in results:
192
- if isinstance(result, Exception):
193
- processed_results.append(f"Error: {str(result)}")
194
- else:
195
- processed_results.append(result)
196
-
197
- return processed_results
198
-
199
- @app.function(
200
- image=client_image,
201
- timeout=600, # 10 minutes
202
- )
203
- async def run_devstral_inference(
204
- base_url: str,
205
- api_key: str,
206
- prompts: List[str],
207
- system_prompt: Optional[str] = None,
208
- mode: str = "single" # "single", "batch", "stream"
209
- ):
210
- """Main function to run optimized Devstral inference"""
211
-
212
- async with DevstralClient(base_url, api_key) as client:
213
-
214
- if mode == "single":
215
- # Single prompt inference
216
- if len(prompts) > 0:
217
- response = await client.generate_response(
218
- prompt=prompts[0],
219
- system_prompt=system_prompt,
220
- temperature=0.1,
221
- max_tokens=10000
222
- )
223
- return {"response": response}
224
-
225
- elif mode == "batch":
226
- # Batch inference for multiple prompts
227
- responses = await client.batch_generate(
228
- prompts=prompts,
229
- system_prompt=system_prompt,
230
- temperature=0.1,
231
- max_tokens=10000,
232
- max_concurrent=5
233
- )
234
- return {"responses": responses}
235
-
236
- elif mode == "stream":
237
- # Streaming inference
238
- if len(prompts) > 0:
239
- full_response = ""
240
- async for chunk in client.generate_response(
241
- prompt=prompts[0],
242
- system_prompt=system_prompt,
243
- temperature=0.1,
244
- stream=True
245
- ):
246
- full_response += chunk
247
- print(chunk, end="", flush=True)
248
- print() # New line after streaming
249
- return {"response": full_response}
250
-
251
- return {"error": "No prompts provided"}
252
-
253
- # Convenient wrapper functions for different use cases
254
- @app.function(image=client_image)
255
- async def code_generation(prompt: str, base_url: str, api_key: str) -> str:
256
- """Optimized for code generation tasks"""
257
- system_prompt = """You are an expert software engineer. Generate clean, efficient, and well-documented code.
258
- Focus on best practices, performance, and maintainability. Include brief explanations for complex logic."""
259
-
260
- async with DevstralClient(base_url, api_key) as client:
261
- return await client.generate_response(
262
- prompt=prompt,
263
- system_prompt=system_prompt,
264
- temperature=0.0, # Deterministic for code
265
- max_tokens=10000,
266
- use_cache=True # Cache code responses
267
- )
268
-
269
- @app.function(image=client_image)
270
- async def chat_response(prompt: str, base_url: str, api_key: str) -> str:
271
- """Optimized for conversational responses"""
272
- system_prompt = """You are a helpful, knowledgeable AI assistant. Provide clear, concise, and accurate responses.
273
- Be conversational but professional."""
274
-
275
- async with DevstralClient(base_url, api_key) as client:
276
- return await client.generate_response(
277
- prompt=prompt,
278
- system_prompt=system_prompt,
279
- temperature=0.3, # Slightly creative
280
- max_tokens=10000
281
- )
282
-
283
- @app.function(image=client_image)
284
- async def document_analysis(prompt: str, base_url: str, api_key: str) -> str:
285
- """Optimized for document analysis and summarization"""
286
- system_prompt = """You are an expert document analyst. Provide thorough, structured analysis with key insights,
287
- summaries, and actionable recommendations. Use clear formatting and bullet points."""
288
-
289
- async with DevstralClient(base_url, api_key) as client:
290
- return await client.generate_response(
291
- prompt=prompt,
292
- system_prompt=system_prompt,
293
- temperature=0.1, # Factual and consistent
294
- max_tokens=800
295
- )
296
-
297
- # Local test client for development
298
- @app.local_entrypoint()
299
- def main(
300
- base_url: str = "https://abhinav-bhatnagar--devstral-vllm-deployment-serve.modal.run",
301
- api_key: str = "ak-zMwhIPjqvBj30jbm1DmKqx",
302
- mode: str = "single"
303
- ):
304
- """Test the optimized Devstral inference client"""
305
-
306
- test_prompts = [
307
- "Write a Python function to calculate the Fibonacci sequence using memoization.",
308
- "Explain the difference between REST and GraphQL APIs.",
309
- "What are the key benefits of using Docker containers?",
310
- "How does machine learning differ from traditional programming?",
311
- "Write a SQL query to find the top 5 customers by total order value."
312
- ]
313
-
314
- print(f"🚀 Testing Devstral inference in {mode} mode...")
315
- print(f"📡 Connecting to: {base_url}")
316
-
317
- if mode == "single":
318
- # Test single inference
319
- result = run_devstral_inference.remote(
320
- base_url=base_url,
321
- api_key=api_key,
322
- prompts=[test_prompts[0]],
323
- system_prompt="You are a helpful coding assistant.",
324
- mode="single"
325
- )
326
- print("✅ Single inference result:")
327
- print(result["response"])
328
-
329
- elif mode == "batch":
330
- # Test batch inference
331
- result = run_devstral_inference.remote(
332
- base_url=base_url,
333
- api_key=api_key,
334
- prompts=test_prompts[:3], # Test with 3 prompts
335
- system_prompt="You are a knowledgeable AI assistant.",
336
- mode="batch"
337
- )
338
- print("✅ Batch inference results:")
339
- for i, response in enumerate(result["responses"]):
340
- print(f"\nPrompt {i+1}: {test_prompts[i]}")
341
- print(f"Response: {response}")
342
-
343
- elif mode == "specialized":
344
- # Test specialized functions
345
- print("\n📝 Testing Code Generation:")
346
- code_result = code_generation.remote(
347
- prompt="Create a Python class for a binary search tree with insert, search, and delete methods.",
348
- base_url=base_url,
349
- api_key=api_key
350
- )
351
- print(code_result)
352
-
353
- print("\n💬 Testing Chat Response:")
354
- chat_result = chat_response.remote(
355
- prompt="What's the best way to learn machine learning for beginners?",
356
- base_url=base_url,
357
- api_key=api_key
358
- )
359
- print(chat_result)
360
-
361
- print("\n🎉 Testing completed!")
362
-
363
- if __name__ == "__main__":
364
- # This allows running the client locally for testing
365
- import sys
366
- mode = sys.argv[1] if len(sys.argv) > 1 else "single"
367
- main(mode=mode)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -5,4 +5,4 @@ python-dotenv==1.0.1
5
  openpyxl==3.1.5
6
  Pillow==10.4.0
7
  marker-pdf==1.7.4
8
- modal==0.75.6
 
5
  openpyxl==3.1.5
6
  Pillow==10.4.0
7
  marker-pdf==1.7.4
8
+ modal==1.0.3