Spaces:

Priyanshukr-1
/

openhermes_mistral_API

Sleeping

App Files Files Community

Priyanshukr-1 commited on 29 days ago

Commit

8a98314

verified ·

1 Parent(s): 6c97f34

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -17

app.py CHANGED Viewed

@@ -7,6 +7,20 @@ import psutil
 import multiprocessing
 import time
 import tiktoken # For estimating token count
 app = FastAPI()
@@ -18,7 +32,7 @@ MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
 # === Download if model not available ===
 if not os.path.exists(MODEL_PATH):
-    print(f"⬇️ Downloading {FILENAME} from Hugging Face...")
     try:
         model_path = hf_hub_download(
             repo_id=REPO_ID,
@@ -27,13 +41,13 @@ if not os.path.exists(MODEL_PATH):
             local_dir=MODEL_DIR,
             local_dir_use_symlinks=False
         )
-        print(f"✅ Model downloaded to: {model_path}")
     except Exception as e:
-        print(f"❌ Error downloading model: {e}")
         # Exit or handle error appropriately if model download fails
         exit(1)
 else:
-    print(f"✅ Model already available at: {MODEL_PATH}")
     model_path = MODEL_PATH
 # === Optimal thread usage ===
@@ -41,8 +55,8 @@ logical_cores = psutil.cpu_count(logical=True)
 physical_cores = psutil.cpu_count(logical=False)
 recommended_threads = max(1, physical_cores) # Ensure at least 1 thread
-print(f"Detected physical cores: {physical_cores}, logical cores: {logical_cores}")
-print(f"Using n_threads: {recommended_threads}")
 # === Load the model ===
 try:
@@ -53,18 +67,18 @@ try:
         use_mlock=True,  # Lock model in RAM for faster access
         n_gpu_layers=0,  # CPU only
         chat_format="chatml",  # TinyLlama Chat uses ChatML format
-        verbose=False
     )
-    print("🚀 Llama model loaded successfully!")
 except Exception as e:
-    print(f"❌ Error loading Llama model: {e}")
     exit(1)
 # Initialize tiktoken encoder for token counting
 try:
     encoding = tiktoken.get_encoding("cl100k_base")
 except Exception:
-    print("⚠️ Could not load tiktoken 'cl100k_base' encoding. Token count for prompt might be less accurate.")
     encoding = None
 def count_tokens_in_text(text):
@@ -75,14 +89,15 @@ def count_tokens_in_text(text):
         # Fallback for when tiktoken isn't available or for simple estimation
         return len(text) // 4 # Rough estimate: 1 token ~ 4 characters
 @app.get("/")
 def root():
     return {"message": "✅ Data Analysis AI API is live and optimized for speed (no context retention)!"}
 @app.get("/get_sys")
 def get_sys_specs():
     """Returns system specifications including CPU, RAM, and OS details."""
     memory = psutil.virtual_memory()
     return {
         "CPU": {
@@ -112,6 +127,7 @@ def get_sys_specs():
 @app.get("/process_list")
 def process_list():
     """Returns a list of processes consuming significant CPU."""
     time.sleep(1)  # Let CPU settle for accurate measurement
     processes = []
     for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
@@ -138,10 +154,12 @@ async def generate(request: Request):
     Generates a response from the LLM without retaining chat context.
     Expects a JSON body with 'prompt'.
     """
     data = await request.json()
     prompt = data.get("prompt", "").strip()
     if not prompt:
         return {"error": "Prompt cannot be empty"}, 400
     # Define the system prompt - sent with every request
@@ -160,8 +178,8 @@ async def generate(request: Request):
     # Calculate tokens in the user's prompt
     prompt_tokens = count_tokens_in_text(prompt)
-    print(f"🧾 Prompt received: {prompt}")
-    print(f"Tokens in prompt: {prompt_tokens}")
     try:
         response = llm.create_chat_completion(
@@ -170,13 +188,12 @@ async def generate(request: Request):
             temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
             stop=["</s>"] # Stop sequence for TinyLlama Chat
         )
         ai_response_content = response["choices"][0]["message"]["content"].strip()
         return {
             "response": ai_response_content,
             "prompt_tokens": prompt_tokens # Return tokens in the prompt
         }
     except Exception as e:
-        print(f"❌ Error during generation: {e}")
-        return {"error": f"Failed to generate response: {e}. Please try again."}, 500

 import multiprocessing
 import time
 import tiktoken # For estimating token count
+import logging # Import the logging module
+# === Configure Logging ===
+# Get the root logger
+logger = logging.getLogger(__name__)
+# Set the logging level (e.g., INFO, DEBUG, WARNING, ERROR, CRITICAL)
+logger.setLevel(logging.INFO)
+# Create a console handler and set its format
+handler = logging.StreamHandler()
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+handler.setFormatter(formatter)
+# Add the handler to the logger if it's not already added
+if not logger.handlers:
+    logger.addHandler(handler)
 app = FastAPI()
 # === Download if model not available ===
 if not os.path.exists(MODEL_PATH):
+    logger.info(f"⬇️ Downloading {FILENAME} from Hugging Face...")
     try:
         model_path = hf_hub_download(
             repo_id=REPO_ID,
             local_dir=MODEL_DIR,
             local_dir_use_symlinks=False
         )
+        logger.info(f"✅ Model downloaded to: {model_path}")
     except Exception as e:
+        logger.error(f"❌ Error downloading model: {e}")
         # Exit or handle error appropriately if model download fails
         exit(1)
 else:
+    logger.info(f"✅ Model already available at: {MODEL_PATH}")
     model_path = MODEL_PATH
 # === Optimal thread usage ===
 physical_cores = psutil.cpu_count(logical=False)
 recommended_threads = max(1, physical_cores) # Ensure at least 1 thread
+logger.info(f"Detected physical cores: {physical_cores}, logical cores: {logical_cores}")
+logger.info(f"Using n_threads: {recommended_threads}")
 # === Load the model ===
 try:
         use_mlock=True,  # Lock model in RAM for faster access
         n_gpu_layers=0,  # CPU only
         chat_format="chatml",  # TinyLlama Chat uses ChatML format
+        verbose=False # Keep llama.cpp's internal verbose logging off
     )
+    logger.info("� Llama model loaded successfully!")
 except Exception as e:
+    logger.error(f"❌ Error loading Llama model: {e}")
     exit(1)
 # Initialize tiktoken encoder for token counting
 try:
     encoding = tiktoken.get_encoding("cl100k_base")
 except Exception:
+    logger.warning("⚠️ Could not load tiktoken 'cl100k_base' encoding. Token count for prompt might be less accurate.")
     encoding = None
 def count_tokens_in_text(text):
         # Fallback for when tiktoken isn't available or for simple estimation
         return len(text) // 4 # Rough estimate: 1 token ~ 4 characters
 @app.get("/")
 def root():
+    logger.info("Root endpoint accessed.")
     return {"message": "✅ Data Analysis AI API is live and optimized for speed (no context retention)!"}
 @app.get("/get_sys")
 def get_sys_specs():
     """Returns system specifications including CPU, RAM, and OS details."""
+    logger.info("System specs endpoint accessed.")
     memory = psutil.virtual_memory()
     return {
         "CPU": {
 @app.get("/process_list")
 def process_list():
     """Returns a list of processes consuming significant CPU."""
+    logger.info("Process list endpoint accessed.")
     time.sleep(1)  # Let CPU settle for accurate measurement
     processes = []
     for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
     Generates a response from the LLM without retaining chat context.
     Expects a JSON body with 'prompt'.
     """
+    logger.info("➡️ /generate endpoint received a request.") # Log at the very beginning
     data = await request.json()
     prompt = data.get("prompt", "").strip()
     if not prompt:
+        logger.warning("Prompt cannot be empty in /generate request.")
         return {"error": "Prompt cannot be empty"}, 400
     # Define the system prompt - sent with every request
     # Calculate tokens in the user's prompt
     prompt_tokens = count_tokens_in_text(prompt)
+    logger.info(f"🧾 Prompt received: {prompt}")
+    logger.info(f"Tokens in prompt: {prompt_tokens}")
     try:
         response = llm.create_chat_completion(
             temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
             stop=["</s>"] # Stop sequence for TinyLlama Chat
         )
         ai_response_content = response["choices"][0]["message"]["content"].strip()
+        logger.info("✅ Response generated successfully.")
         return {
             "response": ai_response_content,
             "prompt_tokens": prompt_tokens # Return tokens in the prompt
         }
     except Exception as e:
+        logger.error(f"❌ Error during generation: {e}", exc_info=True) # Log exception details
+        return {"error": f"Failed to generate response: {e}. Please try again."}, 500