File size: 7,933 Bytes
43cc365
 
 
 
2d70789
 
 
048628f
01e79df
8a98314
 
 
 
 
 
 
 
 
 
 
 
 
 
048628f
43cc365
 
 
d9ba98f
191a9f9
43cc365
 
 
a49d7b2
43cc365
8a98314
191a9f9
 
 
 
 
 
 
 
8a98314
191a9f9
8a98314
191a9f9
 
43cc365
8a98314
43cc365
 
a49d7b2
 
 
191a9f9
 
8a98314
 
a49d7b2
 
191a9f9
 
 
280099a
191a9f9
01e79df
 
191a9f9
8a98314
191a9f9
8a98314
191a9f9
8a98314
191a9f9
 
280099a
01e79df
 
 
8a98314
01e79df
 
280099a
 
01e79df
280099a
01e79df
 
280099a
01e79df
43cc365
 
8a98314
280099a
43cc365
2d70789
 
191a9f9
8a98314
2d70789
 
a49d7b2
 
 
191a9f9
 
a49d7b2
 
 
 
 
 
 
 
 
191a9f9
 
 
 
 
 
 
a49d7b2
2d70789
 
ab6809d
8afce56
191a9f9
8a98314
191a9f9
a49d7b2
191a9f9
8ac0dd3
a49d7b2
191a9f9
 
 
a49d7b2
8ac0dd3
 
191a9f9
 
8ac0dd3
191a9f9
ab6809d
191a9f9
 
a49d7b2
ab6809d
43cc365
 
191a9f9
280099a
 
191a9f9
8a98314
43cc365
a49d7b2
314bed8
191a9f9
8a98314
191a9f9
1fb027f
280099a
b03785b
630d5e2
 
 
 
 
 
 
b03785b
 
280099a
 
 
 
 
191a9f9
280099a
 
01e79df
8a98314
 
191a9f9
 
 
280099a
6c97f34
191a9f9
 
 
 
8a98314
191a9f9
 
280099a
191a9f9
 
8a98314
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
from fastapi import FastAPI, Request
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import platform
import psutil
import multiprocessing
import time
import tiktoken # For estimating token count
import logging # Import the logging module

# === Configure Logging ===
# Get the root logger
logger = logging.getLogger(__name__)
# Set the logging level (e.g., INFO, DEBUG, WARNING, ERROR, CRITICAL)
logger.setLevel(logging.INFO)
# Create a console handler and set its format
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
# Add the handler to the logger if it's not already added
if not logger.handlers:
    logger.addHandler(handler)

app = FastAPI()

# === Model Config ===
REPO_ID = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
FILENAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" # Q4_K_M is a good balance of size and quality
MODEL_DIR = "models"
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)

# === Download if model not available ===
if not os.path.exists(MODEL_PATH):
    logger.info(f"⬇️ Downloading {FILENAME} from Hugging Face...")
    try:
        model_path = hf_hub_download(
            repo_id=REPO_ID,
            filename=FILENAME,
            cache_dir=MODEL_DIR,
            local_dir=MODEL_DIR,
            local_dir_use_symlinks=False
        )
        logger.info(f"✅ Model downloaded to: {model_path}")
    except Exception as e:
        logger.error(f"❌ Error downloading model: {e}")
        # Exit or handle error appropriately if model download fails
        exit(1)
else:
    logger.info(f"✅ Model already available at: {MODEL_PATH}")
    model_path = MODEL_PATH

# === Optimal thread usage ===
logical_cores = psutil.cpu_count(logical=True)
physical_cores = psutil.cpu_count(logical=False)
recommended_threads = max(1, physical_cores) # Ensure at least 1 thread

logger.info(f"Detected physical cores: {physical_cores}, logical cores: {logical_cores}")
logger.info(f"Using n_threads: {recommended_threads}")

# === Load the model ===
try:
    llm = Llama(
        model_path=model_path,
        n_ctx=1024,  # Context window size for the model (still needed, but not fully utilized for history)
        n_threads=recommended_threads,
        use_mlock=True,  # Lock model in RAM for faster access
        n_gpu_layers=0,  # CPU only
        chat_format="chatml",  # TinyLlama Chat uses ChatML format
        verbose=False # Keep llama.cpp's internal verbose logging off
    )
    logger.info("� Llama model loaded successfully!")
except Exception as e:
    logger.error(f"❌ Error loading Llama model: {e}")
    exit(1)

# Initialize tiktoken encoder for token counting
try:
    encoding = tiktoken.get_encoding("cl100k_base")
except Exception:
    logger.warning("⚠️ Could not load tiktoken 'cl100k_base' encoding. Token count for prompt might be less accurate.")
    encoding = None

def count_tokens_in_text(text):
    """Estimates tokens in a given text using tiktoken or simple char count."""
    if encoding:
        return len(encoding.encode(text))
    else:
        # Fallback for when tiktoken isn't available or for simple estimation
        return len(text) // 4 # Rough estimate: 1 token ~ 4 characters

@app.get("/")
def root():
    logger.info("Root endpoint accessed.")
    return {"message": "✅ Data Analysis AI API is live and optimized for speed (no context retention)!"}

@app.get("/get_sys")
def get_sys_specs():
    """Returns system specifications including CPU, RAM, and OS details."""
    logger.info("System specs endpoint accessed.")
    memory = psutil.virtual_memory()
    return {
        "CPU": {
            "physical_cores": physical_cores,
            "logical_cores": logical_cores,
            "max_freq_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else "N/A",
            "cpu_usage_percent": psutil.cpu_percent(interval=1) # CPU usage over 1 second
        },
        "RAM": {
            "total_GB": round(memory.total / (1024 ** 3), 2),
            "available_GB": round(memory.available / (1024 ** 3), 2),
            "usage_percent": memory.percent
        },
        "System": {
            "platform": platform.platform(),
            "architecture": platform.machine(),
            "python_version": platform.python_version()
        },
        "Model_Config": {
            "model_name": FILENAME,
            "n_ctx": llm.n_ctx(),
            "n_threads": llm.n_threads(),
            "use_mlock": llm.use_mlock()
        }
    }

@app.get("/process_list")
def process_list():
    """Returns a list of processes consuming significant CPU."""
    logger.info("Process list endpoint accessed.")
    time.sleep(1)  # Let CPU settle for accurate measurement
    processes = []
    for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
        try:
            cpu = proc.cpu_percent()
            mem = proc.memory_percent()
            # Filter processes using more than 5% CPU or 2% memory
            if cpu > 5 or mem > 2:
                processes.append({
                    "pid": proc.pid,
                    "name": proc.name(),
                    "cpu_percent": round(cpu, 2),
                    "memory_percent": round(mem, 2)
                })
        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
            pass
    # Sort by CPU usage descending
    processes.sort(key=lambda x: x['cpu_percent'], reverse=True)
    return {"heavy_processes": processes}

@app.post("/generate")
async def generate(request: Request):
    """
    Generates a response from the LLM without retaining chat context.
    Expects a JSON body with 'prompt'.
    """
    logger.info("➡️ /generate endpoint received a request.") # Log at the very beginning
    data = await request.json()
    prompt = data.get("prompt", "").strip()

    if not prompt:
        logger.warning("Prompt cannot be empty in /generate request.")
        return {"error": "Prompt cannot be empty"}, 400

    # Define the system prompt - sent with every request
    system_prompt_content = (
        "You are a helpful AI assistant for data analysis. ",
        "Provide concise and actionable suggestions based on the data provided or questions asked. ",
        "Focus on data insights and actionable steps for report generation.",
        "Be concise and professional in your responses.",
        "Avoid unnecessary verbosity and focus on key insights.",
        "Ensure your responses are clear and directly address the questions asked.",
        "Alway follow the instructions provided in the prompt and respond within instructed word limits.",
    )

    # Construct messages for the current request only
    messages_for_llm = [
        {"role": "system", "content": system_prompt_content},
        {"role": "user", "content": prompt}
    ]

    # Calculate tokens in the user's prompt
    prompt_tokens = count_tokens_in_text(prompt)

    logger.info(f"🧾 Prompt received: {prompt}")
    logger.info(f"Tokens in prompt: {prompt_tokens}")

    try:
        response = llm.create_chat_completion(
            messages=messages_for_llm,
            max_tokens=300,  # Keep response length short for maximum speed
            temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
            stop=["</s>"] # Stop sequence for TinyLlama Chat
        )
        ai_response_content = response["choices"][0]["message"]["content"].strip()
        logger.info("✅ Response generated successfully.")
        return {
            "response": ai_response_content,
            "prompt_tokens": prompt_tokens # Return tokens in the prompt
        }
    except Exception as e:
        logger.error(f"❌ Error during generation: {e}", exc_info=True) # Log exception details
        return {"error": f"Failed to generate response: {e}. Please try again."}, 500