Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Load model if not already loaded or if model type changed
|
2 |
+
if "model" not in st.session_state or st.session_state.get("model_type") != model_type:
|
3 |
+
model_data = load_model(hf_token, model_type, selected_model)
|
4 |
+
if model_data is None:
|
5 |
+
st.error("Failed to load model. Please check your token and try again.")
|
6 |
+
st.stop()
|
7 |
+
|
8 |
+
st.session_state.model, st.session_state.tokenizer = model_data
|
9 |
+
st.session_state.model_type = model_type
|
10 |
+
|
11 |
+
model = st.session_state.model
|
12 |
+
tokenizer = st.session_state.tokenizer
|
13 |
+
|
14 |
+
# Add user message
|
15 |
+
with st.chat_message("user", avatar=USER_AVATAR):
|
16 |
+
st.markdown(prompt)
|
17 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
18 |
+
|
19 |
+
# Process file
|
20 |
+
file_context = process_file(uploaded_file)
|
21 |
+
|
22 |
+
# Generate response with KV caching
|
23 |
+
if model and tokenizer:
|
24 |
+
try:
|
25 |
+
with st.chat_message("assistant", avatar=BOT_AVATAR):
|
26 |
+
start_time = time.time()
|
27 |
+
streamer = generate_with_kv_cache(prompt, file_context, model, tokenizer, use_cache=True)
|
28 |
+
|
29 |
+
response_container = st.empty()
|
30 |
+
full_response = ""
|
31 |
+
|
32 |
+
for chunk in streamer:
|
33 |
+
cleaned_chunk = chunk.replace("<think>", "").replace("</think>", "").strip()
|
34 |
+
full_response += cleaned_chunk + " "
|
35 |
+
response_container.markdown(full_response + "β", unsafe_allow_html=True)
|
36 |
+
|
37 |
+
# Calculate performance metrics
|
38 |
+
end_time = time.time()
|
39 |
+
input_tokens = len(tokenizer(prompt)["input_ids"])
|
40 |
+
output_tokens = len(tokenizer(full_response)["input_ids"])
|
41 |
+
speed = output_tokens / (end_time - start_time)
|
42 |
+
|
43 |
+
# Calculate costs (hypothetical pricing model)
|
44 |
+
input_cost = (input_tokens / 1000000) * 5 # $5 per million input tokens
|
45 |
+
output_cost = (output_tokens / 1000000) * 15 # $15 per million output tokens
|
46 |
+
total_cost_usd = input_cost + output_cost
|
47 |
+
total_cost_aoa = total_cost_usd * 1160 # Convert to AOA (Angolan Kwanza)
|
48 |
+
|
49 |
+
# Display metrics
|
50 |
+
st.caption(
|
51 |
+
f"π Input Tokens: {input_tokens} | Output Tokens: {output_tokens} | "
|
52 |
+
f"π Speed: {speed:.1f}t/s | π° Cost (USD): ${total_cost_usd:.4f} | "
|
53 |
+
f"π΅ Cost (AOA): {total_cost_aoa:.4f}"
|
54 |
+
)
|
55 |
+
|
56 |
+
response_container.markdown(full_response)
|
57 |
+
st.session_state.messages.append({"role": "assistant", "content": full_response})
|
58 |
+
|
59 |
+
except Exception as e:
|
60 |
+
st.error(f"β‘ Generation error: {str(e)}")
|
61 |
+
else:
|
62 |
+
st.error("π€ Model not loaded!")
|