amiguel commited on
Commit
037c4ae
Β·
verified Β·
1 Parent(s): 4543832

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -0
app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Load model if not already loaded or if model type changed
2
+ if "model" not in st.session_state or st.session_state.get("model_type") != model_type:
3
+ model_data = load_model(hf_token, model_type, selected_model)
4
+ if model_data is None:
5
+ st.error("Failed to load model. Please check your token and try again.")
6
+ st.stop()
7
+
8
+ st.session_state.model, st.session_state.tokenizer = model_data
9
+ st.session_state.model_type = model_type
10
+
11
+ model = st.session_state.model
12
+ tokenizer = st.session_state.tokenizer
13
+
14
+ # Add user message
15
+ with st.chat_message("user", avatar=USER_AVATAR):
16
+ st.markdown(prompt)
17
+ st.session_state.messages.append({"role": "user", "content": prompt})
18
+
19
+ # Process file
20
+ file_context = process_file(uploaded_file)
21
+
22
+ # Generate response with KV caching
23
+ if model and tokenizer:
24
+ try:
25
+ with st.chat_message("assistant", avatar=BOT_AVATAR):
26
+ start_time = time.time()
27
+ streamer = generate_with_kv_cache(prompt, file_context, model, tokenizer, use_cache=True)
28
+
29
+ response_container = st.empty()
30
+ full_response = ""
31
+
32
+ for chunk in streamer:
33
+ cleaned_chunk = chunk.replace("<think>", "").replace("</think>", "").strip()
34
+ full_response += cleaned_chunk + " "
35
+ response_container.markdown(full_response + "β–Œ", unsafe_allow_html=True)
36
+
37
+ # Calculate performance metrics
38
+ end_time = time.time()
39
+ input_tokens = len(tokenizer(prompt)["input_ids"])
40
+ output_tokens = len(tokenizer(full_response)["input_ids"])
41
+ speed = output_tokens / (end_time - start_time)
42
+
43
+ # Calculate costs (hypothetical pricing model)
44
+ input_cost = (input_tokens / 1000000) * 5 # $5 per million input tokens
45
+ output_cost = (output_tokens / 1000000) * 15 # $15 per million output tokens
46
+ total_cost_usd = input_cost + output_cost
47
+ total_cost_aoa = total_cost_usd * 1160 # Convert to AOA (Angolan Kwanza)
48
+
49
+ # Display metrics
50
+ st.caption(
51
+ f"πŸ”‘ Input Tokens: {input_tokens} | Output Tokens: {output_tokens} | "
52
+ f"πŸ•’ Speed: {speed:.1f}t/s | πŸ’° Cost (USD): ${total_cost_usd:.4f} | "
53
+ f"πŸ’΅ Cost (AOA): {total_cost_aoa:.4f}"
54
+ )
55
+
56
+ response_container.markdown(full_response)
57
+ st.session_state.messages.append({"role": "assistant", "content": full_response})
58
+
59
+ except Exception as e:
60
+ st.error(f"⚑ Generation error: {str(e)}")
61
+ else:
62
+ st.error("πŸ€– Model not loaded!")