gaur3009 commited on
Commit
768d1ad
Β·
verified Β·
1 Parent(s): 4543af6

Update llm.py

Browse files
Files changed (1) hide show
  1. llm.py +26 -59
llm.py CHANGED
@@ -1,74 +1,41 @@
1
- from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
2
- import time
 
 
 
 
 
 
 
 
3
 
4
- # Option 1: Ultra-fast CPU model (best for response speed)
5
- def generate_answer_fast(context, question):
6
- """
7
- Uses DistilGPT-2 (smallest version) with optimizations
8
- ~2-5 seconds on CPU for 60 tokens
9
- """
10
- tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
11
- model = AutoModelForCausalLM.from_pretrained("distilgpt2")
12
-
13
- prompt = f"""You are a helpful AI assistant. Using the context, answer the question conversationally.
14
-
15
  Context:
16
- {context[:1500]}
17
 
18
  Question: {question}
19
  Answer:"""
20
 
21
- inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
 
 
 
 
 
 
22
 
23
- # Fast generation parameters for CPU
24
  outputs = model.generate(
25
  inputs.input_ids,
26
- max_new_tokens=80,
27
- num_beams=1, # Disable beam search (faster)
28
- do_sample=False, # Disable sampling (faster)
 
 
 
29
  pad_token_id=tokenizer.eos_token_id,
30
- temperature=0.7,
31
- top_k=20,
32
  early_stopping=True
33
  )
34
 
35
  # Extract only the new text
36
  full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
37
- return full_text.split("Answer:")[-1].strip()
38
-
39
- # Option 2: Better quality CPU model (balance between speed & quality)
40
- def generate_answer_quality(context, question):
41
- """
42
- Uses DialoGPT-small - conversational but still CPU-friendly
43
- ~5-10 seconds on CPU for 80 tokens
44
- """
45
- qa_pipeline = pipeline(
46
- "text-generation",
47
- model="microsoft/DialoGPT-small",
48
- tokenizer="microsoft/DialoGPT-small"
49
- )
50
-
51
- prompt = f"""Context: {context[:1200]}
52
- Question: {question}
53
- Assistant:"""
54
-
55
- response = qa_pipeline(
56
- prompt,
57
- max_new_tokens=80,
58
- num_beams=1,
59
- temperature=0.8,
60
- top_k=30,
61
- do_sample=True,
62
- pad_token_id=50256, # DialoGPT pad token
63
- no_repeat_ngram_size=2
64
- )
65
-
66
- return response[0]['generated_text'].split("Assistant:")[-1].strip()
67
-
68
- # Choose one based on priority
69
- def generate_answer(context, question):
70
- start_time = time.time()
71
- result = generate_answer_fast(context, question) # For fastest response
72
- # result = generate_answer_quality(context, question) # For better conversation
73
- print(f"Generation took: {time.time() - start_time:.2f}s")
74
- return result
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM
2
+
3
+ # Load CPU-optimized model
4
+ tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
5
+ model = AutoModelForCausalLM.from_pretrained("distilgpt2")
6
+
7
+ def generate_answer(context, question, max_new_tokens=100):
8
+ """Generate answer with CPU optimizations"""
9
+ # Create concise prompt
10
+ prompt = f"""Based on the context, answer the question conversationally.
11
 
 
 
 
 
 
 
 
 
 
 
 
12
  Context:
13
+ {context[:1000]}
14
 
15
  Question: {question}
16
  Answer:"""
17
 
18
+ # Tokenize with truncation
19
+ inputs = tokenizer(
20
+ prompt,
21
+ return_tensors="pt",
22
+ max_length=512,
23
+ truncation=True
24
+ )
25
 
26
+ # Generate with CPU-optimized settings
27
  outputs = model.generate(
28
  inputs.input_ids,
29
+ max_new_tokens=max_new_tokens,
30
+ num_beams=1, # Faster than beam search
31
+ do_sample=True, # More natural responses
32
+ temperature=0.7, # Balance creativity/focus
33
+ top_k=40, # Focus on likely tokens
34
+ top_p=0.9, # Nucleus sampling
35
  pad_token_id=tokenizer.eos_token_id,
 
 
36
  early_stopping=True
37
  )
38
 
39
  # Extract only the new text
40
  full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
41
+ return full_text.split("Answer:")[-1].strip()