Vinitha2004 commited on
Commit
b2270c7
Β·
verified Β·
1 Parent(s): 0e39f49

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model_f16.gguf filter=lfs diff=lfs merge=lfs -text
37
+ model_q4_0.gguf filter=lfs diff=lfs merge=lfs -text
38
+ model_q5_0.gguf filter=lfs diff=lfs merge=lfs -text
39
+ model_q8_0.gguf filter=lfs diff=lfs merge=lfs -text
fast_inference.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Optimized inference script for GGUF models
4
+ Supports llama-cpp-python for maximum speed
5
+ """
6
+
7
+ import argparse
8
+ import time
9
+ from pathlib import Path
10
+ import multiprocessing
11
+
12
+ try:
13
+ from llama_cpp import Llama
14
+ LLAMA_CPP_AVAILABLE = True
15
+ except ImportError:
16
+ LLAMA_CPP_AVAILABLE = False
17
+ print("llama-cpp-python not available.")
18
+ print("Install with: pip install llama-cpp-python")
19
+
20
+ class FastInference:
21
+ """Optimized inference class for GGUF models"""
22
+
23
+ def __init__(self, model_path: str, n_ctx: int = 4096, n_threads: int = -1):
24
+ self.model_path = model_path
25
+
26
+ if not LLAMA_CPP_AVAILABLE:
27
+ raise ImportError("llama-cpp-python required for GGUF inference")
28
+
29
+ # Use all CPU threads if not specified
30
+ if n_threads == -1:
31
+ n_threads = multiprocessing.cpu_count()
32
+
33
+ # Initialize model with optimized settings
34
+ self.model = Llama(
35
+ model_path=model_path,
36
+ n_ctx=n_ctx,
37
+ n_threads=n_threads,
38
+ n_batch=512, # Batch size for prompt processing
39
+ n_gpu_layers=-1 if self._has_gpu() else 0, # Use GPU if available
40
+ use_mmap=True, # Memory-mapped files
41
+ use_mlock=True, # Lock memory
42
+ verbose=False
43
+ )
44
+
45
+ print(f"Model loaded: {model_path}")
46
+ print(f"Context length: {n_ctx}")
47
+ print(f"Threads: {n_threads}")
48
+ print(f"GPU layers: {-1 if self._has_gpu() else 0}")
49
+
50
+ def _has_gpu(self) -> bool:
51
+ """Check if GPU is available"""
52
+ try:
53
+ import torch
54
+ return torch.cuda.is_available()
55
+ except ImportError:
56
+ return False
57
+
58
+ def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.7) -> str:
59
+ """Generate text with optimized settings"""
60
+
61
+ start_time = time.time()
62
+
63
+ # Optimized generation parameters
64
+ response = self.model(
65
+ prompt,
66
+ max_tokens=max_tokens,
67
+ temperature=temperature,
68
+ top_p=0.9,
69
+ repeat_penalty=1.1,
70
+ stop=["</code>", "\n\n\n"], # Stop sequences
71
+ stream=False
72
+ )
73
+
74
+ generation_time = time.time() - start_time
75
+ generated_text = response['choices'][0]['text']
76
+
77
+ # Calculate tokens per second
78
+ estimated_tokens = len(generated_text.split())
79
+ tokens_per_sec = estimated_tokens / generation_time if generation_time > 0 else 0
80
+
81
+ print(f"\nπŸ“Š Performance:")
82
+ print(f" Time: {generation_time:.2f}s")
83
+ print(f" Speed: {tokens_per_sec:.1f} tokens/sec")
84
+ print(f" Tokens: {estimated_tokens}")
85
+
86
+ return generated_text
87
+
88
+ def generate_stream(self, prompt: str, max_tokens: int = 512, temperature: float = 0.7):
89
+ """Generate text with streaming"""
90
+
91
+ print("\nπŸš€ Streaming response:")
92
+ start_time = time.time()
93
+ total_tokens = 0
94
+
95
+ stream = self.model(
96
+ prompt,
97
+ max_tokens=max_tokens,
98
+ temperature=temperature,
99
+ top_p=0.9,
100
+ repeat_penalty=1.1,
101
+ stop=["</code>", "\n\n\n"],
102
+ stream=True
103
+ )
104
+
105
+ for chunk in stream:
106
+ text = chunk['choices'][0]['text']
107
+ print(text, end='', flush=True)
108
+ total_tokens += 1
109
+
110
+ generation_time = time.time() - start_time
111
+ tokens_per_sec = total_tokens / generation_time if generation_time > 0 else 0
112
+
113
+ print(f"\n\nπŸ“Š Streaming Performance:")
114
+ print(f" Time: {generation_time:.2f}s")
115
+ print(f" Speed: {tokens_per_sec:.1f} tokens/sec")
116
+
117
+ def chat_mode(self):
118
+ """Interactive chat mode"""
119
+ print("\nπŸ€– Interactive Chat Mode")
120
+ print("Commands: 'exit' to quit, 'stream' to toggle streaming")
121
+ print("-" * 50)
122
+
123
+ use_streaming = False
124
+
125
+ while True:
126
+ try:
127
+ prompt = input("\nπŸ‘€ You: ")
128
+
129
+ if prompt.lower() == 'exit':
130
+ print("πŸ‘‹ Goodbye!")
131
+ break
132
+ elif prompt.lower() == 'stream':
133
+ use_streaming = not use_streaming
134
+ print(f"πŸ”„ Streaming {'enabled' if use_streaming else 'disabled'}")
135
+ continue
136
+
137
+ print("πŸ€– Assistant:", end=" ")
138
+
139
+ if use_streaming:
140
+ self.generate_stream(prompt)
141
+ else:
142
+ response = self.generate(prompt)
143
+ print(response)
144
+
145
+ except KeyboardInterrupt:
146
+ print("\n\nπŸ‘‹ Goodbye!")
147
+ break
148
+
149
+ def main():
150
+ parser = argparse.ArgumentParser(description="Fast GGUF Model Inference")
151
+ parser.add_argument("--model", required=True, help="Path to GGUF model file")
152
+ parser.add_argument("--prompt", help="Text prompt for generation")
153
+ parser.add_argument("--max-tokens", type=int, default=512, help="Maximum tokens to generate")
154
+ parser.add_argument("--temperature", type=float, default=0.7, help="Generation temperature")
155
+ parser.add_argument("--ctx-size", type=int, default=4096, help="Context size")
156
+ parser.add_argument("--threads", type=int, default=-1, help="Number of threads (-1 for auto)")
157
+ parser.add_argument("--interactive", action="store_true", help="Start interactive chat mode")
158
+ parser.add_argument("--stream", action="store_true", help="Use streaming generation")
159
+
160
+ args = parser.parse_args()
161
+
162
+ # Initialize inference
163
+ print(f"πŸš€ Loading model: {args.model}")
164
+ inferencer = FastInference(
165
+ args.model,
166
+ n_ctx=args.ctx_size,
167
+ n_threads=args.threads
168
+ )
169
+
170
+ if args.interactive:
171
+ inferencer.chat_mode()
172
+ elif args.prompt:
173
+ if args.stream:
174
+ inferencer.generate_stream(args.prompt, args.max_tokens, args.temperature)
175
+ else:
176
+ response = inferencer.generate(args.prompt, args.max_tokens, args.temperature)
177
+ print("\nπŸ€– Generated text:")
178
+ print(response)
179
+ else:
180
+ print("Please provide --prompt or use --interactive mode")
181
+ print("Example: python fast_inference.py --model model.gguf --prompt 'def hello():' --interactive")
182
+
183
+ if __name__ == "__main__":
184
+ main()
model_f16.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23d2dc15401498de50fea28be8984d65632953d9113c5223739edf0babf7b879
3
+ size 3093666208
model_q4_0.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45b84ed0c83a923fcd26862884126df5694db05963a353b82c3dc069076500b5
3
+ size 934951840
model_q5_0.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b5e821453f084445ac49ea97aae0fb39360737e44f7130d49eb69115acd0499
3
+ size 1098726304
model_q8_0.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:998e5e2b5d868adfbfb679c10ff096d6d6831ba60952d5216cef29b78db47aa4
3
+ size 1646569888