Spaces:
Sleeping
Sleeping
| # external imports | |
| from transformers import pipeline | |
| from huggingface_hub import InferenceClient | |
| import torch | |
| # local imports | |
| import config | |
| from llama_cpp import Llama | |
| class Phi3_Mini_4k_Instruct: | |
| def __init__(self): | |
| pass | |
| def generate_text(self, messages, use_local_llm): | |
| if use_local_llm: | |
| return self.generate_text_llama_cpp(messages) | |
| else: | |
| return self.generate_text_api(messages) | |
| def generate_text_llama_cpp(self, messages): | |
| model = Llama.from_pretrained( | |
| repo_id="microsoft/Phi-3-mini-4k-instruct-gguf", | |
| filename="Phi-3-mini-4k-instruct-q4.gguf" | |
| ) | |
| response = model.create_chat_completion(messages) | |
| generated_message = response['choices'][0]['message']['content'] | |
| return generated_message | |
| def generate_text_local_pipeline(self, messages): | |
| self.local_pipeline = pipeline("text-generation", model=config.LLM_MODEL, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto") | |
| self.local_pipeline.model.config.max_length = config.LLM_MAX_LENGTH | |
| self.local_pipeline.model.config.max_new_tokens = config.LLM_MAX_NEW_TOKENS | |
| self.local_pipeline.model.config.temperature = config.LLM_TEMPERATURE | |
| self.local_pipeline.model.config.top_p = config.LLM_TOP_P | |
| result = self.local_pipeline(messages)[-1]['generated_text'][-1]['content'] | |
| return result | |
| def generate_text_api(self, messages): | |
| client = InferenceClient(config.LLM_MODEL, token=config.HF_API_TOKEN) | |
| try: | |
| result = client.chat_completion(messages, max_tokens=config.LLM_MAX_NEW_TOKENS, temperature=config.LLM_TEMPERATURE, top_p=config.LLM_TOP_P).choices[0].message.content | |
| except Exception as e: | |
| result = f"Error: {e}" | |
| return result |