arjunanand13 commited on
Commit
102910d
·
verified ·
1 Parent(s): bd94fef

Delete app_llama3_27_4_24.py

Browse files
Files changed (1) hide show
  1. app_llama3_27_4_24.py +0 -205
app_llama3_27_4_24.py DELETED
@@ -1,205 +0,0 @@
1
- import os
2
- import torch
3
- from torch import cuda, bfloat16
4
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig, StoppingCriteria, StoppingCriteriaList
5
- from langchain.llms import HuggingFacePipeline
6
- from langchain.vectorstores import FAISS
7
- from langchain.chains import ConversationalRetrievalChain
8
- import gradio as gr
9
- from langchain.embeddings import HuggingFaceEmbeddings
10
-
11
-
12
- # Load the Hugging Face token from environment
13
- HF_TOKEN = os.environ.get("HF_TOKEN", None)
14
-
15
- # Define stopping criteria
16
- class StopOnTokens(StoppingCriteria):
17
- def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
18
- for stop_ids in stop_token_ids:
19
- if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
20
- return True
21
- return False
22
-
23
- # Load the LLaMA model and tokenizer
24
- model_id = 'meta-llama/Meta-Llama-3-8B-Instruct'
25
- # model_id = 'mistralai/Mistral-7B-Instruct-v0.3'
26
- device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
27
-
28
- # Set quantization configuration
29
- bnb_config = BitsAndBytesConfig(
30
- load_in_4bit=True,
31
- bnb_4bit_quant_type='nf4',
32
- bnb_4bit_use_double_quant=True,
33
- bnb_4bit_compute_dtype=bfloat16
34
- )
35
-
36
- tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
37
- model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", token=HF_TOKEN, quantization_config=bnb_config)
38
-
39
- # Define stopping criteria
40
- stop_list = ['\nHuman:', '\n```\n']
41
- stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
42
- stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
43
- stopping_criteria = StoppingCriteriaList([StopOnTokens()])
44
-
45
- # Create text generation pipeline
46
- generate_text = pipeline(
47
- model=model,
48
- tokenizer=tokenizer,
49
- return_full_text=True,
50
- task='text-generation',
51
- stopping_criteria=stopping_criteria,
52
- temperature=0.1,
53
- max_new_tokens=512,
54
- repetition_penalty=1.1
55
- )
56
-
57
- llm = HuggingFacePipeline(pipeline=generate_text)
58
-
59
- # Load the stored FAISS index
60
- try:
61
- vectorstore = FAISS.load_local('faiss_index', HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={"device": "cuda"}))
62
- print("Loaded embedding successfully")
63
- except ImportError as e:
64
- print("FAISS could not be imported. Make sure FAISS is installed correctly.")
65
- raise e
66
-
67
- # Set up the Conversational Retrieval Chain
68
- chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), return_source_documents=True)
69
-
70
- chat_history = []
71
-
72
- def format_prompt(query):
73
- prompt = f"""
74
- You are a knowledgeable assistant with access to a comprehensive database.
75
- I need you to answer my question and provide related information in a specific format.
76
- Here's what I need:
77
- 1. A brief, general response to my question based on related answers retrieved.
78
- 2. A JSON-formatted output containing:
79
- - "question": The original question.
80
- - "answer": The detailed answer.
81
- - "related_questions": A list of related questions and their answers, each as a dictionary with the keys:
82
- - "question": The related question.
83
- - "answer": The related answer.
84
- Here's my question:
85
- {query}
86
- Include a brief final answer without additional comments, sign-offs, or extra phrases. Be direct and to the point.
87
- """
88
- return prompt
89
-
90
- def qa_infer(query):
91
- formatted_prompt = format_prompt(query)
92
- result = chain({"question": formatted_prompt, "chat_history": chat_history})
93
- for doc in result['source_documents']:
94
- print("-"*50)
95
- print("Retrieved Document:", doc.page_content)
96
- print("#"*100)
97
- print(result['answer'])
98
- return result['answer']
99
-
100
- EXAMPLES = ["How to use IPU1_0 instead of A15_0 to process NDK in TDA2x-EVM",
101
- "Can BQ25896 support I2C interface?",
102
- "Does TDA2 vout support bt656 8-bit mode?"]
103
-
104
- demo = gr.Interface(fn=qa_infer, inputs="text", allow_flagging='never', examples=EXAMPLES, cache_examples=False, outputs="text")
105
- demo.launch()
106
-
107
- # import os
108
- # import torch
109
- # from torch import cuda, bfloat16
110
- # from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig, StoppingCriteria, StoppingCriteriaList
111
- # from langchain.llms import HuggingFacePipeline
112
- # from langchain.vectorstores import FAISS
113
- # from langchain.chains import ConversationalRetrievalChain
114
- # import gradio as gr
115
- # from langchain.embeddings import HuggingFaceEmbeddings
116
-
117
- # # Load the Hugging Face token from environment
118
- # HF_TOKEN = os.environ.get("HF_TOKEN", None)
119
-
120
- # # Define stopping criteria
121
- # class StopOnTokens(StoppingCriteria):
122
- # def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
123
- # for stop_ids in stop_token_ids:
124
- # if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
125
- # return True
126
- # return False
127
-
128
- # # Load the LLaMA model and tokenizer
129
- # model_id = 'meta-llama/Meta-Llama-3-8B-Instruct'
130
- # device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
131
-
132
- # # Set quantization configuration
133
- # bnb_config = BitsAndBytesConfig(
134
- # load_in_4bit=True,
135
- # bnb_4bit_quant_type='nf4',
136
- # bnb_4bit_use_double_quant=True,
137
- # bnb_4bit_compute_dtype=bfloat16
138
- # )
139
-
140
- # tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
141
- # model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", token=HF_TOKEN, quantization_config=bnb_config)
142
-
143
- # # Define stopping criteria
144
- # stop_list = ['\nHuman:', '\n```\n']
145
- # stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
146
- # stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
147
- # stopping_criteria = StoppingCriteriaList([StopOnTokens()])
148
-
149
- # # Create text generation pipeline
150
- # generate_text = pipeline(
151
- # model=model,
152
- # tokenizer=tokenizer,
153
- # return_full_text=True,
154
- # task='text-generation',
155
- # stopping_criteria=stopping_criteria,
156
- # temperature=0.1,
157
- # max_new_tokens=512,
158
- # repetition_penalty=1.1
159
- # )
160
-
161
- # llm = HuggingFacePipeline(pipeline=generate_text)
162
-
163
- # # Load the stored FAISS index
164
- # try:
165
- # embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={"device": "cuda"})
166
- # vectorstore = FAISS.load_local('faiss_index', embeddings)
167
- # print("Loaded embedding successfully")
168
- # except ImportError as e:
169
- # print("FAISS could not be imported. Make sure FAISS is installed correctly.")
170
- # raise e
171
-
172
- # # Set up the Conversational Retrieval Chain
173
- # chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), return_source_documents=True)
174
-
175
- # chat_history = []
176
-
177
- # def format_prompt(query):
178
- # prompt = f"""
179
- # You are a knowledgeable assistant with access to a comprehensive database.
180
- # I need you to answer my question and provide related information in a specific format.
181
- # Here's what I need:
182
- # 1. A brief, general response to my question based on related answers retrieved.
183
- # 2. A JSON-formatted output containing:
184
- # - "question": The original question.
185
- # - "answer": The detailed answer.
186
- # - "related_questions": A list of related questions and their answers, each as a dictionary with the keys:
187
- # - "question": The related question.
188
- # - "answer": The related answer.
189
- # Here's my question:
190
- # {query}
191
- # Include a brief final answer without additional comments, sign-offs, or extra phrases. Be direct and to the point.
192
- # """
193
- # return prompt
194
-
195
- # def qa_infer(query):
196
- # formatted_prompt = format_prompt(query)
197
- # result = chain({"question": formatted_prompt, "chat_history": chat_history})
198
- # return result['answer']
199
-
200
- # EXAMPLES = ["How to use IPU1_0 instead of A15_0 to process NDK in TDA2x-EVM",
201
- # "Can BQ25896 support I2C interface?",
202
- # "Does TDA2 vout support bt656 8-bit mode?"]
203
-
204
- # demo = gr.Interface(fn=qa_infer, inputs="text", allow_flagging='never', examples=EXAMPLES, cache_examples=False, outputs="text")
205
- # demo.launch()