Mojo3 commited on
Commit
7e63c87
·
verified ·
1 Parent(s): ec25e8b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -208
app.py CHANGED
@@ -1,212 +1,5 @@
1
- import streamlit as st
2
- from docx import Document
3
- import os
4
- from langchain_core.prompts import PromptTemplate
5
- from transformers import AutoTokenizer, AutoModelForCausalLM
6
- import torch
7
- import time
8
- from sentence_transformers import SentenceTransformer
9
- from langchain.vectorstores import Chroma
10
- from langchain.docstore.document import Document as Document2
11
  from langchain_community.embeddings import HuggingFaceEmbeddings
12
 
13
- import cohere
14
- from langchain_core.prompts import PromptTemplate
15
-
16
- # Load token from environment variable
17
- token = os.getenv("HF_TOKEN")
18
-
19
- print("my token is ", token)
20
- # Save the token to Hugging Face's system directory
21
-
22
- docs_folder = "./converted_docs"
23
-
24
-
25
- # Function to load .docx files from Google Drive folder
26
- def load_docx_files_from_drive(drive_folder):
27
- docx_files = [f for f in os.listdir(drive_folder) if f.endswith(".docx")]
28
- documents = []
29
-
30
- for file_name in docx_files:
31
- file_path = os.path.join(drive_folder, file_name)
32
- doc = Document(file_path)
33
- content = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
34
- documents.append(content)
35
-
36
- return documents
37
-
38
-
39
- # Load .docx files from Google Drive folder
40
- documents = load_docx_files_from_drive(docs_folder)
41
-
42
-
43
- def split_extracted_text_into_chunks(documents):
44
- print("Splitting text into chunks")
45
- # List to hold all chunks
46
- chunks = []
47
-
48
- for doc_text in documents:
49
- # Split the document text into lines
50
- lines = doc_text.splitlines()
51
-
52
- # Initialize variables for splitting
53
- current_chunk = []
54
- for line in lines:
55
- # Check if the line starts with "File Name:"
56
- if line.startswith("File Name:"):
57
- # If there's a current chunk, save it before starting a new one
58
- if current_chunk:
59
- chunks.append("\n".join(current_chunk))
60
- current_chunk = [] # Reset the current chunk
61
-
62
- # Add the line to the current chunk
63
- current_chunk.append(line)
64
-
65
- # Add the last chunk for the current document
66
- if current_chunk:
67
- chunks.append("\n".join(current_chunk))
68
-
69
- return chunks
70
-
71
-
72
- # Split the extracted documents into chunks
73
- chunks = split_extracted_text_into_chunks(documents)
74
-
75
-
76
- def save_chunks_to_file(chunks, output_file_path):
77
- print("Saving chunks to file")
78
- # Open the file in write mode
79
- with open(output_file_path, "w", encoding="utf-8") as file:
80
- for i, chunk in enumerate(chunks, start=1):
81
- # Write each chunk with a header for easy identification
82
- file.write(f"Chunk {i}:\n")
83
- file.write(chunk)
84
- file.write("\n" + "=" * 50 + "\n")
85
-
86
-
87
- # Path to save the chunks file
88
- output_file_path = "./chunks_output.txt"
89
-
90
- # Split the extracted documents into chunks
91
- chunks = split_extracted_text_into_chunks(documents)
92
-
93
- # Save the chunks to the file
94
- save_chunks_to_file(chunks, output_file_path)
95
-
96
-
97
- # Step 1: Load the model through LangChain's wrapper
98
  embedding_model = HuggingFaceEmbeddings(
99
  model_name="Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2"
100
- )
101
- print("#0")
102
-
103
- # Step 2: Embed the chunks (now simplified)
104
- def embed_chunks(chunks):
105
- print("Embedding the chunks")
106
- return [
107
- {"chunk": chunk, "embedding": embedding_model.embed_query(chunk)}
108
- for chunk in chunks
109
- ]
110
-
111
-
112
- embeddings = embed_chunks(chunks)
113
- print("#1")
114
-
115
- # Step 3: Prepare documents (unchanged)
116
- def prepare_documents_for_chroma(embeddings):
117
- print("Preparing documents for chroma")
118
- return [
119
- Document2(page_content=entry["chunk"], metadata={"chunk_index": i})
120
- for i, entry in enumerate(embeddings, start=1)
121
- ]
122
-
123
- print("#2")
124
- documents = prepare_documents_for_chroma(embeddings)
125
- print("Creating the vectore store")
126
- # Step 4: Create Chroma store (fixed)
127
- vectorstore = Chroma.from_documents(
128
- documents=documents,
129
- embedding=embedding_model, # Proper embedding object
130
- persist_directory="./chroma_db", # Optional persistence
131
- )
132
-
133
-
134
-
135
-
136
- class RAGPipeline:
137
- def __init__(self, vectorstore, api_key, model_name="c4ai-aya-expanse-8b", k=3):
138
- print("Initializing RAG Pipeline")
139
- self.vectorstore = vectorstore
140
- self.model_name = model_name
141
- self.k = k
142
- self.api_key = api_key
143
- self.client = cohere.Client(api_key) # Initialize the Cohere client
144
- self.retriever = self.vectorstore.as_retriever(
145
- search_type="mmr", search_kwargs={"k": 3}
146
- )
147
- self.prompt_template = PromptTemplate.from_template(self._get_template())
148
-
149
- def _get_template(self):
150
- return """<s>[INST] <<SYS>>
151
- أنت مساعد مفيد يقدم إجابات باللغة العربية بناءً على السياق المقدم.
152
- - أجب فقط باللغة العربية
153
- - إذا لم تجد إجابة في السياق، قل أنك لا تعرف
154
- - كن دقيقاً وواضحاً في إجا��اتك
155
- -جاوب من السياق حصريا
156
- <</SYS>>
157
-
158
- السياق: {context}
159
-
160
- السؤال: {question}
161
- الإجابة: [/INST]\
162
-
163
- """
164
-
165
- def generate_response(self, question):
166
- retrieved_docs = self._retrieve_documents(question)
167
- prompt = self._create_prompt(retrieved_docs, question)
168
- response = self._generate_response_cohere(prompt)
169
- return response
170
-
171
- def _retrieve_documents(self, question):
172
- retrieved_docs = self.retriever.invoke(question)
173
- # print("\n=== المستندات المسترجعة ===")
174
- # for i, doc in enumerate(retrieved_docs):
175
- # print(f"المستند {i+1}: {doc.page_content}")
176
- # print("==========================\n")
177
-
178
- # دمج النصوص المسترجعة في سياق واحد
179
- return " ".join([doc.page_content for doc in retrieved_docs])
180
-
181
- def _create_prompt(self, docs, question):
182
- return self.prompt_template.format(context=docs, question=question)
183
-
184
- def _generate_response_cohere(self, prompt):
185
- # Call Cohere's generate API
186
- response = self.client.generate(
187
- model=self.model_name,
188
- prompt=prompt,
189
- max_tokens=2000, # Adjust token limit based on requirements
190
- temperature=0.3, # Control creativity
191
- stop_sequences=None,
192
- )
193
-
194
- if response.generations:
195
- return response.generations[0].text.strip()
196
- else:
197
- raise Exception("No response generated by Cohere API.")
198
-
199
-
200
-
201
- st.title("Simple Text Generator")
202
- api_key = os.getenv("API_KEY")
203
- s = api_key[:5]
204
- print("KEY: ", s)
205
- rag_pipeline = RAGPipeline(vectorstore=vectorstore, api_key=api_key)
206
- print("Enter your question Here: ")
207
- question = st.text_input("أدخل سؤالك هنا")
208
- if st.button("Generate Answer"):
209
- response = rag_pipeline.generate_response(question)
210
- st.write(response)
211
- print("Question: ", question)
212
- print("Response: ", response)
 
 
 
 
 
 
 
 
 
 
 
1
  from langchain_community.embeddings import HuggingFaceEmbeddings
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  embedding_model = HuggingFaceEmbeddings(
4
  model_name="Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2"
5
+ )