tferhan commited on
Commit
9c513c1
·
verified ·
1 Parent(s): 2a2760a

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -137
app.py DELETED
@@ -1,137 +0,0 @@
1
- import gradio as gr
2
- import os
3
-
4
- from langchain_community.document_loaders import PyPDFLoader
5
- from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from langchain_community.vectorstores import Chroma
7
- from langchain.chains import ConversationalRetrievalChain
8
- from langchain_community.embeddings import HuggingFaceEmbeddings
9
- from langchain_community.llms import HuggingFacePipeline
10
- from langchain.chains import ConversationChain
11
- from langchain.memory import ConversationBufferMemory
12
- from langchain_community.llms import HuggingFaceEndpoint
13
-
14
- from pathlib import Path
15
- import chromadb
16
- from unidecode import unidecode
17
-
18
- from transformers import AutoTokenizer
19
- from transformers import pipeline
20
- import transformers
21
- import torch
22
- import tqdm
23
- import accelerate
24
-
25
- def load_doc(file_path):
26
- loader = PyPDFLoader(file_path)
27
- pages = loader.load()
28
- text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1024, chunk_overlap = 120)
29
- doc_splits = text_splitter.split_documents(pages)
30
- return doc_splits
31
-
32
-
33
-
34
- splt = load_doc('data.pdf')
35
-
36
- def initialize_database(file_path):
37
- # Create list of documents (when valid)
38
- collection_name = Path(file_path).stem
39
- # Fix potential issues from naming convention
40
- ## Remove space
41
- collection_name = collection_name.replace(" ","-")
42
- ## Limit lenght to 50 characters
43
- collection_name = collection_name[:50]
44
- ## Enforce start and end as alphanumeric character
45
- if not collection_name[0].isalnum():
46
- collection_name[0] = 'A'
47
- if not collection_name[-1].isalnum():
48
- collection_name[-1] = 'Z'
49
- # print('list_file_path: ', list_file_path)
50
- print('Collection name: ', collection_name)
51
- # Load document and create splits
52
- doc_splits = load_doc(file_path)
53
- # global vector_db
54
- vector_db = create_db(doc_splits, collection_name)
55
- return vector_db, collection_name, "Complete!"
56
-
57
- def create_db(splits, collection_name):
58
- embedding = HuggingFaceEmbeddings()
59
- new_client = chromadb.EphemeralClient()
60
- vectordb = Chroma.from_documents(
61
- documents=splits,
62
- embedding=embedding,
63
- client=new_client,
64
- collection_name=collection_name,
65
- )
66
- return vectordb
67
-
68
- vec = initialize_database('data.pdf')
69
-
70
- vec_cre = create_db(splt, 'data')
71
-
72
-
73
- def initialize_llmchain(temperature, max_tokens, top_k, vector_db):
74
- #Use memory if you want for the chatbot to be conversational, in this case it is just for answering from the document
75
- # memory = ConversationBufferMemory(
76
- # memory_key="chat_history",
77
- # output_key='answer',
78
- # return_messages=True
79
- # )
80
-
81
- llm = HuggingFaceEndpoint(
82
- repo_id='mistralai/Mixtral-8x7B-Instruct-v0.1',
83
- temperature = temperature,
84
- max_new_tokens = max_tokens,
85
- top_k = top_k,
86
- load_in_8bit = True
87
- )
88
- retriever=vector_db.as_retriever()
89
- qa_chain = ConversationalRetrievalChain.from_llm(
90
- llm,
91
- retriever=retriever,
92
- chain_type="stuff",
93
- #memory=memory,
94
- return_source_documents=True,
95
- verbose=False,
96
- )
97
- return qa_chain
98
-
99
- qa = initialize_llmchain(0.6, 1024, 40, vec_cre) #The model question answer
100
-
101
- pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr") # This pipeline translate english to french , it isn't adviced as it add more latency
102
-
103
-
104
- # def format_chat_history(message, chat_history):
105
- # formatted_chat_history = []
106
- # for user_message, bot_message in chat_history:
107
- # formatted_chat_history.append(f"User: {user_message}")
108
- # formatted_chat_history.append(f"Assistant: {bot_message}")
109
- # return formatted_chat_history
110
-
111
- def conversation(message, history):
112
- #formatted_chat_history = format_chat_history(message, history)
113
-
114
- # Generate response using QA chain
115
- response = qa({"question": message + " According to the document", "chat_history": []})
116
- response_answer = response["answer"]
117
- if response_answer.find("Helpful Answer:") != -1:
118
- response_answer = response_answer.split("Helpful Answer:")[-1]
119
- #You can also return from where the model got the answer to fine-tune or adjust your model mais ici c'est bon
120
- response_sources = response["source_documents"]
121
- response_source1 = response_sources[0].page_content.strip()
122
- response_source2 = response_sources[1].page_content.strip()
123
- response_source3 = response_sources[2].page_content.strip()
124
- response_source1_page = response_sources[0].metadata["page"] + 1
125
- response_source2_page = response_sources[1].metadata["page"] + 1
126
- response_source3_page = response_sources[2].metadata["page"] + 1
127
- #If you want the return in english leave it at :
128
- return response_answer
129
-
130
- #If you want the return in french
131
- #return pipe(response_answer)[0]['translation_text'] + " (Traduis d'anglais en français)"
132
-
133
-
134
-
135
-
136
-
137
- gr.ChatInterface(conversation).launch()