Spaces:
Sleeping
Sleeping
import os | |
import sys | |
from langchain.chains import ConversationalRetrievalChain | |
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.vectorstores import Chroma | |
import gradio as gr | |
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM | |
from sentence_transformers import SentenceTransformer | |
import torch | |
# sqlite workaround for HuggingFace Spaces | |
__import__('pysqlite3') | |
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3') | |
# Load documents | |
docs = [] | |
for f in os.listdir("multiple_docs"): | |
if f.endswith(".pdf"): | |
loader = PyPDFLoader(os.path.join("multiple_docs", f)) | |
docs.extend(loader.load()) | |
elif f.endswith(".docx") or f.endswith(".doc"): | |
loader = Docx2txtLoader(os.path.join("multiple_docs", f)) | |
docs.extend(loader.load()) | |
elif f.endswith(".txt"): | |
loader = TextLoader(os.path.join("multiple_docs", f)) | |
docs.extend(loader.load()) | |
# Split docs | |
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10) | |
docs = splitter.split_documents(docs) | |
# Embeddings | |
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
texts = [doc.page_content for doc in docs] | |
metadatas = [{"id": i} for i in range(len(texts))] | |
embeddings = embedding_model.encode(texts) | |
# Vectorstore | |
vectorstore = Chroma(persist_directory="./db") | |
vectorstore.add_texts(texts=texts, metadatas=metadatas, embeddings=embeddings) | |
vectorstore.persist() | |
# model_name = "deepseek-ai/deepseek-llm-7b-instruct" | |
# tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") | |
model_name = "google/flan-t5-large" | |
model = AutoModelForCausalLM.from_pretrained(model_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
def generate(prompt): | |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
outputs = model.generate(**inputs, max_new_tokens=512) | |
return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
class HuggingFaceLLMWrapper: | |
def __call__(self, prompt, **kwargs): | |
return generate(prompt) | |
llm = HuggingFaceLLMWrapper() | |
# QA chain | |
chain = ConversationalRetrievalChain.from_llm( | |
llm, | |
retriever=vectorstore.as_retriever(search_kwargs={'k': 6}), | |
return_source_documents=True, | |
verbose=False | |
) | |
chat_history = [] | |
with gr.Blocks() as demo: | |
chatbot = gr.Chatbot([("", "Hello, I'm Thierry Decae's chatbot. Ask me about my experience, skills, eligibility, etc.")], | |
avatar_images=["./multiple_docs/Guest.jpg", "./multiple_docs/Thierry Picture.jpg"]) | |
msg = gr.Textbox() | |
clear = gr.Button("Clear") | |
def user(query, chat_history): | |
chat_history_tuples = [(m[0], m[1]) for m in chat_history] | |
result = chain({"question": query, "chat_history": chat_history_tuples}) | |
chat_history.append((query, result["answer"])) | |
return gr.update(value=""), chat_history | |
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False) | |
clear.click(lambda: None, None, chatbot, queue=False) | |
demo.launch(debug=True) | |