|
from langchain_community.document_loaders import PyPDFLoader |
|
import os |
|
from langchain_openai import ChatOpenAI |
|
from langchain_chroma import Chroma |
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
from langchain.chains.combine_documents import create_stuff_documents_chain |
|
from langchain_core.prompts import ChatPromptTemplate |
|
from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings |
|
from setup.environment import default_model |
|
from uuid import uuid4 |
|
|
|
|
|
os.environ.get("OPENAI_API_KEY") |
|
os.environ.get("HUGGINGFACEHUB_API_TOKEN") |
|
embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") |
|
|
|
allIds = [] |
|
|
|
def getPDF(file_paths): |
|
documentId = 0 |
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) |
|
pages = [] |
|
for file in file_paths: |
|
loader = PyPDFLoader(file, extract_images=False) |
|
pagesDoc = loader.load_and_split(text_splitter) |
|
pages = pages + pagesDoc |
|
|
|
|
|
|
|
|
|
for page in pages: |
|
print('\n') |
|
print('allIds: ', allIds) |
|
documentId = str(uuid4()) |
|
allIds.append(documentId) |
|
page.id = documentId |
|
return pages |
|
|
|
def create_retriever(documents, vectorstore): |
|
print('\n\n') |
|
print('documents: ', documents[:2]) |
|
|
|
vectorstore.add_documents(documents=documents) |
|
|
|
retriever = vectorstore.as_retriever( |
|
|
|
|
|
) |
|
|
|
return retriever |
|
|
|
def create_prompt_llm_chain(system_prompt, modelParam): |
|
if modelParam == default_model: |
|
model = ChatOpenAI(model=modelParam) |
|
else: |
|
model = HuggingFaceEndpoint( |
|
repo_id=modelParam, |
|
task="text-generation", |
|
|
|
do_sample=False, |
|
huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN") |
|
) |
|
|
|
system_prompt = system_prompt + "\n\n" + "{context}" |
|
prompt = ChatPromptTemplate.from_messages( |
|
[ |
|
("system", system_prompt), |
|
("human", "{input}"), |
|
] |
|
) |
|
question_answer_chain = create_stuff_documents_chain(model, prompt) |
|
return question_answer_chain |