Spaces:

Hamza011
/

chat_with_docs

Sleeping

File size: 3,794 Bytes

196c8fb

from PyPDF2 import PdfReader,PdfWriter
import gradio as gr
from langchain.embeddings import CohereEmbeddings
from langchain.prompts import PromptTemplate
from langchain import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

import spacy
nlp = spacy.load('en_core_web_md')

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 200, chunk_overlap = 0)
embedding = CohereEmbeddings(model='embed-multilingual-v3.0',cohere_api_key=COHERE_API_KEY)



def recieve_pdf(filename):
    reader = PdfReader(filename)
    writer = PdfWriter()

    for page in reader.pages:
        writer.add_page(page)
    

    with open('processed_file.pdf','wb') as f:
        writer.write(f)

    read = PdfReader('processed_file.pdf')
    extracted_file =[page.extract_text(0) for page in read.pages]       
    extracted_text = ''.join(extracted_file)

    global file
    file = extracted_text

    summary_prompt_formated = summary_prompt.format(document = extracted_text)

    return llm(summary_prompt_formated)


def chatbot(query,history):
    similarity_array =[]
    embeded_query = embedding.embed_documents([query])
    
    doc = nlp(file)
    sentences_1 = [str(sentence) for sentence in doc.sents]
    embedded_text = embedding.embed_documents(sentences_1)
        


    similarity_score = cosine_similarity(embeded_query,embedded_text)
    similarity_array.append(similarity_score) 

    

    most_similar_index = np.argmax(similarity_array)
    most_similar_documents = sentences_1[most_similar_index]



    splitter_text = text_splitter.split_text(file)
    recursive_embedded_text = embedding.embed_documents(splitter_text)

    most_similar_embed = embedding.embed_documents([most_similar_documents])
    final_similarity_score = cosine_similarity(most_similar_embed,recursive_embedded_text)

    final_similarity_index = np.argmax(final_similarity_score)
    final_document = splitter_text[final_similarity_index]

    prompt_formated = prompt.format(context = final_document, query = query)
    repsonse = llm(prompt_formated)

    history.append((query, repsonse))

    
    return '', history

summary_template = """ You an article summarizer and have been provided with this file

{document}

provide a one line summary of the content of the provides file.

"""

summary_prompt = PromptTemplate(input_variables= ['document'], template=summary_template)
template = """ You are a knowledgeable chatbot that gently answers questions.

You know the following context information.

{context}

Answer to the following question from a user. Use only information from the previous context. Do not invent or assume stuff.


Question: {query}

Answer:"""

prompt = PromptTemplate(input_variables= ['context', 'query'], template= template)

llm = OpenAI(model= 'gpt-3.5-turbo-instruct' , temperature= 0)

with gr.Blocks(theme='finlaymacklon/smooth_slate') as demo:
    signal = gr.Markdown('''# Welcome to Chat with Docs
                          I am an AI that recieves a document and can answer questions on the content of the document.''')
    inp = gr.File()
    out = gr.Textbox(label= 'Summary')
    inp.upload(fn= recieve_pdf,inputs= inp,outputs=out,show_progress=True)
    signal_1 = gr.Markdown('Use the Textbox below to chat. **Ask** questions regarding the pdf you uploaded')
    chat = gr.Chatbot()
    msg = gr.Textbox(info='input your chat')
        
    with gr.Row():
        submit = gr.Button('Send')
        clear = gr.ClearButton([msg,chat])

    msg.submit(chatbot, [msg, chat], [msg ,chat])
    submit.click(chatbot, [msg, chat], [msg ,chat])

    feedback = gr.Markdown('# [Please use this to provide feedback](https://forms.gle/oNZKx4nL7DmmJ64g8)')



demo.launch()