Spaces:

tferhan
/

data_gov_ma

Sleeping

App Files Files Community

tferhan commited on Apr 3, 2024

Commit

9c513c1

verified ·

1 Parent(s): 2a2760a

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -137

app.py DELETED Viewed

@@ -1,137 +0,0 @@
-import gradio as gr
-import os
-from langchain_community.document_loaders import PyPDFLoader
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.vectorstores import Chroma
-from langchain.chains import ConversationalRetrievalChain
-from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain_community.llms import HuggingFacePipeline
-from langchain.chains import ConversationChain
-from langchain.memory import ConversationBufferMemory
-from langchain_community.llms import HuggingFaceEndpoint
-from pathlib import Path
-import chromadb
-from unidecode import unidecode
-from transformers import AutoTokenizer
-from transformers import pipeline
-import transformers
-import torch
-import tqdm
-import accelerate
-def load_doc(file_path):
-    loader = PyPDFLoader(file_path)
-    pages = loader.load()
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1024, chunk_overlap = 120)
-    doc_splits = text_splitter.split_documents(pages)
-    return doc_splits
-splt = load_doc('data.pdf')
-def initialize_database(file_path):
-    # Create list of documents (when valid)
-    collection_name = Path(file_path).stem
-    # Fix potential issues from naming convention
-    ## Remove space
-    collection_name = collection_name.replace(" ","-")
-    ## Limit lenght to 50 characters
-    collection_name = collection_name[:50]
-    ## Enforce start and end as alphanumeric character
-    if not collection_name[0].isalnum():
-        collection_name[0] = 'A'
-    if not collection_name[-1].isalnum():
-        collection_name[-1] = 'Z'
-    # print('list_file_path: ', list_file_path)
-    print('Collection name: ', collection_name)
-    # Load document and create splits
-    doc_splits = load_doc(file_path)
-    # global vector_db
-    vector_db = create_db(doc_splits, collection_name)
-    return vector_db, collection_name, "Complete!"
-def create_db(splits, collection_name):
-    embedding = HuggingFaceEmbeddings()
-    new_client = chromadb.EphemeralClient()
-    vectordb = Chroma.from_documents(
-        documents=splits,
-        embedding=embedding,
-        client=new_client,
-        collection_name=collection_name,
-    )
-    return vectordb
-vec = initialize_database('data.pdf')
-vec_cre = create_db(splt, 'data')
-def initialize_llmchain(temperature, max_tokens, top_k, vector_db):
-    #Use memory if you want for the chatbot to be conversational, in this case it is just for answering from the document
-    # memory = ConversationBufferMemory(
-    #     memory_key="chat_history",
-    #     output_key='answer',
-    #     return_messages=True
-    # )
-    llm = HuggingFaceEndpoint(
-            repo_id='mistralai/Mixtral-8x7B-Instruct-v0.1',
-            temperature = temperature,
-            max_new_tokens = max_tokens,
-            top_k = top_k,
-            load_in_8bit = True
-        )
-    retriever=vector_db.as_retriever()
-    qa_chain = ConversationalRetrievalChain.from_llm(
-        llm,
-        retriever=retriever,
-        chain_type="stuff",
-        #memory=memory,
-        return_source_documents=True,
-        verbose=False,
-    )
-    return qa_chain
-qa = initialize_llmchain(0.6, 1024, 40, vec_cre) #The model question answer
-pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr") # This pipeline translate english to french , it isn't adviced as it add more latency
-# def format_chat_history(message, chat_history):
-#     formatted_chat_history = []
-#     for user_message, bot_message in chat_history:
-#         formatted_chat_history.append(f"User: {user_message}")
-#         formatted_chat_history.append(f"Assistant: {bot_message}")
-#     return formatted_chat_history
-def conversation(message, history):
-    #formatted_chat_history = format_chat_history(message, history)
-    # Generate response using QA chain
-    response = qa({"question": message + " According to the document", "chat_history": []})
-    response_answer = response["answer"]
-    if response_answer.find("Helpful Answer:") != -1:
-        response_answer = response_answer.split("Helpful Answer:")[-1]
-    #You can also return from where the model got the answer to fine-tune or adjust your model mais ici c'est bon
-    response_sources = response["source_documents"]
-    response_source1 = response_sources[0].page_content.strip()
-    response_source2 = response_sources[1].page_content.strip()
-    response_source3 = response_sources[2].page_content.strip()
-    response_source1_page = response_sources[0].metadata["page"] + 1
-    response_source2_page = response_sources[1].metadata["page"] + 1
-    response_source3_page = response_sources[2].metadata["page"] + 1
-    #If you want the return in english leave it at :
-    return response_answer
-    #If you want the return in french
-    #return pipe(response_answer)[0]['translation_text'] + " (Traduis d'anglais en français)"
-gr.ChatInterface(conversation).launch()