from langchain_huggingface import HuggingFaceEmbeddings import gradio as gr import os from googletrans import Translator import requests from dotenv import load_dotenv import numpy as np from langchain_community.vectorstores import Chroma from langchain_community.document_loaders import UnstructuredPDFLoader, PyPDFLoader from langchain.text_splitter import CharacterTextSplitter from langchain.chains import RetrievalQAWithSourcesChain from langchain.schema import Document from langchain.memory import ConversationBufferMemory from langchain.callbacks.manager import CallbackManager from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.chains import load_qa_with_sources_from_chain_type from langchain.llms.base import LLM from typing import List, Dict, Any, Optional from pydantic import BaseModel from tqdm import tqdm import torch import logging # Configurazione del logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Aggiornamento dell'inizializzazione di HuggingFaceEmbeddings embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # Definizione della lista di modelli LLM list_llm_simple = ["Gemma 7B (Italian)", "Mistral 7B"] list_llm = ["google/gemma-7b-it", "mistralai/Mistral-7B-Instruct-v0.2"] def initialize_database(document, chunk_size, chunk_overlap, progress=gr.Progress()): logger.info("Initializing database...") documents = [] splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) for file in document: try: loader = UnstructuredPDFLoader(file.name) docs = loader.load() except ImportError: logger.warning("UnstructuredPDFLoader non disponibile. Tentativo di utilizzo di PyPDFLoader.") try: loader = PyPDFLoader(file.name) docs = loader.load() except ImportError: logger.error("Impossibile caricare il documento PDF. Assicurati di aver installato 'unstructured' o 'pypdf'.") return None, None, "Errore: Pacchetti necessari non installati. Esegui 'pip install unstructured pypdf' e riprova." for doc in docs: text_chunks = splitter.split_text(doc.page_content) for chunk in text_chunks: documents.append(Document(page_content=chunk, metadata={"filename": file.name, "page": doc.metadata.get("page", 0)})) if not documents: return None, None, "Errore: Nessun documento caricato correttamente." vectorstore = Chroma.from_documents(documents, embedding_function) progress.update(0.5) logger.info("Database initialized successfully.") return vectorstore, None, "Initialized" # Aggiunto None come secondo output def initialize_LLM(llm_option, llm_temperature, max_tokens, top_k, vector_db, progress=gr.Progress(), language="italian"): logger.info("Initializing LLM chain...") llm_name = list_llm[llm_option] print("llm_name: ",llm_name) if language == "italian": default_llm = "google/gemma-7b-it" else: default_llm = "mistralai/Mistral-7B-Instruct-v0.2" if llm_name != default_llm: print(f"Using default LLM {default_llm} for {language}") llm_name = default_llm qa_chain = load_qa_with_sources_from_chain_type( llm=llm_name, chain_type="stuff", retriever=vector_db.as_retriever(), temperature=llm_temperature, top_k_per_token=top_k, max_tokens=max_tokens, ) progress.update(1.0) logger.info("LLM chain initialized successfully.") return qa_chain, "Complete!" def format_chat_history(message, history): chat_history = "" for item in history: chat_history += f"\nUser: {item[0]}\nAI: {item[1]}" chat_history += f"\n\nUser: {message}" return chat_history def translate_text(text, src_lang, dest_lang): translator = Translator() result = translator.translate(text, src=src_lang, dest=dest_lang) return result.text def conversation(qa_chain, message, history, language): formatted_chat_history = format_chat_history(message, history) response = qa_chain({"question": message, "chat_history": formatted_chat_history}) response_answer = response["answer"] if response_answer.find("Helpful Answer:")!= -1: response_answer = response_answer.split("Helpful Answer:")[-1] if language != "italian": try: translated_response = translate_text(response_answer, src="en", dest="it") except Exception as e: logger.error(f"Error translating response: {e}") translated_response = response_answer else: translated_response = response_answer response_sources = response["source_documents"] response_source1 = response_sources[0].page_content.strip() response_source2 = response_sources[1].page_content.strip() response_source3 = response_sources[2].page_content.strip() response_source1_page = response_sources[0].metadata["page"] + 1 response_source2_page = response_sources[1].metadata["page"] + 1 response_source3_page = response_sources[2].metadata["page"] + 1 new_history = history + [(message, translated_response)] return qa_chain, gr.update(value=""), new_history, response_source1, response_source1_page, response_source2, response_source2_page, response_source3, response_source3_page def demo(): with gr.Blocks(theme="base") as demo: vector_db = gr.State() qa_chain = gr.State() collection_name = gr.State() language = gr.State(value="italian") # Modifica qui gr.Markdown( """