farmax commited on
Commit
93d7cc3
·
verified ·
1 Parent(s): 08f20d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -13
app.py CHANGED
@@ -6,7 +6,7 @@ import requests
6
  from dotenv import load_dotenv
7
  import numpy as np
8
  from langchain_community.vectorstores import Chroma
9
- from langchain_community.document_loaders import UnstructuredPDFLoader
10
  from langchain.text_splitter import CharacterTextSplitter
11
  from langchain.chains import RetrievalQAWithSourcesChain
12
  from langchain.schema import Document
@@ -20,6 +20,10 @@ from tqdm import tqdm
20
  import torch
21
  import logging
22
 
 
 
 
 
23
  # Aggiornamento dell'inizializzazione di HuggingFaceEmbeddings
24
  embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
25
 
@@ -27,26 +31,30 @@ embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all
27
  list_llm_simple = ["Gemma 7B (Italian)", "Mistral 7B"]
28
  list_llm = ["google/gemma-7b-it", "mistralai/Mistral-7B-Instruct-v0.2"]
29
 
30
- logging.basicConfig(level=logging.INFO)
31
- logger = logging.getLogger(__name__)
32
-
33
- class PDFDocument(Document):
34
- def _extract_metadata(self, **kwargs) -> Dict[str, Any]:
35
- metadata = super()._extract_metadata(**kwargs)
36
- metadata["filename"] = self.page_content
37
- return metadata
38
-
39
  def initialize_database(document, chunk_size, chunk_overlap, progress=gr.Progress()):
40
  logger.info("Initializing database...")
41
  documents = []
42
  for file in document:
43
- loader = UnstructuredPDFLoader(file.name)
44
- docs = loader.load()
 
 
 
 
 
 
 
 
 
 
45
  splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
46
  for doc in docs:
47
  pages = splitter.split_document(doc)
48
  for page in pages:
49
- documents.append(PDFDocument(page_content=page.page_content, metadata={"filename": file.name}))
 
 
 
50
 
51
  vectorstore = Chroma.from_documents(documents, embedding_function)
52
  progress.update(0.5)
@@ -121,6 +129,17 @@ def conversation(qa_chain, message, history, language):
121
 
122
  def demo():
123
  with gr.Blocks(theme="base") as demo:
 
 
 
 
 
 
 
 
 
 
 
124
  vector_db = gr.State()
125
  qa_chain = gr.State()
126
  collection_name = gr.State()
 
6
  from dotenv import load_dotenv
7
  import numpy as np
8
  from langchain_community.vectorstores import Chroma
9
+ from langchain_community.document_loaders import UnstructuredPDFLoader, PyPDFLoader
10
  from langchain.text_splitter import CharacterTextSplitter
11
  from langchain.chains import RetrievalQAWithSourcesChain
12
  from langchain.schema import Document
 
20
  import torch
21
  import logging
22
 
23
+ # Configurazione del logging
24
+ logging.basicConfig(level=logging.INFO)
25
+ logger = logging.getLogger(__name__)
26
+
27
  # Aggiornamento dell'inizializzazione di HuggingFaceEmbeddings
28
  embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
29
 
 
31
  list_llm_simple = ["Gemma 7B (Italian)", "Mistral 7B"]
32
  list_llm = ["google/gemma-7b-it", "mistralai/Mistral-7B-Instruct-v0.2"]
33
 
 
 
 
 
 
 
 
 
 
34
  def initialize_database(document, chunk_size, chunk_overlap, progress=gr.Progress()):
35
  logger.info("Initializing database...")
36
  documents = []
37
  for file in document:
38
+ try:
39
+ loader = UnstructuredPDFLoader(file.name)
40
+ docs = loader.load()
41
+ except ImportError:
42
+ logger.warning("UnstructuredPDFLoader non disponibile. Tentativo di utilizzo di PyPDFLoader.")
43
+ try:
44
+ loader = PyPDFLoader(file.name)
45
+ docs = loader.load()
46
+ except ImportError:
47
+ logger.error("Impossibile caricare il documento PDF. Assicurati di aver installato 'unstructured' o 'pypdf'.")
48
+ return None, "Errore: Pacchetti necessari non installati. Esegui 'pip install unstructured pypdf' e riprova."
49
+
50
  splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
51
  for doc in docs:
52
  pages = splitter.split_document(doc)
53
  for page in pages:
54
+ documents.append(Document(page_content=page.page_content, metadata={"filename": file.name}))
55
+
56
+ if not documents:
57
+ return None, "Errore: Nessun documento caricato correttamente."
58
 
59
  vectorstore = Chroma.from_documents(documents, embedding_function)
60
  progress.update(0.5)
 
129
 
130
  def demo():
131
  with gr.Blocks(theme="base") as demo:
132
+ gr.Markdown(
133
+ """
134
+ ## Importante: Installazione dei pacchetti necessari
135
+ Prima di utilizzare questa applicazione, assicurati di aver installato i seguenti pacchetti:
136
+ ```
137
+ pip install unstructured pypdf
138
+ ```
139
+ Questi pacchetti sono necessari per il corretto funzionamento del caricamento dei documenti PDF.
140
+ """
141
+ )
142
+
143
  vector_db = gr.State()
144
  qa_chain = gr.State()
145
  collection_name = gr.State()