masadonline commited on
Commit
f5fc1c4
Β·
verified Β·
1 Parent(s): 5db89f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -54
app.py CHANGED
@@ -7,6 +7,7 @@ from sentence_transformers import SentenceTransformer
7
  import faiss
8
  import numpy as np
9
  from groq import Groq
 
10
 
11
  # --- Helper Functions ---
12
 
@@ -25,6 +26,17 @@ def extract_text_from_pdf(pdf_path):
25
  st.warning(f"PyPDF2 failed with error: {e}. Trying pdfminer.six...")
26
  return extract_text(pdf_path)
27
 
 
 
 
 
 
 
 
 
 
 
 
28
  def chunk_text_with_tokenizer(text, tokenizer, chunk_size=150, chunk_overlap=30):
29
  tokens = tokenizer.tokenize(text)
30
  chunks = []
@@ -51,9 +63,9 @@ def generate_answer_with_groq(question, context):
51
  response = groq_client.chat.completions.create(
52
  model=model_name,
53
  messages=[
54
- {"role": "system", "content": "You are an AI Assistant for Small Businesses.You are an SME expert."},
55
- {"role": "user", "content": prompt},
56
- ]
57
  )
58
  return response.choices[0].message.content
59
  except Exception as e:
@@ -73,58 +85,62 @@ if not GROQ_API_KEY:
73
 
74
  os.environ["GROQ_API_KEY"] = GROQ_API_KEY
75
 
76
- # File uploader
77
- uploaded_pdf = st.file_uploader("πŸ“ Upload PDF document(s) for SME knowledge base", type=["pdf"], accept_multiple_files=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- # Text input for question
80
  user_question = st.text_input("πŸ’¬ Ask your question about SME documents:")
81
 
82
- # Button to trigger processing
83
- if st.button("Get Answer") or (user_question and uploaded_pdf):
84
- if not uploaded_pdf:
85
- st.warning("Please upload a PDF file first.")
86
- elif not user_question:
87
- st.warning("Please enter a question.")
88
  else:
89
- with st.spinner("Processing PDF and generating answer..."):
90
- # Save uploaded file temporarily for PyPDF2/pdfminer
91
- temp_path = f"/tmp/{uploaded_pdf.name}"
92
- with open(temp_path, "wb") as f:
93
- f.write(uploaded_pdf.getbuffer())
94
-
95
- # Extract text
96
- pdf_text = extract_text_from_pdf(temp_path)
97
-
98
- # Tokenizer + Chunk
99
- tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
100
- text_chunks = chunk_text_with_tokenizer(pdf_text, tokenizer)
101
-
102
- # Embeddings
103
- embedding_model = SentenceTransformer('all-mpnet-base-v2')
104
- all_embeddings = embedding_model.encode(text_chunks) if text_chunks else None
105
-
106
- if all_embeddings is None or len(all_embeddings) == 0:
107
- st.error("No text chunks found to create embeddings.")
108
- else:
109
- # Create FAISS index
110
- embedding_dim = all_embeddings[0].shape[0]
111
- index = faiss.IndexFlatL2(embedding_dim)
112
- index.add(np.array(all_embeddings))
113
-
114
- # Retrieve relevant chunks
115
- relevant_chunks = retrieve_relevant_chunks(user_question, index, embedding_model, text_chunks)
116
- context = "\n\n".join(relevant_chunks)
117
-
118
- # Generate answer with Groq
119
- answer = generate_answer_with_groq(user_question, context)
120
-
121
- # Display outputs
122
- #st.markdown("### Extracted Text Snippet:")
123
- #st.write(pdf_text[:500] + "...")
124
-
125
- #st.markdown("### Sample Text Chunks:")
126
- #for i, chunk in enumerate(text_chunks[:3]):
127
- # st.write(f"Chunk {i+1}: {chunk[:200]}...")
128
-
129
- st.markdown("### Answer:")
130
- st.success(answer)
 
7
  import faiss
8
  import numpy as np
9
  from groq import Groq
10
+ import docx # to read .docx files
11
 
12
  # --- Helper Functions ---
13
 
 
26
  st.warning(f"PyPDF2 failed with error: {e}. Trying pdfminer.six...")
27
  return extract_text(pdf_path)
28
 
29
+ def extract_text_from_docx(docx_path):
30
+ try:
31
+ doc = docx.Document(docx_path)
32
+ full_text = []
33
+ for para in doc.paragraphs:
34
+ full_text.append(para.text)
35
+ return '\n'.join(full_text)
36
+ except Exception as e:
37
+ st.warning(f"Failed to read DOCX {docx_path}: {e}")
38
+ return ""
39
+
40
  def chunk_text_with_tokenizer(text, tokenizer, chunk_size=150, chunk_overlap=30):
41
  tokens = tokenizer.tokenize(text)
42
  chunks = []
 
63
  response = groq_client.chat.completions.create(
64
  model=model_name,
65
  messages=[
66
+ {"role": "system", "content": "You are an AI Assistant for Small Businesses. You are an SME expert."},
67
+ {"role": "user", "content": prompt},
68
+ ]
69
  )
70
  return response.choices[0].message.content
71
  except Exception as e:
 
85
 
86
  os.environ["GROQ_API_KEY"] = GROQ_API_KEY
87
 
88
+ # Load and process all docs at startup
89
+ @st.cache_data(show_spinner=True)
90
+ def load_and_prepare_docs(folder_path="docs"):
91
+ all_text = ""
92
+ if not os.path.exists(folder_path):
93
+ st.error(f"Folder '{folder_path}' does not exist!")
94
+ return None, None, None
95
+
96
+ # Collect all pdf and docx files
97
+ files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.pdf', '.docx', '.doc'))]
98
+ if not files:
99
+ st.error(f"No PDF or DOCX files found in folder '{folder_path}'.")
100
+ return None, None, None
101
+
102
+ for file in files:
103
+ path = os.path.join(folder_path, file)
104
+ if file.lower().endswith('.pdf'):
105
+ text = extract_text_from_pdf(path)
106
+ elif file.lower().endswith(('.docx', '.doc')):
107
+ text = extract_text_from_docx(path)
108
+ else:
109
+ text = ""
110
+ if text:
111
+ all_text += text + "\n\n"
112
+
113
+ if not all_text.strip():
114
+ st.error("No text extracted from documents.")
115
+ return None, None, None
116
+
117
+ tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
118
+ text_chunks = chunk_text_with_tokenizer(all_text, tokenizer)
119
+
120
+ embedding_model = SentenceTransformer('all-mpnet-base-v2')
121
+ all_embeddings = embedding_model.encode(text_chunks) if text_chunks else None
122
+
123
+ if all_embeddings is None or len(all_embeddings) == 0:
124
+ st.error("No text chunks found to create embeddings.")
125
+ return None, None, None
126
+
127
+ embedding_dim = all_embeddings[0].shape[0]
128
+ index = faiss.IndexFlatL2(embedding_dim)
129
+ index.add(np.array(all_embeddings))
130
+
131
+ return index, embedding_model, text_chunks
132
+
133
+ index, embedding_model, text_chunks = load_and_prepare_docs()
134
 
 
135
  user_question = st.text_input("πŸ’¬ Ask your question about SME documents:")
136
 
137
+ if st.button("Get Answer") and user_question:
138
+ if index is None or embedding_model is None or text_chunks is None:
139
+ st.error("The document knowledge base is not ready. Please check the errors above.")
 
 
 
140
  else:
141
+ with st.spinner("Searching for relevant information and generating answer..."):
142
+ relevant_chunks = retrieve_relevant_chunks(user_question, index, embedding_model, text_chunks)
143
+ context = "\n\n".join(relevant_chunks)
144
+ answer = generate_answer_with_groq(user_question, context)
145
+ st.markdown("### Answer:")
146
+ st.success(answer)