Spaces:
Sleeping
Sleeping
File size: 10,089 Bytes
9502853 0354172 74e613f da5e58a 9502853 ef3044e 0354172 9502853 0354172 4107bd4 9502853 f478447 9502853 0354172 9502853 0354172 9502853 0354172 9502853 0354172 c87e1a9 0354172 9502853 74e613f 0354172 74e613f 0354172 9502853 0354172 9502853 4ed8a8b 74e613f 9502853 f478447 4e9f9bb f478447 9502853 4107bd4 9502853 4107bd4 4e9f9bb 4107bd4 d2155df 4107bd4 d2155df 4107bd4 d2155df 4107bd4 d2155df 4107bd4 d2155df 4107bd4 d2155df 4107bd4 d2155df 4107bd4 d2155df 47b8e16 a747844 da5e58a a747844 da5e58a a747844 47b8e16 9502853 d7880e6 9502853 d7880e6 b99c74d d7880e6 0354172 9502853 0354172 9502853 0354172 9502853 47b8e16 0fba843 47b8e16 9502853 0fba843 9502853 0fba843 a747844 0fba843 a747844 0fba843 9502853 0fba843 0354172 9502853 0354172 9502853 0fba843 9502853 a067cdb 9502853 0354172 47b8e16 0fba843 0354172 9502853 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 |
# #############################################################################################################################
# # Filename : app.py
# # Description: A Streamlit application to showcase how RAG works.
# # Author : Georgios Ioannou
# #
# # Copyright Β© 2024 by Georgios Ioannou
# #############################################################################################################################
# app.py
import os
import re
from huggingface_hub import InferenceClient
import json
from huggingface_hub import HfApi
import streamlit as st
from typing import List, Dict, Any
from urllib.parse import quote_plus
from pymongo import MongoClient
from PyPDF2 import PdfReader
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from huggingface_hub import InferenceClient
# =================== Secure Env via Hugging Face Secrets ===================
user = quote_plus(os.getenv("MONGO_USERNAME"))
password = quote_plus(os.getenv("MONGO_PASSWORD"))
cluster = os.getenv("MONGO_CLUSTER")
db_name = os.getenv("MONGO_DB_NAME", "files")
collection_name = os.getenv("MONGO_COLLECTION", "files_collection")
index_name = os.getenv("MONGO_VECTOR_INDEX", "vector_index")
HF_TOKEN = os.getenv("HF_TOKEN")
# MONGO_URI = f"mongodb+srv://{user}:{password}@{cluster}/{db_name}?retryWrites=true&w=majority"
MONGO_URI = f"mongodb+srv://{user}:{password}@{cluster}/{db_name}?retryWrites=true&w=majority&tls=true&tlsAllowInvalidCertificates=true"
# =================== Prompt ===================
grantbuddy_prompt = PromptTemplate.from_template(
"""You are Grant Buddy, a specialized language model fine-tuned with instruction-tuning and RLHF.
You help a nonprofit focused on social entrepreneurship, BIPOC empowerment, and edtech write clear, mission-aligned grant responses.
**Instructions:**
- Start with reasoning or context for your answer.
- Always align with the nonprofitβs mission.
- Use structured formatting: headings, bullet points, numbered lists.
- Include impact data or examples if relevant.
- Do NOT repeat the same sentence or answer multiple times.
- If no answer exists in the context, say: "This information is not available in the current context."
CONTEXT:
{context}
QUESTION:
{question}
"""
)
# =================== Vector Search Setup ===================
@st.cache_resource
def init_embedding_model():
return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
@st.cache_resource
def init_vector_search() -> MongoDBAtlasVectorSearch:
HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
model_name = "sentence-transformers/all-MiniLM-L6-v2"
st.write(f"π Connecting to Hugging Face model: `{model_name}`")
embedding_model = HuggingFaceEmbeddings(model_name=model_name)
# β
Manual MongoClient with TLS settings
user = quote_plus(os.getenv("MONGO_USERNAME", "").strip())
password = quote_plus(os.getenv("MONGO_PASSWORD", "").strip())
cluster = os.getenv("MONGO_CLUSTER", "").strip()
db_name = os.getenv("MONGO_DB_NAME", "files").strip()
collection_name = os.getenv("MONGO_COLLECTION", "files_collection").strip()
index_name = os.getenv("MONGO_VECTOR_INDEX", "vector_index").strip()
mongo_uri = f"mongodb+srv://{user}:{password}@{cluster}/?retryWrites=true&w=majority"
try:
client = MongoClient(mongo_uri, tls=True, tlsAllowInvalidCertificates=True, serverSelectionTimeoutMS=20000)
db = client[db_name]
collection = db[collection_name]
st.success("β
MongoClient connected successfully")
return MongoDBAtlasVectorSearch(
collection=collection,
embedding=embedding_model,
index_name=index_name,
)
except Exception as e:
st.error("β Failed to connect to MongoDB Atlas manually")
st.error(str(e))
raise e
# =================== Question/Headers Extraction ===================
# def extract_questions_and_headers(text: str) -> List[str]:
# header_patterns = [
# r'\d+\.\s+\*\*([^\*]+)\*\*',
# r'\*\*([^*]+)\*\*',
# r'^([A-Z][^a-z]*[A-Z])$',
# r'^([A-Z][A-Za-z\s]{3,})$',
# r'^[A-Z][A-Za-z\s]+:$'
# ]
# question_patterns = [
# r'^.+\?$',
# r'^\*?Please .+',
# r'^How .+',
# r'^What .+',
# r'^Describe .+',
# ]
# combined_header_re = re.compile("|".join(header_patterns), re.MULTILINE)
# combined_question_re = re.compile("|".join(question_patterns), re.MULTILINE)
# headers = [match for group in combined_header_re.findall(text) for match in group if match]
# questions = combined_question_re.findall(text)
# return headers + questions
def extract_with_llm(text: str) -> List[str]:
client = InferenceClient(api_key=HF_TOKEN.strip())
try:
response = client.chat.completions.create(
model="mistralai/Mistral-Nemo-Instruct-2407", # or "HuggingFaceH4/zephyr-7b-beta"
messages=[
{
"role": "system",
"content": "You are an assistant helping extract questions and headers from grant applications.",
},
{
"role": "user",
"content": (
"Please extract all the grant application headers and questions from the following text. "
"Include section titles, prompts, and any question-like content. Return them as a numbered list.\n\n"
f"{text[:3000]}"
),
},
],
temperature=0.2,
max_tokens=512,
)
return [
line.strip("β’-1234567890. ").strip()
for line in response.choices[0].message.content.strip().split("\n")
if line.strip()
]
except Exception as e:
st.error("β LLM extraction failed")
st.error(str(e))
return []
# =================== Format Retrieved Chunks ===================
def format_docs(docs: List[Document]) -> str:
return "\n\n".join(doc.page_content or doc.metadata.get("content", "") for doc in docs)
# =================== Generate Response from Hugging Face Model ===================
def generate_response(input_dict: Dict[str, Any]) -> str:
client = InferenceClient(api_key=HF_TOKEN.strip())
prompt = grantbuddy_prompt.format(**input_dict)
try:
response = client.chat.completions.create(
model="HuggingFaceH4/zephyr-7b-beta",
messages=[
{"role": "system", "content": prompt},
{"role": "user", "content": input_dict["question"]},
],
max_tokens=1000,
temperature=0.2,
)
return response.choices[0].message.content
except Exception as e:
st.error(f"β Error from model: {e}")
return "β οΈ Failed to generate response. Please check your model, HF token, or request format."
# =================== RAG Chain ===================
def get_rag_chain(retriever):
return {
"context": retriever | RunnableLambda(format_docs),
"question": RunnablePassthrough()
} | RunnableLambda(generate_response)
# =================== Streamlit UI ===================
def main():
st.set_page_config(page_title="Grant Buddy RAG", page_icon="π€")
st.title("π€ Grant Buddy: Grant-Writing Assistant")
uploaded_file = st.file_uploader("Upload PDF or TXT for extra context (optional)", type=["pdf", "txt"])
uploaded_text = ""
retriever = init_vector_search().as_retriever(search_kwargs={"k": 10, "score_threshold": 0.75})
rag_chain = get_rag_chain(retriever) # β
Initialize before usage
# π Process uploaded file
if uploaded_file:
with st.spinner("π Processing uploaded file..."):
if uploaded_file.name.endswith(".pdf"):
reader = PdfReader(uploaded_file)
uploaded_text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
elif uploaded_file.name.endswith(".txt"):
uploaded_text = uploaded_file.read().decode("utf-8")
questions = extract_with_llm(uploaded_text)
st.success(f"β
Found {len(questions)} questions or headers.")
with st.expander("π§ Extracted Prompts from Upload"):
st.write(questions)
# Generate answers
answers = []
for q in questions:
full_query = f"{q}\n\nAdditional context:\n{uploaded_text}"
response = rag_chain.invoke(full_query)
answers.append({"question": q, "answer": response})
for item in answers:
st.markdown(f"### β {item['question']}")
st.markdown(f"π¬ {item['answer']}")
# β
Manual query box
query = st.text_input("Ask a grant-related question")
if st.button("Submit"):
if not query and not uploaded_file:
st.warning("Please enter a question.")
return
full_query = f"{query}\n\nAdditional context:\n{uploaded_text}" if uploaded_text else query
with st.spinner("π€ Thinking..."):
response = rag_chain.invoke(full_query)
st.text_area("Grant Buddy says:", value=response, height=250, disabled=True)
with st.expander("π Retrieved Chunks"):
context_docs = retriever.get_relevant_documents(full_query)
for doc in context_docs:
st.markdown(f"**Chunk ID:** {doc.metadata.get('chunk_id', 'unknown','title')}")
st.markdown(doc.page_content[:700] + "...")
st.markdown("---")
if __name__ == "__main__":
main()
|