Spaces:

ajnx014
/

Langchain-RAG-QA-BOT

Running

App Files Files Community

ajnx014 commited on 6 days ago

Commit

80344cf

verified ·

1 Parent(s): e6cd443

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -64

app.py CHANGED Viewed

@@ -9,13 +9,10 @@ from langchain_community.vectorstores import FAISS
 from langchain.chains import RetrievalQA
 from langchain.chat_models import ChatOpenAI
-# ✅ Read OpenRouter API key from Hugging Face secret
 OPENROUTER_API_KEY = os.environ.get("ArjunHF")
 class OpenRouterChatModel(ChatOpenAI):
-    """
-    A wrapper around ChatOpenAI to use OpenRouter with a Mistral 3.2 model.
-    """
     def __init__(self, **kwargs):
         super().__init__(
             openai_api_base="https://openrouter.ai/api/v1",
@@ -24,60 +21,52 @@ class OpenRouterChatModel(ChatOpenAI):
             **kwargs
         )
-def clean_math(text: str) -> str:
     """
-    Cleans and standardizes LaTeX/math formulas in LLM outputs for Gradio Markdown.
-    Features:
-    - Converts `\( ... \)` → `$ ... $` for inline math.
-    - Converts `\[ ... \]` → `$$ ... $$` for display math.
-    - Replaces malformed `-$...$` → `$...$`.
-    - Removes trailing backslashes in both `$...$` and `$$...$$`.
-    - Removes unnecessary spaces inside inline or display math.
-    - Cleans math inside lists, paragraphs, or any text context.
     """
-    if not text:
-        return text
-    # Convert \( ... \) -> $ ... $
-    text = re.sub(r"\\\\\(([\s\S]+?)\\\\\)", r"$\1$", text)
-    # Convert \[ ... \] -> $$ ... $$
     text = re.sub(r"\\\\\[([\s\S]+?)\\\\\]", r"$$\1$$", text)
-    # Replace -$ ... $ with $ ... $
-    text = re.sub(r"-\$(.+?)\$", r"$\1$", text)
-    # Remove trailing backslashes before $ or $$
-    text = re.sub(r"\\\$\$", "$$", text)
-    text = re.sub(r"\\\$", "$", text)
-    # Remove extra spaces inside inline and display math
-    text = re.sub(r"\$\$\s+([^\$]+?)\s+\$\$", r"$$\1$$", text)
-    text = re.sub(r"\$\s+([^\$]+?)\s+\$", r"$\1$", text)
-    # Clean math in lists or paragraphs, e.g., "- $ y_i $" -> "- $y_i$"
-    text = re.sub(r"(\s)-\$\s*([^\$]+?)\s*\$", r"\1$\2$", text)
-    text = re.sub(r"(\s)\$\s*([^\$]+?)\s*\$", r"\1$\2$", text)
     return text
-def qa_on_url(url: str, question: str) -> str:
-    """
-    Performs a question-answer retrieval on the content of a given webpage URL.
-    Steps:
-    1. Loads the webpage content.
-    2. Splits the text into manageable chunks.
-    3. Embeds chunks using HuggingFace embeddings.
-    4. Builds a FAISS vector store for semantic search.
-    5. Uses OpenRouter Mistral 3.2 LLM to answer the query.
-    6. Cleans and formats all LaTeX/math formulas for Gradio Markdown.
-    """
     try:
         loader = WebBaseLoader(url)
         docs = loader.load()
         splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
         split_docs = splitter.split_documents(docs)
@@ -89,30 +78,20 @@ def qa_on_url(url: str, question: str) -> str:
         qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever)
         raw_answer = qa_chain.run(question)
-        # Clean all math formulas for proper rendering in Gradio Markdown
-        formatted_answer = clean_math(raw_answer)
         return formatted_answer
     except Exception as e:
         return f"❌ Error: {e}"
-# --- Gradio Interface ---
 iface = gr.Interface(
     fn=qa_on_url,
-    inputs=[
-        gr.Textbox(label="Enter Web URL", placeholder="https://example.com"),
-        gr.Textbox(label="Your Question", placeholder="Ask anything about the webpage")
-    ],
-    outputs=gr.Markdown(),  # ✅ Allows LaTeX/math rendering
-    title="🔎 Ask Questions About Any Webpage",
-    description=(
-        "Use this tool to ask questions about any webpage content. "
-        "It fetches the page, creates semantic embeddings, and uses a "
-        "Mistral 3.2 LLM via OpenRouter to answer your question. "
-        "Inline and display math formulas are automatically cleaned for proper rendering. May crash sometimes."
-        "⚠️ Depending on page length and LLM response, this may take 10–20 seconds."
-    )
 )
 if __name__ == "__main__":
-    iface.launch()

 from langchain.chains import RetrievalQA
 from langchain.chat_models import ChatOpenAI
+# ✅ Read OpenRouter API key from HF secret
 OPENROUTER_API_KEY = os.environ.get("ArjunHF")
 class OpenRouterChatModel(ChatOpenAI):
     def __init__(self, **kwargs):
         super().__init__(
             openai_api_base="https://openrouter.ai/api/v1",
             **kwargs
         )
+import re
+import re
+def format_math(text):
     """
+    Convert LaTeX/math patterns to proper Markdown for Gradio.
+    Cleans up trailing backslashes and ensures formulas render correctly.
     """
+    # 1️⃣ Fix \[ ... \] -> $$ ... $$
     text = re.sub(r"\\\\\[([\s\S]+?)\\\\\]", r"$$\1$$", text)
+    # 2️⃣ Fix \( ... \) -> $ ... $
+    text = re.sub(r"\\\\\(([\s\S]+?)\\\\\)", r"$\1$", text)
+    # 3️⃣ Fix [ ... ] -> $$ ... $$ if contains math symbols
+    def replace_brackets(match):
+        content = match.group(1)
+        if re.search(r"[\\^_{}=]", content):
+            return f"$${content}$$"
+        return match.group(0)
+    text = re.sub(r"\[([^\[\]]+)\]", replace_brackets, text)
+    # 4️⃣ Fix ( ... ) -> $ ... $ if contains math symbols
+    def replace_parentheses(match):
+        content = match.group(1)
+        if re.search(r"[\\^_{}=]", content):
+            return f"${content}$"
+        return match.group(0)
+    text = re.sub(r"\(([^()]+)\)", replace_parentheses, text)
+    # 5️⃣ Remove trailing backslash before $ in both inline and display math
+    text = re.sub(r"\\\$\$", "$$", text)  # trailing \$$ -> $$
+    text = re.sub(r"\\\$", "$", text)     # trailing \$ -> $
+    # 6️⃣ Clean up extra spaces around $$
+    text = re.sub(r"\$\$\s+([^\$]+)\s+\$\$", r"$$\1$$", text)
+    text = re.sub(r"\$\s+([^\$]+)\s+\$", r"$\1$", text)
     return text
+def qa_on_url(url, question):
     try:
         loader = WebBaseLoader(url)
         docs = loader.load()
         splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
         split_docs = splitter.split_documents(docs)
         qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever)
         raw_answer = qa_chain.run(question)
+        # Format all math formulas nicely
+        formatted_answer = format_math(raw_answer)
         return formatted_answer
     except Exception as e:
         return f"❌ Error: {e}"
 iface = gr.Interface(
     fn=qa_on_url,
+    inputs=[gr.Textbox(label="Enter Web URL"), gr.Textbox(label="Your Question")],
+    outputs=gr.Markdown(),  # Markdown allows LaTeX rendering
+    title="🔎 Ask Questions About Any Webpage (Mistral 3.2 via OpenRouter + LangChain)",
+    description="⚠️ This may take 10–20 seconds depending on the page length and LLM response time. Please be patient!"
 )
 if __name__ == "__main__":
+    iface.launch()