Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -9,13 +9,10 @@ from langchain_community.vectorstores import FAISS
|
|
9 |
from langchain.chains import RetrievalQA
|
10 |
from langchain.chat_models import ChatOpenAI
|
11 |
|
12 |
-
# β
Read OpenRouter API key from
|
13 |
OPENROUTER_API_KEY = os.environ.get("ArjunHF")
|
14 |
|
15 |
class OpenRouterChatModel(ChatOpenAI):
|
16 |
-
"""
|
17 |
-
A wrapper around ChatOpenAI to use OpenRouter with a Mistral 3.2 model.
|
18 |
-
"""
|
19 |
def __init__(self, **kwargs):
|
20 |
super().__init__(
|
21 |
openai_api_base="https://openrouter.ai/api/v1",
|
@@ -24,60 +21,52 @@ class OpenRouterChatModel(ChatOpenAI):
|
|
24 |
**kwargs
|
25 |
)
|
26 |
|
27 |
-
|
|
|
|
|
|
|
|
|
28 |
"""
|
29 |
-
|
30 |
-
|
31 |
-
Features:
|
32 |
-
- Converts `\( ... \)` β `$ ... $` for inline math.
|
33 |
-
- Converts `\[ ... \]` β `$$ ... $$` for display math.
|
34 |
-
- Replaces malformed `-$...$` β `$...$`.
|
35 |
-
- Removes trailing backslashes in both `$...$` and `$$...$$`.
|
36 |
-
- Removes unnecessary spaces inside inline or display math.
|
37 |
-
- Cleans math inside lists, paragraphs, or any text context.
|
38 |
"""
|
39 |
-
|
40 |
-
return text
|
41 |
-
|
42 |
-
# Convert \( ... \) -> $ ... $
|
43 |
-
text = re.sub(r"\\\\\(([\s\S]+?)\\\\\)", r"$\1$", text)
|
44 |
-
|
45 |
-
# Convert \[ ... \] -> $$ ... $$
|
46 |
text = re.sub(r"\\\\\[([\s\S]+?)\\\\\]", r"$$\1$$", text)
|
47 |
-
|
48 |
-
# Replace -$ ... $ with $ ... $
|
49 |
-
text = re.sub(r"-\$(.+?)\$", r"$\1$", text)
|
50 |
-
|
51 |
-
# Remove trailing backslashes before $ or $$
|
52 |
-
text = re.sub(r"\\\$\$", "$$", text)
|
53 |
-
text = re.sub(r"\\\$", "$", text)
|
54 |
|
55 |
-
#
|
56 |
-
text = re.sub(r"
|
57 |
-
text = re.sub(r"\$\s+([^\$]+?)\s+\$", r"$\1$", text)
|
58 |
|
59 |
-
#
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
return text
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
Performs a question-answer retrieval on the content of a given webpage URL.
|
68 |
-
|
69 |
-
Steps:
|
70 |
-
1. Loads the webpage content.
|
71 |
-
2. Splits the text into manageable chunks.
|
72 |
-
3. Embeds chunks using HuggingFace embeddings.
|
73 |
-
4. Builds a FAISS vector store for semantic search.
|
74 |
-
5. Uses OpenRouter Mistral 3.2 LLM to answer the query.
|
75 |
-
6. Cleans and formats all LaTeX/math formulas for Gradio Markdown.
|
76 |
-
"""
|
77 |
try:
|
78 |
loader = WebBaseLoader(url)
|
79 |
docs = loader.load()
|
80 |
-
|
81 |
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
82 |
split_docs = splitter.split_documents(docs)
|
83 |
|
@@ -89,30 +78,20 @@ def qa_on_url(url: str, question: str) -> str:
|
|
89 |
qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever)
|
90 |
raw_answer = qa_chain.run(question)
|
91 |
|
92 |
-
#
|
93 |
-
formatted_answer =
|
94 |
return formatted_answer
|
95 |
|
96 |
except Exception as e:
|
97 |
return f"β Error: {e}"
|
98 |
|
99 |
-
# --- Gradio Interface ---
|
100 |
iface = gr.Interface(
|
101 |
fn=qa_on_url,
|
102 |
-
inputs=[
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
outputs=gr.Markdown(), # β
Allows LaTeX/math rendering
|
107 |
-
title="π Ask Questions About Any Webpage",
|
108 |
-
description=(
|
109 |
-
"Use this tool to ask questions about any webpage content. "
|
110 |
-
"It fetches the page, creates semantic embeddings, and uses a "
|
111 |
-
"Mistral 3.2 LLM via OpenRouter to answer your question. "
|
112 |
-
"Inline and display math formulas are automatically cleaned for proper rendering. May crash sometimes."
|
113 |
-
"β οΈ Depending on page length and LLM response, this may take 10β20 seconds."
|
114 |
-
)
|
115 |
)
|
116 |
|
117 |
if __name__ == "__main__":
|
118 |
-
iface.launch()
|
|
|
9 |
from langchain.chains import RetrievalQA
|
10 |
from langchain.chat_models import ChatOpenAI
|
11 |
|
12 |
+
# β
Read OpenRouter API key from HF secret
|
13 |
OPENROUTER_API_KEY = os.environ.get("ArjunHF")
|
14 |
|
15 |
class OpenRouterChatModel(ChatOpenAI):
|
|
|
|
|
|
|
16 |
def __init__(self, **kwargs):
|
17 |
super().__init__(
|
18 |
openai_api_base="https://openrouter.ai/api/v1",
|
|
|
21 |
**kwargs
|
22 |
)
|
23 |
|
24 |
+
import re
|
25 |
+
|
26 |
+
import re
|
27 |
+
|
28 |
+
def format_math(text):
|
29 |
"""
|
30 |
+
Convert LaTeX/math patterns to proper Markdown for Gradio.
|
31 |
+
Cleans up trailing backslashes and ensures formulas render correctly.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
"""
|
33 |
+
# 1οΈβ£ Fix \[ ... \] -> $$ ... $$
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
text = re.sub(r"\\\\\[([\s\S]+?)\\\\\]", r"$$\1$$", text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
+
# 2οΈβ£ Fix \( ... \) -> $ ... $
|
37 |
+
text = re.sub(r"\\\\\(([\s\S]+?)\\\\\)", r"$\1$", text)
|
|
|
38 |
|
39 |
+
# 3οΈβ£ Fix [ ... ] -> $$ ... $$ if contains math symbols
|
40 |
+
def replace_brackets(match):
|
41 |
+
content = match.group(1)
|
42 |
+
if re.search(r"[\\^_{}=]", content):
|
43 |
+
return f"$${content}$$"
|
44 |
+
return match.group(0)
|
45 |
+
text = re.sub(r"\[([^\[\]]+)\]", replace_brackets, text)
|
46 |
+
|
47 |
+
# 4οΈβ£ Fix ( ... ) -> $ ... $ if contains math symbols
|
48 |
+
def replace_parentheses(match):
|
49 |
+
content = match.group(1)
|
50 |
+
if re.search(r"[\\^_{}=]", content):
|
51 |
+
return f"${content}$"
|
52 |
+
return match.group(0)
|
53 |
+
text = re.sub(r"\(([^()]+)\)", replace_parentheses, text)
|
54 |
+
|
55 |
+
# 5οΈβ£ Remove trailing backslash before $ in both inline and display math
|
56 |
+
text = re.sub(r"\\\$\$", "$$", text) # trailing \$$ -> $$
|
57 |
+
text = re.sub(r"\\\$", "$", text) # trailing \$ -> $
|
58 |
+
|
59 |
+
# 6οΈβ£ Clean up extra spaces around $$
|
60 |
+
text = re.sub(r"\$\$\s+([^\$]+)\s+\$\$", r"$$\1$$", text)
|
61 |
+
text = re.sub(r"\$\s+([^\$]+)\s+\$", r"$\1$", text)
|
62 |
|
63 |
return text
|
64 |
|
65 |
+
|
66 |
+
def qa_on_url(url, question):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
try:
|
68 |
loader = WebBaseLoader(url)
|
69 |
docs = loader.load()
|
|
|
70 |
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
71 |
split_docs = splitter.split_documents(docs)
|
72 |
|
|
|
78 |
qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever)
|
79 |
raw_answer = qa_chain.run(question)
|
80 |
|
81 |
+
# Format all math formulas nicely
|
82 |
+
formatted_answer = format_math(raw_answer)
|
83 |
return formatted_answer
|
84 |
|
85 |
except Exception as e:
|
86 |
return f"β Error: {e}"
|
87 |
|
|
|
88 |
iface = gr.Interface(
|
89 |
fn=qa_on_url,
|
90 |
+
inputs=[gr.Textbox(label="Enter Web URL"), gr.Textbox(label="Your Question")],
|
91 |
+
outputs=gr.Markdown(), # Markdown allows LaTeX rendering
|
92 |
+
title="π Ask Questions About Any Webpage (Mistral 3.2 via OpenRouter + LangChain)",
|
93 |
+
description="β οΈ This may take 10β20 seconds depending on the page length and LLM response time. Please be patient!"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
)
|
95 |
|
96 |
if __name__ == "__main__":
|
97 |
+
iface.launch()
|