Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,577 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import re
|
3 |
+
import json
|
4 |
+
import requests
|
5 |
+
import os
|
6 |
+
import tempfile
|
7 |
+
from bs4 import BeautifulSoup # For web scraping
|
8 |
+
from newspaper import Article # For smarter article extraction
|
9 |
+
from tavily import TavilyClient # For web search
|
10 |
+
|
11 |
+
# --- build_logic.py is NO LONGER a hard requirement for the research agent core ---
|
12 |
+
# We might repurpose some utility functions or remove its direct use if focusing purely on research.
|
13 |
+
# For this transformation, we'll comment out most build_logic specific interactions
|
14 |
+
# but keep parsing functions if they are general enough.
|
15 |
+
|
16 |
+
# from build_logic import (
|
17 |
+
# create_space as build_logic_create_space,
|
18 |
+
# _get_api_token as build_logic_get_api_token,
|
19 |
+
# whoami as build_logic_whoami,
|
20 |
+
# list_space_files_for_browsing,
|
21 |
+
# get_space_repository_info,
|
22 |
+
# get_space_file_content,
|
23 |
+
# update_space_file,
|
24 |
+
# parse_markdown as build_logic_parse_markdown, # May still be useful for report generation
|
25 |
+
# delete_space_file as build_logic_delete_space_file,
|
26 |
+
# get_space_runtime_status
|
27 |
+
# )
|
28 |
+
# print("build_logic.py related functions commented out for Research Agent mode.")
|
29 |
+
# --- End build_logic import ---
|
30 |
+
|
31 |
+
|
32 |
+
bbb = chr(96) * 3
|
33 |
+
parsed_research_outputs_cache = [] # Renamed from parsed_code_blocks_state_cache
|
34 |
+
BOT_ROLE_NAME = "assistant" # LLM's role
|
35 |
+
TOOL_ROLE_NAME = "tool" # Role for tool execution results
|
36 |
+
GROQ_API_ENDPOINT = "https://api.groq.com/openai/v1/chat/completions"
|
37 |
+
MAX_WEBPAGE_CONTENT_LENGTH = 6000 # Max characters to extract from a webpage
|
38 |
+
MAX_SEARCH_RESULTS_TO_PROCESS = 3 # Max search results to browse by default
|
39 |
+
|
40 |
+
# --- New System Prompt for Research Agent ---
|
41 |
+
DEFAULT_SYSTEM_PROMPT = f"""You are an expert AI Research Assistant. Your goal is to answer user questions and perform research tasks by intelligently using the tools available to you.
|
42 |
+
|
43 |
+
Available Tools:
|
44 |
+
1. **`search_web`**: Use this tool to search the internet for information.
|
45 |
+
- Input: A JSON object with a "query" key (e.g., `{{"query": "latest advancements in AI"}}`)
|
46 |
+
2. **`browse_web_page`**: Use this tool to get the content of a specific URL.
|
47 |
+
- Input: A JSON object with a "url" key (e.g., `{{"url": "https://example.com/article"}}`)
|
48 |
+
|
49 |
+
Tool Usage Instructions:
|
50 |
+
- When you need to use a tool, respond ONLY with a JSON object describing the tool call.
|
51 |
+
Example for search:
|
52 |
+
`{{"tool_calls": [{{"id": "call_abc123", "type": "function", "function": {{"name": "search_web", "arguments": "{{\\"query\\": \\"your search query\\"}}"}}}}]}}`
|
53 |
+
Example for browsing a URL:
|
54 |
+
`{{"tool_calls": [{{"id": "call_xyz789", "type": "function", "function": {{"name": "browse_web_page", "arguments": "{{\\"url\\": \\"https://www.example.com/page\\"}}"}}}}]}}`
|
55 |
+
- The `id` for the tool call should be unique for each call, e.g., "call_randomstring123".
|
56 |
+
- After you make a tool call, the system will execute it and provide you with the results. You should then use these results to formulate your answer or decide on the next step.
|
57 |
+
- If you have enough information from the conversation history or the previous tool responses to answer the user's query, provide a comprehensive answer directly.
|
58 |
+
- When providing an answer, cite your sources (URLs) if you used information from specific web pages.
|
59 |
+
- If a web search returns multiple promising links, you might need to use `browse_web_page` on a few of them to gather more detailed information. Prioritize relevant and reputable sources.
|
60 |
+
- If a webpage is too long or you cannot access it, note that in your reasoning.
|
61 |
+
- If the user's request is ambiguous, ask clarifying questions.
|
62 |
+
- The role name for your responses in the chat history must be '{BOT_ROLE_NAME}'.
|
63 |
+
|
64 |
+
Output Format for Final Answers (not tool calls):
|
65 |
+
- Provide clear, concise, and well-structured answers.
|
66 |
+
- If you are summarizing information from web pages, mention the source URLs.
|
67 |
+
- Example:
|
68 |
+
"Based on my research:
|
69 |
+
- Finding 1 (Source: [url1])
|
70 |
+
- Finding 2 (Source: [url2])
|
71 |
+
For more details, you can visit the source pages."
|
72 |
+
|
73 |
+
File/Report Generation (Optional - if you generate a structured report):
|
74 |
+
If you generate a structured text report, use this format:
|
75 |
+
### Report: research_summary.md
|
76 |
+
{bbb}markdown
|
77 |
+
# Research Topic: [User's Query]
|
78 |
+
|
79 |
+
## Key Findings:
|
80 |
+
- Point 1
|
81 |
+
- Point 2
|
82 |
+
|
83 |
+
## Detailed Information:
|
84 |
+
### [Source Title 1 (URL)]
|
85 |
+
- Summary of content from this source...
|
86 |
+
|
87 |
+
### [Source Title 2 (URL)]
|
88 |
+
- Summary of content from this source...
|
89 |
+
{bbb}
|
90 |
+
"""
|
91 |
+
|
92 |
+
# --- Core Utility, Parsing, API Call functions (some adapted) ---
|
93 |
+
def escape_html_for_markdown(text):
|
94 |
+
if not isinstance(text, str): return ""
|
95 |
+
return text.replace("&", "&").replace("<", "<").replace(">", ">")
|
96 |
+
|
97 |
+
# _infer_lang_from_filename might be less used, but kept for potential report formatting
|
98 |
+
def _infer_lang_from_filename(filename):
|
99 |
+
# ... (keep existing implementation, it's fine)
|
100 |
+
if not filename: return "plaintext"
|
101 |
+
if '.' in filename:
|
102 |
+
ext = filename.split('.')[-1].lower()
|
103 |
+
mapping = {
|
104 |
+
'py': 'python', 'js': 'javascript', 'ts': 'typescript', 'jsx': 'javascript', 'tsx': 'typescript',
|
105 |
+
'html': 'html', 'htm': 'html', 'css': 'css', 'scss': 'scss', 'sass': 'sass', 'less': 'less',
|
106 |
+
'json': 'json', 'xml': 'xml', 'yaml': 'yaml', 'yml': 'yaml', 'toml': 'toml',
|
107 |
+
'md': 'markdown', 'rst': 'rst',
|
108 |
+
'sh': 'bash', 'bash': 'bash', 'zsh': 'bash', 'bat': 'batch', 'cmd': 'batch', 'ps1': 'powershell',
|
109 |
+
'c': 'c', 'h': 'c', 'cpp': 'cpp', 'hpp': 'cpp', 'cs': 'csharp', 'java': 'java',
|
110 |
+
'rb': 'ruby', 'php': 'php', 'go': 'go', 'rs': 'rust', 'swift': 'swift', 'kt': 'kotlin', 'kts': 'kotlin',
|
111 |
+
'sql': 'sql', 'dockerfile': 'docker', 'tf': 'terraform', 'hcl': 'terraform',
|
112 |
+
'txt': 'plaintext', 'log': 'plaintext', 'ini': 'ini', 'conf': 'plaintext', 'cfg': 'plaintext',
|
113 |
+
'csv': 'plaintext', 'tsv': 'plaintext', 'err': 'plaintext',
|
114 |
+
'.env': 'plaintext', '.gitignore': 'plaintext', '.npmrc': 'plaintext', '.gitattributes': 'plaintext',
|
115 |
+
'makefile': 'makefile',
|
116 |
+
}
|
117 |
+
return mapping.get(ext, "plaintext")
|
118 |
+
base_filename = os.path.basename(filename)
|
119 |
+
if base_filename == 'Dockerfile': return 'docker'
|
120 |
+
if base_filename == 'Makefile': return 'makefile'
|
121 |
+
if base_filename.startswith('.'): return 'plaintext'
|
122 |
+
return "plaintext"
|
123 |
+
|
124 |
+
|
125 |
+
# _clean_filename might be less used if not parsing filenames from LLM for code
|
126 |
+
def _clean_filename(filename_line_content):
|
127 |
+
# ... (keep existing implementation, it's fine)
|
128 |
+
text = filename_line_content.strip()
|
129 |
+
text = re.sub(r'[`\*_]+', '', text) # Remove markdown emphasis characters
|
130 |
+
path_match = re.match(r'^([\w\-\.\s\/\\]+)', text)
|
131 |
+
if path_match:
|
132 |
+
parts = re.split(r'\s*\(', path_match.group(1).strip(), 1)
|
133 |
+
return parts[0].strip() if parts else ""
|
134 |
+
backtick_match = re.search(r'`([^`]+)`', text)
|
135 |
+
if backtick_match:
|
136 |
+
potential_fn = backtick_match.group(1).strip()
|
137 |
+
parts = re.split(r'\s*\(|\s{2,}', potential_fn, 1)
|
138 |
+
cleaned_fn = parts[0].strip() if parts else ""
|
139 |
+
cleaned_fn = cleaned_fn.strip('`\'":;,')
|
140 |
+
if cleaned_fn: return cleaned_fn
|
141 |
+
parts = re.split(r'\s*\(|\s{2,}', text, 1)
|
142 |
+
filename_candidate = parts[0].strip() if parts else text.strip()
|
143 |
+
filename_candidate = filename_candidate.strip('`\'":;,')
|
144 |
+
return filename_candidate if filename_candidate else text.strip()
|
145 |
+
|
146 |
+
|
147 |
+
# _parse_chat_stream_logic: Adapting for potential structured report output from LLM
|
148 |
+
def _parse_chat_stream_logic(chat_json_string, existing_outputs_state=None):
|
149 |
+
global parsed_research_outputs_cache
|
150 |
+
latest_outputs_dict = {}
|
151 |
+
if existing_outputs_state:
|
152 |
+
for item in existing_outputs_state: latest_outputs_dict[item["filename"]] = item.copy()
|
153 |
+
|
154 |
+
results = {"parsed_outputs": [], "preview_md": "", "error_message": None}
|
155 |
+
try:
|
156 |
+
ai_chat_history = json.loads(chat_json_string)
|
157 |
+
if not isinstance(ai_chat_history, list): raise ValueError("JSON input must be a list of chat messages.")
|
158 |
+
except json.JSONDecodeError as e: results["error_message"] = f"JSON Parsing Error: {e}."; return results
|
159 |
+
except ValueError as e: results["error_message"] = str(e); return results
|
160 |
+
|
161 |
+
message_obj = None
|
162 |
+
if ai_chat_history and isinstance(ai_chat_history[-1], dict) and ai_chat_history[-1].get("role", "").lower() == BOT_ROLE_NAME:
|
163 |
+
message_obj = ai_chat_history[-1]
|
164 |
+
|
165 |
+
if not message_obj:
|
166 |
+
results["parsed_outputs"] = list(latest_outputs_dict.values())
|
167 |
+
return results
|
168 |
+
|
169 |
+
role, content = message_obj.get("role", "").lower(), message_obj.get("content", "")
|
170 |
+
|
171 |
+
# Check for report format
|
172 |
+
report_pattern = re.compile(r"### Report:\s*(?P<filename_line>[^\n]+)\n```(?P<lang>[\w\.\-\+]*)\n(?P<code>[\s\S]*?)\n```")
|
173 |
+
|
174 |
+
if role == BOT_ROLE_NAME:
|
175 |
+
for match in report_pattern.finditer(content):
|
176 |
+
filename = _clean_filename(match.group("filename_line"))
|
177 |
+
if not filename: continue
|
178 |
+
lang, code_block = match.group("lang"), match.group("code")
|
179 |
+
item_data = {
|
180 |
+
"filename": filename,
|
181 |
+
"code": code_block.strip(),
|
182 |
+
"language": (lang.strip().lower() if lang else _infer_lang_from_filename(filename)),
|
183 |
+
"is_report": True
|
184 |
+
}
|
185 |
+
latest_outputs_dict[filename] = item_data # Overwrite if exists
|
186 |
+
|
187 |
+
current_parsed_outputs = list(latest_outputs_dict.values())
|
188 |
+
parsed_research_outputs_cache = current_parsed_outputs # Update global cache
|
189 |
+
results["parsed_outputs"] = current_parsed_outputs
|
190 |
+
return results
|
191 |
+
|
192 |
+
# _generate_ui_outputs_from_cache: Adapting for research reports
|
193 |
+
def _generate_ui_outputs_from_cache():
|
194 |
+
global parsed_research_outputs_cache
|
195 |
+
preview_md_val = "*No structured reports generated by AI yet.*"
|
196 |
+
formatted_md_val = "# Research Agent Output\n\n*No structured reports generated yet.*"
|
197 |
+
download_file = None
|
198 |
+
|
199 |
+
if parsed_research_outputs_cache:
|
200 |
+
preview_md_lines = ["## Generated Reports/Structured Outputs:"]
|
201 |
+
main_report_content = ""
|
202 |
+
for item in parsed_research_outputs_cache:
|
203 |
+
if item.get("is_report"):
|
204 |
+
preview_md_lines.append(f"\n----\n**Report:** `{escape_html_for_markdown(item['filename'])}` (Language: `{item['language']}`)\n")
|
205 |
+
preview_md_lines.append(f"\n{bbb}{item.get('language', 'plaintext') or 'plaintext'}\n{item.get('code','')}\n{bbb}\n")
|
206 |
+
if not main_report_content: # Take the first report as the main one for formatted output
|
207 |
+
main_report_content = f"# Report: {item['filename']}\n\n{bbb}{item.get('language', 'plaintext') or 'plaintext'}\n{item.get('code','')}\n{bbb}"
|
208 |
+
|
209 |
+
preview_md_val = "\n".join(preview_md_lines)
|
210 |
+
if main_report_content:
|
211 |
+
formatted_md_val = main_report_content
|
212 |
+
try:
|
213 |
+
# Use the report filename for download if available, else generic
|
214 |
+
report_filename_for_download = "research_report.md"
|
215 |
+
if parsed_research_outputs_cache and parsed_research_outputs_cache[0].get("filename"):
|
216 |
+
report_filename_for_download = parsed_research_outputs_cache[0]["filename"]
|
217 |
+
|
218 |
+
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".md", prefix=report_filename_for_download.split('.')[0] + "_", encoding='utf-8') as tmpfile:
|
219 |
+
tmpfile.write(main_report_content); download_file = tmpfile.name
|
220 |
+
except Exception as e: print(f"Error creating temp file for report: {e}")
|
221 |
+
else: # if no structured report, but there's other content in cache (future use)
|
222 |
+
formatted_md_val = "# Research Agent Output\n\n*No specific report found, showing raw cache if any.*"
|
223 |
+
# Potentially list other non-report items here if the cache structure evolves
|
224 |
+
|
225 |
+
return formatted_md_val, preview_md_val, gr.update(value=download_file, interactive=download_file is not None)
|
226 |
+
|
227 |
+
|
228 |
+
def _convert_gr_history_to_api_messages(system_prompt, gr_history, current_user_message=None):
|
229 |
+
messages = [{"role": "system", "content": system_prompt}] if system_prompt else []
|
230 |
+
for user_msg, bot_msg_or_tool_resp in gr_history:
|
231 |
+
if user_msg: messages.append({"role": "user", "content": user_msg})
|
232 |
+
if bot_msg_or_tool_resp:
|
233 |
+
# Check if it's a tool call from the assistant or a tool response
|
234 |
+
try:
|
235 |
+
# Attempt to parse as JSON, if it's a tool_calls object from assistant
|
236 |
+
# or a tool response object we constructed.
|
237 |
+
potential_json = json.loads(bot_msg_or_tool_resp)
|
238 |
+
if isinstance(potential_json, dict) and "tool_calls" in potential_json and isinstance(potential_json["tool_calls"], list):
|
239 |
+
# This is an assistant's message with tool calls
|
240 |
+
messages.append({
|
241 |
+
"role": BOT_ROLE_NAME,
|
242 |
+
"content": None, # OpenAI expects content to be null for tool_calls only message
|
243 |
+
"tool_calls": potential_json["tool_calls"]
|
244 |
+
})
|
245 |
+
elif isinstance(potential_json, dict) and "tool_call_id" in potential_json and "role" in potential_json and potential_json["role"] == TOOL_ROLE_NAME:
|
246 |
+
# This is a tool response message we constructed
|
247 |
+
messages.append(potential_json) # Already in correct format
|
248 |
+
else: # Not a special JSON, treat as regular bot message
|
249 |
+
messages.append({"role": BOT_ROLE_NAME, "content": str(bot_msg_or_tool_resp)})
|
250 |
+
except json.JSONDecodeError: # Not JSON, treat as regular bot message
|
251 |
+
messages.append({"role": BOT_ROLE_NAME, "content": str(bot_msg_or_tool_resp)})
|
252 |
+
|
253 |
+
if current_user_message: messages.append({"role": "user", "content": current_user_message})
|
254 |
+
return messages
|
255 |
+
|
256 |
+
# --- New Tool Functions ---
|
257 |
+
def search_web(query: str, tavily_api_key: str):
|
258 |
+
"""Performs a web search using Tavily API."""
|
259 |
+
if not tavily_api_key:
|
260 |
+
return json.dumps({"error": "Tavily API key not provided."})
|
261 |
+
try:
|
262 |
+
client = TavilyClient(api_key=tavily_api_key)
|
263 |
+
response = client.search(query=query, search_depth="basic", max_results=5) # basic is often enough
|
264 |
+
# `response` includes 'results' which is a list of dicts: {'title': ..., 'url': ..., 'content': ...}
|
265 |
+
# We'll return the stringified JSON of results for the LLM.
|
266 |
+
return json.dumps(response.get("results", []))
|
267 |
+
except Exception as e:
|
268 |
+
return json.dumps({"error": f"Tavily search failed: {str(e)}"})
|
269 |
+
|
270 |
+
def browse_web_page(url: str):
|
271 |
+
"""Fetches and extracts text content from a web page."""
|
272 |
+
try:
|
273 |
+
headers = {
|
274 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
275 |
+
}
|
276 |
+
response = requests.get(url, headers=headers, timeout=10)
|
277 |
+
response.raise_for_status()
|
278 |
+
|
279 |
+
# Try Newspaper3k first for cleaner article text
|
280 |
+
try:
|
281 |
+
article = Article(url)
|
282 |
+
article.download(input_html=response.content) # Pass downloaded HTML
|
283 |
+
article.parse()
|
284 |
+
content = article.text
|
285 |
+
if content and len(content.strip()) > 100: # If newspaper got good content
|
286 |
+
return json.dumps({"url": url, "content": content[:MAX_WEBPAGE_CONTENT_LENGTH]})
|
287 |
+
except Exception as e:
|
288 |
+
print(f"Newspaper3k failed for {url}: {e}. Falling back to BeautifulSoup.")
|
289 |
+
|
290 |
+
# Fallback to BeautifulSoup if Newspaper3k fails or gets minimal content
|
291 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
292 |
+
|
293 |
+
# Remove script and style elements
|
294 |
+
for script_or_style in soup(["script", "style"]):
|
295 |
+
script_or_style.decompose()
|
296 |
+
|
297 |
+
text = soup.get_text(separator='\n', strip=True)
|
298 |
+
lines = (line.strip() for line in text.splitlines())
|
299 |
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
300 |
+
text = '\n'.join(chunk for chunk in chunks if chunk)
|
301 |
+
|
302 |
+
if not text:
|
303 |
+
return json.dumps({"url": url, "content": "[No text content found or page is primarily non-textual]"})
|
304 |
+
|
305 |
+
return json.dumps({"url": url, "content": text[:MAX_WEBPAGE_CONTENT_LENGTH]})
|
306 |
+
except requests.exceptions.RequestException as e:
|
307 |
+
return json.dumps({"url": url, "error": f"Failed to fetch URL: {str(e)}"})
|
308 |
+
except Exception as e:
|
309 |
+
return json.dumps({"url": url, "error": f"Error processing page: {str(e)}"})
|
310 |
+
|
311 |
+
available_tools = {
|
312 |
+
"search_web": search_web,
|
313 |
+
"browse_web_page": browse_web_page,
|
314 |
+
}
|
315 |
+
# --- Main Chat Handler ---
|
316 |
+
def handle_research_chat_submit(user_message, chat_history, groq_api_key, tavily_api_key, model_select, system_prompt):
|
317 |
+
global parsed_research_outputs_cache
|
318 |
+
_chat_msg_in, _chat_hist, _status = "", list(chat_history), "Initializing..."
|
319 |
+
_detected_outputs_update, _formatted_output_update, _download_btn_update = gr.update(), gr.update(), gr.update(interactive=False, value=None)
|
320 |
+
|
321 |
+
if not user_message.strip():
|
322 |
+
_status = "Cannot send an empty message."
|
323 |
+
yield (user_message, _chat_hist, _status, _detected_outputs_update, _formatted_output_update, _download_btn_update); return
|
324 |
+
|
325 |
+
_chat_hist.append((user_message, None))
|
326 |
+
yield (_chat_msg_in, _chat_hist, _status, _detected_outputs_update, _formatted_output_update, _download_btn_update)
|
327 |
+
|
328 |
+
effective_groq_api_key = groq_api_key or os.environ.get("GROQ_API_KEY")
|
329 |
+
effective_tavily_api_key = tavily_api_key or os.environ.get("TAVILY_API_KEY")
|
330 |
+
|
331 |
+
if not effective_groq_api_key:
|
332 |
+
_chat_hist[-1] = (user_message, "Error: Groq API Key not set."); _status = "Groq API Key missing."
|
333 |
+
yield (_chat_msg_in, _chat_hist, _status, _detected_outputs_update, _formatted_output_update, _download_btn_update); return
|
334 |
+
|
335 |
+
current_sys_prompt = system_prompt.strip() or DEFAULT_SYSTEM_PROMPT
|
336 |
+
|
337 |
+
# Tool definitions for the API
|
338 |
+
tools_for_api = [
|
339 |
+
{
|
340 |
+
"type": "function",
|
341 |
+
"function": {
|
342 |
+
"name": "search_web",
|
343 |
+
"description": "Searches the web for a given query using Tavily.",
|
344 |
+
"parameters": {
|
345 |
+
"type": "object",
|
346 |
+
"properties": {
|
347 |
+
"query": {"type": "string", "description": "The search query."},
|
348 |
+
},
|
349 |
+
"required": ["query"],
|
350 |
+
},
|
351 |
+
},
|
352 |
+
},
|
353 |
+
{
|
354 |
+
"type": "function",
|
355 |
+
"function": {
|
356 |
+
"name": "browse_web_page",
|
357 |
+
"description": "Fetches and extracts text content from a given URL.",
|
358 |
+
"parameters": {
|
359 |
+
"type": "object",
|
360 |
+
"properties": {
|
361 |
+
"url": {"type": "string", "description": "The URL of the web page to browse."},
|
362 |
+
},
|
363 |
+
"required": ["url"],
|
364 |
+
},
|
365 |
+
},
|
366 |
+
},
|
367 |
+
]
|
368 |
+
|
369 |
+
# Convert current chat history for API
|
370 |
+
# For the first message from user, history is _chat_hist[:-1] and current_user_message is user_message
|
371 |
+
api_msgs = _convert_gr_history_to_api_messages(current_sys_prompt, _chat_hist[:-1], user_message)
|
372 |
+
|
373 |
+
max_tool_iterations = 5 # Prevent infinite loops
|
374 |
+
current_iteration = 0
|
375 |
+
|
376 |
+
while current_iteration < max_tool_iterations:
|
377 |
+
current_iteration += 1
|
378 |
+
headers = {"Authorization": f"Bearer {effective_groq_api_key}", "Content-Type": "application/json"}
|
379 |
+
payload = {"model": model_select, "messages": api_msgs, "tools": tools_for_api, "tool_choice": "auto"}
|
380 |
+
|
381 |
+
try:
|
382 |
+
_status = f"Waiting for {model_select} (Iteration {current_iteration})...";
|
383 |
+
# Update chat history for streaming intermediate status to user
|
384 |
+
if _chat_hist[-1] and _chat_hist[-1][1] is None : # If last bot message is empty (first iteration of this turn)
|
385 |
+
_chat_hist[-1] = (_chat_hist[-1][0], f"<i>{_status}</i>")
|
386 |
+
else: # If there's already a bot message (e.g. tool response was added)
|
387 |
+
_chat_hist.append((None, f"<i>{_status}</i>"))
|
388 |
+
|
389 |
+
yield (_chat_msg_in, _chat_hist, _status, _detected_outputs_update, _formatted_output_update, _download_btn_update)
|
390 |
+
|
391 |
+
response = requests.post(GROQ_API_ENDPOINT, headers=headers, json=payload, timeout=180)
|
392 |
+
response.raise_for_status()
|
393 |
+
api_resp_json = response.json()
|
394 |
+
|
395 |
+
# Clean up "Waiting..." message from history if a real response is coming
|
396 |
+
if _chat_hist and _chat_hist[-1][1] and _chat_hist[-1][1].startswith("<i>Waiting for"):
|
397 |
+
if _chat_hist[-1][0] is None: # It was a status-only message
|
398 |
+
_chat_hist.pop()
|
399 |
+
else: # It was part of a user-bot turn
|
400 |
+
_chat_hist[-1] = (_chat_hist[-1][0], None) # Clear the status for now
|
401 |
+
|
402 |
+
|
403 |
+
if not api_resp_json.get("choices") or not api_resp_json["choices"][0]:
|
404 |
+
raise ValueError("API response missing choices.")
|
405 |
+
|
406 |
+
message = api_resp_json["choices"][0].get("message")
|
407 |
+
finish_reason = api_resp_json["choices"][0].get("finish_reason")
|
408 |
+
|
409 |
+
if not message:
|
410 |
+
raise ValueError("API response missing message object in choice.")
|
411 |
+
|
412 |
+
# Add assistant's response (or tool call) to API message list for next potential iteration
|
413 |
+
api_msgs.append(message)
|
414 |
+
|
415 |
+
if message.get("tool_calls"):
|
416 |
+
_status = "AI requested to use tools. Executing..."
|
417 |
+
# Store the tool call request itself in chat history for visibility
|
418 |
+
# The actual tool response will be added later.
|
419 |
+
tool_call_request_str = json.dumps({"tool_calls": message["tool_calls"]})
|
420 |
+
if _chat_hist[-1] and _chat_hist[-1][1] is None:
|
421 |
+
_chat_hist[-1] = (_chat_hist[-1][0], f"π€ Requesting tools:\n```json\n{tool_call_request_str}\n```")
|
422 |
+
else:
|
423 |
+
_chat_hist.append((None, f"π€ Requesting tools:\n```json\n{tool_call_request_str}\n```"))
|
424 |
+
yield (_chat_msg_in, _chat_hist, _status, _detected_outputs_update, _formatted_output_update, _download_btn_update)
|
425 |
+
|
426 |
+
for tool_call in message["tool_calls"]:
|
427 |
+
function_name = tool_call["function"]["name"]
|
428 |
+
function_args = json.loads(tool_call["function"]["arguments"])
|
429 |
+
tool_call_id = tool_call["id"]
|
430 |
+
|
431 |
+
if function_name not in available_tools:
|
432 |
+
tool_response_content = json.dumps({"error": f"Tool '{function_name}' not found."})
|
433 |
+
_status = f"Error: Tool '{function_name}' not found."
|
434 |
+
else:
|
435 |
+
_status = f"Executing tool: {function_name} with args: {function_args}"
|
436 |
+
# Update chat history with tool execution status
|
437 |
+
_chat_hist.append((None, f"π οΈ Executing: {function_name}({json.dumps(function_args)})"))
|
438 |
+
yield (_chat_msg_in, _chat_hist, _status, _detected_outputs_update, _formatted_output_update, _download_btn_update)
|
439 |
+
|
440 |
+
tool_function = available_tools[function_name]
|
441 |
+
if function_name == "search_web":
|
442 |
+
if not effective_tavily_api_key:
|
443 |
+
tool_response_content = json.dumps({"error": "Tavily API key not configured by user."})
|
444 |
+
_status = "Error: Tavily API Key not set by user."
|
445 |
+
else:
|
446 |
+
tool_response_content = tool_function(query=function_args["query"], tavily_api_key=effective_tavily_api_key)
|
447 |
+
elif function_name == "browse_web_page":
|
448 |
+
tool_response_content = tool_function(url=function_args["url"])
|
449 |
+
else: # Should not happen if function_name in available_tools
|
450 |
+
tool_response_content = json.dumps({"error": "Unknown tool execution path."})
|
451 |
+
|
452 |
+
# Add tool response to API message list for LLM
|
453 |
+
tool_response_message = {
|
454 |
+
"tool_call_id": tool_call_id,
|
455 |
+
"role": TOOL_ROLE_NAME,
|
456 |
+
"name": function_name,
|
457 |
+
"content": tool_response_content, # This is the JSON string result from the tool
|
458 |
+
}
|
459 |
+
api_msgs.append(tool_response_message)
|
460 |
+
|
461 |
+
# Add tool response to Gradio chat history for visibility
|
462 |
+
# Truncate long content for display
|
463 |
+
display_content = tool_response_content
|
464 |
+
if len(display_content) > 500:
|
465 |
+
display_content = display_content[:500] + "... (truncated for display)"
|
466 |
+
_chat_hist.append((None, f"βοΈ Tool Result ({function_name}):\n```json\n{display_content}\n```" ))
|
467 |
+
yield (_chat_msg_in, _chat_hist, _status, _detected_outputs_update, _formatted_output_update, _download_btn_update)
|
468 |
+
|
469 |
+
# If finish_reason is 'tool_calls', continue loop to let LLM process tool results
|
470 |
+
if finish_reason == "tool_calls":
|
471 |
+
continue
|
472 |
+
else: # LLM decided to call tool AND respond in same turn (unusual for OpenAI spec but handle)
|
473 |
+
if message.get("content"):
|
474 |
+
bot_response_actual = message.get("content", "")
|
475 |
+
_chat_hist.append((None, bot_response_actual)) # Add the text response as well
|
476 |
+
break # Exit loop as LLM also provided content
|
477 |
+
else: # Only tool calls, continue loop
|
478 |
+
continue
|
479 |
+
|
480 |
+
|
481 |
+
elif message.get("content"): # Standard text response from LLM
|
482 |
+
bot_response_actual = message.get("content", "")
|
483 |
+
if _chat_hist[-1] and _chat_hist[-1][1] is None :
|
484 |
+
_chat_hist[-1] = (_chat_hist[-1][0], bot_response_actual)
|
485 |
+
else:
|
486 |
+
_chat_hist.append((None, bot_response_actual))
|
487 |
+
_status = "AI response received."
|
488 |
+
|
489 |
+
# Try to parse for structured reports
|
490 |
+
latest_bot_message_json = json.dumps([{"role": BOT_ROLE_NAME, "content": bot_response_actual}], indent=2)
|
491 |
+
parsing_res = _parse_chat_stream_logic(latest_bot_message_json, existing_outputs_state=parsed_research_outputs_cache)
|
492 |
+
if parsing_res["error_message"]:
|
493 |
+
_status = f"Parsing Error: {parsing_res['error_message']}"
|
494 |
+
_detected_outputs_update = gr.Markdown(f"## Parsing Error\n`{escape_html_for_markdown(parsing_res['error_message'])}`")
|
495 |
+
else:
|
496 |
+
_formatted_output_update, _detected_outputs_update, _download_btn_update = _generate_ui_outputs_from_cache()
|
497 |
+
_status = "Processing complete. Previews updated."
|
498 |
+
|
499 |
+
yield (_chat_msg_in, _chat_hist, _status, _detected_outputs_update, _formatted_output_update, _download_btn_update)
|
500 |
+
return # End of processing for this user message
|
501 |
+
|
502 |
+
else: # No tool_calls and no content, unusual
|
503 |
+
_status = "AI response was empty or malformed."
|
504 |
+
_chat_hist.append((None, f"<i>{_status}</i>"))
|
505 |
+
yield (_chat_msg_in, _chat_hist, _status, _detected_outputs_update, _formatted_output_update, _download_btn_update)
|
506 |
+
return
|
507 |
+
|
508 |
+
|
509 |
+
except requests.exceptions.HTTPError as e: error_msg = f"API HTTP Error: {e} - {e.response.text if e.response else 'No details'}"
|
510 |
+
except requests.exceptions.RequestException as e: error_msg = f"API Request Error: {e}"
|
511 |
+
except Exception as e: error_msg = f"Unexpected error in chat submit: {str(e)}"
|
512 |
+
|
513 |
+
# Error handling for the loop
|
514 |
+
_chat_hist.append((None, error_msg))
|
515 |
+
_status = error_msg
|
516 |
+
yield (_chat_msg_in, _chat_hist, _status, _detected_outputs_update, _formatted_output_update, _download_btn_update)
|
517 |
+
return # Exit on error
|
518 |
+
|
519 |
+
if current_iteration >= max_tool_iterations:
|
520 |
+
_status = "Max tool iterations reached. AI may be in a loop."
|
521 |
+
_chat_hist.append((None, f"<i>{_status}</i>"))
|
522 |
+
yield (_chat_msg_in, _chat_hist, _status, _detected_outputs_update, _formatted_output_update, _download_btn_update)
|
523 |
+
|
524 |
+
|
525 |
+
# --- UI Definition ---
|
526 |
+
custom_theme = gr.themes.Base(primary_hue="teal", secondary_hue="purple", neutral_hue="zinc", text_size="sm", spacing_size="md", radius_size="sm", font=["System UI", "sans-serif"])
|
527 |
+
custom_css = """ /* ... (Your existing CSS, it's good) ... */ """ # Keep your CSS
|
528 |
+
|
529 |
+
with gr.Blocks(theme=custom_theme, css=custom_css) as demo:
|
530 |
+
gr.Markdown("# π Internet Research Mega Agent")
|
531 |
+
gr.Markdown("Ask questions or research topics. The AI will use web search and browsing tools to find answers.")
|
532 |
+
with gr.Row():
|
533 |
+
with gr.Sidebar():
|
534 |
+
gr.Markdown("## βοΈ Configuration")
|
535 |
+
with gr.Group():
|
536 |
+
gr.Markdown("### API Keys")
|
537 |
+
groq_api_key_input = gr.Textbox(label="Groq API Key", type="password", placeholder="gsk_...", info="Needed for LLM.")
|
538 |
+
tavily_api_key_input = gr.Textbox(label="Tavily API Key", type="password", placeholder="tvly-...", info="Needed for web search tool.")
|
539 |
+
with gr.Group():
|
540 |
+
gr.Markdown("### AI Model Settings")
|
541 |
+
groq_model_select = gr.Dropdown(label="Groq Model", choices=["mixtral-8x7b-32768", "llama3-8b-8192", "llama3-70b-8192", "gemma-7b-it"], value="llama3-70b-8192", info="Llama3-70b is recommended for better reasoning with tools.")
|
542 |
+
groq_system_prompt_input = gr.Textbox(label="System Prompt", lines=10, value=DEFAULT_SYSTEM_PROMPT, interactive=True)
|
543 |
+
|
544 |
+
with gr.Column(scale=3):
|
545 |
+
gr.Markdown("## π¬ AI Research Assistant Chat")
|
546 |
+
research_chatbot_display = gr.Chatbot(label="AI Research Chat", height=500, bubble_full_width=False, avatar_images=(None, "https://raw.githubusercontent.com/groq/groq-api-cookbook/main/groq.png"))
|
547 |
+
with gr.Row():
|
548 |
+
research_chat_message_input = gr.Textbox(show_label=False, placeholder="Ask your research question...", scale=7)
|
549 |
+
research_send_chat_button = gr.Button("Send", variant="primary", scale=1, size="lg")
|
550 |
+
research_status_output = gr.Textbox(label="Agent Status", interactive=False, lines=1, value="Ready.")
|
551 |
+
|
552 |
+
gr.Markdown("---")
|
553 |
+
with gr.Tabs():
|
554 |
+
with gr.TabItem("π Generated Report/Output"):
|
555 |
+
gr.Markdown("If the AI generates a structured report, it will appear here.")
|
556 |
+
formatted_research_output_display = gr.Textbox(label="Current Research Report", lines=15, interactive=True, show_copy_button=True, value="*Research reports will appear here...*")
|
557 |
+
download_report_button = gr.DownloadButton(label="Download Report", interactive=False, size="sm")
|
558 |
+
with gr.TabItem("π Intermediate Outputs Preview"):
|
559 |
+
detected_outputs_preview = gr.Markdown(value="*Intermediate outputs or tool call details might show here...*")
|
560 |
+
|
561 |
+
# --- Event Handlers ---
|
562 |
+
chat_outputs = [research_chat_message_input, research_chatbot_display, research_status_output, detected_outputs_preview, formatted_research_output_display, download_report_button]
|
563 |
+
chat_inputs = [research_chat_message_input, research_chatbot_display, groq_api_key_input, tavily_api_key_input, groq_model_select, groq_system_prompt_input]
|
564 |
+
|
565 |
+
research_send_chat_button.click(fn=handle_research_chat_submit, inputs=chat_inputs, outputs=chat_outputs)
|
566 |
+
research_chat_message_input.submit(fn=handle_research_chat_submit, inputs=chat_inputs, outputs=chat_outputs)
|
567 |
+
|
568 |
+
# Removed Hugging Face specific buttons and their handlers:
|
569 |
+
# - load_space_button, build_space_button, refresh_status_button
|
570 |
+
# - file_browser_dropdown, file_content_editor, commit_message_input, update_file_button, delete_file_button
|
571 |
+
# And their corresponding output components if they are not repurposed.
|
572 |
+
|
573 |
+
if __name__ == "__main__":
|
574 |
+
# For local testing, you might set API keys as environment variables or directly in the script for convenience (not recommended for sharing)
|
575 |
+
# os.environ["GROQ_API_KEY"] = "your_groq_key"
|
576 |
+
# os.environ["TAVILY_API_KEY"] = "your_tavily_key"
|
577 |
+
demo.launch(debug=True, share=False)
|