Final_Assignment_Project

Sleeping

App Files Files Community

wt002 commited on Jun 3

Commit

51e50b5

verified ·

1 Parent(s): a6c4989

Update app.py

Browse files

Files changed (1) hide show

app.py +169 -313

app.py CHANGED Viewed

@@ -1,366 +1,222 @@
 import os
-import re
 import gradio as gr
 import requests
 import pandas as pd
 import heapq
 from collections import Counter
 from io import BytesIO
 from youtube_transcript_api import YouTubeTranscriptApi
-from smolagents import tool, Tool, CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, SpeechToTextTool, FinalAnswerTool
-from langchain_community.document_loaders import WikipediaLoader, PyPDFLoader, TextLoader
-from dotenv import load_dotenv
-import tempfile
-import mimetypes
-import logging
-# --- Initialize logging ---
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-# --- Load environment variables ---
 load_dotenv()
-HF_API_TOKEN = os.getenv("HF_API_TOKEN")
-if not HF_API_TOKEN:
-    logger.error("HF_API_TOKEN not found in environment variables! Please set it to use the HfApiModel.")
-    # Exit or raise an error if the token is critical for functionality
-    # sys.exit(1) # Uncomment if you want to exit the script if token is missing
-# --- Utility Functions ---
-def extract_youtube_id(url: str) -> str:
-    """Extract YouTube ID from various URL formats"""
-    patterns = [
-        r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)',
-        r'(?:https?:\/\/)?youtu\.be\/([^?]+)',
-        r'([a-zA-Z0-9_-]{11})'  # Catches just the ID if provided directly
-    ]
-    for pattern in patterns:
-        match = re.search(pattern, url)
-        if match:
-            return match.group(1)
-    return ""
-# --- Enhanced Tools ---
 class WikiSearchTool(Tool):
-    """Enhanced Wikipedia search with better formatting and error handling"""
     name = "wiki_search"
-    description = "Search Wikipedia for a query. Returns up to 2 results with metadata."
-    inputs = {"query": {"type": "string", "description": "Search term for Wikipedia"}}
     output_type = "string"
     def forward(self, query: str) -> str:
-        try:
-            logger.info(f"Searching Wikipedia for: {query}")
-            docs = WikipediaLoader(query=query, load_max_docs=2).load()
-            if not docs:
-                logger.info(f"No Wikipedia articles found for: {query}")
-                return "No Wikipedia articles found."
-            formatted_results = []
-            for i, doc in enumerate(docs):
-                # Limit page content length to avoid overwhelming the model, but provide enough context
-                summary = doc.page_content[:1000] + "..." if len(doc.page_content) > 1000 else doc.page_content
-                formatted_results.append(
-                    f"--- Wikipedia Result {i+1} ---\n"
-                    f"Title: {doc.metadata.get('title', 'N/A')}\n"
-                    f"URL: {doc.metadata.get('source', 'N/A')}\n"
-                    f"Summary: {summary}\n"
-                )
-            return "\n\n".join(formatted_results)
-        except Exception as e:
-            logger.error(f"Wikipedia search error for '{query}': {e}")
-            return f"Wikipedia search error: {str(e)}"
-class FileAnalysisTool(Tool):
-    """Universal file analyzer for text/PDF/Excel files"""
-    name = "file_analysis"
-    description = "Analyze text, PDF, and Excel files. Returns extracted content."
-    inputs = {"file_path": {"type": "string", "description": "Path to the local file"}}
     output_type = "string"
-    def forward(self, file_path: str) -> str:
-        if not os.path.exists(file_path):
-            return f"File not found: {file_path}"
         try:
-            mime_type, _ = mimetypes.guess_type(file_path)
-            logger.info(f"Analyzing file: {file_path} with MIME type: {mime_type}")
-            if mime_type == "application/pdf":
-                return self._process_pdf(file_path)
-            elif mime_type in ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "application/vnd.ms-excel"]:
-                return self._process_excel(file_path)
-            elif mime_type and ("text" in mime_type or "csv" in mime_type):
-                return self._process_text(file_path)
-            else:
-                return f"Unsupported file type for analysis: {mime_type}. Only PDF, Excel, and text/CSV files are supported."
         except Exception as e:
-            logger.error(f"File analysis error for '{file_path}': {e}")
-            return f"File analysis error: {str(e)}"
-    def _process_pdf(self, path: str) -> str:
-        loader = PyPDFLoader(path)
-        docs = loader.load()
-        content = "\n\n".join([doc.page_content for doc in docs])
-        # Truncate to avoid excessive token usage, provide a warning if truncated
-        if len(content) > 8000:
-            logger.warning(f"PDF content truncated from {len(content)} to 8000 characters for {path}")
-            return content[:8000] + "\n... [Content truncated]"
-        return content
-    def _process_excel(self, path: str) -> str:
-        df = pd.read_excel(path)
-        # Provide a sample of the data and its basic info
-        info = BytesIO()
-        df.info(buf=info)
-        info_str = info.getvalue().decode('utf-8')
-        return (f"Excel file loaded. First 10 rows:\n{df.head(10).to_markdown()}\n\n"
-                f"DataFrame Info:\n{info_str}")
-    def _process_text(self, path: str) -> str:
-        with open(path, 'r', encoding='utf-8') as f:
-            content = f.read()
-            if len(content) > 8000:
-                logger.warning(f"Text file content truncated from {len(content)} to 8000 characters for {path}")
-                return content[:8000] + "\n... [Content truncated]"
-            return content
 class VideoTranscriptionTool(Tool):
-    """Enhanced YouTube transcription with multilingual support and better output"""
     name = "transcript_video"
-    description = "Fetch YouTube video transcripts with optional timestamps. Supports English, French, Spanish, German."
     inputs = {
-        "url": {"type": "string", "description": "YouTube URL or ID"},
-        "include_timestamps": {"type": "boolean", "description": "Include timestamps? (default: False)"}
     }
     output_type = "string"
     def forward(self, url: str, include_timestamps: bool = False) -> str:
         try:
-            video_id = extract_youtube_id(url)
-            if not video_id:
-                return "Invalid YouTube URL or ID format. Please provide a valid YouTube URL or an 11-character video ID."
-            logger.info(f"Attempting to transcribe video ID: {video_id}")
-            transcript = YouTubeTranscriptApi.get_transcript(
-                video_id,
-                languages=['en', 'fr', 'es', 'de']  # Prioritize common languages
-            )
-            if not transcript:
-                return f"No transcript found for video ID: {video_id} in supported languages (en, fr, es, de)."
             if include_timestamps:
-                formatted_transcript = "\n".join(
-                    f"[{int(seg['start']//60):02d}:{int(seg['start']%60):02d}] {seg['text']}"
-                    for seg in transcript
-                )
             else:
-                formatted_transcript = " ".join(seg['text'] for seg in transcript)
-            return formatted_transcript
-        except Exception as e:
-            logger.error(f"Transcription error for '{url}': {e}")
-            return f"Transcription error: {str(e)}. This might be due to no available transcript or an unsupported video."
-class DataAnalysisTool(Tool):
-    """Perform data analysis using pandas on structured data (CSV/Excel)"""
-    name = "data_analysis"
-    description = "Analyze CSV/Excel data using pandas operations. Supported operations: 'describe', 'groupby:column:aggfunc' (e.g., 'groupby:Category:mean')."
-    inputs = {
-        "file_path": {"type": "string", "description": "Path to the local data file (CSV or Excel)"},
-        "operation": {"type": "string", "description": "Pandas operation (e.g., 'describe', 'groupby:column_name:mean')"}
-    }
-    output_type = "string"
-    def forward(self, file_path: str, operation: str) -> str:
-        if not os.path.exists(file_path):
-            return f"File not found: {file_path}"
-        try:
-            if file_path.endswith('.csv'):
-                df = pd.read_csv(file_path)
-            elif file_path.endswith('.xlsx') or file_path.endswith('.xls'):
-                df = pd.read_excel(file_path)
-            else:
-                return "Unsupported file format for data analysis. Please provide a .csv or .xlsx file."
-            logger.info(f"Performing data analysis operation '{operation}' on {file_path}")
-            if operation == "describe":
-                return "Descriptive Statistics:\n" + str(df.describe())
-            elif operation.startswith("groupby:"):
-                parts = operation.split(":")
-                if len(parts) == 3:
-                    _, col, agg = parts
-                    if col not in df.columns:
-                        return f"Column '{col}' not found in the DataFrame."
-                    try:
-                        result = df.groupby(col).agg(agg)
-                        return f"Groupby operation '{agg}' on column '{col}':\n" + str(result)
-                    except Exception as agg_e:
-                        return f"Error performing aggregation '{agg}' on column '{col}': {str(agg_e)}"
-                else:
-                    return "Invalid 'groupby' operation format. Use 'groupby:column_name:agg_function'."
-            else:
-                return "Unsupported operation. Try: 'describe' or 'groupby:column_name:agg_function'."
         except Exception as e:
-            logger.error(f"Data analysis error for '{file_path}' with operation '{operation}': {e}")
-            return f"Data analysis error: {str(e)}. Please check file content and operation."
-# --- Agent Initialization ---
 class BasicAgent:
     def __init__(self):
-        self.model = HfApiModel(
-            temperature=0.1, # Slightly increased temperature for more creative responses if appropriate
-            token=HF_API_TOKEN,
-            max_tokens=2000
         )
-        self.tools = self._initialize_tools()
-        self.agent = self._create_agent()
-    def _initialize_tools(self) -> list:
-        """Initialize all tools with enhanced capabilities"""
-        return [
-            DuckDuckGoSearchTool(),
-            WikiSearchTool(),
-            VisitWebpageTool(),
-            SpeechToTextTool(), # Might be less relevant for a text-based research agent but kept if needed
-            FinalAnswerTool(),
-            VideoTranscriptionTool(),
-            FileAnalysisTool(),
-            DataAnalysisTool(),
-            self._create_excel_download_tool(), # Renamed for clarity
-            self._create_keywords_tool()
-        ]
-    def _create_excel_download_tool(self):
-        """Tool to download and parse Excel files from a specific URL"""
-        @tool
-        def download_and_parse_excel(task_id: str) -> dict:
-            """
-            Downloads an Excel file from a predefined URL using a task_id and parses its content.
-            Returns a dictionary with status and data (first 20 rows).
-            """
-            try:
-                url = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
-                logger.info(f"Attempting to download Excel from: {url}")
-                response = requests.get(url, timeout=60) # Increased timeout for larger files
-                response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
-                with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
-                    tmp.write(response.content)
-                    temp_file_path = tmp.name
-                df = pd.read_excel(temp_file_path)
-                os.unlink(temp_file_path) # Clean up the temporary file
-                logger.info(f"Successfully downloaded and parsed Excel for task_id: {task_id}")
-                return {
-                    "task_id": task_id,
-                    "data_sample": df.head(10).to_dict(orient="records"), # Reduced to 10 for conciseness
-                    "status": "Success",
-                    "columns": df.columns.tolist(), # Added column names for context
-                    "shape": df.shape # Added shape for context
-                }
-            except requests.exceptions.RequestException as req_err:
-                logger.error(f"Network or HTTP error downloading Excel for task_id '{task_id}': {req_err}")
-                return {"status": f"Download error: {str(req_err)}"}
-            except Exception as e:
-                logger.error(f"Error parsing Excel for task_id '{task_id}': {e}")
-                return {"status": f"Parsing error: {str(e)}"}
-        return download_and_parse_excel
-    def _create_keywords_tool(self):
-        """Keywords extractor with TF-IDF like scoring (basic frequency for now)"""
-        @tool
-        def extract_keywords(text: str, top_n: int = 5) -> list:
-            """
-            Extracts the most frequent keywords from a given text, excluding common stopwords.
-            Args:
-                text (str): The input text to extract keywords from.
-                top_n (int): The number of top keywords to return.
-            Returns:
-                list: A list of the most frequent keywords.
-            """
-            if not text:
-                return []
-            # Use a more comprehensive list of English stopwords
-            stopwords = set([
-                "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
-                "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
-                "they", "this", "to", "was", "will", "with", "he", "she", "it's", "i", "we", "you", "my",
-                "your", "our", "us", "him", "her", "his", "hers", "its", "them", "their", "what", "when",
-                "where", "why", "how", "which", "who", "whom", "can", "could", "would", "should", "may",
-                "might", "must", "have", "has", "had", "do", "does", "did", "am", "are", "is", "were", "been",
-                "being", "from", "up", "down", "out", "off", "over", "under", "again", "further", "then",
-                "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few",
-                "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so",
-                "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"
-            ])
-            words = re.findall(r'\b\w+\b', text.lower()) # Relaxed regex to capture all words
-            filtered = [w for w in words if w not in stopwords and len(w) > 2] # Filter words less than 3 chars
-            counter = Counter(filtered)
-            return [word for word, _ in counter.most_common(top_n)]
-        return extract_keywords
-    def _create_agent(self) -> CodeAgent:
-        """Create agent with improved system prompt"""
-        system_prompt = """
-You are an advanced, helpful, and highly analytical research assistant. Your goal is to provide accurate, comprehensive, and well-structured answers to user queries, leveraging all available tools efficiently.
-**Follow this robust process:**
-1.  **Understand the User's Need:** Carefully analyze the user's question, including any attached files or specific requests (e.g., "summarize," "analyze data," "find facts").
-2.  **Formulate a Detailed Plan:** Before acting, create a clear, step-by-step plan. This plan should outline:
-    * What information needs to be gathered.
-    * Which tools are most appropriate for each step (e.g., `duckduckgo_search` for general web search, `wiki_search` for encyclopedic facts, `transcript_video` for YouTube, `file_analysis` or `data_analysis` for local files).
-    * How you will combine information from different sources.
-    * How you will verify or synthesize the findings.
-3.  **Execute the Plan Using Tools:** Call the necessary tools, providing clear and correct arguments. If a tool fails, try to understand why and adapt your plan (e.g., try a different search query or tool).
-4.  **Synthesize and Verify Information:** Once you have gathered sufficient information, synthesize it into a coherent answer. Do not just list facts; explain their significance and how they relate to the original question. If there are contradictions or uncertainties, mention them.
-5.  **Formulate the Final Answer:**
-    * Present your answer clearly and concisely.
-    * Always begin your ultimate response with "FINAL ANSWER:".
-    * If the answer is a single number, provide only the number.
-    * If the answer is a list, provide comma-separated values.
-    * For complex answers, use structured formats like bullet points or JSON where appropriate to enhance readability.
-    #* **Crucially, always include sources or references (e.g., URLs, Wikipedia titles, file names) where you obtained the information.** This builds trust and allows for verification.
-    * If you used `file_analysis` or `data_analysis` tools on an uploaded file, explicitly state that you analyzed the provided file.
-**Important Considerations:**
-* **Prioritize:** If the query involves a specific file, start by analyzing that file if appropriate.
-* **Ambiguity:** If the question is ambiguous, ask for clarification.
-* **Limitations:** If you cannot answer a question with the available tools, state that clearly.
-* **Conciseness:** Be as concise as possible while providing a complete and accurate answer.
 """
-        agent = CodeAgent(
-            model=self.model,
-            tools=self.tools,
             add_base_tools=True
         )
-        agent.prompt_templates["system_prompt"] = system_prompt
-        return agent
     def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
         answer = self.agent.run(question)
         print(f"Agent returning answer: {answer}")
         return answer
-        logger.info(f"Received question: {question[:200]}...") # Log more of the question
-        try:
-            response = self.agent.run(question)
-            logger.info(f"Response generated successfully for question: {question[:200]}")
-            return response
-        except Exception as e:
-            logger.error(f"Agent execution failed for question '{question[:100]}': {str(e)}", exc_info=True) # Log full traceback
-            return f"Error processing your request: {str(e)}. Please try again or rephrase your question."
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """

 import os
 import gradio as gr
 import requests
+import inspect
 import pandas as pd
+from smolagents import tool, Tool, CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, SpeechToTextTool, FinalAnswerTool
+from dotenv import load_dotenv
 import heapq
 from collections import Counter
+import re
 from io import BytesIO
 from youtube_transcript_api import YouTubeTranscriptApi
+from langchain_community.tools.tavily_search import TavilySearchResults
+from langchain_community.document_loaders import WikipediaLoader
+from langchain_community.utilities import WikipediaAPIWrapper
+from langchain_community.document_loaders import ArxivLoader
+# (Keep Constants as is)
+# --- Constants ---
+DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+#Load environment variables
 load_dotenv()
+from smolagents import Tool
+from langchain_community.document_loaders import WikipediaLoader
 class WikiSearchTool(Tool):
     name = "wiki_search"
+    description = "Search Wikipedia for a query and return up to 2 results."
+    inputs = {
+        "query": {"type": "string", "description": "The search term for Wikipedia."}
+    }
     output_type = "string"
     def forward(self, query: str) -> str:
+        search_docs = WikipediaLoader(query=query, load_max_docs=2).load()
+        formatted_search_docs = "\n\n---\n\n".join(
+            [
+                f'<Document source="{doc.metadata.get("source", "Wikipedia")}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
+                for doc in search_docs
+            ]
+        )
+        return formatted_search_docs
+class StringReverseTool(Tool):
+    name = "reverse_message"
+    description = "When you received a strange text, try to reverse it and perform action described in reversed message."
+    inputs = {
+        "message": {
+            "type": "string",
+            "description": "A message, which looks like strange and can be reversed to get actions to execute."
+        }
+    }
     output_type = "string"
+    def __init__(self):
+        return
+    def forward(self, message: str):
+        return message[::-1]
+class KeywordsExtractorTool(Tool):
+    """Extracts top 5 keywords from a given text based on frequency."""
+    name = "keywords_extractor"
+    description = "This tool returns the 5 most frequent keywords occur in provided block of text."
+    inputs = {
+        "text": {
+            "type": "string",
+            "description": "Text to analyze for keywords.",
+        }
+    }
+    output_type = "string"
+    def forward(self, text: str) -> str:
         try:
+            all_words = re.findall(r'\b\w+\b', text.lower())
+            conjunctions = {'a', 'and', 'of', 'is', 'in', 'to', 'the'}
+            filtered_words = []
+            for w in all_words:
+                if w not in conjunctions:
+                    filtered_words.push(w)
+            word_counts = Counter(filtered_words)
+            k = 5
+            return heapq.nlargest(k, word_counts.items(), key=lambda x: x[1])
         except Exception as e:
+            return f"Error during extracting most common words: {e}"
+@tool
+def parse_excel_to_json(task_id: str) -> dict:
+    """
+    For a given task_id fetch and parse an Excel file and save parsed data in structured JSON file.
+    Args:
+        task_id: An task ID to fetch.
+    Returns:
+        {
+            "task_id": str,
+            "sheets": {
+                "SheetName1": [ {col1: val1, col2: val2, ...}, ... ],
+                ...
+            },
+            "status": "Success" | "Error"
+        }
+    """
+    url = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
+    try:
+        response = requests.get(url, timeout=100)
+        if response.status_code != 200:
+            return {"task_id": task_id, "sheets": {}, "status": f"{response.status_code} - Failed"}
+        xls_content = pd.ExcelFile(BytesIO(response.content))
+        json_sheets = {}
+        for sheet in xls_content.sheet_names:
+            df = xls_content.parse(sheet)
+            df = df.dropna(how="all")
+            rows = df.head(20).to_dict(orient="records")
+            json_sheets[sheet] = rows
+        return {
+            "task_id": task_id,
+            "sheets": json_sheets,
+            "status": "Success"
+        }
+    except Exception as e:
+        return {
+            "task_id": task_id,
+            "sheets": {},
+            "status": f"Error in parsing Excel file: {str(e)}"
+        }
 class VideoTranscriptionTool(Tool):
+    """Fetch transcripts from YouTube videos"""
     name = "transcript_video"
+    description = "Fetch text transcript from YouTube movies with optional timestamps"
     inputs = {
+        "url": {"type": "string", "description": "YouTube video URL or ID"},
+        "include_timestamps": {"type": "boolean", "description": "If timestamps should be included in output", "nullable": True}
     }
     output_type = "string"
     def forward(self, url: str, include_timestamps: bool = False) -> str:
+        if "youtube.com/watch" in url:
+            video_id = url.split("v=")[1].split("&")[0]
+        elif "youtu.be/" in url:
+            video_id = url.split("youtu.be/")[1].split("?")[0]
+        elif len(url.strip()) == 11:  # Direct ID
+            video_id = url.strip()
+        else:
+            return f"YouTube URL or ID: {url} is invalid!"
         try:
+            transcription = YouTubeTranscriptApi.get_transcript(video_id)
             if include_timestamps:
+                formatted_transcription = []
+                for part in transcription:
+                    timestamp = f"{int(part['start']//60)}:{int(part['start']%60):02d}"
+                    formatted_transcription.append(f"[{timestamp}] {part['text']}")
+                return "\n".join(formatted_transcription)
             else:
+                return " ".join([part['text'] for part in transcription])
         except Exception as e:
+            return f"Error in extracting YouTube transcript: {str(e)}"
 class BasicAgent:
     def __init__(self):
+        token = os.environ.get("HF_API_TOKEN")
+        model = HfApiModel(
+            temperature=0.1,
+            token=token
         )
+        search_tool = DuckDuckGoSearchTool()
+        wiki_search_tool = WikiSearchTool()
+        str_reverse_tool = StringReverseTool()
+        keywords_extract_tool = KeywordsExtractorTool()
+        speech_to_text_tool = SpeechToTextTool()
+        visit_webpage_tool = VisitWebpageTool()
+        final_answer_tool = FinalAnswerTool()
+        video_transcription_tool = VideoTranscriptionTool()
+        system_prompt = f"""
+You are my general AI assistant. Your task is to answer the question I asked.
+First, provide an explanation of your reasoning, step by step, to arrive at the answer.
+Then, return your final answer in a single line, formatted as follows: "FINAL ANSWER: [YOUR FINAL ANSWER]".
+[YOUR FINAL ANSWER] should be a number, a string, or a comma-separated list of numbers and/or strings, depending on the question.
+If the answer is a number, do not use commas or units (e.g., $, %) unless specified.
+If the answer is a string, do not use articles or abbreviations (e.g., for cities), and write digits in plain text unless specified.
+If the answer is a comma-separated list, apply the above rules for each element based on whether it is a number or a string.
 """
+        self.agent = CodeAgent(
+            model=model,
+            tools=[search_tool, wiki_search_tool, str_reverse_tool, keywords_extract_tool, speech_to_text_tool, visit_webpage_tool, final_answer_tool, parse_excel_to_json, video_transcription_tool],
             add_base_tools=True
         )
+        self.agent.prompt_templates["system_prompt"] = self.agent.prompt_templates["system_prompt"] + system_prompt
     def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
         answer = self.agent.run(question)
         print(f"Agent returning answer: {answer}")
         return answer
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """