Spaces:

stillerman
/

wikihop-server

Runtime error

App Files Files Community

stillerman commited on Apr 25

Commit

fa70ae5

0 Parent(s):

init

Browse files

Files changed (6) hide show

.gitignore +2 -0
README.md +1 -0
agent.py +136 -0
cli.py +163 -0
wiki_parser.py +131 -0
wiki_run_engine.py +82 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.bz2
2	+ data

README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ wget https://dumps.wikimedia.org/simplewiki/20250420/simplewiki-20250420-pages-articles-multistream.xml.bz2

agent.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# LLM agent player using HuggingFace's smolagent
+from wiki_run_engine import WikiRunEnvironment
+from rich.console import Console
+console = Console()
+try:
+    from smolagent import Agent, AgentConfig
+except ImportError:
+    console.print("[red]smolagent package not found. Please install with 'uv pip install smolagent'[/red]")
+    raise
+class AgentPlayer:
+    def __init__(self, wiki_data_path, model_name="HuggingFaceH4/zephyr-7b-beta"):
+        """Initialize agent player"""
+        self.env = WikiRunEnvironment(wiki_data_path)
+        # Initialize LLM agent
+        config = AgentConfig(
+            model=model_name,
+            output_parser="json"
+        )
+        self.agent = Agent(config)
+    def play(self, start_article=None, target_article=None, max_steps=20):
+        """Play a game of Wiki Run using the LLM agent"""
+        # Reset environment
+        state = self.env.reset(start_article, target_article)
+        console.print("[bold]Agent Wiki Run[/bold]")
+        console.print(f"Starting article: [cyan]{state['current_article']}[/cyan]")
+        console.print(f"Target article: [red]{state['target_article']}[/red]")
+        console.print()
+        steps = 0
+        while not state['is_complete'] and steps < max_steps:
+            console.print(f"[bold]Step {steps + 1}:[/bold]")
+            console.print(f"Current article: [cyan]{state['current_article']}[/cyan]")
+            # Create prompt for agent
+            prompt = self._create_agent_prompt(state)
+            # Get agent's decision
+            tool_result = self.agent.run(
+                prompt,
+                tools=[
+                    {
+                        "name": "choose_next_article",
+                        "description": "Choose the next Wikipedia article to navigate to",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {
+                                "article": {
+                                    "type": "string",
+                                    "description": "The title of the next article to navigate to"
+                                },
+                                "reasoning": {
+                                    "type": "string",
+                                    "description": "Explanation of why this article was chosen"
+                                }
+                            },
+                            "required": ["article", "reasoning"]
+                        }
+                    }
+                ]
+            )
+            # Extract agent's choice
+            choice = tool_result.get("choose_next_article", {})
+            next_article = choice.get("article", "")
+            reasoning = choice.get("reasoning", "")
+            console.print(f"[yellow]Agent reasoning: {reasoning}[/yellow]")
+            console.print(f"Agent chooses: [green]{next_article}[/green]")
+            # Verify the choice is valid
+            if next_article in state['available_links']:
+                state, message = self.env.step(next_article)
+                if message:
+                    console.print(f"[bold]{message}[/bold]")
+            else:
+                console.print("[red]Invalid choice! Agent selected an article that's not in the available links.[/red]")
+                # Choose a random valid link as fallback
+                import random
+                next_article = random.choice(state['available_links'])
+                console.print(f"[yellow]Falling back to random choice: {next_article}[/yellow]")
+                state, _ = self.env.step(next_article)
+            steps += 1
+            console.print()
+        # Game complete
+        if state['is_complete']:
+            console.print("[bold green]Success! Agent reached the target article![/bold green]")
+        else:
+            console.print("[bold red]Failed to reach target within step limit.[/bold red]")
+        console.print(f"Steps taken: [bold]{state['steps_taken']}[/bold]")
+        console.print(f"Path: [italic]{' → '.join(state['path_taken'])}[/italic]")
+        return state
+    def _create_agent_prompt(self, state):
+        """Create prompt for the agent"""
+        current = state['current_article']
+        target = state['target_article']
+        links = state['available_links']
+        prompt = f"""You are playing the Wiki Run game. Your goal is to navigate from the current Wikipedia article to the target article using only the available links.
+Current article: {current}
+Target article: {target}
+Available links (choose one):
+{', '.join(links)}
+Choose the link that you think will get you closest to the target article. Consider:
+1. Direct connections to the target
+2. Articles that might be in the same category as the target
+3. General articles that might have many links to other topics
+Use the choose_next_article tool to make your selection."""
+        return prompt
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) < 2:
+        console.print("[red]Please provide the path to Wikipedia data[/red]")
+        console.print("Usage: python agent.py <wiki_data_path>")
+        sys.exit(1)
+    wiki_data_path = sys.argv[1]
+    agent = AgentPlayer(wiki_data_path)
+    agent.play()

cli.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# Rich CLI interface for the Wiki Run game
+from rich.console import Console
+from rich.panel import Panel
+from rich.markdown import Markdown
+from rich.prompt import Prompt
+from rich.text import Text
+from rich.table import Table
+from wiki_run_engine import WikiRunEnvironment
+console = Console()
+class WikiRunCLI:
+    def __init__(self, wiki_data_path):
+        """Initialize the CLI interface"""
+        self.env = WikiRunEnvironment(wiki_data_path)
+        self.console = Console()
+    def _display_article(self, state):
+        """Display the current article with highlighted links"""
+        title = state['current_article']
+        text = state['article_text']
+        links = state['available_links']
+        # Create a panel with article title
+        console.print(Panel(f"[bold cyan]{title}[/bold cyan]",
+                           expand=False,
+                           border_style="cyan"))
+        # Display article text with highlighted links
+        # In a real implementation, we'd need to properly format
+        # the links within the text, but for MVP this is simpler
+        md = Markdown(text)
+        console.print(md)
+        # Display available links
+        console.print("\n[bold green]Available Links:[/bold green]")
+        # Create a table for links (3 columns)
+        table = Table(show_header=False, box=None)
+        table.add_column("Link 1", style="green")
+        table.add_column("Link 2", style="green")
+        table.add_column("Link 3", style="green")
+        # Add links to table (3 per row)
+        row = []
+        for i, link in enumerate(links):
+            row.append(f"{i+1}. {link}")
+            if len(row) == 3:
+                table.add_row(*row)
+                row = []
+        # Add any remaining links
+        if row:
+            table.add_row(*row + [''] * (3 - len(row)))
+        console.print(table)
+    def _display_game_info(self, state):
+        """Display game information"""
+        console.print("\n[bold yellow]Game Info:[/bold yellow]")
+        console.print(f"Target Article: [bold red]{state['target_article']}[/bold red]")
+        console.print(f"Steps Taken: [bold]{state['steps_taken']}[/bold]")
+        console.print(f"Path: [italic]{' → '.join(state['path_taken'])}[/italic]")
+    def play(self):
+        """Main game loop"""
+        console.clear()
+        console.print("[bold]Welcome to Wiki Run![/bold]")
+        console.print("Navigate from one Wikipedia article to another using only links.\n")
+        # Let user choose between random articles or specific ones
+        choice = Prompt.ask("Start with [bold]r[/bold]andom articles or [bold]s[/bold]pecific ones?",
+                            choices=["r", "s"],
+                            default="r")
+        if choice == "r":
+            state = self.env.reset()
+        else:
+            # Get start and target articles
+            available_articles = list(self.env.wiki_data.keys())
+            console.print(f"Available articles: {len(available_articles)}")
+            # Show a sample of available articles
+            import random
+            sample = random.sample(available_articles, min(10, len(available_articles)))
+            console.print("Sample articles:")
+            for article in sample:
+                console.print(f"- {article}")
+            start = Prompt.ask("Start article")
+            target = Prompt.ask("Target article")
+            state = self.env.reset(start, target)
+        # Main game loop
+        while not state['is_complete']:
+            console.clear()
+            self._display_article(state)
+            self._display_game_info(state)
+            # Get user input
+            choice = Prompt.ask("\nChoose a link (number) or type part of a link name",
+                                default="")
+            # Process input
+            links = state['available_links']
+            next_article = None
+            try:
+                # Check if input is a number
+                idx = int(choice) - 1
+                if 0 <= idx < len(links):
+                    next_article = links[idx]
+            except ValueError:
+                # Input is text, find closest match
+                matches = [link for link in links if choice.lower() in link.lower()]
+                if len(matches) == 1:
+                    next_article = matches[0]
+                elif len(matches) > 1:
+                    console.print("[yellow]Multiple matches found:[/yellow]")
+                    for i, match in enumerate(matches):
+                        console.print(f"{i+1}. {match}")
+                    sub_choice = Prompt.ask("Choose a match (number)", default="1")
+                    try:
+                        idx = int(sub_choice) - 1
+                        if 0 <= idx < len(matches):
+                            next_article = matches[idx]
+                    except ValueError:
+                        pass
+            if next_article:
+                state, message = self.env.step(next_article)
+                if message:
+                    console.print(f"[bold]{message}[/bold]")
+                    if state['is_complete']:
+                        break
+            else:
+                console.print("[red]Invalid choice. Try again.[/red]")
+                console.input("\nPress Enter to continue...")
+        # Game complete
+        console.clear()
+        console.print("[bold green]Congratulations! You've reached the target article![/bold green]")
+        console.print(f"You took [bold]{state['steps_taken']}[/bold] steps.")
+        console.print(f"Your path: [italic]{' → '.join(state['path_taken'])}[/italic]")
+        play_again = Prompt.ask("Play again? (y/n)", choices=["y", "n"], default="y")
+        if play_again == "y":
+            self.play()
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) < 2:
+        console.print("[red]Please provide the path to Wikipedia data[/red]")
+        console.print("Usage: python cli.py <wiki_data_path>")
+        sys.exit(1)
+    wiki_data_path = sys.argv[1]
+    game = WikiRunCLI(wiki_data_path)
+    game.play()

wiki_parser.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import bz2
+import json
+import re
+import xml.etree.ElementTree as ET
+from collections import defaultdict
+from pathlib import Path
+import os
+def parse_wiki_dump(dump_path, output_path, max_articles=None):
+    """
+    Parse the Wikipedia XML dump and extract articles with their links.
+    Args:
+        dump_path: Path to the bz2 Wikipedia dump
+        output_path: Path to save the extracted data
+        max_articles: Maximum number of articles to extract (None for all)
+    Returns:
+        The path to the saved JSON file
+    """
+    print(f"Parsing Wikipedia dump: {dump_path}")
+    # Dictionary to store articles and their links
+    wiki_data = {}
+    article_count = 0
+    # Namespace mapping in Wikipedia XML
+    ns = {
+        'xml': 'http://www.w3.org/XML/1998/namespace'
+    }
+    # Initialize XML parser
+    context = ET.iterparse(bz2.open(dump_path, 'rt', encoding='utf-8'), events=('end',))
+    # Process XML elements
+    for event, elem in context:
+        if elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}page':
+            # Extract title
+            title_elem = elem.find('.//{http://www.mediawiki.org/xml/export-0.10/}title')
+            if title_elem is None:
+                continue
+            title = title_elem.text
+            # Skip non-main namespace
+            ns_elem = elem.find('.//{http://www.mediawiki.org/xml/export-0.10/}ns')
+            if ns_elem is not None and ns_elem.text != '0':
+                elem.clear()
+                continue
+            # Extract content
+            text_elem = elem.find('.//{http://www.mediawiki.org/xml/export-0.10/}text')
+            if text_elem is None or text_elem.text is None:
+                elem.clear()
+                continue
+            content = text_elem.text
+            # Extract links from content
+            links = extract_links(content)
+            # Add to wiki data
+            wiki_data[title] = {
+                'title': title,
+                'text': content,
+                'links': links
+            }
+            article_count += 1
+            # Print progress
+            if article_count % 1000 == 0:
+                print(f"Processed {article_count} articles...")
+            # Check if we've reached the maximum number of articles
+            if max_articles and article_count >= max_articles:
+                break
+            # Clear element to save memory
+            elem.clear()
+    print(f"Extracted {article_count} articles with their links.")
+    # Save data to JSON file
+    output_file = os.path.join(output_path, 'wiki_data.json')
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(wiki_data, f)
+    print(f"Data saved to {output_file}")
+    return output_file
+def extract_links(text):
+    """Extract links from article wikitext"""
+    # Pattern to match [[Link]] or [[Link|Text]] format
+    links = re.findall(r'\[\[([^|\]]+)(?:\|[^\]]+)?\]\]', text)
+    # Process links
+    processed_links = []
+    for link in links:
+        # Skip non-article links
+        if ':' in link and not link.startswith('Category:'):
+            continue
+        # Remove any section links (with #)
+        link = link.split('#')[0].strip()
+        # Skip empty links
+        if not link:
+            continue
+        processed_links.append(link)
+    # Remove duplicates and return
+    return list(set(processed_links))
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='Parse Wikipedia XML dump')
+    parser.add_argument('dump_path', help='Path to the Wikipedia XML dump (bz2 file)')
+    parser.add_argument('output_path', help='Path to save the extracted data')
+    parser.add_argument('--max-articles', type=int, default=None,
+                        help='Maximum number of articles to extract (default: all)')
+    args = parser.parse_args()
+    # Create output directory if it doesn't exist
+    os.makedirs(args.output_path, exist_ok=True)
+    # Parse the dump
+    parse_wiki_dump(args.dump_path, args.output_path, args.max_articles)

wiki_run_engine.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Core game engine that manages:
+# 1. Loading/parsing Wikipedia data
+# 2. Game state management
+# 3. Navigation between articles
+import json
+import random
+class WikiRunEnvironment:
+    def __init__(self, wiki_data_path):
+        """Initialize with path to Wikipedia data"""
+        self.wiki_data = self._load_wiki_data(wiki_data_path)
+        self.current_article = None
+        self.target_article = None
+        self.path_taken = []
+        self.steps = 0
+    def _load_wiki_data(self, path):
+        """Load Wikipedia data from JSON file"""
+        print(f"Loading wiki data from {path}...")
+        with open(path, 'r', encoding='utf-8') as f:
+            wiki_data = json.load(f)
+        print(f"Loaded {len(wiki_data)} articles")
+        return wiki_data
+    def reset(self, start_article=None, target_article=None):
+        """Reset the environment with new start/target articles"""
+        if start_article is None or target_article is None:
+            # Choose random articles if not specified
+            available_articles = list(self.wiki_data.keys())
+            if start_article is None:
+                start_article = random.choice(available_articles)
+            if target_article is None:
+                # Ensure target is different from start
+                remaining = [a for a in available_articles if a != start_article]
+                target_article = random.choice(remaining)
+        self.current_article = start_article
+        self.target_article = target_article
+        self.path_taken = [start_article]
+        self.steps = 0
+        return self.get_current_state()
+    def get_current_state(self):
+        """Get the current state of the environment"""
+        if self.current_article is None:
+            return None
+        current = self.wiki_data.get(self.current_article, {})
+        return {
+            'current_article': self.current_article,
+            'target_article': self.target_article,
+            'article_text': current.get('text', ''),
+            'available_links': current.get('links', []),
+            'steps_taken': self.steps,
+            'path_taken': self.path_taken,
+            'is_complete': self.current_article == self.target_article
+        }
+    def step(self, next_article):
+        """Take a step to the next article"""
+        if self.current_article is None:
+            return None, "Game not initialized. Call reset() first."
+        current = self.wiki_data.get(self.current_article, {})
+        available_links = current.get('links', [])
+        if next_article not in available_links:
+            return self.get_current_state(), f"Invalid link: {next_article} not in available links"
+        # Update state
+        self.current_article = next_article
+        self.path_taken.append(next_article)
+        self.steps += 1
+        # Check if we've reached the target
+        is_complete = self.current_article == self.target_article
+        return self.get_current_state(), "Target reached!" if is_complete else ""